diff --git a/crypto/openssl/crypto/aes/asm/aesv8-armx.pl b/crypto/openssl/crypto/aes/asm/aesv8-armx.pl index d0e0be6187bd..30dad3d03456 100755 --- a/crypto/openssl/crypto/aes/asm/aesv8-armx.pl +++ b/crypto/openssl/crypto/aes/asm/aesv8-armx.pl @@ -1,3739 +1,3753 @@ #! /usr/bin/env perl # Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements support for ARMv8 AES instructions. The # module is endian-agnostic in sense that it supports both big- and # little-endian cases. As does it support both 32- and 64-bit modes # of operation. Latter is achieved by limiting amount of utilized # registers to 16, which implies additional NEON load and integer # instructions. This has no effect on mighty Apple A7, where results # are literally equal to the theoretical estimates based on AES # instruction latencies and issue rates. On Cortex-A53, an in-order # execution core, this costs up to 10-15%, which is partially # compensated by implementing dedicated code path for 128-bit # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... # # April 2019 # # Key to performance of parallelize-able modes is round instruction # interleaving. But which factor to use? There is optimal one for # each combination of instruction latency and issue rate, beyond # which increasing interleave factor doesn't pay off. While on cons # side we have code size increase and resource waste on platforms for # which interleave factor is too high. In other words you want it to # be just right. So far interleave factor of 3x was serving well all # platforms. But for ThunderX2 optimal interleave factor was measured # to be 5x... # # Performance in cycles per byte processed with 128-bit key: # # CBC enc CBC dec CTR # Apple A7 2.39 1.20 1.20 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 # Cortex-A72 1.33 0.85/0.88 0.92/0.96 # Denver 1.96 0.65/0.86 0.76/0.80 # Mongoose 1.33 1.23/1.20 1.30/1.20 # Kryo 1.26 0.87/0.94 1.00/1.00 # ThunderX2 5.95 1.25 1.30 # # (*) original 3.64/1.34/1.32 results were for r0p0 revision # and are still same even for updated module; # (**) numbers after slash are for 32-bit code, which is 3x- # interleaved; # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; $prefix="aes_v8"; $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); $code=<<___; #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 ___ $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); $code.=<<___ if ($flavour !~ /64/); .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) .fpu neon #ifdef __thumb2__ .syntax unified .thumb # define INST(a,b,c,d) $_byte c,d|0xc,a,b #else .code 32 # define INST(a,b,c,d) $_byte a,b,c,d #endif .text ___ # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to # maintain both 32- and 64-bit codes within single module and # transliterate common code to either flavour with regex vodoo. # {{{ my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); $code.=<<___; .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b .globl ${prefix}_set_encrypt_key .type ${prefix}_set_encrypt_key,%function .align 5 ${prefix}_set_encrypt_key: .Lenc_key: ___ $code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ $code.=<<___; mov $ptr,#-1 cmp $inp,#0 b.eq .Lenc_key_abort cmp $out,#0 b.eq .Lenc_key_abort mov $ptr,#-2 cmp $bits,#128 b.lt .Lenc_key_abort cmp $bits,#256 b.gt .Lenc_key_abort tst $bits,#0x3f b.ne .Lenc_key_abort adr $ptr,.Lrcon cmp $bits,#192 veor $zero,$zero,$zero vld1.8 {$in0},[$inp],#16 mov $bits,#8 // reuse $bits vld1.32 {$rcon,$mask},[$ptr],#32 b.lt .Loop128 b.eq .L192 b .L256 .align 4 .Loop128: vtbl.8 $key,{$in0},$mask vext.8 $tmp,$zero,$in0,#12 vst1.32 {$in0},[$out],#16 aese $key,$zero subs $bits,$bits,#1 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $key,$key,$rcon veor $in0,$in0,$tmp vshl.u8 $rcon,$rcon,#1 veor $in0,$in0,$key b.ne .Loop128 vld1.32 {$rcon},[$ptr] vtbl.8 $key,{$in0},$mask vext.8 $tmp,$zero,$in0,#12 vst1.32 {$in0},[$out],#16 aese $key,$zero veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $key,$key,$rcon veor $in0,$in0,$tmp vshl.u8 $rcon,$rcon,#1 veor $in0,$in0,$key vtbl.8 $key,{$in0},$mask vext.8 $tmp,$zero,$in0,#12 vst1.32 {$in0},[$out],#16 aese $key,$zero veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $key,$key,$rcon veor $in0,$in0,$tmp veor $in0,$in0,$key vst1.32 {$in0},[$out] add $out,$out,#0x50 mov $rounds,#10 b .Ldone .align 4 .L192: vld1.8 {$in1},[$inp],#8 vmov.i8 $key,#8 // borrow $key vst1.32 {$in0},[$out],#16 vsub.i8 $mask,$mask,$key // adjust the mask .Loop192: vtbl.8 $key,{$in1},$mask vext.8 $tmp,$zero,$in0,#12 #ifdef __ARMEB__ vst1.32 {$in1},[$out],#16 sub $out,$out,#8 #else vst1.32 {$in1},[$out],#8 #endif aese $key,$zero subs $bits,$bits,#1 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vdup.32 $tmp,${in0}[3] veor $tmp,$tmp,$in1 veor $key,$key,$rcon vext.8 $in1,$zero,$in1,#12 vshl.u8 $rcon,$rcon,#1 veor $in1,$in1,$tmp veor $in0,$in0,$key veor $in1,$in1,$key vst1.32 {$in0},[$out],#16 b.ne .Loop192 mov $rounds,#12 add $out,$out,#0x20 b .Ldone .align 4 .L256: vld1.8 {$in1},[$inp] mov $bits,#7 mov $rounds,#14 vst1.32 {$in0},[$out],#16 .Loop256: vtbl.8 $key,{$in1},$mask vext.8 $tmp,$zero,$in0,#12 vst1.32 {$in1},[$out],#16 aese $key,$zero subs $bits,$bits,#1 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in0,$in0,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $key,$key,$rcon veor $in0,$in0,$tmp vshl.u8 $rcon,$rcon,#1 veor $in0,$in0,$key vst1.32 {$in0},[$out],#16 b.eq .Ldone vdup.32 $key,${in0}[3] // just splat vext.8 $tmp,$zero,$in1,#12 aese $key,$zero veor $in1,$in1,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in1,$in1,$tmp vext.8 $tmp,$zero,$tmp,#12 veor $in1,$in1,$tmp veor $in1,$in1,$key b .Loop256 .Ldone: str $rounds,[$out] mov $ptr,#0 .Lenc_key_abort: mov x0,$ptr // return value `"ldr x29,[sp],#16" if ($flavour =~ /64/)` ret .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key .globl ${prefix}_set_decrypt_key .type ${prefix}_set_decrypt_key,%function .align 5 ${prefix}_set_decrypt_key: ___ $code.=<<___ if ($flavour =~ /64/); - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ $code.=<<___ if ($flavour !~ /64/); stmdb sp!,{r4,lr} ___ $code.=<<___; bl .Lenc_key cmp x0,#0 b.ne .Ldec_key_abort sub $out,$out,#240 // restore original $out mov x4,#-16 add $inp,$out,x12,lsl#4 // end of key schedule vld1.32 {v0.16b},[$out] vld1.32 {v1.16b},[$inp] vst1.32 {v0.16b},[$inp],x4 vst1.32 {v1.16b},[$out],#16 .Loop_imc: vld1.32 {v0.16b},[$out] vld1.32 {v1.16b},[$inp] aesimc v0.16b,v0.16b aesimc v1.16b,v1.16b vst1.32 {v0.16b},[$inp],x4 vst1.32 {v1.16b},[$out],#16 cmp $inp,$out b.hi .Loop_imc vld1.32 {v0.16b},[$out] aesimc v0.16b,v0.16b vst1.32 {v0.16b},[$inp] eor x0,x0,x0 // return value .Ldec_key_abort: ___ $code.=<<___ if ($flavour !~ /64/); ldmia sp!,{r4,pc} ___ $code.=<<___ if ($flavour =~ /64/); ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret ___ $code.=<<___; .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ___ }}} {{{ sub gen_block () { my $dir = shift; my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); my ($inp,$out,$key)=map("x$_",(0..2)); my $rounds="w3"; my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); $code.=<<___; .globl ${prefix}_${dir}crypt .type ${prefix}_${dir}crypt,%function .align 5 ${prefix}_${dir}crypt: +___ +$code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET +___ +$code.=<<___; ldr $rounds,[$key,#240] vld1.32 {$rndkey0},[$key],#16 vld1.8 {$inout},[$inp] sub $rounds,$rounds,#2 vld1.32 {$rndkey1},[$key],#16 .Loop_${dir}c: aes$e $inout,$rndkey0 aes$mc $inout,$inout vld1.32 {$rndkey0},[$key],#16 subs $rounds,$rounds,#2 aes$e $inout,$rndkey1 aes$mc $inout,$inout vld1.32 {$rndkey1},[$key],#16 b.gt .Loop_${dir}c aes$e $inout,$rndkey0 aes$mc $inout,$inout vld1.32 {$rndkey0},[$key] aes$e $inout,$rndkey1 veor $inout,$inout,$rndkey0 vst1.8 {$inout},[$out] ret .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt ___ } &gen_block("en"); &gen_block("de"); }}} # Performance in cycles per byte. # Processed with AES-ECB different key size. # It shows the value before and after optimization as below: # (before/after): # # AES-128-ECB AES-192-ECB AES-256-ECB # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 # Optimization is implemented by loop unrolling and interleaving. # Commonly, we choose the unrolling factor as 5, if the input # data size smaller than 5 blocks, but not smaller than 3 blocks, # choose 3 as the unrolling factor. # If the input data size dsize >= 5*16 bytes, then take 5 blocks # as one iteration, every loop the left size lsize -= 5*16. # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, # every loop lsize -=3*16. # If lsize < 3*16 bytes, treat them as the tail, interleave the # two blocks AES instructions. # There is one special case, if the original input data size dsize # = 16 bytes, we will treat it seperately to improve the # performance: one independent code block without LR, FP load and # store, just looks like what the original ECB implementation does. {{{ my ($inp,$out,$len,$key)=map("x$_",(0..3)); my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ### q7 last round key ### q10-q15 q7 Last 7 round keys ### q8-q9 preloaded round keys except last 7 keys for big size ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat3,$in3,$tmp3); # used only in 64-bit mode my ($dat4,$in4,$tmp4); if ($flavour =~ /64/) { ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); } $code.=<<___; .globl ${prefix}_ecb_encrypt .type ${prefix}_ecb_encrypt,%function .align 5 ${prefix}_ecb_encrypt: ___ $code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET subs $len,$len,#16 // Original input data size bigger than 16, jump to big size processing. b.ne .Lecb_big_size vld1.8 {$dat0},[$inp] cmp $enc,#0 // en- or decrypting? ldr $rounds,[$key,#240] vld1.32 {q5-q6},[$key],#32 // load key schedule... b.eq .Lecb_small_dec aese $dat0,q5 aesmc $dat0,$dat0 vld1.32 {q8-q9},[$key],#32 // load key schedule... aese $dat0,q6 aesmc $dat0,$dat0 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing b.eq .Lecb_128_enc .Lecb_round_loop: aese $dat0,q8 aesmc $dat0,$dat0 vld1.32 {q8},[$key],#16 // load key schedule... aese $dat0,q9 aesmc $dat0,$dat0 vld1.32 {q9},[$key],#16 // load key schedule... subs $rounds,$rounds,#2 // bias b.gt .Lecb_round_loop .Lecb_128_enc: vld1.32 {q10-q11},[$key],#32 // load key schedule... aese $dat0,q8 aesmc $dat0,$dat0 aese $dat0,q9 aesmc $dat0,$dat0 vld1.32 {q12-q13},[$key],#32 // load key schedule... aese $dat0,q10 aesmc $dat0,$dat0 aese $dat0,q11 aesmc $dat0,$dat0 vld1.32 {q14-q15},[$key],#32 // load key schedule... aese $dat0,q12 aesmc $dat0,$dat0 aese $dat0,q13 aesmc $dat0,$dat0 vld1.32 {$rndlast},[$key] aese $dat0,q14 aesmc $dat0,$dat0 aese $dat0,q15 veor $dat0,$dat0,$rndlast vst1.8 {$dat0},[$out] b .Lecb_Final_abort .Lecb_small_dec: aesd $dat0,q5 aesimc $dat0,$dat0 vld1.32 {q8-q9},[$key],#32 // load key schedule... aesd $dat0,q6 aesimc $dat0,$dat0 subs $rounds,$rounds,#10 // bias b.eq .Lecb_128_dec .Lecb_dec_round_loop: aesd $dat0,q8 aesimc $dat0,$dat0 vld1.32 {q8},[$key],#16 // load key schedule... aesd $dat0,q9 aesimc $dat0,$dat0 vld1.32 {q9},[$key],#16 // load key schedule... subs $rounds,$rounds,#2 // bias b.gt .Lecb_dec_round_loop .Lecb_128_dec: vld1.32 {q10-q11},[$key],#32 // load key schedule... aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat0,q9 aesimc $dat0,$dat0 vld1.32 {q12-q13},[$key],#32 // load key schedule... aesd $dat0,q10 aesimc $dat0,$dat0 aesd $dat0,q11 aesimc $dat0,$dat0 vld1.32 {q14-q15},[$key],#32 // load key schedule... aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat0,q13 aesimc $dat0,$dat0 vld1.32 {$rndlast},[$key] aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat0,q15 veor $dat0,$dat0,$rndlast vst1.8 {$dat0},[$out] b .Lecb_Final_abort .Lecb_big_size: ___ $code.=<<___ if ($flavour =~ /64/); stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ $code.=<<___ if ($flavour !~ /64/); mov ip,sp stmdb sp!,{r4-r8,lr} vstmdb sp!,{d8-d15} @ ABI specification says so ldmia ip,{r4-r5} @ load remaining args subs $len,$len,#16 ___ $code.=<<___; mov $step,#16 b.lo .Lecb_done cclr $step,eq cmp $enc,#0 // en- or decrypting? ldr $rounds,[$key,#240] and $len,$len,#-16 vld1.8 {$dat},[$inp],$step vld1.32 {q8-q9},[$key] // load key schedule... sub $rounds,$rounds,#6 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys sub $rounds,$rounds,#2 vld1.32 {q10-q11},[$key_],#32 vld1.32 {q12-q13},[$key_],#32 vld1.32 {q14-q15},[$key_],#32 vld1.32 {$rndlast},[$key_] add $key_,$key,#32 mov $cnt,$rounds b.eq .Lecb_dec vld1.8 {$dat1},[$inp],#16 subs $len,$len,#32 // bias add $cnt,$rounds,#2 vorr $in1,$dat1,$dat1 vorr $dat2,$dat1,$dat1 vorr $dat1,$dat,$dat b.lo .Lecb_enc_tail vorr $dat1,$in1,$in1 vld1.8 {$dat2},[$inp],#16 ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#32 b.lo .Loop3x_ecb_enc vld1.8 {$dat3},[$inp],#16 vld1.8 {$dat4},[$inp],#16 sub $len,$len,#32 // bias mov $cnt,$rounds .Loop5x_ecb_enc: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_ecb_enc aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 cmp $len,#0x40 // because .Lecb_enc_tail4x sub $len,$len,#0x50 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo mov $key_,$key aese $dat0,q10 aesmc $dat0,$dat0 aese $dat1,q10 aesmc $dat1,$dat1 aese $dat2,q10 aesmc $dat2,$dat2 aese $dat3,q10 aesmc $dat3,$dat3 aese $dat4,q10 aesmc $dat4,$dat4 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat4 // are loaded with last "words" add x6,$len,#0x60 // because .Lecb_enc_tail4x aese $dat0,q11 aesmc $dat0,$dat0 aese $dat1,q11 aesmc $dat1,$dat1 aese $dat2,q11 aesmc $dat2,$dat2 aese $dat3,q11 aesmc $dat3,$dat3 aese $dat4,q11 aesmc $dat4,$dat4 aese $dat0,q12 aesmc $dat0,$dat0 aese $dat1,q12 aesmc $dat1,$dat1 aese $dat2,q12 aesmc $dat2,$dat2 aese $dat3,q12 aesmc $dat3,$dat3 aese $dat4,q12 aesmc $dat4,$dat4 aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 aese $dat3,q13 aesmc $dat3,$dat3 aese $dat4,q13 aesmc $dat4,$dat4 aese $dat0,q14 aesmc $dat0,$dat0 aese $dat1,q14 aesmc $dat1,$dat1 aese $dat2,q14 aesmc $dat2,$dat2 aese $dat3,q14 aesmc $dat3,$dat3 aese $dat4,q14 aesmc $dat4,$dat4 aese $dat0,q15 vld1.8 {$in0},[$inp],#16 aese $dat1,q15 vld1.8 {$in1},[$inp],#16 aese $dat2,q15 vld1.8 {$in2},[$inp],#16 aese $dat3,q15 vld1.8 {$in3},[$inp],#16 aese $dat4,q15 vld1.8 {$in4},[$inp],#16 cbz x6,.Lecb_enc_tail4x vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] veor $tmp0,$rndlast,$dat0 vorr $dat0,$in0,$in0 veor $tmp1,$rndlast,$dat1 vorr $dat1,$in1,$in1 veor $tmp2,$rndlast,$dat2 vorr $dat2,$in2,$in2 veor $tmp3,$rndlast,$dat3 vorr $dat3,$in3,$in3 veor $tmp4,$rndlast,$dat4 vst1.8 {$tmp0},[$out],#16 vorr $dat4,$in4,$in4 vst1.8 {$tmp1},[$out],#16 mov $cnt,$rounds vst1.8 {$tmp2},[$out],#16 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b.hs .Loop5x_ecb_enc add $len,$len,#0x50 cbz $len,.Lecb_done add $cnt,$rounds,#2 subs $len,$len,#0x30 vorr $dat0,$in2,$in2 vorr $dat1,$in3,$in3 vorr $dat2,$in4,$in4 b.lo .Lecb_enc_tail b .Loop3x_ecb_enc .align 4 .Lecb_enc_tail4x: veor $tmp1,$rndlast,$dat1 veor $tmp2,$rndlast,$dat2 veor $tmp3,$rndlast,$dat3 veor $tmp4,$rndlast,$dat4 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b .Lecb_done .align 4 ___ $code.=<<___; .Loop3x_ecb_enc: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Loop3x_ecb_enc aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 subs $len,$len,#0x30 mov.lo x6,$len // x6, $cnt, is zero at this point aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat2 // are loaded with last "words" mov $key_,$key aese $dat0,q12 aesmc $dat0,$dat0 aese $dat1,q12 aesmc $dat1,$dat1 aese $dat2,q12 aesmc $dat2,$dat2 vld1.8 {$in0},[$inp],#16 aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 aese $dat0,q14 aesmc $dat0,$dat0 aese $dat1,q14 aesmc $dat1,$dat1 aese $dat2,q14 aesmc $dat2,$dat2 vld1.8 {$in2},[$inp],#16 aese $dat0,q15 aese $dat1,q15 aese $dat2,q15 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] add $cnt,$rounds,#2 veor $tmp0,$rndlast,$dat0 veor $tmp1,$rndlast,$dat1 veor $dat2,$dat2,$rndlast vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp0},[$out],#16 vorr $dat0,$in0,$in0 vst1.8 {$tmp1},[$out],#16 vorr $dat1,$in1,$in1 vst1.8 {$dat2},[$out],#16 vorr $dat2,$in2,$in2 b.hs .Loop3x_ecb_enc cmn $len,#0x30 b.eq .Lecb_done nop .Lecb_enc_tail: aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Lecb_enc_tail aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat1,q12 aesmc $dat1,$dat1 aese $dat2,q12 aesmc $dat2,$dat2 cmn $len,#0x20 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 aese $dat1,q14 aesmc $dat1,$dat1 aese $dat2,q14 aesmc $dat2,$dat2 aese $dat1,q15 aese $dat2,q15 b.eq .Lecb_enc_one veor $tmp1,$rndlast,$dat1 veor $tmp2,$rndlast,$dat2 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 b .Lecb_done .Lecb_enc_one: veor $tmp1,$rndlast,$dat2 vst1.8 {$tmp1},[$out],#16 b .Lecb_done ___ $code.=<<___; .align 5 .Lecb_dec: vld1.8 {$dat1},[$inp],#16 subs $len,$len,#32 // bias add $cnt,$rounds,#2 vorr $in1,$dat1,$dat1 vorr $dat2,$dat1,$dat1 vorr $dat1,$dat,$dat b.lo .Lecb_dec_tail vorr $dat1,$in1,$in1 vld1.8 {$dat2},[$inp],#16 ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#32 b.lo .Loop3x_ecb_dec vld1.8 {$dat3},[$inp],#16 vld1.8 {$dat4},[$inp],#16 sub $len,$len,#32 // bias mov $cnt,$rounds .Loop5x_ecb_dec: aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_ecb_dec aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 cmp $len,#0x40 // because .Lecb_tail4x sub $len,$len,#0x50 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo mov $key_,$key aesd $dat0,q10 aesimc $dat0,$dat0 aesd $dat1,q10 aesimc $dat1,$dat1 aesd $dat2,q10 aesimc $dat2,$dat2 aesd $dat3,q10 aesimc $dat3,$dat3 aesd $dat4,q10 aesimc $dat4,$dat4 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat4 // are loaded with last "words" add x6,$len,#0x60 // because .Lecb_tail4x aesd $dat0,q11 aesimc $dat0,$dat0 aesd $dat1,q11 aesimc $dat1,$dat1 aesd $dat2,q11 aesimc $dat2,$dat2 aesd $dat3,q11 aesimc $dat3,$dat3 aesd $dat4,q11 aesimc $dat4,$dat4 aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 aesd $dat3,q12 aesimc $dat3,$dat3 aesd $dat4,q12 aesimc $dat4,$dat4 aesd $dat0,q13 aesimc $dat0,$dat0 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 aesd $dat3,q13 aesimc $dat3,$dat3 aesd $dat4,q13 aesimc $dat4,$dat4 aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 aesd $dat3,q14 aesimc $dat3,$dat3 aesd $dat4,q14 aesimc $dat4,$dat4 aesd $dat0,q15 vld1.8 {$in0},[$inp],#16 aesd $dat1,q15 vld1.8 {$in1},[$inp],#16 aesd $dat2,q15 vld1.8 {$in2},[$inp],#16 aesd $dat3,q15 vld1.8 {$in3},[$inp],#16 aesd $dat4,q15 vld1.8 {$in4},[$inp],#16 cbz x6,.Lecb_tail4x vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] veor $tmp0,$rndlast,$dat0 vorr $dat0,$in0,$in0 veor $tmp1,$rndlast,$dat1 vorr $dat1,$in1,$in1 veor $tmp2,$rndlast,$dat2 vorr $dat2,$in2,$in2 veor $tmp3,$rndlast,$dat3 vorr $dat3,$in3,$in3 veor $tmp4,$rndlast,$dat4 vst1.8 {$tmp0},[$out],#16 vorr $dat4,$in4,$in4 vst1.8 {$tmp1},[$out],#16 mov $cnt,$rounds vst1.8 {$tmp2},[$out],#16 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b.hs .Loop5x_ecb_dec add $len,$len,#0x50 cbz $len,.Lecb_done add $cnt,$rounds,#2 subs $len,$len,#0x30 vorr $dat0,$in2,$in2 vorr $dat1,$in3,$in3 vorr $dat2,$in4,$in4 b.lo .Lecb_dec_tail b .Loop3x_ecb_dec .align 4 .Lecb_tail4x: veor $tmp1,$rndlast,$dat1 veor $tmp2,$rndlast,$dat2 veor $tmp3,$rndlast,$dat3 veor $tmp4,$rndlast,$dat4 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b .Lecb_done .align 4 ___ $code.=<<___; .Loop3x_ecb_dec: aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Loop3x_ecb_dec aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 subs $len,$len,#0x30 mov.lo x6,$len // x6, $cnt, is zero at this point aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat2 // are loaded with last "words" mov $key_,$key aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 vld1.8 {$in0},[$inp],#16 aesd $dat0,q13 aesimc $dat0,$dat0 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 vld1.8 {$in2},[$inp],#16 aesd $dat0,q15 aesd $dat1,q15 aesd $dat2,q15 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] add $cnt,$rounds,#2 veor $tmp0,$rndlast,$dat0 veor $tmp1,$rndlast,$dat1 veor $dat2,$dat2,$rndlast vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp0},[$out],#16 vorr $dat0,$in0,$in0 vst1.8 {$tmp1},[$out],#16 vorr $dat1,$in1,$in1 vst1.8 {$dat2},[$out],#16 vorr $dat2,$in2,$in2 b.hs .Loop3x_ecb_dec cmn $len,#0x30 b.eq .Lecb_done nop .Lecb_dec_tail: aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Lecb_dec_tail aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 cmn $len,#0x20 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 aesd $dat1,q15 aesd $dat2,q15 b.eq .Lecb_dec_one veor $tmp1,$rndlast,$dat1 veor $tmp2,$rndlast,$dat2 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 b .Lecb_done .Lecb_dec_one: veor $tmp1,$rndlast,$dat2 vst1.8 {$tmp1},[$out],#16 .Lecb_done: ___ } $code.=<<___ if ($flavour !~ /64/); vldmia sp!,{d8-d15} ldmia sp!,{r4-r8,pc} ___ $code.=<<___ if ($flavour =~ /64/); ldr x29,[sp],#16 ___ $code.=<<___ if ($flavour =~ /64/); .Lecb_Final_abort: ret ___ $code.=<<___; .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ___ }}} {{{ my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); ### q8-q15 preloaded key schedule $code.=<<___; .globl ${prefix}_cbc_encrypt .type ${prefix}_cbc_encrypt,%function .align 5 ${prefix}_cbc_encrypt: ___ $code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ $code.=<<___ if ($flavour !~ /64/); mov ip,sp stmdb sp!,{r4-r8,lr} vstmdb sp!,{d8-d15} @ ABI specification says so ldmia ip,{r4-r5} @ load remaining args ___ $code.=<<___; subs $len,$len,#16 mov $step,#16 b.lo .Lcbc_abort cclr $step,eq cmp $enc,#0 // en- or decrypting? ldr $rounds,[$key,#240] and $len,$len,#-16 vld1.8 {$ivec},[$ivp] vld1.8 {$dat},[$inp],$step vld1.32 {q8-q9},[$key] // load key schedule... sub $rounds,$rounds,#6 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys sub $rounds,$rounds,#2 vld1.32 {q10-q11},[$key_],#32 vld1.32 {q12-q13},[$key_],#32 vld1.32 {q14-q15},[$key_],#32 vld1.32 {$rndlast},[$key_] add $key_,$key,#32 mov $cnt,$rounds b.eq .Lcbc_dec cmp $rounds,#2 veor $dat,$dat,$ivec veor $rndzero_n_last,q8,$rndlast b.eq .Lcbc_enc128 vld1.32 {$in0-$in1},[$key_] add $key_,$key,#16 add $key4,$key,#16*4 add $key5,$key,#16*5 aese $dat,q8 aesmc $dat,$dat add $key6,$key,#16*6 add $key7,$key,#16*7 b .Lenter_cbc_enc .align 4 .Loop_cbc_enc: aese $dat,q8 aesmc $dat,$dat vst1.8 {$ivec},[$out],#16 .Lenter_cbc_enc: aese $dat,q9 aesmc $dat,$dat aese $dat,$in0 aesmc $dat,$dat vld1.32 {q8},[$key4] cmp $rounds,#4 aese $dat,$in1 aesmc $dat,$dat vld1.32 {q9},[$key5] b.eq .Lcbc_enc192 aese $dat,q8 aesmc $dat,$dat vld1.32 {q8},[$key6] aese $dat,q9 aesmc $dat,$dat vld1.32 {q9},[$key7] nop .Lcbc_enc192: aese $dat,q8 aesmc $dat,$dat subs $len,$len,#16 aese $dat,q9 aesmc $dat,$dat cclr $step,eq aese $dat,q10 aesmc $dat,$dat aese $dat,q11 aesmc $dat,$dat vld1.8 {q8},[$inp],$step aese $dat,q12 aesmc $dat,$dat veor q8,q8,$rndzero_n_last aese $dat,q13 aesmc $dat,$dat vld1.32 {q9},[$key_] // re-pre-load rndkey[1] aese $dat,q14 aesmc $dat,$dat aese $dat,q15 veor $ivec,$dat,$rndlast b.hs .Loop_cbc_enc vst1.8 {$ivec},[$out],#16 b .Lcbc_done .align 5 .Lcbc_enc128: vld1.32 {$in0-$in1},[$key_] aese $dat,q8 aesmc $dat,$dat b .Lenter_cbc_enc128 .Loop_cbc_enc128: aese $dat,q8 aesmc $dat,$dat vst1.8 {$ivec},[$out],#16 .Lenter_cbc_enc128: aese $dat,q9 aesmc $dat,$dat subs $len,$len,#16 aese $dat,$in0 aesmc $dat,$dat cclr $step,eq aese $dat,$in1 aesmc $dat,$dat aese $dat,q10 aesmc $dat,$dat aese $dat,q11 aesmc $dat,$dat vld1.8 {q8},[$inp],$step aese $dat,q12 aesmc $dat,$dat aese $dat,q13 aesmc $dat,$dat aese $dat,q14 aesmc $dat,$dat veor q8,q8,$rndzero_n_last aese $dat,q15 veor $ivec,$dat,$rndlast b.hs .Loop_cbc_enc128 vst1.8 {$ivec},[$out],#16 b .Lcbc_done ___ { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat3,$in3,$tmp3); # used only in 64-bit mode my ($dat4,$in4,$tmp4); if ($flavour =~ /64/) { ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); } $code.=<<___; .align 5 .Lcbc_dec: vld1.8 {$dat2},[$inp],#16 subs $len,$len,#32 // bias add $cnt,$rounds,#2 vorr $in1,$dat,$dat vorr $dat1,$dat,$dat vorr $in2,$dat2,$dat2 b.lo .Lcbc_dec_tail vorr $dat1,$dat2,$dat2 vld1.8 {$dat2},[$inp],#16 vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 vorr $in2,$dat2,$dat2 ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#32 b.lo .Loop3x_cbc_dec vld1.8 {$dat3},[$inp],#16 vld1.8 {$dat4},[$inp],#16 sub $len,$len,#32 // bias mov $cnt,$rounds vorr $in3,$dat3,$dat3 vorr $in4,$dat4,$dat4 .Loop5x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_cbc_dec aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 cmp $len,#0x40 // because .Lcbc_tail4x sub $len,$len,#0x50 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo mov $key_,$key aesd $dat0,q10 aesimc $dat0,$dat0 aesd $dat1,q10 aesimc $dat1,$dat1 aesd $dat2,q10 aesimc $dat2,$dat2 aesd $dat3,q10 aesimc $dat3,$dat3 aesd $dat4,q10 aesimc $dat4,$dat4 add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat4 // are loaded with last "words" add x6,$len,#0x60 // because .Lcbc_tail4x aesd $dat0,q11 aesimc $dat0,$dat0 aesd $dat1,q11 aesimc $dat1,$dat1 aesd $dat2,q11 aesimc $dat2,$dat2 aesd $dat3,q11 aesimc $dat3,$dat3 aesd $dat4,q11 aesimc $dat4,$dat4 aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 aesd $dat3,q12 aesimc $dat3,$dat3 aesd $dat4,q12 aesimc $dat4,$dat4 aesd $dat0,q13 aesimc $dat0,$dat0 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 aesd $dat3,q13 aesimc $dat3,$dat3 aesd $dat4,q13 aesimc $dat4,$dat4 aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 aesd $dat3,q14 aesimc $dat3,$dat3 aesd $dat4,q14 aesimc $dat4,$dat4 veor $tmp0,$ivec,$rndlast aesd $dat0,q15 veor $tmp1,$in0,$rndlast vld1.8 {$in0},[$inp],#16 aesd $dat1,q15 veor $tmp2,$in1,$rndlast vld1.8 {$in1},[$inp],#16 aesd $dat2,q15 veor $tmp3,$in2,$rndlast vld1.8 {$in2},[$inp],#16 aesd $dat3,q15 veor $tmp4,$in3,$rndlast vld1.8 {$in3},[$inp],#16 aesd $dat4,q15 vorr $ivec,$in4,$in4 vld1.8 {$in4},[$inp],#16 cbz x6,.Lcbc_tail4x vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] veor $tmp0,$tmp0,$dat0 vorr $dat0,$in0,$in0 veor $tmp1,$tmp1,$dat1 vorr $dat1,$in1,$in1 veor $tmp2,$tmp2,$dat2 vorr $dat2,$in2,$in2 veor $tmp3,$tmp3,$dat3 vorr $dat3,$in3,$in3 veor $tmp4,$tmp4,$dat4 vst1.8 {$tmp0},[$out],#16 vorr $dat4,$in4,$in4 vst1.8 {$tmp1},[$out],#16 mov $cnt,$rounds vst1.8 {$tmp2},[$out],#16 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b.hs .Loop5x_cbc_dec add $len,$len,#0x50 cbz $len,.Lcbc_done add $cnt,$rounds,#2 subs $len,$len,#0x30 vorr $dat0,$in2,$in2 vorr $in0,$in2,$in2 vorr $dat1,$in3,$in3 vorr $in1,$in3,$in3 vorr $dat2,$in4,$in4 vorr $in2,$in4,$in4 b.lo .Lcbc_dec_tail b .Loop3x_cbc_dec .align 4 .Lcbc_tail4x: veor $tmp1,$tmp0,$dat1 veor $tmp2,$tmp2,$dat2 veor $tmp3,$tmp3,$dat3 veor $tmp4,$tmp4,$dat4 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b .Lcbc_done .align 4 ___ $code.=<<___; .Loop3x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Loop3x_cbc_dec aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 veor $tmp0,$ivec,$rndlast subs $len,$len,#0x30 veor $tmp1,$in0,$rndlast mov.lo x6,$len // x6, $cnt, is zero at this point aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 veor $tmp2,$in1,$rndlast add $inp,$inp,x6 // $inp is adjusted in such way that // at exit from the loop $dat1-$dat2 // are loaded with last "words" vorr $ivec,$in2,$in2 mov $key_,$key aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 vld1.8 {$in0},[$inp],#16 aesd $dat0,q13 aesimc $dat0,$dat0 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 vld1.8 {$in2},[$inp],#16 aesd $dat0,q15 aesd $dat1,q15 aesd $dat2,q15 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] add $cnt,$rounds,#2 veor $tmp0,$tmp0,$dat0 veor $tmp1,$tmp1,$dat1 veor $dat2,$dat2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp0},[$out],#16 vorr $dat0,$in0,$in0 vst1.8 {$tmp1},[$out],#16 vorr $dat1,$in1,$in1 vst1.8 {$dat2},[$out],#16 vorr $dat2,$in2,$in2 b.hs .Loop3x_cbc_dec cmn $len,#0x30 b.eq .Lcbc_done nop .Lcbc_dec_tail: aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Lcbc_dec_tail aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 cmn $len,#0x20 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 veor $tmp1,$ivec,$rndlast aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 veor $tmp2,$in1,$rndlast aesd $dat1,q15 aesd $dat2,q15 b.eq .Lcbc_dec_one veor $tmp1,$tmp1,$dat1 veor $tmp2,$tmp2,$dat2 vorr $ivec,$in2,$in2 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 b .Lcbc_done .Lcbc_dec_one: veor $tmp1,$tmp1,$dat2 vorr $ivec,$in2,$in2 vst1.8 {$tmp1},[$out],#16 .Lcbc_done: vst1.8 {$ivec},[$ivp] .Lcbc_abort: ___ } $code.=<<___ if ($flavour !~ /64/); vldmia sp!,{d8-d15} ldmia sp!,{r4-r8,pc} ___ $code.=<<___ if ($flavour =~ /64/); ldr x29,[sp],#16 ret ___ $code.=<<___; .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ___ }}} {{{ my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my ($rounds,$cnt,$key_)=("w5","w6","x7"); my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); # used only in 64-bit mode... my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); my ($dat,$tmp)=($dat0,$tmp0); ### q8-q15 preloaded key schedule $code.=<<___; .globl ${prefix}_ctr32_encrypt_blocks .type ${prefix}_ctr32_encrypt_blocks,%function .align 5 ${prefix}_ctr32_encrypt_blocks: ___ $code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 ___ $code.=<<___ if ($flavour !~ /64/); mov ip,sp stmdb sp!,{r4-r10,lr} vstmdb sp!,{d8-d15} @ ABI specification says so ldr r4, [ip] @ load remaining arg ___ $code.=<<___; ldr $rounds,[$key,#240] ldr $ctr, [$ivp, #12] #ifdef __ARMEB__ vld1.8 {$dat0},[$ivp] #else vld1.32 {$dat0},[$ivp] #endif vld1.32 {q8-q9},[$key] // load key schedule... sub $rounds,$rounds,#4 mov $step,#16 cmp $len,#2 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys sub $rounds,$rounds,#2 vld1.32 {q12-q13},[$key_],#32 vld1.32 {q14-q15},[$key_],#32 vld1.32 {$rndlast},[$key_] add $key_,$key,#32 mov $cnt,$rounds cclr $step,lo #ifndef __ARMEB__ rev $ctr, $ctr #endif ___ $code.=<<___ if ($flavour =~ /64/); vorr $dat1,$dat0,$dat0 add $tctr1, $ctr, #1 vorr $dat2,$dat0,$dat0 add $ctr, $ctr, #2 vorr $ivec,$dat0,$dat0 rev $tctr1, $tctr1 vmov.32 ${dat1}[3],$tctr1 b.ls .Lctr32_tail rev $tctr2, $ctr sub $len,$len,#3 // bias vmov.32 ${dat2}[3],$tctr2 ___ $code.=<<___ if ($flavour !~ /64/); add $tctr1, $ctr, #1 vorr $ivec,$dat0,$dat0 rev $tctr1, $tctr1 vmov.32 ${ivec}[3],$tctr1 add $ctr, $ctr, #2 vorr $dat1,$ivec,$ivec b.ls .Lctr32_tail rev $tctr2, $ctr vmov.32 ${ivec}[3],$tctr2 sub $len,$len,#3 // bias vorr $dat2,$ivec,$ivec ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#32 b.lo .Loop3x_ctr32 add w13,$ctr,#1 add w14,$ctr,#2 vorr $dat3,$dat0,$dat0 rev w13,w13 vorr $dat4,$dat0,$dat0 rev w14,w14 vmov.32 ${dat3}[3],w13 sub $len,$len,#2 // bias vmov.32 ${dat4}[3],w14 add $ctr,$ctr,#2 b .Loop5x_ctr32 .align 4 .Loop5x_ctr32: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_ctr32 mov $key_,$key aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] aese $dat0,q12 aesmc $dat0,$dat0 add $tctr0,$ctr,#1 add $tctr1,$ctr,#2 aese $dat1,q12 aesmc $dat1,$dat1 add $tctr2,$ctr,#3 add w13,$ctr,#4 aese $dat2,q12 aesmc $dat2,$dat2 add w14,$ctr,#5 rev $tctr0,$tctr0 aese $dat3,q12 aesmc $dat3,$dat3 rev $tctr1,$tctr1 rev $tctr2,$tctr2 aese $dat4,q12 aesmc $dat4,$dat4 rev w13,w13 rev w14,w14 aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 aese $dat3,q13 aesmc $dat3,$dat3 aese $dat4,q13 aesmc $dat4,$dat4 aese $dat0,q14 aesmc $dat0,$dat0 vld1.8 {$in0},[$inp],#16 aese $dat1,q14 aesmc $dat1,$dat1 vld1.8 {$in1},[$inp],#16 aese $dat2,q14 aesmc $dat2,$dat2 vld1.8 {$in2},[$inp],#16 aese $dat3,q14 aesmc $dat3,$dat3 vld1.8 {$in3},[$inp],#16 aese $dat4,q14 aesmc $dat4,$dat4 vld1.8 {$in4},[$inp],#16 aese $dat0,q15 veor $in0,$in0,$rndlast aese $dat1,q15 veor $in1,$in1,$rndlast aese $dat2,q15 veor $in2,$in2,$rndlast aese $dat3,q15 veor $in3,$in3,$rndlast aese $dat4,q15 veor $in4,$in4,$rndlast veor $in0,$in0,$dat0 vorr $dat0,$ivec,$ivec veor $in1,$in1,$dat1 vorr $dat1,$ivec,$ivec veor $in2,$in2,$dat2 vorr $dat2,$ivec,$ivec veor $in3,$in3,$dat3 vorr $dat3,$ivec,$ivec veor $in4,$in4,$dat4 vorr $dat4,$ivec,$ivec vst1.8 {$in0},[$out],#16 vmov.32 ${dat0}[3],$tctr0 vst1.8 {$in1},[$out],#16 vmov.32 ${dat1}[3],$tctr1 vst1.8 {$in2},[$out],#16 vmov.32 ${dat2}[3],$tctr2 vst1.8 {$in3},[$out],#16 vmov.32 ${dat3}[3],w13 vst1.8 {$in4},[$out],#16 vmov.32 ${dat4}[3],w14 mov $cnt,$rounds cbz $len,.Lctr32_done add $ctr,$ctr,#5 subs $len,$len,#5 b.hs .Loop5x_ctr32 add $len,$len,#5 sub $ctr,$ctr,#5 cmp $len,#2 mov $step,#16 cclr $step,lo b.ls .Lctr32_tail sub $len,$len,#3 // bias add $ctr,$ctr,#3 ___ $code.=<<___; b .Loop3x_ctr32 .align 4 .Loop3x_ctr32: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Loop3x_ctr32 aese $dat0,q8 aesmc $tmp0,$dat0 aese $dat1,q8 aesmc $tmp1,$dat1 vld1.8 {$in0},[$inp],#16 ___ $code.=<<___ if ($flavour =~ /64/); vorr $dat0,$ivec,$ivec ___ $code.=<<___ if ($flavour !~ /64/); add $tctr0,$ctr,#1 ___ $code.=<<___; aese $dat2,q8 aesmc $dat2,$dat2 vld1.8 {$in1},[$inp],#16 ___ $code.=<<___ if ($flavour =~ /64/); vorr $dat1,$ivec,$ivec ___ $code.=<<___ if ($flavour !~ /64/); rev $tctr0,$tctr0 ___ $code.=<<___; aese $tmp0,q9 aesmc $tmp0,$tmp0 aese $tmp1,q9 aesmc $tmp1,$tmp1 vld1.8 {$in2},[$inp],#16 mov $key_,$key aese $dat2,q9 aesmc $tmp2,$dat2 ___ $code.=<<___ if ($flavour =~ /64/); vorr $dat2,$ivec,$ivec add $tctr0,$ctr,#1 ___ $code.=<<___; aese $tmp0,q12 aesmc $tmp0,$tmp0 aese $tmp1,q12 aesmc $tmp1,$tmp1 veor $in0,$in0,$rndlast add $tctr1,$ctr,#2 aese $tmp2,q12 aesmc $tmp2,$tmp2 veor $in1,$in1,$rndlast add $ctr,$ctr,#3 aese $tmp0,q13 aesmc $tmp0,$tmp0 aese $tmp1,q13 aesmc $tmp1,$tmp1 veor $in2,$in2,$rndlast ___ $code.=<<___ if ($flavour =~ /64/); rev $tctr0,$tctr0 aese $tmp2,q13 aesmc $tmp2,$tmp2 vmov.32 ${dat0}[3], $tctr0 ___ $code.=<<___ if ($flavour !~ /64/); vmov.32 ${ivec}[3], $tctr0 aese $tmp2,q13 aesmc $tmp2,$tmp2 vorr $dat0,$ivec,$ivec ___ $code.=<<___; rev $tctr1,$tctr1 aese $tmp0,q14 aesmc $tmp0,$tmp0 ___ $code.=<<___ if ($flavour !~ /64/); vmov.32 ${ivec}[3], $tctr1 rev $tctr2,$ctr ___ $code.=<<___; aese $tmp1,q14 aesmc $tmp1,$tmp1 ___ $code.=<<___ if ($flavour =~ /64/); vmov.32 ${dat1}[3], $tctr1 rev $tctr2,$ctr aese $tmp2,q14 aesmc $tmp2,$tmp2 vmov.32 ${dat2}[3], $tctr2 ___ $code.=<<___ if ($flavour !~ /64/); vorr $dat1,$ivec,$ivec vmov.32 ${ivec}[3], $tctr2 aese $tmp2,q14 aesmc $tmp2,$tmp2 vorr $dat2,$ivec,$ivec ___ $code.=<<___; subs $len,$len,#3 aese $tmp0,q15 aese $tmp1,q15 aese $tmp2,q15 veor $in0,$in0,$tmp0 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] vst1.8 {$in0},[$out],#16 veor $in1,$in1,$tmp1 mov $cnt,$rounds vst1.8 {$in1},[$out],#16 veor $in2,$in2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$in2},[$out],#16 b.hs .Loop3x_ctr32 adds $len,$len,#3 b.eq .Lctr32_done cmp $len,#1 mov $step,#16 cclr $step,eq .Lctr32_tail: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 vld1.32 {q8},[$key_],#16 subs $cnt,$cnt,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 vld1.32 {q9},[$key_],#16 b.gt .Lctr32_tail aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 vld1.8 {$in0},[$inp],$step aese $dat0,q12 aesmc $dat0,$dat0 aese $dat1,q12 aesmc $dat1,$dat1 vld1.8 {$in1},[$inp] aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 veor $in0,$in0,$rndlast aese $dat0,q14 aesmc $dat0,$dat0 aese $dat1,q14 aesmc $dat1,$dat1 veor $in1,$in1,$rndlast aese $dat0,q15 aese $dat1,q15 cmp $len,#1 veor $in0,$in0,$dat0 veor $in1,$in1,$dat1 vst1.8 {$in0},[$out],#16 b.eq .Lctr32_done vst1.8 {$in1},[$out] .Lctr32_done: ___ $code.=<<___ if ($flavour !~ /64/); vldmia sp!,{d8-d15} ldmia sp!,{r4-r10,pc} ___ $code.=<<___ if ($flavour =~ /64/); ldr x29,[sp],#16 ret ___ $code.=<<___; .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ___ }}} # Performance in cycles per byte. # Processed with AES-XTS different key size. # It shows the value before and after optimization as below: # (before/after): # # AES-128-XTS AES-256-XTS # Cortex-A57 3.36/1.09 4.02/1.37 # Cortex-A72 3.03/1.02 3.28/1.33 # Optimization is implemented by loop unrolling and interleaving. # Commonly, we choose the unrolling factor as 5, if the input # data size smaller than 5 blocks, but not smaller than 3 blocks, # choose 3 as the unrolling factor. # If the input data size dsize >= 5*16 bytes, then take 5 blocks # as one iteration, every loop the left size lsize -= 5*16. # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes # will be processed specially, which be integrated into the 5*16 bytes # loop to improve the efficiency. # There is one special case, if the original input data size dsize # = 16 bytes, we will treat it seperately to improve the # performance: one independent code block without LR, FP load and # store. # Encryption will process the (length -tailcnt) bytes as mentioned # previously, then encrypt the composite block as last second # cipher block. # Decryption will process the (length -tailcnt -1) bytes as mentioned # previously, then decrypt the last second cipher block to get the # last plain block(tail), decrypt the composite block as last second # plain text block. {{{ my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); my ($tmpin)=("v26.16b"); my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); # q7 last round key # q10-q15, q7 Last 7 round keys # q8-q9 preloaded round keys except last 7 keys for big size # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat3,$in3,$tmp3); # used only in 64-bit mode my ($dat4,$in4,$tmp4); if ($flavour =~ /64/) { ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); } $code.=<<___ if ($flavour =~ /64/); .globl ${prefix}_xts_encrypt .type ${prefix}_xts_encrypt,%function .align 5 ${prefix}_xts_encrypt: ___ $code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET cmp $len,#16 // Original input data size bigger than 16, jump to big size processing. b.ne .Lxts_enc_big_size // Encrypt the iv with key2, as the first XEX iv. ldr $rounds,[$key2,#240] vld1.32 {$dat},[$key2],#16 vld1.8 {$iv0},[$ivp] sub $rounds,$rounds,#2 vld1.32 {$dat1},[$key2],#16 .Loop_enc_iv_enc: aese $iv0,$dat aesmc $iv0,$iv0 vld1.32 {$dat},[$key2],#16 subs $rounds,$rounds,#2 aese $iv0,$dat1 aesmc $iv0,$iv0 vld1.32 {$dat1},[$key2],#16 b.gt .Loop_enc_iv_enc aese $iv0,$dat aesmc $iv0,$iv0 vld1.32 {$dat},[$key2] aese $iv0,$dat1 veor $iv0,$iv0,$dat vld1.8 {$dat0},[$inp] veor $dat0,$iv0,$dat0 ldr $rounds,[$key1,#240] vld1.32 {q20-q21},[$key1],#32 // load key schedule... aese $dat0,q20 aesmc $dat0,$dat0 vld1.32 {q8-q9},[$key1],#32 // load key schedule... aese $dat0,q21 aesmc $dat0,$dat0 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing b.eq .Lxts_128_enc .Lxts_enc_round_loop: aese $dat0,q8 aesmc $dat0,$dat0 vld1.32 {q8},[$key1],#16 // load key schedule... aese $dat0,q9 aesmc $dat0,$dat0 vld1.32 {q9},[$key1],#16 // load key schedule... subs $rounds,$rounds,#2 // bias b.gt .Lxts_enc_round_loop .Lxts_128_enc: vld1.32 {q10-q11},[$key1],#32 // load key schedule... aese $dat0,q8 aesmc $dat0,$dat0 aese $dat0,q9 aesmc $dat0,$dat0 vld1.32 {q12-q13},[$key1],#32 // load key schedule... aese $dat0,q10 aesmc $dat0,$dat0 aese $dat0,q11 aesmc $dat0,$dat0 vld1.32 {q14-q15},[$key1],#32 // load key schedule... aese $dat0,q12 aesmc $dat0,$dat0 aese $dat0,q13 aesmc $dat0,$dat0 vld1.32 {$rndlast},[$key1] aese $dat0,q14 aesmc $dat0,$dat0 aese $dat0,q15 veor $dat0,$dat0,$rndlast veor $dat0,$dat0,$iv0 vst1.8 {$dat0},[$out] b .Lxts_enc_final_abort .align 4 .Lxts_enc_big_size: ___ $code.=<<___ if ($flavour =~ /64/); stp $constnumx,$tmpinp,[sp,#-64]! stp $tailcnt,$midnumx,[sp,#48] stp $ivd10,$ivd20,[sp,#32] stp $ivd30,$ivd40,[sp,#16] // tailcnt store the tail value of length%16. and $tailcnt,$len,#0xf and $len,$len,#-16 subs $len,$len,#16 mov $step,#16 b.lo .Lxts_abort csel $step,xzr,$step,eq // Firstly, encrypt the iv with key2, as the first iv of XEX. ldr $rounds,[$key2,#240] vld1.32 {$dat},[$key2],#16 vld1.8 {$iv0},[$ivp] sub $rounds,$rounds,#2 vld1.32 {$dat1},[$key2],#16 .Loop_iv_enc: aese $iv0,$dat aesmc $iv0,$iv0 vld1.32 {$dat},[$key2],#16 subs $rounds,$rounds,#2 aese $iv0,$dat1 aesmc $iv0,$iv0 vld1.32 {$dat1},[$key2],#16 b.gt .Loop_iv_enc aese $iv0,$dat aesmc $iv0,$iv0 vld1.32 {$dat},[$key2] aese $iv0,$dat1 veor $iv0,$iv0,$dat // The iv for second block // $ivl- iv(low), $ivh - iv(high) // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 fmov $ivl,$ivd00 fmov $ivh,$ivd01 mov $constnum,#0x87 extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd10,$ivl fmov $ivd11,$ivh ldr $rounds0,[$key1,#240] // next starting point vld1.8 {$dat},[$inp],$step vld1.32 {q8-q9},[$key1] // load key schedule... sub $rounds0,$rounds0,#6 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys sub $rounds0,$rounds0,#2 vld1.32 {q10-q11},[$key_],#32 vld1.32 {q12-q13},[$key_],#32 vld1.32 {q14-q15},[$key_],#32 vld1.32 {$rndlast},[$key_] add $key_,$key1,#32 mov $rounds,$rounds0 // Encryption .Lxts_enc: vld1.8 {$dat2},[$inp],#16 subs $len,$len,#32 // bias add $rounds,$rounds0,#2 vorr $in1,$dat,$dat vorr $dat1,$dat,$dat vorr $in3,$dat,$dat vorr $in2,$dat2,$dat2 vorr $in4,$dat2,$dat2 b.lo .Lxts_inner_enc_tail veor $dat,$dat,$iv0 // before encryption, xor with iv veor $dat2,$dat2,$iv1 // The iv for third block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd20,$ivl fmov $ivd21,$ivh vorr $dat1,$dat2,$dat2 vld1.8 {$dat2},[$inp],#16 vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 veor $in2,$dat2,$iv2 // the third block veor $dat2,$dat2,$iv2 cmp $len,#32 b.lo .Lxts_outer_enc_tail // The iv for fourth block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd30,$ivl fmov $ivd31,$ivh vld1.8 {$dat3},[$inp],#16 // The iv for fifth block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd40,$ivl fmov $ivd41,$ivh vld1.8 {$dat4},[$inp],#16 veor $dat3,$dat3,$iv3 // the fourth block veor $dat4,$dat4,$iv4 sub $len,$len,#32 // bias mov $rounds,$rounds0 b .Loop5x_xts_enc .align 4 .Loop5x_xts_enc: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 vld1.32 {q8},[$key_],#16 subs $rounds,$rounds,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 vld1.32 {q9},[$key_],#16 b.gt .Loop5x_xts_enc aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat3,q8 aesmc $dat3,$dat3 aese $dat4,q8 aesmc $dat4,$dat4 subs $len,$len,#0x50 // because .Lxts_enc_tail4x aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat3,q9 aesmc $dat3,$dat3 aese $dat4,q9 aesmc $dat4,$dat4 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo mov $key_,$key1 aese $dat0,q10 aesmc $dat0,$dat0 aese $dat1,q10 aesmc $dat1,$dat1 aese $dat2,q10 aesmc $dat2,$dat2 aese $dat3,q10 aesmc $dat3,$dat3 aese $dat4,q10 aesmc $dat4,$dat4 add $inp,$inp,$xoffset // x0 is adjusted in such way that // at exit from the loop v1.16b-v26.16b // are loaded with last "words" add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x aese $dat0,q11 aesmc $dat0,$dat0 aese $dat1,q11 aesmc $dat1,$dat1 aese $dat2,q11 aesmc $dat2,$dat2 aese $dat3,q11 aesmc $dat3,$dat3 aese $dat4,q11 aesmc $dat4,$dat4 aese $dat0,q12 aesmc $dat0,$dat0 aese $dat1,q12 aesmc $dat1,$dat1 aese $dat2,q12 aesmc $dat2,$dat2 aese $dat3,q12 aesmc $dat3,$dat3 aese $dat4,q12 aesmc $dat4,$dat4 aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 aese $dat3,q13 aesmc $dat3,$dat3 aese $dat4,q13 aesmc $dat4,$dat4 aese $dat0,q14 aesmc $dat0,$dat0 aese $dat1,q14 aesmc $dat1,$dat1 aese $dat2,q14 aesmc $dat2,$dat2 aese $dat3,q14 aesmc $dat3,$dat3 aese $dat4,q14 aesmc $dat4,$dat4 veor $tmp0,$rndlast,$iv0 aese $dat0,q15 // The iv for first block of one iteration extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd00,$ivl fmov $ivd01,$ivh veor $tmp1,$rndlast,$iv1 vld1.8 {$in0},[$inp],#16 aese $dat1,q15 // The iv for second block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd10,$ivl fmov $ivd11,$ivh veor $tmp2,$rndlast,$iv2 vld1.8 {$in1},[$inp],#16 aese $dat2,q15 // The iv for third block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd20,$ivl fmov $ivd21,$ivh veor $tmp3,$rndlast,$iv3 vld1.8 {$in2},[$inp],#16 aese $dat3,q15 // The iv for fourth block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd30,$ivl fmov $ivd31,$ivh veor $tmp4,$rndlast,$iv4 vld1.8 {$in3},[$inp],#16 aese $dat4,q15 // The iv for fifth block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd40,$ivl fmov $ivd41,$ivh vld1.8 {$in4},[$inp],#16 cbz $xoffset,.Lxts_enc_tail4x vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] veor $tmp0,$tmp0,$dat0 veor $dat0,$in0,$iv0 veor $tmp1,$tmp1,$dat1 veor $dat1,$in1,$iv1 veor $tmp2,$tmp2,$dat2 veor $dat2,$in2,$iv2 veor $tmp3,$tmp3,$dat3 veor $dat3,$in3,$iv3 veor $tmp4,$tmp4,$dat4 vst1.8 {$tmp0},[$out],#16 veor $dat4,$in4,$iv4 vst1.8 {$tmp1},[$out],#16 mov $rounds,$rounds0 vst1.8 {$tmp2},[$out],#16 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b.hs .Loop5x_xts_enc // If left 4 blocks, borrow the five block's processing. cmn $len,#0x10 b.ne .Loop5x_enc_after vorr $iv4,$iv3,$iv3 vorr $iv3,$iv2,$iv2 vorr $iv2,$iv1,$iv1 vorr $iv1,$iv0,$iv0 fmov $ivl,$ivd40 fmov $ivh,$ivd41 veor $dat0,$iv0,$in0 veor $dat1,$iv1,$in1 veor $dat2,$in2,$iv2 veor $dat3,$in3,$iv3 veor $dat4,$in4,$iv4 b.eq .Loop5x_xts_enc .Loop5x_enc_after: add $len,$len,#0x50 cbz $len,.Lxts_enc_done add $rounds,$rounds0,#2 subs $len,$len,#0x30 b.lo .Lxts_inner_enc_tail veor $dat0,$iv0,$in2 veor $dat1,$iv1,$in3 veor $dat2,$in4,$iv2 b .Lxts_outer_enc_tail .align 4 .Lxts_enc_tail4x: add $inp,$inp,#16 veor $tmp1,$dat1,$tmp1 vst1.8 {$tmp1},[$out],#16 veor $tmp2,$dat2,$tmp2 vst1.8 {$tmp2},[$out],#16 veor $tmp3,$dat3,$tmp3 veor $tmp4,$dat4,$tmp4 vst1.8 {$tmp3-$tmp4},[$out],#32 b .Lxts_enc_done .align 4 .Lxts_outer_enc_tail: aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $rounds,$rounds,#2 aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Lxts_outer_enc_tail aese $dat0,q8 aesmc $dat0,$dat0 aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 veor $tmp0,$iv0,$rndlast subs $len,$len,#0x30 // The iv for first block fmov $ivl,$ivd20 fmov $ivh,$ivd21 //mov $constnum,#0x87 extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr#31 eor $ivl,$tmpmx,$ivl,lsl#1 fmov $ivd00,$ivl fmov $ivd01,$ivh veor $tmp1,$iv1,$rndlast csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point aese $dat0,q9 aesmc $dat0,$dat0 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 veor $tmp2,$iv2,$rndlast add $xoffset,$xoffset,#0x20 add $inp,$inp,$xoffset mov $key_,$key1 aese $dat0,q12 aesmc $dat0,$dat0 aese $dat1,q12 aesmc $dat1,$dat1 aese $dat2,q12 aesmc $dat2,$dat2 aese $dat0,q13 aesmc $dat0,$dat0 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 aese $dat0,q14 aesmc $dat0,$dat0 aese $dat1,q14 aesmc $dat1,$dat1 aese $dat2,q14 aesmc $dat2,$dat2 aese $dat0,q15 aese $dat1,q15 aese $dat2,q15 vld1.8 {$in2},[$inp],#16 add $rounds,$rounds0,#2 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] veor $tmp0,$tmp0,$dat0 veor $tmp1,$tmp1,$dat1 veor $dat2,$dat2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp0},[$out],#16 vst1.8 {$tmp1},[$out],#16 vst1.8 {$dat2},[$out],#16 cmn $len,#0x30 b.eq .Lxts_enc_done .Lxts_encxor_one: vorr $in3,$in1,$in1 vorr $in4,$in2,$in2 nop .Lxts_inner_enc_tail: cmn $len,#0x10 veor $dat1,$in3,$iv0 veor $dat2,$in4,$iv1 b.eq .Lxts_enc_tail_loop veor $dat2,$in4,$iv0 .Lxts_enc_tail_loop: aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $rounds,$rounds,#2 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Lxts_enc_tail_loop aese $dat1,q8 aesmc $dat1,$dat1 aese $dat2,q8 aesmc $dat2,$dat2 aese $dat1,q9 aesmc $dat1,$dat1 aese $dat2,q9 aesmc $dat2,$dat2 aese $dat1,q12 aesmc $dat1,$dat1 aese $dat2,q12 aesmc $dat2,$dat2 cmn $len,#0x20 aese $dat1,q13 aesmc $dat1,$dat1 aese $dat2,q13 aesmc $dat2,$dat2 veor $tmp1,$iv0,$rndlast aese $dat1,q14 aesmc $dat1,$dat1 aese $dat2,q14 aesmc $dat2,$dat2 veor $tmp2,$iv1,$rndlast aese $dat1,q15 aese $dat2,q15 b.eq .Lxts_enc_one veor $tmp1,$tmp1,$dat1 vst1.8 {$tmp1},[$out],#16 veor $tmp2,$tmp2,$dat2 vorr $iv0,$iv1,$iv1 vst1.8 {$tmp2},[$out],#16 fmov $ivl,$ivd10 fmov $ivh,$ivd11 mov $constnum,#0x87 extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd00,$ivl fmov $ivd01,$ivh b .Lxts_enc_done .Lxts_enc_one: veor $tmp1,$tmp1,$dat2 vorr $iv0,$iv0,$iv0 vst1.8 {$tmp1},[$out],#16 fmov $ivl,$ivd00 fmov $ivh,$ivd01 mov $constnum,#0x87 extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd00,$ivl fmov $ivd01,$ivh b .Lxts_enc_done .align 5 .Lxts_enc_done: // Process the tail block with cipher stealing. tst $tailcnt,#0xf b.eq .Lxts_abort mov $tmpinp,$inp mov $tmpoutp,$out sub $out,$out,#16 .composite_enc_loop: subs $tailcnt,$tailcnt,#1 ldrb $l2outp,[$out,$tailcnt] ldrb $loutp,[$tmpinp,$tailcnt] strb $l2outp,[$tmpoutp,$tailcnt] strb $loutp,[$out,$tailcnt] b.gt .composite_enc_loop .Lxts_enc_load_done: vld1.8 {$tmpin},[$out] veor $tmpin,$tmpin,$iv0 // Encrypt the composite block to get the last second encrypted text block ldr $rounds,[$key1,#240] // load key schedule... vld1.32 {$dat},[$key1],#16 sub $rounds,$rounds,#2 vld1.32 {$dat1},[$key1],#16 // load key schedule... .Loop_final_enc: aese $tmpin,$dat0 aesmc $tmpin,$tmpin vld1.32 {$dat0},[$key1],#16 subs $rounds,$rounds,#2 aese $tmpin,$dat1 aesmc $tmpin,$tmpin vld1.32 {$dat1},[$key1],#16 b.gt .Loop_final_enc aese $tmpin,$dat0 aesmc $tmpin,$tmpin vld1.32 {$dat0},[$key1] aese $tmpin,$dat1 veor $tmpin,$tmpin,$dat0 veor $tmpin,$tmpin,$iv0 vst1.8 {$tmpin},[$out] .Lxts_abort: ldp $tailcnt,$midnumx,[sp,#48] ldp $ivd10,$ivd20,[sp,#32] ldp $ivd30,$ivd40,[sp,#16] ldp $constnumx,$tmpinp,[sp],#64 .Lxts_enc_final_abort: ret .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt ___ }}} {{{ my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); # q7 last round key # q10-q15, q7 Last 7 round keys # q8-q9 preloaded round keys except last 7 keys for big size # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); my ($dat3,$in3,$tmp3); # used only in 64-bit mode my ($dat4,$in4,$tmp4); if ($flavour =~ /64/) { ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); } $code.=<<___ if ($flavour =~ /64/); .globl ${prefix}_xts_decrypt .type ${prefix}_xts_decrypt,%function .align 5 ${prefix}_xts_decrypt: + AARCH64_VALID_CALL_TARGET ___ $code.=<<___ if ($flavour =~ /64/); cmp $len,#16 // Original input data size bigger than 16, jump to big size processing. b.ne .Lxts_dec_big_size // Encrypt the iv with key2, as the first XEX iv. ldr $rounds,[$key2,#240] vld1.32 {$dat},[$key2],#16 vld1.8 {$iv0},[$ivp] sub $rounds,$rounds,#2 vld1.32 {$dat1},[$key2],#16 .Loop_dec_small_iv_enc: aese $iv0,$dat aesmc $iv0,$iv0 vld1.32 {$dat},[$key2],#16 subs $rounds,$rounds,#2 aese $iv0,$dat1 aesmc $iv0,$iv0 vld1.32 {$dat1},[$key2],#16 b.gt .Loop_dec_small_iv_enc aese $iv0,$dat aesmc $iv0,$iv0 vld1.32 {$dat},[$key2] aese $iv0,$dat1 veor $iv0,$iv0,$dat vld1.8 {$dat0},[$inp] veor $dat0,$iv0,$dat0 ldr $rounds,[$key1,#240] vld1.32 {q20-q21},[$key1],#32 // load key schedule... aesd $dat0,q20 aesimc $dat0,$dat0 vld1.32 {q8-q9},[$key1],#32 // load key schedule... aesd $dat0,q21 aesimc $dat0,$dat0 subs $rounds,$rounds,#10 // bias b.eq .Lxts_128_dec .Lxts_dec_round_loop: aesd $dat0,q8 aesimc $dat0,$dat0 vld1.32 {q8},[$key1],#16 // load key schedule... aesd $dat0,q9 aesimc $dat0,$dat0 vld1.32 {q9},[$key1],#16 // load key schedule... subs $rounds,$rounds,#2 // bias b.gt .Lxts_dec_round_loop .Lxts_128_dec: vld1.32 {q10-q11},[$key1],#32 // load key schedule... aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat0,q9 aesimc $dat0,$dat0 vld1.32 {q12-q13},[$key1],#32 // load key schedule... aesd $dat0,q10 aesimc $dat0,$dat0 aesd $dat0,q11 aesimc $dat0,$dat0 vld1.32 {q14-q15},[$key1],#32 // load key schedule... aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat0,q13 aesimc $dat0,$dat0 vld1.32 {$rndlast},[$key1] aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat0,q15 veor $dat0,$dat0,$rndlast veor $dat0,$iv0,$dat0 vst1.8 {$dat0},[$out] b .Lxts_dec_final_abort .Lxts_dec_big_size: ___ $code.=<<___ if ($flavour =~ /64/); stp $constnumx,$tmpinp,[sp,#-64]! stp $tailcnt,$midnumx,[sp,#48] stp $ivd10,$ivd20,[sp,#32] stp $ivd30,$ivd40,[sp,#16] and $tailcnt,$len,#0xf and $len,$len,#-16 subs $len,$len,#16 mov $step,#16 b.lo .Lxts_dec_abort // Encrypt the iv with key2, as the first XEX iv ldr $rounds,[$key2,#240] vld1.32 {$dat},[$key2],#16 vld1.8 {$iv0},[$ivp] sub $rounds,$rounds,#2 vld1.32 {$dat1},[$key2],#16 .Loop_dec_iv_enc: aese $iv0,$dat aesmc $iv0,$iv0 vld1.32 {$dat},[$key2],#16 subs $rounds,$rounds,#2 aese $iv0,$dat1 aesmc $iv0,$iv0 vld1.32 {$dat1},[$key2],#16 b.gt .Loop_dec_iv_enc aese $iv0,$dat aesmc $iv0,$iv0 vld1.32 {$dat},[$key2] aese $iv0,$dat1 veor $iv0,$iv0,$dat // The iv for second block // $ivl- iv(low), $ivh - iv(high) // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 fmov $ivl,$ivd00 fmov $ivh,$ivd01 mov $constnum,#0x87 extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd10,$ivl fmov $ivd11,$ivh ldr $rounds0,[$key1,#240] // load rounds number // The iv for third block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd20,$ivl fmov $ivd21,$ivh vld1.32 {q8-q9},[$key1] // load key schedule... sub $rounds0,$rounds0,#6 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys sub $rounds0,$rounds0,#2 vld1.32 {q10-q11},[$key_],#32 // load key schedule... vld1.32 {q12-q13},[$key_],#32 vld1.32 {q14-q15},[$key_],#32 vld1.32 {$rndlast},[$key_] // The iv for fourth block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd30,$ivl fmov $ivd31,$ivh add $key_,$key1,#32 mov $rounds,$rounds0 b .Lxts_dec // Decryption .align 5 .Lxts_dec: tst $tailcnt,#0xf b.eq .Lxts_dec_begin subs $len,$len,#16 csel $step,xzr,$step,eq vld1.8 {$dat},[$inp],#16 b.lo .Lxts_done sub $inp,$inp,#16 .Lxts_dec_begin: vld1.8 {$dat},[$inp],$step subs $len,$len,#32 // bias add $rounds,$rounds0,#2 vorr $in1,$dat,$dat vorr $dat1,$dat,$dat vorr $in3,$dat,$dat vld1.8 {$dat2},[$inp],#16 vorr $in2,$dat2,$dat2 vorr $in4,$dat2,$dat2 b.lo .Lxts_inner_dec_tail veor $dat,$dat,$iv0 // before decryt, xor with iv veor $dat2,$dat2,$iv1 vorr $dat1,$dat2,$dat2 vld1.8 {$dat2},[$inp],#16 vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 veor $in2,$dat2,$iv2 // third block xox with third iv veor $dat2,$dat2,$iv2 cmp $len,#32 b.lo .Lxts_outer_dec_tail vld1.8 {$dat3},[$inp],#16 // The iv for fifth block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd40,$ivl fmov $ivd41,$ivh vld1.8 {$dat4},[$inp],#16 veor $dat3,$dat3,$iv3 // the fourth block veor $dat4,$dat4,$iv4 sub $len,$len,#32 // bias mov $rounds,$rounds0 b .Loop5x_xts_dec .align 4 .Loop5x_xts_dec: aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 vld1.32 {q8},[$key_],#16 // load key schedule... subs $rounds,$rounds,#2 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 vld1.32 {q9},[$key_],#16 // load key schedule... b.gt .Loop5x_xts_dec aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat3,q8 aesimc $dat3,$dat3 aesd $dat4,q8 aesimc $dat4,$dat4 subs $len,$len,#0x50 // because .Lxts_dec_tail4x aesd $dat0,q9 aesimc $dat0,$dat aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat3,q9 aesimc $dat3,$dat3 aesd $dat4,q9 aesimc $dat4,$dat4 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo mov $key_,$key1 aesd $dat0,q10 aesimc $dat0,$dat0 aesd $dat1,q10 aesimc $dat1,$dat1 aesd $dat2,q10 aesimc $dat2,$dat2 aesd $dat3,q10 aesimc $dat3,$dat3 aesd $dat4,q10 aesimc $dat4,$dat4 add $inp,$inp,$xoffset // x0 is adjusted in such way that // at exit from the loop v1.16b-v26.16b // are loaded with last "words" add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x aesd $dat0,q11 aesimc $dat0,$dat0 aesd $dat1,q11 aesimc $dat1,$dat1 aesd $dat2,q11 aesimc $dat2,$dat2 aesd $dat3,q11 aesimc $dat3,$dat3 aesd $dat4,q11 aesimc $dat4,$dat4 aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 aesd $dat3,q12 aesimc $dat3,$dat3 aesd $dat4,q12 aesimc $dat4,$dat4 aesd $dat0,q13 aesimc $dat0,$dat0 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 aesd $dat3,q13 aesimc $dat3,$dat3 aesd $dat4,q13 aesimc $dat4,$dat4 aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 aesd $dat3,q14 aesimc $dat3,$dat3 aesd $dat4,q14 aesimc $dat4,$dat4 veor $tmp0,$rndlast,$iv0 aesd $dat0,q15 // The iv for first block of next iteration. extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd00,$ivl fmov $ivd01,$ivh veor $tmp1,$rndlast,$iv1 vld1.8 {$in0},[$inp],#16 aesd $dat1,q15 // The iv for second block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd10,$ivl fmov $ivd11,$ivh veor $tmp2,$rndlast,$iv2 vld1.8 {$in1},[$inp],#16 aesd $dat2,q15 // The iv for third block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd20,$ivl fmov $ivd21,$ivh veor $tmp3,$rndlast,$iv3 vld1.8 {$in2},[$inp],#16 aesd $dat3,q15 // The iv for fourth block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd30,$ivl fmov $ivd31,$ivh veor $tmp4,$rndlast,$iv4 vld1.8 {$in3},[$inp],#16 aesd $dat4,q15 // The iv for fifth block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd40,$ivl fmov $ivd41,$ivh vld1.8 {$in4},[$inp],#16 cbz $xoffset,.Lxts_dec_tail4x vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] veor $tmp0,$tmp0,$dat0 veor $dat0,$in0,$iv0 veor $tmp1,$tmp1,$dat1 veor $dat1,$in1,$iv1 veor $tmp2,$tmp2,$dat2 veor $dat2,$in2,$iv2 veor $tmp3,$tmp3,$dat3 veor $dat3,$in3,$iv3 veor $tmp4,$tmp4,$dat4 vst1.8 {$tmp0},[$out],#16 veor $dat4,$in4,$iv4 vst1.8 {$tmp1},[$out],#16 mov $rounds,$rounds0 vst1.8 {$tmp2},[$out],#16 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp3},[$out],#16 vst1.8 {$tmp4},[$out],#16 b.hs .Loop5x_xts_dec cmn $len,#0x10 b.ne .Loop5x_dec_after // If x2($len) equal to -0x10, the left blocks is 4. // After specially processing, utilize the five blocks processing again. // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. vorr $iv4,$iv3,$iv3 vorr $iv3,$iv2,$iv2 vorr $iv2,$iv1,$iv1 vorr $iv1,$iv0,$iv0 fmov $ivl,$ivd40 fmov $ivh,$ivd41 veor $dat0,$iv0,$in0 veor $dat1,$iv1,$in1 veor $dat2,$in2,$iv2 veor $dat3,$in3,$iv3 veor $dat4,$in4,$iv4 b.eq .Loop5x_xts_dec .Loop5x_dec_after: add $len,$len,#0x50 cbz $len,.Lxts_done add $rounds,$rounds0,#2 subs $len,$len,#0x30 b.lo .Lxts_inner_dec_tail veor $dat0,$iv0,$in2 veor $dat1,$iv1,$in3 veor $dat2,$in4,$iv2 b .Lxts_outer_dec_tail .align 4 .Lxts_dec_tail4x: add $inp,$inp,#16 tst $tailcnt,#0xf veor $tmp1,$dat1,$tmp0 vst1.8 {$tmp1},[$out],#16 veor $tmp2,$dat2,$tmp2 vst1.8 {$tmp2},[$out],#16 veor $tmp3,$dat3,$tmp3 veor $tmp4,$dat4,$tmp4 vst1.8 {$tmp3-$tmp4},[$out],#32 b.eq .Lxts_dec_abort vld1.8 {$dat0},[$inp],#16 b .Lxts_done .align 4 .Lxts_outer_dec_tail: aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $rounds,$rounds,#2 aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Lxts_outer_dec_tail aesd $dat0,q8 aesimc $dat0,$dat0 aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 veor $tmp0,$iv0,$rndlast subs $len,$len,#0x30 // The iv for first block fmov $ivl,$ivd20 fmov $ivh,$ivd21 mov $constnum,#0x87 extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd00,$ivl fmov $ivd01,$ivh veor $tmp1,$iv1,$rndlast csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point aesd $dat0,q9 aesimc $dat0,$dat0 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 veor $tmp2,$iv2,$rndlast // The iv for second block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd10,$ivl fmov $ivd11,$ivh add $xoffset,$xoffset,#0x20 add $inp,$inp,$xoffset // $inp is adjusted to the last data mov $key_,$key1 // The iv for third block extr $midnumx,$ivh,$ivh,#32 extr $ivh,$ivh,$ivl,#63 and $tmpmw,$constnum,$midnum,asr #31 eor $ivl,$tmpmx,$ivl,lsl #1 fmov $ivd20,$ivl fmov $ivd21,$ivh aesd $dat0,q12 aesimc $dat0,$dat0 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 aesd $dat0,q13 aesimc $dat0,$dat0 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 aesd $dat0,q14 aesimc $dat0,$dat0 aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 vld1.8 {$in2},[$inp],#16 aesd $dat0,q15 aesd $dat1,q15 aesd $dat2,q15 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] add $rounds,$rounds0,#2 veor $tmp0,$tmp0,$dat0 veor $tmp1,$tmp1,$dat1 veor $dat2,$dat2,$tmp2 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vst1.8 {$tmp0},[$out],#16 vst1.8 {$tmp1},[$out],#16 vst1.8 {$dat2},[$out],#16 cmn $len,#0x30 add $len,$len,#0x30 b.eq .Lxts_done sub $len,$len,#0x30 vorr $in3,$in1,$in1 vorr $in4,$in2,$in2 nop .Lxts_inner_dec_tail: // $len == -0x10 means two blocks left. cmn $len,#0x10 veor $dat1,$in3,$iv0 veor $dat2,$in4,$iv1 b.eq .Lxts_dec_tail_loop veor $dat2,$in4,$iv0 .Lxts_dec_tail_loop: aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 vld1.32 {q8},[$key_],#16 subs $rounds,$rounds,#2 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 vld1.32 {q9},[$key_],#16 b.gt .Lxts_dec_tail_loop aesd $dat1,q8 aesimc $dat1,$dat1 aesd $dat2,q8 aesimc $dat2,$dat2 aesd $dat1,q9 aesimc $dat1,$dat1 aesd $dat2,q9 aesimc $dat2,$dat2 aesd $dat1,q12 aesimc $dat1,$dat1 aesd $dat2,q12 aesimc $dat2,$dat2 cmn $len,#0x20 aesd $dat1,q13 aesimc $dat1,$dat1 aesd $dat2,q13 aesimc $dat2,$dat2 veor $tmp1,$iv0,$rndlast aesd $dat1,q14 aesimc $dat1,$dat1 aesd $dat2,q14 aesimc $dat2,$dat2 veor $tmp2,$iv1,$rndlast aesd $dat1,q15 aesd $dat2,q15 b.eq .Lxts_dec_one veor $tmp1,$tmp1,$dat1 veor $tmp2,$tmp2,$dat2 vorr $iv0,$iv2,$iv2 vorr $iv1,$iv3,$iv3 vst1.8 {$tmp1},[$out],#16 vst1.8 {$tmp2},[$out],#16 add $len,$len,#16 b .Lxts_done .Lxts_dec_one: veor $tmp1,$tmp1,$dat2 vorr $iv0,$iv1,$iv1 vorr $iv1,$iv2,$iv2 vst1.8 {$tmp1},[$out],#16 add $len,$len,#32 .Lxts_done: tst $tailcnt,#0xf b.eq .Lxts_dec_abort // Processing the last two blocks with cipher stealing. mov x7,x3 cbnz x2,.Lxts_dec_1st_done vld1.8 {$dat0},[$inp],#16 // Decrypt the last secod block to get the last plain text block .Lxts_dec_1st_done: eor $tmpin,$dat0,$iv1 ldr $rounds,[$key1,#240] vld1.32 {$dat0},[$key1],#16 sub $rounds,$rounds,#2 vld1.32 {$dat1},[$key1],#16 .Loop_final_2nd_dec: aesd $tmpin,$dat0 aesimc $tmpin,$tmpin vld1.32 {$dat0},[$key1],#16 // load key schedule... subs $rounds,$rounds,#2 aesd $tmpin,$dat1 aesimc $tmpin,$tmpin vld1.32 {$dat1},[$key1],#16 // load key schedule... b.gt .Loop_final_2nd_dec aesd $tmpin,$dat0 aesimc $tmpin,$tmpin vld1.32 {$dat0},[$key1] aesd $tmpin,$dat1 veor $tmpin,$tmpin,$dat0 veor $tmpin,$tmpin,$iv1 vst1.8 {$tmpin},[$out] mov $tmpinp,$inp add $tmpoutp,$out,#16 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks // to get the last encrypted block. .composite_dec_loop: subs $tailcnt,$tailcnt,#1 ldrb $l2outp,[$out,$tailcnt] ldrb $loutp,[$tmpinp,$tailcnt] strb $l2outp,[$tmpoutp,$tailcnt] strb $loutp,[$out,$tailcnt] b.gt .composite_dec_loop .Lxts_dec_load_done: vld1.8 {$tmpin},[$out] veor $tmpin,$tmpin,$iv0 // Decrypt the composite block to get the last second plain text block ldr $rounds,[$key_,#240] vld1.32 {$dat},[$key_],#16 sub $rounds,$rounds,#2 vld1.32 {$dat1},[$key_],#16 .Loop_final_dec: aesd $tmpin,$dat0 aesimc $tmpin,$tmpin vld1.32 {$dat0},[$key_],#16 // load key schedule... subs $rounds,$rounds,#2 aesd $tmpin,$dat1 aesimc $tmpin,$tmpin vld1.32 {$dat1},[$key_],#16 // load key schedule... b.gt .Loop_final_dec aesd $tmpin,$dat0 aesimc $tmpin,$tmpin vld1.32 {$dat0},[$key_] aesd $tmpin,$dat1 veor $tmpin,$tmpin,$dat0 veor $tmpin,$tmpin,$iv0 vst1.8 {$tmpin},[$out] .Lxts_dec_abort: ldp $tailcnt,$midnumx,[sp,#48] ldp $ivd10,$ivd20,[sp,#32] ldp $ivd30,$ivd40,[sp,#16] ldp $constnumx,$tmpinp,[sp],#64 .Lxts_dec_final_abort: ret .size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt ___ } }}} $code.=<<___; #endif ___ ######################################## if ($flavour =~ /64/) { ######## 64-bit code my %opcode = ( "aesd" => 0x4e285800, "aese" => 0x4e284800, "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); local *unaes = sub { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5), $mnemonic,$arg; }; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers s/@\s/\/\//o; # old->new style commentary #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or s/vmov\.i8/movi/o or # fix up legacy mnemonics s/vext\.8/ext/o or s/vrev32\.8/rev32/o or s/vtst\.8/cmtst/o or s/vshr/ushr/o or s/^(\s+)v/$1/o or # strip off v prefix s/\bbx\s+lr\b/ret/o; # fix up remaining legacy suffixes s/\.[ui]?8//o; m/\],#8/o and s/\.16b/\.8b/go; s/\.[ui]?32//o and s/\.16b/\.4s/go; s/\.[ui]?64//o and s/\.16b/\.2d/go; s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; # Switch preprocessor checks to aarch64 versions. s/__ARME([BL])__/__AARCH64E$1__/go; print $_,"\n"; } } else { ######## 32-bit code my %opcode = ( "aesd" => 0xf3b00340, "aese" => 0xf3b00300, "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); local *unaes = sub { my ($mnemonic,$arg)=@_; if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) |(($2&7)<<1) |(($2&8)<<2); # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; } }; sub unvtbl { my $arg=shift; $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && sprintf "vtbl.8 d%d,{q%d},d%d\n\t". "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; } sub unvdup32 { my $arg=shift; $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; } sub unvmov32 { my $arg=shift; $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; } foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers s/\/\/\s?/@ /o; # new->old style commentary # fix up remaining new-style suffixes s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or s/\],#[0-9]+/]!/o; s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or s/vtbl\.8\s+(.*)/unvtbl($1)/geo or s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/vmov\.32\s+(.*)/unvmov32($1)/geo or s/^(\s+)b\./$1b/o or s/^(\s+)ret/$1bx\tlr/o; if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { print " it $2\n"; } print $_,"\n"; } } close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/openssl/crypto/aes/asm/vpaes-armv8.pl b/crypto/openssl/crypto/aes/asm/vpaes-armv8.pl index dcd5065e68c0..49988e9c2b29 100755 --- a/crypto/openssl/crypto/aes/asm/vpaes-armv8.pl +++ b/crypto/openssl/crypto/aes/asm/vpaes-armv8.pl @@ -1,1281 +1,1284 @@ #! /usr/bin/env perl # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html ###################################################################### ## Constant-time SSSE3 AES core implementation. ## version 0.1 ## ## By Mike Hamburg (Stanford University), 2009 ## Public domain. ## ## For details see http://shiftleft.org/papers/vector_aes/ and ## http://crypto.stanford.edu/vpaes/. ## ###################################################################### # ARMv8 NEON adaptation by # # Reason for undertaken effort is that there is at least one popular # SoC based on Cortex-A53 that doesn't have crypto extensions. # # CBC enc ECB enc/dec(*) [bit-sliced enc/dec] # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] # ThunderX2(***) 39.4(**) 33.8/48.6(**) # # (*) ECB denotes approximate result for parallelizable modes # such as CBC decrypt, CTR, etc.; # (**) these results are worse than scalar compiler-generated # code, but it's constant-time and therefore preferred; # (***) presented for reference/comparison purposes; # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; $code.=<<___; +#include "arm_arch.h" + .text .type _vpaes_consts,%object .align 7 // totally strategic alignment _vpaes_consts: .Lk_mc_forward: // mc_forward .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 .quad 0x080B0A0904070605, 0x000302010C0F0E0D .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 .quad 0x000302010C0F0E0D, 0x080B0A0904070605 .Lk_mc_backward:// mc_backward .quad 0x0605040702010003, 0x0E0D0C0F0A09080B .quad 0x020100030E0D0C0F, 0x0A09080B06050407 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 .quad 0x0A09080B06050407, 0x020100030E0D0C0F .Lk_sr: // sr .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 .quad 0x030E09040F0A0500, 0x0B06010C07020D08 .quad 0x0F060D040B020900, 0x070E050C030A0108 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 // // "Hot" constants // .Lk_inv: // inv, inva .quad 0x0E05060F0D080180, 0x040703090A0B0C02 .quad 0x01040A060F0B0780, 0x030D0E0C02050809 .Lk_ipt: // input transform (lo, hi) .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 .Lk_sbo: // sbou, sbot .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA .Lk_sb1: // sb1u, sb1t .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 .Lk_sb2: // sb2u, sb2t .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD // // Decryption stuff // .Lk_dipt: // decryption input transform .quad 0x0F505B040B545F00, 0x154A411E114E451A .quad 0x86E383E660056500, 0x12771772F491F194 .Lk_dsbo: // decryption sbox final output .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C .Lk_dsb9: // decryption sbox output *9*u, *9*t .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 .Lk_dsbd: // decryption sbox output *D*u, *D*t .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 .Lk_dsbb: // decryption sbox output *B*u, *B*t .quad 0xD022649296B44200, 0x602646F6B0F2D404 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B .Lk_dsbe: // decryption sbox output *E*u, *E*t .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 // // Key schedule constants // .Lk_dksd: // decryption key schedule: invskew x*D .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E .Lk_dksb: // decryption key schedule: invskew x*B .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 .Lk_dks9: // decryption key schedule: invskew x*9 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE .Lk_rcon: // rcon .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 .Lk_opt: // output transform .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 .Lk_deskew: // deskew tables: inverts the sbox's "skew" .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 .asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" .size _vpaes_consts,.-_vpaes_consts .align 6 ___ { my ($inp,$out,$key) = map("x$_",(0..2)); my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); $code.=<<___; // // _aes_preheat // // Fills register %r10 -> .aes_consts (so you can -fPIC) // and %xmm9-%xmm15 as specified below. // .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: adr x10, .Lk_inv movi v17.16b, #0x0f ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 ret .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat // // _aes_encrypt_core // // AES-encrypt %xmm0. // // Inputs: // %xmm0 = input // %xmm9-%xmm15 as in _vpaes_preheat // (%rdx) = scheduled keys // // Output in %xmm0 // Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax // Preserves %xmm6 - %xmm8 so you get some local vectors // // .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: mov x9, $key ldr w8, [$key,#240] // pull rounds adr x11, .Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 b .Lenc_entry .align 4 .Lenc_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D sub w8, w8, #1 // nr-- .Lenc_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 ret .size _vpaes_encrypt_core,.-_vpaes_encrypt_core .globl vpaes_encrypt .type vpaes_encrypt,%function .align 4 vpaes_encrypt: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {v7.16b}, [$inp] bl _vpaes_encrypt_preheat bl _vpaes_encrypt_core st1 {v0.16b}, [$out] ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_encrypt,.-vpaes_encrypt .type _vpaes_encrypt_2x,%function .align 4 _vpaes_encrypt_2x: mov x9, $key ldr w8, [$key,#240] // pull rounds adr x11, .Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b ushr v8.16b, v15.16b, #4 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 tbl v9.16b, {$iptlo}, v9.16b // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 tbl v10.16b, {$ipthi}, v8.16b eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 eor v8.16b, v9.16b, v16.16b eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 eor v8.16b, v8.16b, v10.16b b .Lenc_2x_entry .align 4 .Lenc_2x_loop: // middle of middle round add x10, x11, #0x40 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u tbl v12.16b, {$sb1t}, v10.16b ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t tbl v8.16b, {$sb1u}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u tbl v13.16b, {$sb2t}, v10.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t tbl v10.16b, {$sb2u}, v11.16b ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B tbl v11.16b, {v8.16b}, v1.16b eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A eor v10.16b, v10.16b, v13.16b tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D tbl v8.16b, {v8.16b}, v4.16b eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B eor v11.16b, v11.16b, v10.16b tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C tbl v12.16b, {v11.16b},v1.16b eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D eor v8.16b, v8.16b, v11.16b and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D eor v8.16b, v8.16b, v12.16b sub w8, w8, #1 // nr-- .Lenc_2x_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i and v9.16b, v8.16b, v17.16b ushr v8.16b, v8.16b, #4 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k tbl v13.16b, {$invhi},v9.16b eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j eor v9.16b, v9.16b, v8.16b tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v11.16b, {$invlo},v8.16b tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j tbl v12.16b, {$invlo},v9.16b eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v11.16b, v11.16b, v13.16b eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k eor v12.16b, v12.16b, v13.16b tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v10.16b, {$invlo},v11.16b tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak tbl v11.16b, {$invlo},v12.16b eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_2x_loop // middle of last round add x10, x11, #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou tbl v12.16b, {$sbou}, v10.16b ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t tbl v8.16b, {$sbot}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A eor v8.16b, v8.16b, v12.16b tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 tbl v1.16b, {v8.16b},v1.16b ret .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x .type _vpaes_decrypt_preheat,%function .align 4 _vpaes_decrypt_preheat: adr x10, .Lk_inv movi v17.16b, #0x0f adr x11, .Lk_dipt ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe ret .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat // // Decryption core // // Same API as encryption core. // .type _vpaes_decrypt_core,%function .align 4 _vpaes_decrypt_core: mov x9, $key ldr w8, [$key,#240] // pull rounds // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 eor x11, x11, #0x30 // xor \$0x30, %r11 adr x10, .Lk_sr and x11, x11, #0x30 // and \$0x30, %r11 add x11, x11, x10 adr x10, .Lk_mc_forward+48 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 b .Ldec_entry .align 4 .Ldec_loop: // // Inverse mix columns // // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch sub w8, w8, #1 // sub \$1,%rax # nr-- .Ldec_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 cbnz w8, .Ldec_loop // middle of last round // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 ret .size _vpaes_decrypt_core,.-_vpaes_decrypt_core .globl vpaes_decrypt .type vpaes_decrypt,%function .align 4 vpaes_decrypt: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {v7.16b}, [$inp] bl _vpaes_decrypt_preheat bl _vpaes_decrypt_core st1 {v0.16b}, [$out] ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_decrypt,.-vpaes_decrypt // v14-v15 input, v0-v1 output .type _vpaes_decrypt_2x,%function .align 4 _vpaes_decrypt_2x: mov x9, $key ldr w8, [$key,#240] // pull rounds // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 eor x11, x11, #0x30 // xor \$0x30, %r11 adr x10, .Lk_sr and x11, x11, #0x30 // and \$0x30, %r11 add x11, x11, x10 adr x10, .Lk_mc_forward+48 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b ushr v8.16b, v15.16b, #4 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 tbl v10.16b, {$iptlo},v9.16b ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 tbl v8.16b, {$ipthi},v8.16b eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 eor v10.16b, v10.16b, v16.16b eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 eor v8.16b, v8.16b, v10.16b b .Ldec_2x_entry .align 4 .Ldec_2x_loop: // // Inverse mix columns // // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u tbl v12.16b, {$sb9u}, v10.16b tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t tbl v9.16b, {$sb9t}, v11.16b eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 eor v8.16b, v12.16b, v16.16b // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu tbl v12.16b, {$sbdu}, v10.16b tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch tbl v8.16b, {v8.16b},v5.16b tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt tbl v9.16b, {$sbdt}, v11.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch eor v8.16b, v8.16b, v12.16b // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch eor v8.16b, v8.16b, v9.16b // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu tbl v12.16b, {$sbbu}, v10.16b tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch tbl v8.16b, {v8.16b},v5.16b tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt tbl v9.16b, {$sbbt}, v11.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch eor v8.16b, v8.16b, v12.16b // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch eor v8.16b, v8.16b, v9.16b // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu tbl v12.16b, {$sbeu}, v10.16b tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch tbl v8.16b, {v8.16b},v5.16b tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet tbl v9.16b, {$sbet}, v11.16b eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch eor v8.16b, v8.16b, v12.16b ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch eor v8.16b, v8.16b, v9.16b sub w8, w8, #1 // sub \$1,%rax # nr-- .Ldec_2x_entry: // top of round and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i and v9.16b, v8.16b, v17.16b ushr v8.16b, v8.16b, #4 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k tbl v10.16b, {$invhi},v9.16b eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j eor v9.16b, v9.16b, v8.16b tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i tbl v11.16b, {$invlo},v8.16b tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j tbl v12.16b, {$invlo},v9.16b eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k eor v11.16b, v11.16b, v10.16b eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k eor v12.16b, v12.16b, v10.16b tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak tbl v10.16b, {$invlo},v11.16b tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak tbl v11.16b, {$invlo},v12.16b eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 cbnz w8, .Ldec_2x_loop // middle of last round // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou tbl v12.16b, {$sbou}, v10.16b // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t tbl v9.16b, {$sbot}, v11.16b ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A eor v8.16b, v9.16b, v12.16b tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 tbl v1.16b, {v8.16b},v2.16b ret .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x ___ } { my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); $code.=<<___; //////////////////////////////////////////////////////// // // // AES key schedule // // // //////////////////////////////////////////////////////// .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: adr x10, .Lk_inv movi v16.16b, #0x5b // .Lk_s63 adr x11, .Lk_sb1 movi v17.16b, #0x0f // .Lk_s0F ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt adr x10, .Lk_dksd ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 adr x11, .Lk_mc_forward ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 ld1 {v8.2d}, [x10] // .Lk_rcon ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] ret .size _vpaes_key_preheat,.-_vpaes_key_preheat .type _vpaes_schedule_core,%function .align 4 _vpaes_schedule_core: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp,#-16]! add x29,sp,#0 bl _vpaes_key_preheat // load the tables ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) // input transform mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 bl _vpaes_schedule_transform mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 add x8, x8, x10 cbnz $dir, .Lschedule_am_decrypting // encrypting, output zeroth round key after transform st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) b .Lschedule_go .Lschedule_am_decrypting: // decrypting, output zeroth round key after shiftrows ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) eor x8, x8, #0x30 // xor \$0x30, %r8 .Lschedule_go: cmp $bits, #192 // cmp \$192, %esi b.hi .Lschedule_256 b.eq .Lschedule_192 // 128: fall though // // .schedule_128 // // 128-bit specific part of key schedule. // // This schedule is really simple, because all its parts // are accomplished by the subroutines. // .Lschedule_128: mov $inp, #10 // mov \$10, %esi .Loop_schedule_128: sub $inp, $inp, #1 // dec %esi bl _vpaes_schedule_round cbz $inp, .Lschedule_mangle_last bl _vpaes_schedule_mangle // write output b .Loop_schedule_128 // // .aes_schedule_192 // // 192-bit specific part of key schedule. // // The main body of this schedule is the same as the 128-bit // schedule, but with more smearing. The long, high side is // stored in %xmm7 as before, and the short, low side is in // the high bits of %xmm6. // // This schedule is somewhat nastier, however, because each // round produces 192 bits of key material, or 1.5 round keys. // Therefore, on each cycle we do 2 rounds and produce 3 round // keys. // .align 4 .Lschedule_192: sub $inp, $inp, #8 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) bl _vpaes_schedule_transform // input transform mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros mov $inp, #4 // mov \$4, %esi .Loop_schedule_192: sub $inp, $inp, #1 // dec %esi bl _vpaes_schedule_round ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle // save key n bl _vpaes_schedule_192_smear bl _vpaes_schedule_mangle // save key n+1 bl _vpaes_schedule_round cbz $inp, .Lschedule_mangle_last bl _vpaes_schedule_mangle // save key n+2 bl _vpaes_schedule_192_smear b .Loop_schedule_192 // // .aes_schedule_256 // // 256-bit specific part of key schedule. // // The structure here is very similar to the 128-bit // schedule, but with an additional "low side" in // %xmm6. The low side's rounds are the same as the // high side's, except no rcon and no rotation. // .align 4 .Lschedule_256: ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform // input transform mov $inp, #7 // mov \$7, %esi .Loop_schedule_256: sub $inp, $inp, #1 // dec %esi bl _vpaes_schedule_mangle // output low result mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 // high round bl _vpaes_schedule_round cbz $inp, .Lschedule_mangle_last bl _vpaes_schedule_mangle // low round. swap xmm7 and xmm6 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 movi v4.16b, #0 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 bl _vpaes_schedule_low_round mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 b .Loop_schedule_256 // // .aes_schedule_mangle_last // // Mangler for last round of key schedule // Mangles %xmm0 // when encrypting, outputs out(%xmm0) ^ 63 // when decrypting, outputs unskew(%xmm0) // // Always called right before return... jumps to cleanup and exits // .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew cbnz $dir, .Lschedule_mangle_last_dec // encrypting ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform add $out, $out, #32 // add \$32, %rdx tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute .Lschedule_mangle_last_dec: ld1 {v20.2d-v21.2d}, [x11] // reload constants sub $out, $out, #16 // add \$-16, %rdx eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform // output transform st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key // cleanup eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 ldp x29, x30, [sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size _vpaes_schedule_core,.-_vpaes_schedule_core // // .aes_schedule_192_smear // // Smear the short, low side in the 192-bit key schedule. // // Inputs: // %xmm7: high side, b a x y // %xmm6: low side, d c 0 0 // %xmm13: 0 // // Outputs: // %xmm6: b+c+d b+c 0 0 // %xmm0: b+c+d b+c b a // .type _vpaes_schedule_192_smear,%function .align 4 _vpaes_schedule_192_smear: movi v1.16b, #0 dup v0.4s, v7.s[3] ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros ret .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear // // .aes_schedule_round // // Runs one main round of the key schedule on %xmm0, %xmm7 // // Specifically, runs subbytes on the high dword of %xmm0 // then rotates it by one byte and xors into the low dword of // %xmm7. // // Adds rcon from low byte of %xmm8, then rotates %xmm8 for // next rcon. // // Smears the dwords of %xmm7 by xoring the low into the // second low, result into third, result into highest. // // Returns results in %xmm7 = %xmm0. // Clobbers %xmm1-%xmm4, %r11. // .type _vpaes_schedule_round,%function .align 4 _vpaes_schedule_round: // extract rcon from xmm8 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 // rotate dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 // fall through... // low round: same as high round, but no rotation and no rcon. _vpaes_schedule_low_round: // smear xmm7 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 // subbytes and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output // add in smeared stuff eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 ret .size _vpaes_schedule_round,.-_vpaes_schedule_round // // .aes_schedule_transform // // Linear-transform %xmm0 according to tables at (%r11) // // Requires that %xmm9 = 0x0F0F... as in preheat // Output in %xmm0 // Clobbers %xmm1, %xmm2 // .type _vpaes_schedule_transform,%function .align 4 _vpaes_schedule_transform: and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 // vmovdqa (%r11), %xmm2 # lo tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 // vmovdqa 16(%r11), %xmm1 # hi tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 ret .size _vpaes_schedule_transform,.-_vpaes_schedule_transform // // .aes_schedule_mangle // // Mangle xmm0 from (basis-transformed) standard version // to our version. // // On encrypt, // xor with 0x63 // multiply by circulant 0,1,1,1 // apply shiftrows transform // // On decrypt, // xor with 0x63 // multiply by "inverse mixcolumns" circulant E,B,D,9 // deskew // apply shiftrows transform // // // Writes out to (%rdx), and increments or decrements it // Keeps track of round number mod 4 in %r8 // Preserves xmm0 // Clobbers xmm1-xmm5 // .type _vpaes_schedule_mangle,%function .align 4 _vpaes_schedule_mangle: mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later // vmovdqa .Lk_mc_forward(%rip),%xmm5 cbnz $dir, .Lschedule_mangle_dec // encrypting eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 add $out, $out, #16 // add \$16, %rdx tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 b .Lschedule_mangle_both .align 4 .Lschedule_mangle_dec: // inverse mix columns // lea .Lk_dksd(%rip),%r11 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo // vmovdqa 0x00(%r11), %xmm2 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 // vmovdqa 0x10(%r11), %xmm3 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 // vmovdqa 0x20(%r11), %xmm2 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 // vmovdqa 0x30(%r11), %xmm3 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 // vmovdqa 0x40(%r11), %xmm2 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 // vmovdqa 0x50(%r11), %xmm3 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 // vmovdqa 0x60(%r11), %xmm2 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 // vmovdqa 0x70(%r11), %xmm4 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 sub $out, $out, #16 // add \$-16, %rdx .Lschedule_mangle_both: tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 add x8, x8, #64-16 // add \$-16, %r8 and x8, x8, #~(1<<6) // and \$0x30, %r8 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) ret .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle .globl vpaes_set_encrypt_key .type vpaes_set_encrypt_key,%function .align 4 vpaes_set_encrypt_key: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so lsr w9, $bits, #5 // shr \$5,%eax add w9, w9, #5 // \$5,%eax str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov $dir, #0 // mov \$0,%ecx mov x8, #0x30 // mov \$0x30,%r8d bl _vpaes_schedule_core eor x0, x0, x0 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key .globl vpaes_set_decrypt_key .type vpaes_set_decrypt_key,%function .align 4 vpaes_set_decrypt_key: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so lsr w9, $bits, #5 // shr \$5,%eax add w9, w9, #5 // \$5,%eax str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; lsl w9, w9, #4 // shl \$4,%eax add $out, $out, #16 // lea 16(%rdx,%rax),%rdx add $out, $out, x9 mov $dir, #1 // mov \$1,%ecx lsr w8, $bits, #1 // shr \$1,%r8d and x8, x8, #32 // and \$32,%r8d eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32 bl _vpaes_schedule_core ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key ___ } { my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5)); $code.=<<___; .globl vpaes_cbc_encrypt .type vpaes_cbc_encrypt,%function .align 4 vpaes_cbc_encrypt: + AARCH64_SIGN_LINK_REGISTER cbz $len, .Lcbc_abort cmp w5, #0 // check direction b.eq vpaes_cbc_decrypt - .inst 0xd503233f // paciasp stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x17, $len // reassign mov x2, $key // reassign ld1 {v0.16b}, [$ivec] // load ivec bl _vpaes_encrypt_preheat b .Lcbc_enc_loop .align 4 .Lcbc_enc_loop: ld1 {v7.16b}, [$inp],#16 // load input eor v7.16b, v7.16b, v0.16b // xor with ivec bl _vpaes_encrypt_core st1 {v0.16b}, [$out],#16 // save output subs x17, x17, #16 b.hi .Lcbc_enc_loop st1 {v0.16b}, [$ivec] // write ivec ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp .Lcbc_abort: + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt .type vpaes_cbc_decrypt,%function .align 4 vpaes_cbc_decrypt: - .inst 0xd503233f // paciasp + // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to + // only from vpaes_cbc_encrypt which has already signed the return address. stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! mov x17, $len // reassign mov x2, $key // reassign ld1 {v6.16b}, [$ivec] // load ivec bl _vpaes_decrypt_preheat tst x17, #16 b.eq .Lcbc_dec_loop2x ld1 {v7.16b}, [$inp], #16 // load input bl _vpaes_decrypt_core eor v0.16b, v0.16b, v6.16b // xor with ivec orr v6.16b, v7.16b, v7.16b // next ivec value st1 {v0.16b}, [$out], #16 subs x17, x17, #16 b.ls .Lcbc_dec_done .align 4 .Lcbc_dec_loop2x: ld1 {v14.16b,v15.16b}, [$inp], #32 bl _vpaes_decrypt_2x eor v0.16b, v0.16b, v6.16b // xor with ivec eor v1.16b, v1.16b, v14.16b orr v6.16b, v15.16b, v15.16b st1 {v0.16b,v1.16b}, [$out], #32 subs x17, x17, #32 b.hi .Lcbc_dec_loop2x .Lcbc_dec_done: st1 {v6.16b}, [$ivec] ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt ___ if (1) { $code.=<<___; .globl vpaes_ecb_encrypt .type vpaes_ecb_encrypt,%function .align 4 vpaes_ecb_encrypt: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! mov x17, $len mov x2, $key bl _vpaes_encrypt_preheat tst x17, #16 b.eq .Lecb_enc_loop ld1 {v7.16b}, [$inp],#16 bl _vpaes_encrypt_core st1 {v0.16b}, [$out],#16 subs x17, x17, #16 b.ls .Lecb_enc_done .align 4 .Lecb_enc_loop: ld1 {v14.16b,v15.16b}, [$inp], #32 bl _vpaes_encrypt_2x st1 {v0.16b,v1.16b}, [$out], #32 subs x17, x17, #32 b.hi .Lecb_enc_loop .Lecb_enc_done: ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt .globl vpaes_ecb_decrypt .type vpaes_ecb_decrypt,%function .align 4 vpaes_ecb_decrypt: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 stp d8,d9,[sp,#-16]! // ABI spec says so stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! mov x17, $len mov x2, $key bl _vpaes_decrypt_preheat tst x17, #16 b.eq .Lecb_dec_loop ld1 {v7.16b}, [$inp],#16 bl _vpaes_encrypt_core st1 {v0.16b}, [$out],#16 subs x17, x17, #16 b.ls .Lecb_dec_done .align 4 .Lecb_dec_loop: ld1 {v14.16b,v15.16b}, [$inp], #32 bl _vpaes_decrypt_2x st1 {v0.16b,v1.16b}, [$out], #32 subs x17, x17, #32 b.hi .Lecb_dec_loop .Lecb_dec_done: ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt ___ } } print $code; close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/openssl/crypto/aes/build.info b/crypto/openssl/crypto/aes/build.info index 271015e35e1b..dec340779281 100644 --- a/crypto/openssl/crypto/aes/build.info +++ b/crypto/openssl/crypto/aes/build.info @@ -1,128 +1,129 @@ LIBS=../../libcrypto $AESASM=aes_core.c aes_cbc.c IF[{- !$disabled{asm} -}] $AESASM_x86=aes-586.S $AESDEF_x86=AES_ASM $AESASM_x86_sse2=vpaes-x86.S aesni-x86.S $AESDEF_x86_sse2=VPAES_ASM OPENSSL_IA32_SSE2 $AESASM_x86_64=\ aes-x86_64.s vpaes-x86_64.s bsaes-x86_64.s aesni-x86_64.s \ aesni-sha1-x86_64.s aesni-sha256-x86_64.s aesni-mb-x86_64.s $AESDEF_x86_64=AES_ASM VPAES_ASM BSAES_ASM $AESASM_ia64=aes_core.c aes_cbc.c aes-ia64.s $AESDEF_ia64=AES_ASM $AESASM_sparcv9=\ aes_core.c aes_cbc.c aes-sparcv9.S aest4-sparcv9.S aesfx-sparcv9.S $AESDEF_sparcv9=AES_ASM $AESASM_mips32=aes_cbc.c aes-mips.S $AESDEF_mips32=AES_ASM $AESASM_mips64=$AESASM_mips32 $AESDEF_mips64=$AESDEF_mips32 $AESASM_s390x=aes-s390x.S # aes-390x.S implements AES_ctr32_encrypt and AES_xts_[en|de]crypt $AESDEF_s390x=AES_ASM AES_CTR_ASM AES_XTS_ASM $AESASM_armv4=aes_cbc.c aes-armv4.S bsaes-armv7.S aesv8-armx.S $AESDEF_armv4=AES_ASM BSAES_ASM $AESASM_aarch64=aes_core.c aes_cbc.c aesv8-armx.S vpaes-armv8.S $AESDEF_aarch64=VPAES_ASM $AESASM_parisc11=aes_core.c aes_cbc.c aes-parisc.s $AESDEF_parisc11=AES_ASM $AESASM_parisc20_64=$AESASM_parisc11 $AESDEF_parisc20_64=$AESDEF_parisc11 $AESASM_ppc32=aes_core.c aes_cbc.c aes-ppc.s vpaes-ppc.s aesp8-ppc.s $AESDEF_ppc32=AES_ASM VPAES_ASM $AESASM_ppc64=$AESASM_ppc32 $AESDEF_ppc64=$AESDEF_ppc32 $AESASM_c64xplus=aes-c64xplus.s aes_cbc.c # aes-c64xplus.s implements AES_ctr32_encrypt $AESDEF_c64xplus=AES_ASM AES_CTR_ASM # Now that we have defined all the arch specific variables, use the # appropriate one, and define the appropriate macros IF[$AESASM_{- $target{asm_arch} -}] $AESASM=$AESASM_{- $target{asm_arch} -} $AESDEF=$AESDEF_{- $target{asm_arch} -} IF[{- !$disabled{sse2} -}] $AESASM=$AESASM $AESASM_{- $target{asm_arch} -}_sse2 $AESDEF=$AESDEF $AESDEF_{- $target{asm_arch} -}_sse2 ENDIF ENDIF ENDIF $COMMON=aes_misc.c aes_ecb.c $AESASM SOURCE[../../libcrypto]=$COMMON aes_cfb.c aes_ofb.c aes_wrap.c IF[{- !$disabled{'deprecated-3.0'} -}] SOURCE[../../libcrypto]=aes_ige.c ENDIF SOURCE[../../providers/libfips.a]=$COMMON # Implementations are now spread across several libraries, so the defines # need to be applied to all affected libraries and modules. DEFINE[../../libcrypto]=$AESDEF DEFINE[../../providers/libfips.a]=$AESDEF DEFINE[../../providers/libdefault.a]=$AESDEF # We only need to include the AESDEF stuff in the legacy provider when it's a # separate module and it's dynamically linked with libcrypto. Otherwise, it # already gets everything that the static libcrypto.a has, and doesn't need it # added again. IF[{- !$disabled{module} && !$disabled{shared} -}] DEFINE[../../providers/liblegacy.a]=$AESDEF ENDIF GENERATE[aes-ia64.s]=asm/aes-ia64.S GENERATE[aes-586.S]=asm/aes-586.pl DEPEND[aes-586.S]=../perlasm/x86asm.pl GENERATE[vpaes-x86.S]=asm/vpaes-x86.pl DEPEND[vpaes-586.S]=../perlasm/x86asm.pl GENERATE[aesni-x86.S]=asm/aesni-x86.pl DEPEND[aesni-586.S]=../perlasm/x86asm.pl GENERATE[aes-x86_64.s]=asm/aes-x86_64.pl GENERATE[vpaes-x86_64.s]=asm/vpaes-x86_64.pl GENERATE[bsaes-x86_64.s]=asm/bsaes-x86_64.pl GENERATE[aesni-x86_64.s]=asm/aesni-x86_64.pl GENERATE[aesni-sha1-x86_64.s]=asm/aesni-sha1-x86_64.pl GENERATE[aesni-sha256-x86_64.s]=asm/aesni-sha256-x86_64.pl GENERATE[aesni-mb-x86_64.s]=asm/aesni-mb-x86_64.pl GENERATE[aes-sparcv9.S]=asm/aes-sparcv9.pl INCLUDE[aes-sparcv9.o]=.. GENERATE[aest4-sparcv9.S]=asm/aest4-sparcv9.pl INCLUDE[aest4-sparcv9.o]=.. DEPEND[aest4-sparcv9.S]=../perlasm/sparcv9_modes.pl GENERATE[aesfx-sparcv9.S]=asm/aesfx-sparcv9.pl INCLUDE[aesfx-sparcv9.o]=.. GENERATE[aes-ppc.s]=asm/aes-ppc.pl GENERATE[vpaes-ppc.s]=asm/vpaes-ppc.pl GENERATE[aesp8-ppc.s]=asm/aesp8-ppc.pl GENERATE[aes-parisc.s]=asm/aes-parisc.pl GENERATE[aes-mips.S]=asm/aes-mips.pl INCLUDE[aes-mips.o]=.. GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl INCLUDE[aesv8-armx.o]=.. GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl +INCLUDE[vpaes-armv8.o]=.. GENERATE[aes-armv4.S]=asm/aes-armv4.pl INCLUDE[aes-armv4.o]=.. GENERATE[bsaes-armv7.S]=asm/bsaes-armv7.pl INCLUDE[bsaes-armv7.o]=.. GENERATE[aes-s390x.S]=asm/aes-s390x.pl INCLUDE[aes-s390x.o]=.. GENERATE[aes-c64xplus.S]=asm/aes-c64xplus.pl diff --git a/crypto/openssl/crypto/arm64cpuid.pl b/crypto/openssl/crypto/arm64cpuid.pl index ac76dd449f37..11f0e5027942 100755 --- a/crypto/openssl/crypto/arm64cpuid.pl +++ b/crypto/openssl/crypto/arm64cpuid.pl @@ -1,157 +1,167 @@ #! /usr/bin/env perl # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; $code.=<<___; #include "arm_arch.h" .text .arch armv8-a+crypto .align 5 .globl _armv7_neon_probe .type _armv7_neon_probe,%function _armv7_neon_probe: + AARCH64_VALID_CALL_TARGET orr v15.16b, v15.16b, v15.16b ret .size _armv7_neon_probe,.-_armv7_neon_probe .globl _armv7_tick .type _armv7_tick,%function _armv7_tick: + AARCH64_VALID_CALL_TARGET #ifdef __APPLE__ mrs x0, CNTPCT_EL0 #else mrs x0, CNTVCT_EL0 #endif ret .size _armv7_tick,.-_armv7_tick .globl _armv8_aes_probe .type _armv8_aes_probe,%function _armv8_aes_probe: + AARCH64_VALID_CALL_TARGET aese v0.16b, v0.16b ret .size _armv8_aes_probe,.-_armv8_aes_probe .globl _armv8_sha1_probe .type _armv8_sha1_probe,%function _armv8_sha1_probe: + AARCH64_VALID_CALL_TARGET sha1h s0, s0 ret .size _armv8_sha1_probe,.-_armv8_sha1_probe .globl _armv8_sha256_probe .type _armv8_sha256_probe,%function _armv8_sha256_probe: + AARCH64_VALID_CALL_TARGET sha256su0 v0.4s, v0.4s ret .size _armv8_sha256_probe,.-_armv8_sha256_probe .globl _armv8_pmull_probe .type _armv8_pmull_probe,%function _armv8_pmull_probe: + AARCH64_VALID_CALL_TARGET pmull v0.1q, v0.1d, v0.1d ret .size _armv8_pmull_probe,.-_armv8_pmull_probe .globl _armv8_sha512_probe .type _armv8_sha512_probe,%function _armv8_sha512_probe: + AARCH64_VALID_CALL_TARGET .long 0xcec08000 // sha512su0 v0.2d,v0.2d ret .size _armv8_sha512_probe,.-_armv8_sha512_probe .globl _armv8_cpuid_probe .type _armv8_cpuid_probe,%function _armv8_cpuid_probe: + AARCH64_VALID_CALL_TARGET mrs x0, midr_el1 ret .size _armv8_cpuid_probe,.-_armv8_cpuid_probe .globl OPENSSL_cleanse .type OPENSSL_cleanse,%function .align 5 OPENSSL_cleanse: + AARCH64_VALID_CALL_TARGET cbz x1,.Lret // len==0? cmp x1,#15 b.hi .Lot // len>15 nop .Little: strb wzr,[x0],#1 // store byte-by-byte subs x1,x1,#1 b.ne .Little .Lret: ret .align 4 .Lot: tst x0,#7 b.eq .Laligned // inp is aligned strb wzr,[x0],#1 // store byte-by-byte sub x1,x1,#1 b .Lot .align 4 .Laligned: str xzr,[x0],#8 // store word-by-word sub x1,x1,#8 tst x1,#-8 b.ne .Laligned // len>=8 cbnz x1,.Little // len!=0? ret .size OPENSSL_cleanse,.-OPENSSL_cleanse .globl CRYPTO_memcmp .type CRYPTO_memcmp,%function .align 4 CRYPTO_memcmp: + AARCH64_VALID_CALL_TARGET eor w3,w3,w3 cbz x2,.Lno_data // len==0? cmp x2,#16 b.ne .Loop_cmp ldp x8,x9,[x0] ldp x10,x11,[x1] eor x8,x8,x10 eor x9,x9,x11 orr x8,x8,x9 mov x0,#1 cmp x8,#0 csel x0,xzr,x0,eq ret .align 4 .Loop_cmp: ldrb w4,[x0],#1 ldrb w5,[x1],#1 eor w4,w4,w5 orr w3,w3,w4 subs x2,x2,#1 b.ne .Loop_cmp .Lno_data: neg w0,w3 lsr w0,w0,#31 ret .size CRYPTO_memcmp,.-CRYPTO_memcmp ___ print $code; close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/openssl/crypto/arm_arch.h b/crypto/openssl/crypto/arm_arch.h index ec4a087fede2..7bedb385d971 100644 --- a/crypto/openssl/crypto/arm_arch.h +++ b/crypto/openssl/crypto/arm_arch.h @@ -1,124 +1,182 @@ /* * Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved. * * Licensed under the Apache License 2.0 (the "License"). You may not use * this file except in compliance with the License. You can obtain a copy * in the file LICENSE in the source distribution or at * https://www.openssl.org/source/license.html */ #ifndef OSSL_CRYPTO_ARM_ARCH_H # define OSSL_CRYPTO_ARM_ARCH_H # if !defined(__ARM_ARCH__) # if defined(__CC_ARM) # define __ARM_ARCH__ __TARGET_ARCH_ARM # if defined(__BIG_ENDIAN) # define __ARMEB__ # else # define __ARMEL__ # endif # elif defined(__GNUC__) # if defined(__aarch64__) # define __ARM_ARCH__ 8 /* * Why doesn't gcc define __ARM_ARCH__? Instead it defines * bunch of below macros. See all_architectures[] table in * gcc/config/arm/arm.c. On a side note it defines * __ARMEL__/__ARMEB__ for little-/big-endian. */ # elif defined(__ARM_ARCH) # define __ARM_ARCH__ __ARM_ARCH # elif defined(__ARM_ARCH_8A__) # define __ARM_ARCH__ 8 # elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ defined(__ARM_ARCH_7EM__) # define __ARM_ARCH__ 7 # elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \ defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \ defined(__ARM_ARCH_6T2__) # define __ARM_ARCH__ 6 # elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \ defined(__ARM_ARCH_5TEJ__) # define __ARM_ARCH__ 5 # elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) # define __ARM_ARCH__ 4 # else # error "unsupported ARM architecture" # endif # endif # endif # if !defined(__ARM_MAX_ARCH__) # define __ARM_MAX_ARCH__ __ARM_ARCH__ # endif # if __ARM_MAX_ARCH__<__ARM_ARCH__ # error "__ARM_MAX_ARCH__ can't be less than __ARM_ARCH__" # elif __ARM_MAX_ARCH__!=__ARM_ARCH__ # if __ARM_ARCH__<7 && __ARM_MAX_ARCH__>=7 && defined(__ARMEB__) # error "can't build universal big-endian binary" # endif # endif # ifndef __ASSEMBLER__ extern unsigned int OPENSSL_armcap_P; extern unsigned int OPENSSL_arm_midr; extern unsigned int OPENSSL_armv8_rsa_neonized; # endif # define ARMV7_NEON (1<<0) # define ARMV7_TICK (1<<1) # define ARMV8_AES (1<<2) # define ARMV8_SHA1 (1<<3) # define ARMV8_SHA256 (1<<4) # define ARMV8_PMULL (1<<5) # define ARMV8_SHA512 (1<<6) # define ARMV8_CPUID (1<<7) /* * MIDR_EL1 system register * * 63___ _ ___32_31___ _ ___24_23_____20_19_____16_15__ _ __4_3_______0 * | | | | | | | * |RES0 | Implementer | Variant | Arch | PartNum |Revision| * |____ _ _____|_____ _ _____|_________|_______ _|____ _ ___|________| * */ # define ARM_CPU_IMP_ARM 0x41 # define ARM_CPU_PART_CORTEX_A72 0xD08 # define ARM_CPU_PART_N1 0xD0C # define MIDR_PARTNUM_SHIFT 4 # define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) # define MIDR_PARTNUM(midr) \ (((midr) & MIDR_PARTNUM_MASK) >> MIDR_PARTNUM_SHIFT) # define MIDR_IMPLEMENTER_SHIFT 24 # define MIDR_IMPLEMENTER_MASK (0xffU << MIDR_IMPLEMENTER_SHIFT) # define MIDR_IMPLEMENTER(midr) \ (((midr) & MIDR_IMPLEMENTER_MASK) >> MIDR_IMPLEMENTER_SHIFT) # define MIDR_ARCHITECTURE_SHIFT 16 # define MIDR_ARCHITECTURE_MASK (0xfU << MIDR_ARCHITECTURE_SHIFT) # define MIDR_ARCHITECTURE(midr) \ (((midr) & MIDR_ARCHITECTURE_MASK) >> MIDR_ARCHITECTURE_SHIFT) # define MIDR_CPU_MODEL_MASK \ (MIDR_IMPLEMENTER_MASK | \ MIDR_PARTNUM_MASK | \ MIDR_ARCHITECTURE_MASK) # define MIDR_CPU_MODEL(imp, partnum) \ (((imp) << MIDR_IMPLEMENTER_SHIFT) | \ (0xfU << MIDR_ARCHITECTURE_SHIFT) | \ ((partnum) << MIDR_PARTNUM_SHIFT)) # define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) + +#if defined(__ASSEMBLER__) + + /* + * Support macros for + * - Armv8.3-A Pointer Authentication and + * - Armv8.5-A Branch Target Identification + * features which require emitting a .note.gnu.property section with the + * appropriate architecture-dependent feature bits set. + * Read more: "ELF for the Arm® 64-bit Architecture" + */ + +# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 +# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ +# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ +# else +# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ +# define AARCH64_VALID_CALL_TARGET +# endif + +# if defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */ +# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) /* Has Pointer Authentication */ +# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */ +# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */ +# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ + (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */ +# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ + (1 << 1) /* Has Pointer Authentication */ +# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */ +# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */ +# else +# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */ +# if GNU_PROPERTY_AARCH64_BTI != 0 +# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET +# else +# define AARCH64_SIGN_LINK_REGISTER +# endif +# define AARCH64_VALIDATE_LINK_REGISTER +# endif + +# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 + .pushsection .note.gnu.property, "a"; + .balign 8; + .long 4; + .long 0x10; + .long 0x5; + .asciz "GNU"; + .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ + .long 4; + .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); + .long 0; + .popsection; +# endif + +# endif /* defined __ASSEMBLER__ */ + #endif diff --git a/crypto/openssl/crypto/bn/asm/armv8-mont.pl b/crypto/openssl/crypto/bn/asm/armv8-mont.pl index 54d2e8245f15..21ab12bdf07e 100755 --- a/crypto/openssl/crypto/bn/asm/armv8-mont.pl +++ b/crypto/openssl/crypto/bn/asm/armv8-mont.pl @@ -1,1898 +1,1907 @@ #! /usr/bin/env perl # Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # March 2015 # # "Teaser" Montgomery multiplication module for ARMv8. Needs more # work. While it does improve RSA sign performance by 20-30% (less for # longer keys) on most processors, for some reason RSA2048 is not # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication # instruction issue rate is limited on processor in question, meaning # that dedicated squaring procedure is a must. Well, actually all # contemporary AArch64 processors seem to have limited multiplication # issue rate, i.e. they can't issue multiplication every cycle, which # explains moderate improvement coefficients in comparison to # compiler-generated code. Recall that compiler is instructed to use # umulh and therefore uses same amount of multiplication instructions # to do the job. Assembly's edge is to minimize number of "collateral" # instructions and of course instruction scheduling. # # April 2015 # # Squaring procedure that handles lengths divisible by 8 improves # RSA/DSA performance by 25-40-60% depending on processor and key # length. Overall improvement coefficients are always positive in # comparison to compiler-generated code. On Cortex-A57 improvement # is still modest on longest key lengths, while others exhibit e.g. # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster # on Cortex-A57 and ~60-100% faster on others. # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $1"; *STDOUT=*OUT; ($lo0,$hi0,$aj,$m0,$alo,$ahi, $lo1,$hi1,$nj,$m1,$nlo,$nhi, $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); # int bn_mul_mont( $rp="x0"; # BN_ULONG *rp, $ap="x1"; # const BN_ULONG *ap, $bp="x2"; # const BN_ULONG *bp, $np="x3"; # const BN_ULONG *np, $n0="x4"; # const BN_ULONG *n0, $num="x5"; # int num); $code.=<<___; +#include "arm_arch.h" #ifndef __KERNEL__ -# include "arm_arch.h" .extern OPENSSL_armv8_rsa_neonized .hidden OPENSSL_armv8_rsa_neonized #endif .text .globl bn_mul_mont .type bn_mul_mont,%function .align 5 bn_mul_mont: + AARCH64_SIGN_LINK_REGISTER .Lbn_mul_mont: tst $num,#3 b.ne .Lmul_mont cmp $num,#32 b.le .Lscalar_impl #ifndef __KERNEL__ adrp x17,OPENSSL_armv8_rsa_neonized ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] cbnz w17, bn_mul8x_mont_neon #endif .Lscalar_impl: tst $num,#7 b.eq __bn_sqr8x_mont tst $num,#3 b.eq __bn_mul4x_mont .Lmul_mont: stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] ldr $m0,[$bp],#8 // bp[0] sub $tp,sp,$num,lsl#3 ldp $hi0,$aj,[$ap],#16 // ap[0..1] lsl $num,$num,#3 ldr $n0,[$n0] // *n0 and $tp,$tp,#-16 // ABI says so ldp $hi1,$nj,[$np],#16 // np[0..1] mul $lo0,$hi0,$m0 // ap[0]*bp[0] sub $j,$num,#16 // j=num-2 umulh $hi0,$hi0,$m0 mul $alo,$aj,$m0 // ap[1]*bp[0] umulh $ahi,$aj,$m0 mul $m1,$lo0,$n0 // "tp[0]"*n0 mov sp,$tp // alloca // (*) mul $lo1,$hi1,$m1 // np[0]*m1 umulh $hi1,$hi1,$m1 mul $nlo,$nj,$m1 // np[1]*m1 // (*) adds $lo1,$lo1,$lo0 // discarded // (*) As for removal of first multiplication and addition // instructions. The outcome of first addition is // guaranteed to be zero, which leaves two computationally // significant outcomes: it either carries or not. Then // question is when does it carry? Is there alternative // way to deduce it? If you follow operations, you can // observe that condition for carry is quite simple: // $lo0 being non-zero. So that carry can be calculated // by adding -1 to $lo0. That's what next instruction does. subs xzr,$lo0,#1 // (*) umulh $nhi,$nj,$m1 adc $hi1,$hi1,xzr cbz $j,.L1st_skip .L1st: ldr $aj,[$ap],#8 adds $lo0,$alo,$hi0 sub $j,$j,#8 // j-- adc $hi0,$ahi,xzr ldr $nj,[$np],#8 adds $lo1,$nlo,$hi1 mul $alo,$aj,$m0 // ap[j]*bp[0] adc $hi1,$nhi,xzr umulh $ahi,$aj,$m0 adds $lo1,$lo1,$lo0 mul $nlo,$nj,$m1 // np[j]*m1 adc $hi1,$hi1,xzr umulh $nhi,$nj,$m1 str $lo1,[$tp],#8 // tp[j-1] cbnz $j,.L1st .L1st_skip: adds $lo0,$alo,$hi0 sub $ap,$ap,$num // rewind $ap adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 sub $np,$np,$num // rewind $np adc $hi1,$nhi,xzr adds $lo1,$lo1,$lo0 sub $i,$num,#8 // i=num-1 adcs $hi1,$hi1,$hi0 adc $ovf,xzr,xzr // upmost overflow bit stp $lo1,$hi1,[$tp] .Louter: ldr $m0,[$bp],#8 // bp[i] ldp $hi0,$aj,[$ap],#16 ldr $tj,[sp] // tp[0] add $tp,sp,#8 mul $lo0,$hi0,$m0 // ap[0]*bp[i] sub $j,$num,#16 // j=num-2 umulh $hi0,$hi0,$m0 ldp $hi1,$nj,[$np],#16 mul $alo,$aj,$m0 // ap[1]*bp[i] adds $lo0,$lo0,$tj umulh $ahi,$aj,$m0 adc $hi0,$hi0,xzr mul $m1,$lo0,$n0 sub $i,$i,#8 // i-- // (*) mul $lo1,$hi1,$m1 // np[0]*m1 umulh $hi1,$hi1,$m1 mul $nlo,$nj,$m1 // np[1]*m1 // (*) adds $lo1,$lo1,$lo0 subs xzr,$lo0,#1 // (*) umulh $nhi,$nj,$m1 cbz $j,.Linner_skip .Linner: ldr $aj,[$ap],#8 adc $hi1,$hi1,xzr ldr $tj,[$tp],#8 // tp[j] adds $lo0,$alo,$hi0 sub $j,$j,#8 // j-- adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 ldr $nj,[$np],#8 adc $hi1,$nhi,xzr mul $alo,$aj,$m0 // ap[j]*bp[i] adds $lo0,$lo0,$tj umulh $ahi,$aj,$m0 adc $hi0,$hi0,xzr mul $nlo,$nj,$m1 // np[j]*m1 adds $lo1,$lo1,$lo0 umulh $nhi,$nj,$m1 stur $lo1,[$tp,#-16] // tp[j-1] cbnz $j,.Linner .Linner_skip: ldr $tj,[$tp],#8 // tp[j] adc $hi1,$hi1,xzr adds $lo0,$alo,$hi0 sub $ap,$ap,$num // rewind $ap adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 sub $np,$np,$num // rewind $np adcs $hi1,$nhi,$ovf adc $ovf,xzr,xzr adds $lo0,$lo0,$tj adc $hi0,$hi0,xzr adds $lo1,$lo1,$lo0 adcs $hi1,$hi1,$hi0 adc $ovf,$ovf,xzr // upmost overflow bit stp $lo1,$hi1,[$tp,#-16] cbnz $i,.Louter // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr $tj,[sp] // tp[0] add $tp,sp,#8 ldr $nj,[$np],#8 // np[0] subs $j,$num,#8 // j=num-1 and clear borrow mov $ap,$rp .Lsub: sbcs $aj,$tj,$nj // tp[j]-np[j] ldr $tj,[$tp],#8 sub $j,$j,#8 // j-- ldr $nj,[$np],#8 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] cbnz $j,.Lsub sbcs $aj,$tj,$nj sbcs $ovf,$ovf,xzr // did it borrow? str $aj,[$ap],#8 // rp[num-1] ldr $tj,[sp] // tp[0] add $tp,sp,#8 ldr $aj,[$rp],#8 // rp[0] sub $num,$num,#8 // num-- nop .Lcond_copy: sub $num,$num,#8 // num-- csel $nj,$tj,$aj,lo // did it borrow? ldr $tj,[$tp],#8 ldr $aj,[$rp],#8 stur xzr,[$tp,#-16] // wipe tp stur $nj,[$rp,#-16] cbnz $num,.Lcond_copy csel $nj,$tj,$aj,lo stur xzr,[$tp,#-8] // wipe tp stur $nj,[$rp,#-8] ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER ret .size bn_mul_mont,.-bn_mul_mont ___ { my ($A0,$A1,$N0,$N1)=map("v$_",(0..3)); my ($Z,$Temp)=("v4.16b","v5"); my @ACC=map("v$_",(6..13)); my ($Bi,$Ni,$M0)=map("v$_",(28..30)); my $sBi="s28"; my $sM0="s30"; my $zero="v14"; my $temp="v15"; my $ACCTemp="v16"; my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); $code.=<<___; .type bn_mul8x_mont_neon,%function .align 5 bn_mul8x_mont_neon: + // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to + // only from bn_mul_mont which has already signed the return address. stp x29,x30,[sp,#-80]! mov x16,sp stp d8,d9,[sp,#16] stp d10,d11,[sp,#32] stp d12,d13,[sp,#48] stp d14,d15,[sp,#64] lsl $num,$num,#1 eor $zero.16b,$zero.16b,$zero.16b .align 4 .LNEON_8n: eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b sub $toutptr,sp,#128 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b sub $toutptr,$toutptr,$num,lsl#4 eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b and $toutptr,$toutptr,#-64 eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b mov sp,$toutptr // alloca eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b add $toutptr,$toutptr,#256 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b sub $inner,$num,#8 eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b .LNEON_8n_init: st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 subs $inner,$inner,#8 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 bne .LNEON_8n_init add $tinptr,sp,#256 ld1 {$A0.4s,$A1.4s},[$aptr],#32 add $bnptr,sp,#8 ldr $sM0,[$n0],#4 mov $outer,$num b .LNEON_8n_outer .align 4 .LNEON_8n_outer: ldr $sBi,[$bptr],#4 // *b++ uxtl $Bi.4s,$Bi.4h add $toutptr,sp,#128 ld1 {$N0.4s,$N1.4s},[$nptr],#32 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] umlal @ACC[1].2d,$Bi.2s,$A0.s[1] umlal @ACC[2].2d,$Bi.2s,$A0.s[2] shl $Ni.2d,@ACC[0].2d,#16 ext $Ni.16b,$Ni.16b,$Ni.16b,#8 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] add $Ni.2d,$Ni.2d,@ACC[0].2d umlal @ACC[4].2d,$Bi.2s,$A1.s[0] mul $Ni.2s,$Ni.2s,$M0.2s umlal @ACC[5].2d,$Bi.2s,$A1.s[1] st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] umlal @ACC[6].2d,$Bi.2s,$A1.s[2] uxtl $Ni.4s,$Ni.4h umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ___ for ($i=0; $i<7;) { $code.=<<___; ldr $sBi,[$bptr],#4 // *b++ umlal @ACC[0].2d,$Ni.2s,$N0.s[0] umlal @ACC[1].2d,$Ni.2s,$N0.s[1] uxtl $Bi.4s,$Bi.4h umlal @ACC[2].2d,$Ni.2s,$N0.s[2] ushr $temp.2d,@ACC[0].2d,#16 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] umlal @ACC[4].2d,$Ni.2s,$N1.s[0] ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 add @ACC[0].2d,@ACC[0].2d,$temp.2d umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ushr @ACC[0].2d,@ACC[0].2d,#16 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] umlal @ACC[7].2d,$Ni.2s,$N1.s[3] add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d ins @ACC[1].d[0],$ACCTemp.d[0] st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] ___ push(@ACC,shift(@ACC)); $i++; $code.=<<___; umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ld1 {@ACC[7].2d},[$tinptr],#16 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] umlal @ACC[2].2d,$Bi.2s,$A0.s[2] shl $Ni.2d,@ACC[0].2d,#16 ext $Ni.16b,$Ni.16b,$Ni.16b,#8 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] add $Ni.2d,$Ni.2d,@ACC[0].2d umlal @ACC[4].2d,$Bi.2s,$A1.s[0] mul $Ni.2s,$Ni.2s,$M0.2s umlal @ACC[5].2d,$Bi.2s,$A1.s[1] st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i] umlal @ACC[6].2d,$Bi.2s,$A1.s[2] uxtl $Ni.4s,$Ni.4h umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ___ } $code.=<<___; ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ld1 {$A0.4s,$A1.4s},[$aptr],#32 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] umlal @ACC[2].2d,$Ni.2s,$N0.s[2] mov $Temp.16b,@ACC[0].16b ushr $Temp.2d,$Temp.2d,#16 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] umlal @ACC[4].2d,$Ni.2s,$N1.s[0] add @ACC[0].2d,@ACC[0].2d,$Temp.2d umlal @ACC[5].2d,$Ni.2s,$N1.s[1] ushr @ACC[0].2d,@ACC[0].2d,#16 eor $temp.16b,$temp.16b,$temp.16b ins @ACC[0].d[1],$temp.d[0] umlal @ACC[6].2d,$Ni.2s,$N1.s[2] umlal @ACC[7].2d,$Ni.2s,$N1.s[3] add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] add $bnptr,sp,#8 // rewind ___ push(@ACC,shift(@ACC)); $code.=<<___; sub $inner,$num,#8 b .LNEON_8n_inner .align 4 .LNEON_8n_inner: subs $inner,$inner,#8 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ld1 {@ACC[7].2d},[$tinptr] umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0] umlal @ACC[2].2d,$Bi.2s,$A0.s[2] ld1 {$N0.4s,$N1.4s},[$nptr],#32 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] b.eq .LInner_jump add $tinptr,$tinptr,#16 // don't advance in last iteration .LInner_jump: umlal @ACC[4].2d,$Bi.2s,$A1.s[0] umlal @ACC[5].2d,$Bi.2s,$A1.s[1] umlal @ACC[6].2d,$Bi.2s,$A1.s[2] umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ___ for ($i=1; $i<8; $i++) { $code.=<<___; ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i] umlal @ACC[0].2d,$Ni.2s,$N0.s[0] umlal @ACC[1].2d,$Ni.2s,$N0.s[1] umlal @ACC[2].2d,$Ni.2s,$N0.s[2] umlal @ACC[3].2d,$Ni.2s,$N0.s[3] umlal @ACC[4].2d,$Ni.2s,$N1.s[0] umlal @ACC[5].2d,$Ni.2s,$N1.s[1] umlal @ACC[6].2d,$Ni.2s,$N1.s[2] umlal @ACC[7].2d,$Ni.2s,$N1.s[3] st1 {@ACC[0].2d},[$toutptr],#16 ___ push(@ACC,shift(@ACC)); $code.=<<___; umlal @ACC[0].2d,$Bi.2s,$A0.s[0] ld1 {@ACC[7].2d},[$tinptr] umlal @ACC[1].2d,$Bi.2s,$A0.s[1] ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i] umlal @ACC[2].2d,$Bi.2s,$A0.s[2] b.eq .LInner_jump$i add $tinptr,$tinptr,#16 // don't advance in last iteration .LInner_jump$i: umlal @ACC[3].2d,$Bi.2s,$A0.s[3] umlal @ACC[4].2d,$Bi.2s,$A1.s[0] umlal @ACC[5].2d,$Bi.2s,$A1.s[1] umlal @ACC[6].2d,$Bi.2s,$A1.s[2] umlal @ACC[7].2d,$Bi.2s,$A1.s[3] ___ } $code.=<<___; b.ne .LInner_after_rewind$i sub $aptr,$aptr,$num,lsl#2 // rewind .LInner_after_rewind$i: umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ld1 {$A0.4s,$A1.4s},[$aptr],#32 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] add $bnptr,sp,#8 // rewind umlal @ACC[3].2d,$Ni.2s,$N0.s[3] umlal @ACC[4].2d,$Ni.2s,$N1.s[0] umlal @ACC[5].2d,$Ni.2s,$N1.s[1] umlal @ACC[6].2d,$Ni.2s,$N1.s[2] st1 {@ACC[0].2d},[$toutptr],#16 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] bne .LNEON_8n_inner ___ push(@ACC,shift(@ACC)); $code.=<<___; add $tinptr,sp,#128 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 eor $N0.16b,$N0.16b,$N0.16b // $N0 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 eor $N1.16b,$N1.16b,$N1.16b // $N1 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 st1 {@ACC[6].2d},[$toutptr] subs $outer,$outer,#8 ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32 ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32 ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32 ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 b.eq .LInner_8n_jump_2steps sub $nptr,$nptr,$num,lsl#2 // rewind b .LNEON_8n_outer .LInner_8n_jump_2steps: add $toutptr,sp,#128 st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame mov $Temp.16b,@ACC[0].16b ushr $temp.2d,@ACC[0].2d,#16 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 st1 {$N0.2d,$N1.2d}, [sp],#32 add @ACC[0].2d,@ACC[0].2d,$temp.2d st1 {$N0.2d,$N1.2d}, [sp],#32 ushr $temp.2d,@ACC[0].2d,#16 st1 {$N0.2d,$N1.2d}, [sp],#32 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h ins $temp.d[1],$zero.d[0] mov $inner,$num b .LNEON_tail_entry .align 4 .LNEON_tail: add @ACC[0].2d,@ACC[0].2d,$temp.2d mov $Temp.16b,@ACC[0].16b ushr $temp.2d,@ACC[0].2d,#16 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32 add @ACC[0].2d,@ACC[0].2d,$temp.2d ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32 ushr $temp.2d,@ACC[0].2d,#16 ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h ins $temp.d[1],$zero.d[0] .LNEON_tail_entry: ___ for ($i=1; $i<8; $i++) { $code.=<<___; add @ACC[1].2d,@ACC[1].2d,$temp.2d st1 {@ACC[0].s}[0], [$toutptr],#4 ushr $temp.2d,@ACC[1].2d,#16 mov $Temp.16b,@ACC[1].16b ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8 add @ACC[1].2d,@ACC[1].2d,$temp.2d ushr $temp.2d,@ACC[1].2d,#16 zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h ins $temp.d[1],$zero.d[0] ___ push(@ACC,shift(@ACC)); } push(@ACC,shift(@ACC)); $code.=<<___; ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32 subs $inner,$inner,#8 st1 {@ACC[7].s}[0], [$toutptr],#4 bne .LNEON_tail st1 {$temp.s}[0], [$toutptr],#4 // top-most bit sub $nptr,$nptr,$num,lsl#2 // rewind $nptr subs $aptr,sp,#0 // clear carry flag add $bptr,sp,$num,lsl#2 .LNEON_sub: ldp w4,w5,[$aptr],#8 ldp w6,w7,[$aptr],#8 ldp w8,w9,[$nptr],#8 ldp w10,w11,[$nptr],#8 sbcs w8,w4,w8 sbcs w9,w5,w9 sbcs w10,w6,w10 sbcs w11,w7,w11 sub x17,$bptr,$aptr stp w8,w9,[$rptr],#8 stp w10,w11,[$rptr],#8 cbnz x17,.LNEON_sub ldr w10, [$aptr] // load top-most bit mov x11,sp eor v0.16b,v0.16b,v0.16b sub x11,$bptr,x11 // this is num*4 eor v1.16b,v1.16b,v1.16b mov $aptr,sp sub $rptr,$rptr,x11 // rewind $rptr mov $nptr,$bptr // second 3/4th of frame sbcs w10,w10,wzr // result is carry flag .LNEON_copy_n_zap: ldp w4,w5,[$aptr],#8 ldp w6,w7,[$aptr],#8 ldp w8,w9,[$rptr],#8 ldp w10,w11,[$rptr] sub $rptr,$rptr,#8 b.cs .LCopy_1 mov w8,w4 mov w9,w5 mov w10,w6 mov w11,w7 .LCopy_1: st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe ldp w4,w5,[$aptr],#8 ldp w6,w7,[$aptr],#8 stp w8,w9,[$rptr],#8 stp w10,w11,[$rptr],#8 sub $aptr,$aptr,#32 ldp w8,w9,[$rptr],#8 ldp w10,w11,[$rptr] sub $rptr,$rptr,#8 b.cs .LCopy_2 mov w8, w4 mov w9, w5 mov w10, w6 mov w11, w7 .LCopy_2: st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe sub x17,$bptr,$aptr // preserves carry stp w8,w9,[$rptr],#8 stp w10,w11,[$rptr],#8 cbnz x17,.LNEON_copy_n_zap mov sp,x16 ldp d14,d15,[sp,#64] ldp d12,d13,[sp,#48] ldp d10,d11,[sp,#32] ldp d8,d9,[sp,#16] ldr x29,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER ret // bx lr .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon ___ } { ######################################################################## # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); my ($cnt,$carry,$topmost)=("x27","x28","x30"); my ($tp,$ap_end,$na0)=($bp,$np,$carry); $code.=<<___; .type __bn_sqr8x_mont,%function .align 5 __bn_sqr8x_mont: cmp $ap,$bp b.ne __bn_mul4x_mont .Lsqr8x_mont: - .inst 0xd503233f // paciasp + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to + // only from bn_mul_mont which has already signed the return address. stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] stp $rp,$np,[sp,#96] // offload rp and np ldp $a0,$a1,[$ap,#8*0] ldp $a2,$a3,[$ap,#8*2] ldp $a4,$a5,[$ap,#8*4] ldp $a6,$a7,[$ap,#8*6] sub $tp,sp,$num,lsl#4 lsl $num,$num,#3 ldr $n0,[$n0] // *n0 mov sp,$tp // alloca sub $cnt,$num,#8*8 b .Lsqr8x_zero_start .Lsqr8x_zero: sub $cnt,$cnt,#8*8 stp xzr,xzr,[$tp,#8*0] stp xzr,xzr,[$tp,#8*2] stp xzr,xzr,[$tp,#8*4] stp xzr,xzr,[$tp,#8*6] .Lsqr8x_zero_start: stp xzr,xzr,[$tp,#8*8] stp xzr,xzr,[$tp,#8*10] stp xzr,xzr,[$tp,#8*12] stp xzr,xzr,[$tp,#8*14] add $tp,$tp,#8*16 cbnz $cnt,.Lsqr8x_zero add $ap_end,$ap,$num add $ap,$ap,#8*8 mov $acc0,xzr mov $acc1,xzr mov $acc2,xzr mov $acc3,xzr mov $acc4,xzr mov $acc5,xzr mov $acc6,xzr mov $acc7,xzr mov $tp,sp str $n0,[x29,#112] // offload n0 // Multiply everything but a[i]*a[i] .align 4 .Lsqr8x_outer_loop: // a[1]a[0] (i) // a[2]a[0] // a[3]a[0] // a[4]a[0] // a[5]a[0] // a[6]a[0] // a[7]a[0] // a[2]a[1] (ii) // a[3]a[1] // a[4]a[1] // a[5]a[1] // a[6]a[1] // a[7]a[1] // a[3]a[2] (iii) // a[4]a[2] // a[5]a[2] // a[6]a[2] // a[7]a[2] // a[4]a[3] (iv) // a[5]a[3] // a[6]a[3] // a[7]a[3] // a[5]a[4] (v) // a[6]a[4] // a[7]a[4] // a[6]a[5] (vi) // a[7]a[5] // a[7]a[6] (vii) mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) mul $t1,$a2,$a0 mul $t2,$a3,$a0 mul $t3,$a4,$a0 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) mul $t0,$a5,$a0 adcs $acc2,$acc2,$t1 mul $t1,$a6,$a0 adcs $acc3,$acc3,$t2 mul $t2,$a7,$a0 adcs $acc4,$acc4,$t3 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) adcs $acc5,$acc5,$t0 umulh $t0,$a2,$a0 adcs $acc6,$acc6,$t1 umulh $t1,$a3,$a0 adcs $acc7,$acc7,$t2 umulh $t2,$a4,$a0 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] adc $acc0,xzr,xzr // t[8] adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) umulh $t3,$a5,$a0 adcs $acc3,$acc3,$t0 umulh $t0,$a6,$a0 adcs $acc4,$acc4,$t1 umulh $t1,$a7,$a0 adcs $acc5,$acc5,$t2 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) adcs $acc6,$acc6,$t3 mul $t3,$a3,$a1 adcs $acc7,$acc7,$t0 mul $t0,$a4,$a1 adc $acc0,$acc0,$t1 mul $t1,$a5,$a1 adds $acc3,$acc3,$t2 mul $t2,$a6,$a1 adcs $acc4,$acc4,$t3 mul $t3,$a7,$a1 adcs $acc5,$acc5,$t0 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) adcs $acc6,$acc6,$t1 umulh $t1,$a3,$a1 adcs $acc7,$acc7,$t2 umulh $t2,$a4,$a1 adcs $acc0,$acc0,$t3 umulh $t3,$a5,$a1 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] adc $acc1,xzr,xzr // t[9] adds $acc4,$acc4,$t0 umulh $t0,$a6,$a1 adcs $acc5,$acc5,$t1 umulh $t1,$a7,$a1 adcs $acc6,$acc6,$t2 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) adcs $acc7,$acc7,$t3 mul $t3,$a4,$a2 adcs $acc0,$acc0,$t0 mul $t0,$a5,$a2 adc $acc1,$acc1,$t1 mul $t1,$a6,$a2 adds $acc5,$acc5,$t2 mul $t2,$a7,$a2 adcs $acc6,$acc6,$t3 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) adcs $acc7,$acc7,$t0 umulh $t0,$a4,$a2 adcs $acc0,$acc0,$t1 umulh $t1,$a5,$a2 adcs $acc1,$acc1,$t2 umulh $t2,$a6,$a2 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] adc $acc2,xzr,xzr // t[10] adds $acc6,$acc6,$t3 umulh $t3,$a7,$a2 adcs $acc7,$acc7,$t0 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) adcs $acc0,$acc0,$t1 mul $t1,$a5,$a3 adcs $acc1,$acc1,$t2 mul $t2,$a6,$a3 adc $acc2,$acc2,$t3 mul $t3,$a7,$a3 adds $acc7,$acc7,$t0 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) adcs $acc0,$acc0,$t1 umulh $t1,$a5,$a3 adcs $acc1,$acc1,$t2 umulh $t2,$a6,$a3 adcs $acc2,$acc2,$t3 umulh $t3,$a7,$a3 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] adc $acc3,xzr,xzr // t[11] adds $acc0,$acc0,$t0 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) adcs $acc1,$acc1,$t1 mul $t1,$a6,$a4 adcs $acc2,$acc2,$t2 mul $t2,$a7,$a4 adc $acc3,$acc3,$t3 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) adds $acc1,$acc1,$t0 umulh $t0,$a6,$a4 adcs $acc2,$acc2,$t1 umulh $t1,$a7,$a4 adcs $acc3,$acc3,$t2 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) adc $acc4,xzr,xzr // t[12] adds $acc2,$acc2,$t3 mul $t3,$a7,$a5 adcs $acc3,$acc3,$t0 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) adc $acc4,$acc4,$t1 umulh $t1,$a7,$a5 adds $acc3,$acc3,$t2 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) adcs $acc4,$acc4,$t3 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) adc $acc5,xzr,xzr // t[13] adds $acc4,$acc4,$t0 sub $cnt,$ap_end,$ap // done yet? adc $acc5,$acc5,$t1 adds $acc5,$acc5,$t2 sub $t0,$ap_end,$num // rewinded ap adc $acc6,xzr,xzr // t[14] add $acc6,$acc6,$t3 cbz $cnt,.Lsqr8x_outer_break mov $n0,$a0 ldp $a0,$a1,[$tp,#8*0] ldp $a2,$a3,[$tp,#8*2] ldp $a4,$a5,[$tp,#8*4] ldp $a6,$a7,[$tp,#8*6] adds $acc0,$acc0,$a0 adcs $acc1,$acc1,$a1 ldp $a0,$a1,[$ap,#8*0] adcs $acc2,$acc2,$a2 adcs $acc3,$acc3,$a3 ldp $a2,$a3,[$ap,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 ldp $a4,$a5,[$ap,#8*4] adcs $acc6,$acc6,$a6 mov $rp,$ap adcs $acc7,xzr,$a7 ldp $a6,$a7,[$ap,#8*6] add $ap,$ap,#8*8 //adc $carry,xzr,xzr // moved below mov $cnt,#-8*8 // a[8]a[0] // a[9]a[0] // a[a]a[0] // a[b]a[0] // a[c]a[0] // a[d]a[0] // a[e]a[0] // a[f]a[0] // a[8]a[1] // a[f]a[1]........................ // a[8]a[2] // a[f]a[2]........................ // a[8]a[3] // a[f]a[3]........................ // a[8]a[4] // a[f]a[4]........................ // a[8]a[5] // a[f]a[5]........................ // a[8]a[6] // a[f]a[6]........................ // a[8]a[7] // a[f]a[7]........................ .Lsqr8x_mul: mul $t0,$a0,$n0 adc $carry,xzr,xzr // carry bit, modulo-scheduled mul $t1,$a1,$n0 add $cnt,$cnt,#8 mul $t2,$a2,$n0 mul $t3,$a3,$n0 adds $acc0,$acc0,$t0 mul $t0,$a4,$n0 adcs $acc1,$acc1,$t1 mul $t1,$a5,$n0 adcs $acc2,$acc2,$t2 mul $t2,$a6,$n0 adcs $acc3,$acc3,$t3 mul $t3,$a7,$n0 adcs $acc4,$acc4,$t0 umulh $t0,$a0,$n0 adcs $acc5,$acc5,$t1 umulh $t1,$a1,$n0 adcs $acc6,$acc6,$t2 umulh $t2,$a2,$n0 adcs $acc7,$acc7,$t3 umulh $t3,$a3,$n0 adc $carry,$carry,xzr str $acc0,[$tp],#8 adds $acc0,$acc1,$t0 umulh $t0,$a4,$n0 adcs $acc1,$acc2,$t1 umulh $t1,$a5,$n0 adcs $acc2,$acc3,$t2 umulh $t2,$a6,$n0 adcs $acc3,$acc4,$t3 umulh $t3,$a7,$n0 ldr $n0,[$rp,$cnt] adcs $acc4,$acc5,$t0 adcs $acc5,$acc6,$t1 adcs $acc6,$acc7,$t2 adcs $acc7,$carry,$t3 //adc $carry,xzr,xzr // moved above cbnz $cnt,.Lsqr8x_mul // note that carry flag is guaranteed // to be zero at this point cmp $ap,$ap_end // done yet? b.eq .Lsqr8x_break ldp $a0,$a1,[$tp,#8*0] ldp $a2,$a3,[$tp,#8*2] ldp $a4,$a5,[$tp,#8*4] ldp $a6,$a7,[$tp,#8*6] adds $acc0,$acc0,$a0 ldur $n0,[$rp,#-8*8] adcs $acc1,$acc1,$a1 ldp $a0,$a1,[$ap,#8*0] adcs $acc2,$acc2,$a2 adcs $acc3,$acc3,$a3 ldp $a2,$a3,[$ap,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 ldp $a4,$a5,[$ap,#8*4] adcs $acc6,$acc6,$a6 mov $cnt,#-8*8 adcs $acc7,$acc7,$a7 ldp $a6,$a7,[$ap,#8*6] add $ap,$ap,#8*8 //adc $carry,xzr,xzr // moved above b .Lsqr8x_mul .align 4 .Lsqr8x_break: ldp $a0,$a1,[$rp,#8*0] add $ap,$rp,#8*8 ldp $a2,$a3,[$rp,#8*2] sub $t0,$ap_end,$ap // is it last iteration? ldp $a4,$a5,[$rp,#8*4] sub $t1,$tp,$t0 ldp $a6,$a7,[$rp,#8*6] cbz $t0,.Lsqr8x_outer_loop stp $acc0,$acc1,[$tp,#8*0] ldp $acc0,$acc1,[$t1,#8*0] stp $acc2,$acc3,[$tp,#8*2] ldp $acc2,$acc3,[$t1,#8*2] stp $acc4,$acc5,[$tp,#8*4] ldp $acc4,$acc5,[$t1,#8*4] stp $acc6,$acc7,[$tp,#8*6] mov $tp,$t1 ldp $acc6,$acc7,[$t1,#8*6] b .Lsqr8x_outer_loop .align 4 .Lsqr8x_outer_break: // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] ldp $t1,$t2,[sp,#8*1] ldp $a5,$a7,[$t0,#8*2] add $ap,$t0,#8*4 ldp $t3,$t0,[sp,#8*3] stp $acc0,$acc1,[$tp,#8*0] mul $acc0,$a1,$a1 stp $acc2,$acc3,[$tp,#8*2] umulh $a1,$a1,$a1 stp $acc4,$acc5,[$tp,#8*4] mul $a2,$a3,$a3 stp $acc6,$acc7,[$tp,#8*6] mov $tp,sp umulh $a3,$a3,$a3 adds $acc1,$a1,$t1,lsl#1 extr $t1,$t2,$t1,#63 sub $cnt,$num,#8*4 .Lsqr4x_shift_n_add: adcs $acc2,$a2,$t1 extr $t2,$t3,$t2,#63 sub $cnt,$cnt,#8*4 adcs $acc3,$a3,$t2 ldp $t1,$t2,[$tp,#8*5] mul $a4,$a5,$a5 ldp $a1,$a3,[$ap],#8*2 umulh $a5,$a5,$a5 mul $a6,$a7,$a7 umulh $a7,$a7,$a7 extr $t3,$t0,$t3,#63 stp $acc0,$acc1,[$tp,#8*0] adcs $acc4,$a4,$t3 extr $t0,$t1,$t0,#63 stp $acc2,$acc3,[$tp,#8*2] adcs $acc5,$a5,$t0 ldp $t3,$t0,[$tp,#8*7] extr $t1,$t2,$t1,#63 adcs $acc6,$a6,$t1 extr $t2,$t3,$t2,#63 adcs $acc7,$a7,$t2 ldp $t1,$t2,[$tp,#8*9] mul $a0,$a1,$a1 ldp $a5,$a7,[$ap],#8*2 umulh $a1,$a1,$a1 mul $a2,$a3,$a3 umulh $a3,$a3,$a3 stp $acc4,$acc5,[$tp,#8*4] extr $t3,$t0,$t3,#63 stp $acc6,$acc7,[$tp,#8*6] add $tp,$tp,#8*8 adcs $acc0,$a0,$t3 extr $t0,$t1,$t0,#63 adcs $acc1,$a1,$t0 ldp $t3,$t0,[$tp,#8*3] extr $t1,$t2,$t1,#63 cbnz $cnt,.Lsqr4x_shift_n_add ___ my ($np,$np_end)=($ap,$ap_end); $code.=<<___; ldp $np,$n0,[x29,#104] // pull np and n0 adcs $acc2,$a2,$t1 extr $t2,$t3,$t2,#63 adcs $acc3,$a3,$t2 ldp $t1,$t2,[$tp,#8*5] mul $a4,$a5,$a5 umulh $a5,$a5,$a5 stp $acc0,$acc1,[$tp,#8*0] mul $a6,$a7,$a7 umulh $a7,$a7,$a7 stp $acc2,$acc3,[$tp,#8*2] extr $t3,$t0,$t3,#63 adcs $acc4,$a4,$t3 extr $t0,$t1,$t0,#63 ldp $acc0,$acc1,[sp,#8*0] adcs $acc5,$a5,$t0 extr $t1,$t2,$t1,#63 ldp $a0,$a1,[$np,#8*0] adcs $acc6,$a6,$t1 extr $t2,xzr,$t2,#63 ldp $a2,$a3,[$np,#8*2] adc $acc7,$a7,$t2 ldp $a4,$a5,[$np,#8*4] // Reduce by 512 bits per iteration mul $na0,$n0,$acc0 // t[0]*n0 ldp $a6,$a7,[$np,#8*6] add $np_end,$np,$num ldp $acc2,$acc3,[sp,#8*2] stp $acc4,$acc5,[$tp,#8*4] ldp $acc4,$acc5,[sp,#8*4] stp $acc6,$acc7,[$tp,#8*6] ldp $acc6,$acc7,[sp,#8*6] add $np,$np,#8*8 mov $topmost,xzr // initial top-most carry mov $tp,sp mov $cnt,#8 .Lsqr8x_reduction: // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) mul $t1,$a1,$na0 sub $cnt,$cnt,#1 mul $t2,$a2,$na0 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing mul $t3,$a3,$na0 // (*) adds xzr,$acc0,$t0 subs xzr,$acc0,#1 // (*) mul $t0,$a4,$na0 adcs $acc0,$acc1,$t1 mul $t1,$a5,$na0 adcs $acc1,$acc2,$t2 mul $t2,$a6,$na0 adcs $acc2,$acc3,$t3 mul $t3,$a7,$na0 adcs $acc3,$acc4,$t0 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) adcs $acc4,$acc5,$t1 umulh $t1,$a1,$na0 adcs $acc5,$acc6,$t2 umulh $t2,$a2,$na0 adcs $acc6,$acc7,$t3 umulh $t3,$a3,$na0 adc $acc7,xzr,xzr adds $acc0,$acc0,$t0 umulh $t0,$a4,$na0 adcs $acc1,$acc1,$t1 umulh $t1,$a5,$na0 adcs $acc2,$acc2,$t2 umulh $t2,$a6,$na0 adcs $acc3,$acc3,$t3 umulh $t3,$a7,$na0 mul $na0,$n0,$acc0 // next t[0]*n0 adcs $acc4,$acc4,$t0 adcs $acc5,$acc5,$t1 adcs $acc6,$acc6,$t2 adc $acc7,$acc7,$t3 cbnz $cnt,.Lsqr8x_reduction ldp $t0,$t1,[$tp,#8*0] ldp $t2,$t3,[$tp,#8*2] mov $rp,$tp sub $cnt,$np_end,$np // done yet? adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 ldp $t0,$t1,[$tp,#8*4] adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 ldp $t2,$t3,[$tp,#8*6] adcs $acc4,$acc4,$t0 adcs $acc5,$acc5,$t1 adcs $acc6,$acc6,$t2 adcs $acc7,$acc7,$t3 //adc $carry,xzr,xzr // moved below cbz $cnt,.Lsqr8x8_post_condition ldur $n0,[$tp,#-8*8] ldp $a0,$a1,[$np,#8*0] ldp $a2,$a3,[$np,#8*2] ldp $a4,$a5,[$np,#8*4] mov $cnt,#-8*8 ldp $a6,$a7,[$np,#8*6] add $np,$np,#8*8 .Lsqr8x_tail: mul $t0,$a0,$n0 adc $carry,xzr,xzr // carry bit, modulo-scheduled mul $t1,$a1,$n0 add $cnt,$cnt,#8 mul $t2,$a2,$n0 mul $t3,$a3,$n0 adds $acc0,$acc0,$t0 mul $t0,$a4,$n0 adcs $acc1,$acc1,$t1 mul $t1,$a5,$n0 adcs $acc2,$acc2,$t2 mul $t2,$a6,$n0 adcs $acc3,$acc3,$t3 mul $t3,$a7,$n0 adcs $acc4,$acc4,$t0 umulh $t0,$a0,$n0 adcs $acc5,$acc5,$t1 umulh $t1,$a1,$n0 adcs $acc6,$acc6,$t2 umulh $t2,$a2,$n0 adcs $acc7,$acc7,$t3 umulh $t3,$a3,$n0 adc $carry,$carry,xzr str $acc0,[$tp],#8 adds $acc0,$acc1,$t0 umulh $t0,$a4,$n0 adcs $acc1,$acc2,$t1 umulh $t1,$a5,$n0 adcs $acc2,$acc3,$t2 umulh $t2,$a6,$n0 adcs $acc3,$acc4,$t3 umulh $t3,$a7,$n0 ldr $n0,[$rp,$cnt] adcs $acc4,$acc5,$t0 adcs $acc5,$acc6,$t1 adcs $acc6,$acc7,$t2 adcs $acc7,$carry,$t3 //adc $carry,xzr,xzr // moved above cbnz $cnt,.Lsqr8x_tail // note that carry flag is guaranteed // to be zero at this point ldp $a0,$a1,[$tp,#8*0] sub $cnt,$np_end,$np // done yet? sub $t2,$np_end,$num // rewinded np ldp $a2,$a3,[$tp,#8*2] ldp $a4,$a5,[$tp,#8*4] ldp $a6,$a7,[$tp,#8*6] cbz $cnt,.Lsqr8x_tail_break ldur $n0,[$rp,#-8*8] adds $acc0,$acc0,$a0 adcs $acc1,$acc1,$a1 ldp $a0,$a1,[$np,#8*0] adcs $acc2,$acc2,$a2 adcs $acc3,$acc3,$a3 ldp $a2,$a3,[$np,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 ldp $a4,$a5,[$np,#8*4] adcs $acc6,$acc6,$a6 mov $cnt,#-8*8 adcs $acc7,$acc7,$a7 ldp $a6,$a7,[$np,#8*6] add $np,$np,#8*8 //adc $carry,xzr,xzr // moved above b .Lsqr8x_tail .align 4 .Lsqr8x_tail_break: ldr $n0,[x29,#112] // pull n0 add $cnt,$tp,#8*8 // end of current t[num] window subs xzr,$topmost,#1 // "move" top-most carry to carry bit adcs $t0,$acc0,$a0 adcs $t1,$acc1,$a1 ldp $acc0,$acc1,[$rp,#8*0] adcs $acc2,$acc2,$a2 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] adcs $acc3,$acc3,$a3 ldp $a2,$a3,[$t2,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 ldp $a4,$a5,[$t2,#8*4] adcs $acc6,$acc6,$a6 adcs $acc7,$acc7,$a7 ldp $a6,$a7,[$t2,#8*6] add $np,$t2,#8*8 adc $topmost,xzr,xzr // top-most carry mul $na0,$n0,$acc0 stp $t0,$t1,[$tp,#8*0] stp $acc2,$acc3,[$tp,#8*2] ldp $acc2,$acc3,[$rp,#8*2] stp $acc4,$acc5,[$tp,#8*4] ldp $acc4,$acc5,[$rp,#8*4] cmp $cnt,x29 // did we hit the bottom? stp $acc6,$acc7,[$tp,#8*6] mov $tp,$rp // slide the window ldp $acc6,$acc7,[$rp,#8*6] mov $cnt,#8 b.ne .Lsqr8x_reduction // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. ldr $rp,[x29,#96] // pull rp add $tp,$tp,#8*8 subs $t0,$acc0,$a0 sbcs $t1,$acc1,$a1 sub $cnt,$num,#8*8 mov $ap_end,$rp // $rp copy .Lsqr8x_sub: sbcs $t2,$acc2,$a2 ldp $a0,$a1,[$np,#8*0] sbcs $t3,$acc3,$a3 stp $t0,$t1,[$rp,#8*0] sbcs $t0,$acc4,$a4 ldp $a2,$a3,[$np,#8*2] sbcs $t1,$acc5,$a5 stp $t2,$t3,[$rp,#8*2] sbcs $t2,$acc6,$a6 ldp $a4,$a5,[$np,#8*4] sbcs $t3,$acc7,$a7 ldp $a6,$a7,[$np,#8*6] add $np,$np,#8*8 ldp $acc0,$acc1,[$tp,#8*0] sub $cnt,$cnt,#8*8 ldp $acc2,$acc3,[$tp,#8*2] ldp $acc4,$acc5,[$tp,#8*4] ldp $acc6,$acc7,[$tp,#8*6] add $tp,$tp,#8*8 stp $t0,$t1,[$rp,#8*4] sbcs $t0,$acc0,$a0 stp $t2,$t3,[$rp,#8*6] add $rp,$rp,#8*8 sbcs $t1,$acc1,$a1 cbnz $cnt,.Lsqr8x_sub sbcs $t2,$acc2,$a2 mov $tp,sp add $ap,sp,$num ldp $a0,$a1,[$ap_end,#8*0] sbcs $t3,$acc3,$a3 stp $t0,$t1,[$rp,#8*0] sbcs $t0,$acc4,$a4 ldp $a2,$a3,[$ap_end,#8*2] sbcs $t1,$acc5,$a5 stp $t2,$t3,[$rp,#8*2] sbcs $t2,$acc6,$a6 ldp $acc0,$acc1,[$ap,#8*0] sbcs $t3,$acc7,$a7 ldp $acc2,$acc3,[$ap,#8*2] sbcs xzr,$topmost,xzr // did it borrow? ldr x30,[x29,#8] // pull return address stp $t0,$t1,[$rp,#8*4] stp $t2,$t3,[$rp,#8*6] sub $cnt,$num,#8*4 .Lsqr4x_cond_copy: sub $cnt,$cnt,#8*4 csel $t0,$acc0,$a0,lo stp xzr,xzr,[$tp,#8*0] csel $t1,$acc1,$a1,lo ldp $a0,$a1,[$ap_end,#8*4] ldp $acc0,$acc1,[$ap,#8*4] csel $t2,$acc2,$a2,lo stp xzr,xzr,[$tp,#8*2] add $tp,$tp,#8*4 csel $t3,$acc3,$a3,lo ldp $a2,$a3,[$ap_end,#8*6] ldp $acc2,$acc3,[$ap,#8*6] add $ap,$ap,#8*4 stp $t0,$t1,[$ap_end,#8*0] stp $t2,$t3,[$ap_end,#8*2] add $ap_end,$ap_end,#8*4 stp xzr,xzr,[$ap,#8*0] stp xzr,xzr,[$ap,#8*2] cbnz $cnt,.Lsqr4x_cond_copy csel $t0,$acc0,$a0,lo stp xzr,xzr,[$tp,#8*0] csel $t1,$acc1,$a1,lo stp xzr,xzr,[$tp,#8*2] csel $t2,$acc2,$a2,lo csel $t3,$acc3,$a3,lo stp $t0,$t1,[$ap_end,#8*0] stp $t2,$t3,[$ap_end,#8*2] b .Lsqr8x_done .align 4 .Lsqr8x8_post_condition: adc $carry,xzr,xzr ldr x30,[x29,#8] // pull return address // $acc0-7,$carry hold result, $a0-7 hold modulus subs $a0,$acc0,$a0 ldr $ap,[x29,#96] // pull rp sbcs $a1,$acc1,$a1 stp xzr,xzr,[sp,#8*0] sbcs $a2,$acc2,$a2 stp xzr,xzr,[sp,#8*2] sbcs $a3,$acc3,$a3 stp xzr,xzr,[sp,#8*4] sbcs $a4,$acc4,$a4 stp xzr,xzr,[sp,#8*6] sbcs $a5,$acc5,$a5 stp xzr,xzr,[sp,#8*8] sbcs $a6,$acc6,$a6 stp xzr,xzr,[sp,#8*10] sbcs $a7,$acc7,$a7 stp xzr,xzr,[sp,#8*12] sbcs $carry,$carry,xzr // did it borrow? stp xzr,xzr,[sp,#8*14] // $a0-7 hold result-modulus csel $a0,$acc0,$a0,lo csel $a1,$acc1,$a1,lo csel $a2,$acc2,$a2,lo csel $a3,$acc3,$a3,lo stp $a0,$a1,[$ap,#8*0] csel $a4,$acc4,$a4,lo csel $a5,$acc5,$a5,lo stp $a2,$a3,[$ap,#8*2] csel $a6,$acc6,$a6,lo csel $a7,$acc7,$a7,lo stp $a4,$a5,[$ap,#8*4] stp $a6,$a7,[$ap,#8*6] .Lsqr8x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 - .inst 0xd50323bf // autiasp + // x30 is loaded earlier + AARCH64_VALIDATE_LINK_REGISTER ret .size __bn_sqr8x_mont,.-__bn_sqr8x_mont ___ } { ######################################################################## # Even though this might look as ARMv8 adaptation of mulx4x_mont from # x86_64-mont5 module, it's different in sense that it performs # reduction 256 bits at a time. my ($a0,$a1,$a2,$a3, $t0,$t1,$t2,$t3, $m0,$m1,$m2,$m3, $acc0,$acc1,$acc2,$acc3,$acc4, $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); my $bp_end=$rp; my ($carry,$topmost) = ($rp,"x30"); $code.=<<___; .type __bn_mul4x_mont,%function .align 5 __bn_mul4x_mont: - .inst 0xd503233f // paciasp + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to + // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub $tp,sp,$num,lsl#3 lsl $num,$num,#3 ldr $n0,[$n0] // *n0 sub sp,$tp,#8*4 // alloca add $t0,$bp,$num add $ap_end,$ap,$num stp $rp,$t0,[x29,#96] // offload rp and &b[num] ldr $bi,[$bp,#8*0] // b[0] ldp $a0,$a1,[$ap,#8*0] // a[0..3] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 mov $acc0,xzr mov $acc1,xzr mov $acc2,xzr mov $acc3,xzr ldp $m0,$m1,[$np,#8*0] // n[0..3] ldp $m2,$m3,[$np,#8*2] adds $np,$np,#8*4 // clear carry bit mov $carry,xzr mov $cnt,#0 mov $tp,sp .Loop_mul4x_1st_reduction: mul $t0,$a0,$bi // lo(a[0..3]*b[0]) adc $carry,$carry,xzr // modulo-scheduled mul $t1,$a1,$bi add $cnt,$cnt,#8 mul $t2,$a2,$bi and $cnt,$cnt,#31 mul $t3,$a3,$bi adds $acc0,$acc0,$t0 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) adcs $acc1,$acc1,$t1 mul $mi,$acc0,$n0 // t[0]*n0 adcs $acc2,$acc2,$t2 umulh $t1,$a1,$bi adcs $acc3,$acc3,$t3 umulh $t2,$a2,$bi adc $acc4,xzr,xzr umulh $t3,$a3,$bi ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) adds $acc1,$acc1,$t0 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing adcs $acc2,$acc2,$t1 mul $t1,$m1,$mi adcs $acc3,$acc3,$t2 mul $t2,$m2,$mi adc $acc4,$acc4,$t3 // can't overflow mul $t3,$m3,$mi // (*) adds xzr,$acc0,$t0 subs xzr,$acc0,#1 // (*) umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) adcs $acc0,$acc1,$t1 umulh $t1,$m1,$mi adcs $acc1,$acc2,$t2 umulh $t2,$m2,$mi adcs $acc2,$acc3,$t3 umulh $t3,$m3,$mi adcs $acc3,$acc4,$carry adc $carry,xzr,xzr adds $acc0,$acc0,$t0 sub $t0,$ap_end,$ap adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_1st_reduction cbz $t0,.Lmul4x4_post_condition ldp $a0,$a1,[$ap,#8*0] // a[4..7] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 ldr $mi,[sp] // a[0]*n0 ldp $m0,$m1,[$np,#8*0] // n[4..7] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 .Loop_mul4x_1st_tail: mul $t0,$a0,$bi // lo(a[4..7]*b[i]) adc $carry,$carry,xzr // modulo-scheduled mul $t1,$a1,$bi add $cnt,$cnt,#8 mul $t2,$a2,$bi and $cnt,$cnt,#31 mul $t3,$a3,$bi adds $acc0,$acc0,$t0 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) adcs $acc1,$acc1,$t1 umulh $t1,$a1,$bi adcs $acc2,$acc2,$t2 umulh $t2,$a2,$bi adcs $acc3,$acc3,$t3 umulh $t3,$a3,$bi adc $acc4,xzr,xzr ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) adds $acc1,$acc1,$t0 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) adcs $acc2,$acc2,$t1 mul $t1,$m1,$mi adcs $acc3,$acc3,$t2 mul $t2,$m2,$mi adc $acc4,$acc4,$t3 // can't overflow mul $t3,$m3,$mi adds $acc0,$acc0,$t0 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) adcs $acc1,$acc1,$t1 umulh $t1,$m1,$mi adcs $acc2,$acc2,$t2 umulh $t2,$m2,$mi adcs $acc3,$acc3,$t3 adcs $acc4,$acc4,$carry umulh $t3,$m3,$mi adc $carry,xzr,xzr ldr $mi,[sp,$cnt] // next t[0]*n0 str $acc0,[$tp],#8 // result!!! adds $acc0,$acc1,$t0 sub $t0,$ap_end,$ap // done yet? adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 adcs $acc3,$acc4,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_1st_tail sub $t1,$ap_end,$num // rewinded $ap cbz $t0,.Lmul4x_proceed ldp $a0,$a1,[$ap,#8*0] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 ldp $m0,$m1,[$np,#8*0] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 b .Loop_mul4x_1st_tail .align 5 .Lmul4x_proceed: ldr $bi,[$bp,#8*4]! // *++b adc $topmost,$carry,xzr ldp $a0,$a1,[$t1,#8*0] // a[0..3] sub $np,$np,$num // rewind np ldp $a2,$a3,[$t1,#8*2] add $ap,$t1,#8*4 stp $acc0,$acc1,[$tp,#8*0] // result!!! ldp $acc0,$acc1,[sp,#8*4] // t[0..3] stp $acc2,$acc3,[$tp,#8*2] // result!!! ldp $acc2,$acc3,[sp,#8*6] ldp $m0,$m1,[$np,#8*0] // n[0..3] mov $tp,sp ldp $m2,$m3,[$np,#8*2] adds $np,$np,#8*4 // clear carry bit mov $carry,xzr .align 4 .Loop_mul4x_reduction: mul $t0,$a0,$bi // lo(a[0..3]*b[4]) adc $carry,$carry,xzr // modulo-scheduled mul $t1,$a1,$bi add $cnt,$cnt,#8 mul $t2,$a2,$bi and $cnt,$cnt,#31 mul $t3,$a3,$bi adds $acc0,$acc0,$t0 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) adcs $acc1,$acc1,$t1 mul $mi,$acc0,$n0 // t[0]*n0 adcs $acc2,$acc2,$t2 umulh $t1,$a1,$bi adcs $acc3,$acc3,$t3 umulh $t2,$a2,$bi adc $acc4,xzr,xzr umulh $t3,$a3,$bi ldr $bi,[$bp,$cnt] // next b[i] adds $acc1,$acc1,$t0 // (*) mul $t0,$m0,$mi str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing adcs $acc2,$acc2,$t1 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 adcs $acc3,$acc3,$t2 mul $t2,$m2,$mi adc $acc4,$acc4,$t3 // can't overflow mul $t3,$m3,$mi // (*) adds xzr,$acc0,$t0 subs xzr,$acc0,#1 // (*) umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 adcs $acc0,$acc1,$t1 umulh $t1,$m1,$mi adcs $acc1,$acc2,$t2 umulh $t2,$m2,$mi adcs $acc2,$acc3,$t3 umulh $t3,$m3,$mi adcs $acc3,$acc4,$carry adc $carry,xzr,xzr adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_reduction adc $carry,$carry,xzr ldp $t0,$t1,[$tp,#8*4] // t[4..7] ldp $t2,$t3,[$tp,#8*6] ldp $a0,$a1,[$ap,#8*0] // a[4..7] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr ldr $mi,[sp] // t[0]*n0 ldp $m0,$m1,[$np,#8*0] // n[4..7] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 .align 4 .Loop_mul4x_tail: mul $t0,$a0,$bi // lo(a[4..7]*b[4]) adc $carry,$carry,xzr // modulo-scheduled mul $t1,$a1,$bi add $cnt,$cnt,#8 mul $t2,$a2,$bi and $cnt,$cnt,#31 mul $t3,$a3,$bi adds $acc0,$acc0,$t0 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) adcs $acc1,$acc1,$t1 umulh $t1,$a1,$bi adcs $acc2,$acc2,$t2 umulh $t2,$a2,$bi adcs $acc3,$acc3,$t3 umulh $t3,$a3,$bi adc $acc4,xzr,xzr ldr $bi,[$bp,$cnt] // next b[i] adds $acc1,$acc1,$t0 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) adcs $acc2,$acc2,$t1 mul $t1,$m1,$mi adcs $acc3,$acc3,$t2 mul $t2,$m2,$mi adc $acc4,$acc4,$t3 // can't overflow mul $t3,$m3,$mi adds $acc0,$acc0,$t0 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) adcs $acc1,$acc1,$t1 umulh $t1,$m1,$mi adcs $acc2,$acc2,$t2 umulh $t2,$m2,$mi adcs $acc3,$acc3,$t3 umulh $t3,$m3,$mi adcs $acc4,$acc4,$carry ldr $mi,[sp,$cnt] // next a[0]*n0 adc $carry,xzr,xzr str $acc0,[$tp],#8 // result!!! adds $acc0,$acc1,$t0 sub $t0,$ap_end,$ap // done yet? adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 adcs $acc3,$acc4,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_tail sub $t1,$np,$num // rewinded np? adc $carry,$carry,xzr cbz $t0,.Loop_mul4x_break ldp $t0,$t1,[$tp,#8*4] ldp $t2,$t3,[$tp,#8*6] ldp $a0,$a1,[$ap,#8*0] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr ldp $m0,$m1,[$np,#8*0] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 b .Loop_mul4x_tail .align 4 .Loop_mul4x_break: ldp $t2,$t3,[x29,#96] // pull rp and &b[num] adds $acc0,$acc0,$topmost add $bp,$bp,#8*4 // bp++ adcs $acc1,$acc1,xzr sub $ap,$ap,$num // rewind ap adcs $acc2,$acc2,xzr stp $acc0,$acc1,[$tp,#8*0] // result!!! adcs $acc3,$acc3,xzr ldp $acc0,$acc1,[sp,#8*4] // t[0..3] adc $topmost,$carry,xzr stp $acc2,$acc3,[$tp,#8*2] // result!!! cmp $bp,$t3 // done yet? ldp $acc2,$acc3,[sp,#8*6] ldp $m0,$m1,[$t1,#8*0] // n[0..3] ldp $m2,$m3,[$t1,#8*2] add $np,$t1,#8*4 b.eq .Lmul4x_post ldr $bi,[$bp] ldp $a0,$a1,[$ap,#8*0] // a[0..3] ldp $a2,$a3,[$ap,#8*2] adds $ap,$ap,#8*4 // clear carry bit mov $carry,xzr mov $tp,sp b .Loop_mul4x_reduction .align 4 .Lmul4x_post: // Final step. We see if result is larger than modulus, and // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. mov $rp,$t2 mov $ap_end,$t2 // $rp copy subs $t0,$acc0,$m0 add $tp,sp,#8*8 sbcs $t1,$acc1,$m1 sub $cnt,$num,#8*4 .Lmul4x_sub: sbcs $t2,$acc2,$m2 ldp $m0,$m1,[$np,#8*0] sub $cnt,$cnt,#8*4 ldp $acc0,$acc1,[$tp,#8*0] sbcs $t3,$acc3,$m3 ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 ldp $acc2,$acc3,[$tp,#8*2] add $tp,$tp,#8*4 stp $t0,$t1,[$rp,#8*0] sbcs $t0,$acc0,$m0 stp $t2,$t3,[$rp,#8*2] add $rp,$rp,#8*4 sbcs $t1,$acc1,$m1 cbnz $cnt,.Lmul4x_sub sbcs $t2,$acc2,$m2 mov $tp,sp add $ap,sp,#8*4 ldp $a0,$a1,[$ap_end,#8*0] sbcs $t3,$acc3,$m3 stp $t0,$t1,[$rp,#8*0] ldp $a2,$a3,[$ap_end,#8*2] stp $t2,$t3,[$rp,#8*2] ldp $acc0,$acc1,[$ap,#8*0] ldp $acc2,$acc3,[$ap,#8*2] sbcs xzr,$topmost,xzr // did it borrow? ldr x30,[x29,#8] // pull return address sub $cnt,$num,#8*4 .Lmul4x_cond_copy: sub $cnt,$cnt,#8*4 csel $t0,$acc0,$a0,lo stp xzr,xzr,[$tp,#8*0] csel $t1,$acc1,$a1,lo ldp $a0,$a1,[$ap_end,#8*4] ldp $acc0,$acc1,[$ap,#8*4] csel $t2,$acc2,$a2,lo stp xzr,xzr,[$tp,#8*2] add $tp,$tp,#8*4 csel $t3,$acc3,$a3,lo ldp $a2,$a3,[$ap_end,#8*6] ldp $acc2,$acc3,[$ap,#8*6] add $ap,$ap,#8*4 stp $t0,$t1,[$ap_end,#8*0] stp $t2,$t3,[$ap_end,#8*2] add $ap_end,$ap_end,#8*4 cbnz $cnt,.Lmul4x_cond_copy csel $t0,$acc0,$a0,lo stp xzr,xzr,[$tp,#8*0] csel $t1,$acc1,$a1,lo stp xzr,xzr,[$tp,#8*2] csel $t2,$acc2,$a2,lo stp xzr,xzr,[$tp,#8*3] csel $t3,$acc3,$a3,lo stp xzr,xzr,[$tp,#8*4] stp $t0,$t1,[$ap_end,#8*0] stp $t2,$t3,[$ap_end,#8*2] b .Lmul4x_done .align 4 .Lmul4x4_post_condition: adc $carry,$carry,xzr ldr $ap,[x29,#96] // pull rp // $acc0-3,$carry hold result, $m0-7 hold modulus subs $a0,$acc0,$m0 ldr x30,[x29,#8] // pull return address sbcs $a1,$acc1,$m1 stp xzr,xzr,[sp,#8*0] sbcs $a2,$acc2,$m2 stp xzr,xzr,[sp,#8*2] sbcs $a3,$acc3,$m3 stp xzr,xzr,[sp,#8*4] sbcs xzr,$carry,xzr // did it borrow? stp xzr,xzr,[sp,#8*6] // $a0-3 hold result-modulus csel $a0,$acc0,$a0,lo csel $a1,$acc1,$a1,lo csel $a2,$acc2,$a2,lo csel $a3,$acc3,$a3,lo stp $a0,$a1,[$ap,#8*0] stp $a2,$a3,[$ap,#8*2] .Lmul4x_done: ldp x19,x20,[x29,#16] mov sp,x29 ldp x21,x22,[x29,#32] mov x0,#1 ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldr x29,[sp],#128 - .inst 0xd50323bf // autiasp + // x30 loaded earlier + AARCH64_VALIDATE_LINK_REGISTER ret .size __bn_mul4x_mont,.-__bn_mul4x_mont ___ } $code.=<<___; .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by " .align 4 ___ print $code; close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl b/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl index dcdc4a04e367..e1a8b8159421 100755 --- a/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl +++ b/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl @@ -1,1293 +1,1293 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # June 2015 # # ChaCha20 for ARMv8. # # April 2019 # # Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest # option on most(*), but not all, processors, yet 6+2 is retained. # This is because penalties are considered tolerable in comparison to # improvement on processors where 6+2 helps. Most notably +37% on # ThunderX2. It's server-oriented processor which will have to serve # as many requests as possible. While others are mostly clients, when # performance doesn't have to be absolute top-notch, just fast enough, # as majority of time is spent "entertaining" relatively slow human. # # Performance in cycles per byte out of large buffer. # # IALU/gcc-4.9 4xNEON+1xIALU 6xNEON+2xIALU # # Apple A7 5.50/+49% 2.72 1.60 # Cortex-A53 8.40/+80% 4.06 4.45(*) # Cortex-A57 8.06/+43% 4.15 4.40(*) # Denver 4.50/+82% 2.30 2.70(*) # X-Gene 9.50/+46% 8.20 8.90(*) # Mongoose 8.00/+44% 2.74 3.12(*) # Kryo 8.17/+50% 4.47 4.65(*) # ThunderX2 7.22/+48% 5.64 4.10 # # (*) slower than 4+1:-( # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); my @x=map("x$_",(5..17,19..21)); my @d=map("x$_",(22..28,30)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); ( "&add_32 (@x[$a0],@x[$a0],@x[$b0])", "&add_32 (@x[$a1],@x[$a1],@x[$b1])", "&add_32 (@x[$a2],@x[$a2],@x[$b2])", "&add_32 (@x[$a3],@x[$a3],@x[$b3])", "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", "&ror_32 (@x[$d0],@x[$d0],16)", "&ror_32 (@x[$d1],@x[$d1],16)", "&ror_32 (@x[$d2],@x[$d2],16)", "&ror_32 (@x[$d3],@x[$d3],16)", "&add_32 (@x[$c0],@x[$c0],@x[$d0])", "&add_32 (@x[$c1],@x[$c1],@x[$d1])", "&add_32 (@x[$c2],@x[$c2],@x[$d2])", "&add_32 (@x[$c3],@x[$c3],@x[$d3])", "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", "&ror_32 (@x[$b0],@x[$b0],20)", "&ror_32 (@x[$b1],@x[$b1],20)", "&ror_32 (@x[$b2],@x[$b2],20)", "&ror_32 (@x[$b3],@x[$b3],20)", "&add_32 (@x[$a0],@x[$a0],@x[$b0])", "&add_32 (@x[$a1],@x[$a1],@x[$b1])", "&add_32 (@x[$a2],@x[$a2],@x[$b2])", "&add_32 (@x[$a3],@x[$a3],@x[$b3])", "&eor_32 (@x[$d0],@x[$d0],@x[$a0])", "&eor_32 (@x[$d1],@x[$d1],@x[$a1])", "&eor_32 (@x[$d2],@x[$d2],@x[$a2])", "&eor_32 (@x[$d3],@x[$d3],@x[$a3])", "&ror_32 (@x[$d0],@x[$d0],24)", "&ror_32 (@x[$d1],@x[$d1],24)", "&ror_32 (@x[$d2],@x[$d2],24)", "&ror_32 (@x[$d3],@x[$d3],24)", "&add_32 (@x[$c0],@x[$c0],@x[$d0])", "&add_32 (@x[$c1],@x[$c1],@x[$d1])", "&add_32 (@x[$c2],@x[$c2],@x[$d2])", "&add_32 (@x[$c3],@x[$c3],@x[$d3])", "&eor_32 (@x[$b0],@x[$b0],@x[$c0])", "&eor_32 (@x[$b1],@x[$b1],@x[$c1])", "&eor_32 (@x[$b2],@x[$b2],@x[$c2])", "&eor_32 (@x[$b3],@x[$b3],@x[$c3])", "&ror_32 (@x[$b0],@x[$b0],25)", "&ror_32 (@x[$b1],@x[$b1],25)", "&ror_32 (@x[$b2],@x[$b2],25)", "&ror_32 (@x[$b3],@x[$b3],25)" ); } $code.=<<___; +#include "arm_arch.h" #ifndef __KERNEL__ -# include "arm_arch.h" .extern OPENSSL_armcap_P .hidden OPENSSL_armcap_P #endif .text .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral .Lone: .long 1,2,3,4 .Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f .asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm" .globl ChaCha20_ctr32 .type ChaCha20_ctr32,%function .align 5 ChaCha20_ctr32: + AARCH64_SIGN_LINK_REGISTER cbz $len,.Labort cmp $len,#192 b.lo .Lshort #ifndef __KERNEL__ adrp x17,OPENSSL_armcap_P ldr w17,[x17,#:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON b.ne .LChaCha20_neon #endif .Lshort: - .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 adr @x[0],.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#64 ldp @d[0],@d[1],[@x[0]] // load sigma ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ldp @d[6],@d[7],[$ctr] // load counter #ifdef __AARCH64EB__ ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 ror @d[4],@d[4],#32 ror @d[5],@d[5],#32 ror @d[6],@d[6],#32 ror @d[7],@d[7],#32 #endif .Loop_outer: mov.32 @x[0],@d[0] // unpack key block lsr @x[1],@d[0],#32 mov.32 @x[2],@d[1] lsr @x[3],@d[1],#32 mov.32 @x[4],@d[2] lsr @x[5],@d[2],#32 mov.32 @x[6],@d[3] lsr @x[7],@d[3],#32 mov.32 @x[8],@d[4] lsr @x[9],@d[4],#32 mov.32 @x[10],@d[5] lsr @x[11],@d[5],#32 mov.32 @x[12],@d[6] lsr @x[13],@d[6],#32 mov.32 @x[14],@d[7] lsr @x[15],@d[7],#32 mov $ctr,#10 subs $len,$len,#64 .Loop: sub $ctr,$ctr,#1 ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; cbnz $ctr,.Loop add.32 @x[0],@x[0],@d[0] // accumulate key block add @x[1],@x[1],@d[0],lsr#32 add.32 @x[2],@x[2],@d[1] add @x[3],@x[3],@d[1],lsr#32 add.32 @x[4],@x[4],@d[2] add @x[5],@x[5],@d[2],lsr#32 add.32 @x[6],@x[6],@d[3] add @x[7],@x[7],@d[3],lsr#32 add.32 @x[8],@x[8],@d[4] add @x[9],@x[9],@d[4],lsr#32 add.32 @x[10],@x[10],@d[5] add @x[11],@x[11],@d[5],lsr#32 add.32 @x[12],@x[12],@d[6] add @x[13],@x[13],@d[6],lsr#32 add.32 @x[14],@x[14],@d[7] add @x[15],@x[15],@d[7],lsr#32 b.lo .Ltail add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 ldp @x[1],@x[3],[$inp,#0] // load input add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 ldp @x[5],@x[7],[$inp,#16] add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 ldp @x[9],@x[11],[$inp,#32] add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor @x[10],@x[10],@x[11] eor @x[12],@x[12],@x[13] eor @x[14],@x[14],@x[15] stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#1 // increment counter stp @x[4],@x[6],[$out,#16] stp @x[8],@x[10],[$out,#32] stp @x[12],@x[14],[$out,#48] add $out,$out,#64 b.hi .Loop_outer ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 - .inst 0xd50323bf // autiasp .Labort: + AARCH64_VALIDATE_LINK_REGISTER ret .align 4 .Ltail: add $len,$len,#64 .Less_than_64: sub $out,$out,#1 add $inp,$inp,$len add $out,$out,$len add $ctr,sp,$len neg $len,$len add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif stp @x[0],@x[2],[sp,#0] stp @x[4],@x[6],[sp,#16] stp @x[8],@x[10],[sp,#32] stp @x[12],@x[14],[sp,#48] .Loop_tail: ldrb w10,[$inp,$len] ldrb w11,[$ctr,$len] add $len,$len,#1 eor w10,w10,w11 strb w10,[$out,$len] cbnz $len,.Loop_tail stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_ctr32,.-ChaCha20_ctr32 ___ {{{ my @K = map("v$_.4s",(0..3)); my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9)); my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31)); my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X; sub NEON_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my @x=map("'$_'",@X); ( "&add (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&add (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&add (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&add (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&eor (@x[$d0],@x[$d0],@x[$a0])", "&eor (@x[$d1],@x[$d1],@x[$a1])", "&eor (@x[$d2],@x[$d2],@x[$a2])", "&eor (@x[$d3],@x[$d3],@x[$a3])", "&rev32_16 (@x[$d0],@x[$d0])", "&rev32_16 (@x[$d1],@x[$d1])", "&rev32_16 (@x[$d2],@x[$d2])", "&rev32_16 (@x[$d3],@x[$d3])", "&add (@x[$c0],@x[$c0],@x[$d0])", "&add (@x[$c1],@x[$c1],@x[$d1])", "&add (@x[$c2],@x[$c2],@x[$d2])", "&add (@x[$c3],@x[$c3],@x[$d3])", "&eor ('$xt0',@x[$b0],@x[$c0])", "&eor ('$xt1',@x[$b1],@x[$c1])", "&eor ('$xt2',@x[$b2],@x[$c2])", "&eor ('$xt3',@x[$b3],@x[$c3])", "&ushr (@x[$b0],'$xt0',20)", "&ushr (@x[$b1],'$xt1',20)", "&ushr (@x[$b2],'$xt2',20)", "&ushr (@x[$b3],'$xt3',20)", "&sli (@x[$b0],'$xt0',12)", "&sli (@x[$b1],'$xt1',12)", "&sli (@x[$b2],'$xt2',12)", "&sli (@x[$b3],'$xt3',12)", "&add (@x[$a0],@x[$a0],@x[$b0])", "&add (@x[$a1],@x[$a1],@x[$b1])", "&add (@x[$a2],@x[$a2],@x[$b2])", "&add (@x[$a3],@x[$a3],@x[$b3])", "&eor ('$xt0',@x[$d0],@x[$a0])", "&eor ('$xt1',@x[$d1],@x[$a1])", "&eor ('$xt2',@x[$d2],@x[$a2])", "&eor ('$xt3',@x[$d3],@x[$a3])", "&tbl (@x[$d0],'{$xt0}','$ROT24')", "&tbl (@x[$d1],'{$xt1}','$ROT24')", "&tbl (@x[$d2],'{$xt2}','$ROT24')", "&tbl (@x[$d3],'{$xt3}','$ROT24')", "&add (@x[$c0],@x[$c0],@x[$d0])", "&add (@x[$c1],@x[$c1],@x[$d1])", "&add (@x[$c2],@x[$c2],@x[$d2])", "&add (@x[$c3],@x[$c3],@x[$d3])", "&eor ('$xt0',@x[$b0],@x[$c0])", "&eor ('$xt1',@x[$b1],@x[$c1])", "&eor ('$xt2',@x[$b2],@x[$c2])", "&eor ('$xt3',@x[$b3],@x[$c3])", "&ushr (@x[$b0],'$xt0',25)", "&ushr (@x[$b1],'$xt1',25)", "&ushr (@x[$b2],'$xt2',25)", "&ushr (@x[$b3],'$xt3',25)", "&sli (@x[$b0],'$xt0',7)", "&sli (@x[$b1],'$xt1',7)", "&sli (@x[$b2],'$xt2',7)", "&sli (@x[$b3],'$xt3',7)" ); } $code.=<<___; #ifdef __KERNEL__ .globl ChaCha20_neon #endif .type ChaCha20_neon,%function .align 5 ChaCha20_neon: + AARCH64_SIGN_LINK_REGISTER .LChaCha20_neon: - .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 adr @x[0],.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] cmp $len,#512 b.hs .L512_or_more_neon sub sp,sp,#64 ldp @d[0],@d[1],[@x[0]] // load sigma ld1 {@K[0]},[@x[0]],#16 ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ld1 {@K[1],@K[2]},[$key] ldp @d[6],@d[7],[$ctr] // load counter ld1 {@K[3]},[$ctr] stp d8,d9,[sp] // meet ABI requirements ld1 {$CTR,$ROT24},[@x[0]] #ifdef __AARCH64EB__ rev64 @K[0],@K[0] ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 ror @d[4],@d[4],#32 ror @d[5],@d[5],#32 ror @d[6],@d[6],#32 ror @d[7],@d[7],#32 #endif .Loop_outer_neon: dup $xa0,@{K[0]}[0] // unpack key block mov.32 @x[0],@d[0] dup $xa1,@{K[0]}[1] lsr @x[1],@d[0],#32 dup $xa2,@{K[0]}[2] mov.32 @x[2],@d[1] dup $xa3,@{K[0]}[3] lsr @x[3],@d[1],#32 dup $xb0,@{K[1]}[0] mov.32 @x[4],@d[2] dup $xb1,@{K[1]}[1] lsr @x[5],@d[2],#32 dup $xb2,@{K[1]}[2] mov.32 @x[6],@d[3] dup $xb3,@{K[1]}[3] lsr @x[7],@d[3],#32 dup $xd0,@{K[3]}[0] mov.32 @x[8],@d[4] dup $xd1,@{K[3]}[1] lsr @x[9],@d[4],#32 dup $xd2,@{K[3]}[2] mov.32 @x[10],@d[5] dup $xd3,@{K[3]}[3] lsr @x[11],@d[5],#32 add $xd0,$xd0,$CTR mov.32 @x[12],@d[6] dup $xc0,@{K[2]}[0] lsr @x[13],@d[6],#32 dup $xc1,@{K[2]}[1] mov.32 @x[14],@d[7] dup $xc2,@{K[2]}[2] lsr @x[15],@d[7],#32 dup $xc3,@{K[2]}[3] mov $ctr,#10 subs $len,$len,#320 .Loop_neon: sub $ctr,$ctr,#1 ___ my @plus_one=&ROUND(0,4,8,12); foreach (&NEON_lane_ROUND(0,4,8,12)) { eval; eval(shift(@plus_one)); } @plus_one=&ROUND(0,5,10,15); foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); } $code.=<<___; cbnz $ctr,.Loop_neon add $xd0,$xd0,$CTR zip1 $xt0,$xa0,$xa1 // transpose data zip1 $xt1,$xa2,$xa3 zip2 $xt2,$xa0,$xa1 zip2 $xt3,$xa2,$xa3 zip1.64 $xa0,$xt0,$xt1 zip2.64 $xa1,$xt0,$xt1 zip1.64 $xa2,$xt2,$xt3 zip2.64 $xa3,$xt2,$xt3 zip1 $xt0,$xb0,$xb1 zip1 $xt1,$xb2,$xb3 zip2 $xt2,$xb0,$xb1 zip2 $xt3,$xb2,$xb3 zip1.64 $xb0,$xt0,$xt1 zip2.64 $xb1,$xt0,$xt1 zip1.64 $xb2,$xt2,$xt3 zip2.64 $xb3,$xt2,$xt3 zip1 $xt0,$xc0,$xc1 add.32 @x[0],@x[0],@d[0] // accumulate key block zip1 $xt1,$xc2,$xc3 add @x[1],@x[1],@d[0],lsr#32 zip2 $xt2,$xc0,$xc1 add.32 @x[2],@x[2],@d[1] zip2 $xt3,$xc2,$xc3 add @x[3],@x[3],@d[1],lsr#32 zip1.64 $xc0,$xt0,$xt1 add.32 @x[4],@x[4],@d[2] zip2.64 $xc1,$xt0,$xt1 add @x[5],@x[5],@d[2],lsr#32 zip1.64 $xc2,$xt2,$xt3 add.32 @x[6],@x[6],@d[3] zip2.64 $xc3,$xt2,$xt3 add @x[7],@x[7],@d[3],lsr#32 zip1 $xt0,$xd0,$xd1 add.32 @x[8],@x[8],@d[4] zip1 $xt1,$xd2,$xd3 add @x[9],@x[9],@d[4],lsr#32 zip2 $xt2,$xd0,$xd1 add.32 @x[10],@x[10],@d[5] zip2 $xt3,$xd2,$xd3 add @x[11],@x[11],@d[5],lsr#32 zip1.64 $xd0,$xt0,$xt1 add.32 @x[12],@x[12],@d[6] zip2.64 $xd1,$xt0,$xt1 add @x[13],@x[13],@d[6],lsr#32 zip1.64 $xd2,$xt2,$xt3 add.32 @x[14],@x[14],@d[7] zip2.64 $xd3,$xt2,$xt3 add @x[15],@x[15],@d[7],lsr#32 b.lo .Ltail_neon add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 ldp @x[1],@x[3],[$inp,#0] // load input add $xa0,$xa0,@K[0] // accumulate key block add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 ldp @x[5],@x[7],[$inp,#16] add $xb0,$xb0,@K[1] add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 ldp @x[9],@x[11],[$inp,#32] add $xc0,$xc0,@K[2] add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $xd0,$xd0,@K[3] add $inp,$inp,#64 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif ld1.8 {$xt0-$xt3},[$inp],#64 eor @x[0],@x[0],@x[1] add $xa1,$xa1,@K[0] eor @x[2],@x[2],@x[3] add $xb1,$xb1,@K[1] eor @x[4],@x[4],@x[5] add $xc1,$xc1,@K[2] eor @x[6],@x[6],@x[7] add $xd1,$xd1,@K[3] eor @x[8],@x[8],@x[9] eor $xa0,$xa0,$xt0 movi $xt0,#5 eor @x[10],@x[10],@x[11] eor $xb0,$xb0,$xt1 eor @x[12],@x[12],@x[13] eor $xc0,$xc0,$xt2 eor @x[14],@x[14],@x[15] eor $xd0,$xd0,$xt3 add $CTR,$CTR,$xt0 // += 5 ld1.8 {$xt0-$xt3},[$inp],#64 stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#5 // increment counter stp @x[4],@x[6],[$out,#16] stp @x[8],@x[10],[$out,#32] stp @x[12],@x[14],[$out,#48] add $out,$out,#64 st1.8 {$xa0-$xd0},[$out],#64 add $xa2,$xa2,@K[0] add $xb2,$xb2,@K[1] add $xc2,$xc2,@K[2] add $xd2,$xd2,@K[3] ld1.8 {$xa0-$xd0},[$inp],#64 eor $xa1,$xa1,$xt0 eor $xb1,$xb1,$xt1 eor $xc1,$xc1,$xt2 eor $xd1,$xd1,$xt3 st1.8 {$xa1-$xd1},[$out],#64 add $xa3,$xa3,@K[0] add $xb3,$xb3,@K[1] add $xc3,$xc3,@K[2] add $xd3,$xd3,@K[3] ld1.8 {$xa1-$xd1},[$inp],#64 eor $xa2,$xa2,$xa0 eor $xb2,$xb2,$xb0 eor $xc2,$xc2,$xc0 eor $xd2,$xd2,$xd0 st1.8 {$xa2-$xd2},[$out],#64 eor $xa3,$xa3,$xa1 eor $xb3,$xb3,$xb1 eor $xc3,$xc3,$xc1 eor $xd3,$xd3,$xd1 st1.8 {$xa3-$xd3},[$out],#64 b.hi .Loop_outer_neon ldp d8,d9,[sp] // meet ABI requirements ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .align 4 .Ltail_neon: add $len,$len,#320 ldp d8,d9,[sp] // meet ABI requirements cmp $len,#64 b.lo .Less_than_64 add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 ldp @x[1],@x[3],[$inp,#0] // load input add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 ldp @x[5],@x[7],[$inp,#16] add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 ldp @x[9],@x[11],[$inp,#32] add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor @x[10],@x[10],@x[11] eor @x[12],@x[12],@x[13] eor @x[14],@x[14],@x[15] stp @x[0],@x[2],[$out,#0] // store output add $xa0,$xa0,@K[0] // accumulate key block stp @x[4],@x[6],[$out,#16] add $xb0,$xb0,@K[1] stp @x[8],@x[10],[$out,#32] add $xc0,$xc0,@K[2] stp @x[12],@x[14],[$out,#48] add $xd0,$xd0,@K[3] add $out,$out,#64 b.eq .Ldone_neon sub $len,$len,#64 cmp $len,#64 b.lo .Last_neon ld1.8 {$xt0-$xt3},[$inp],#64 eor $xa0,$xa0,$xt0 eor $xb0,$xb0,$xt1 eor $xc0,$xc0,$xt2 eor $xd0,$xd0,$xt3 st1.8 {$xa0-$xd0},[$out],#64 b.eq .Ldone_neon add $xa0,$xa1,@K[0] add $xb0,$xb1,@K[1] sub $len,$len,#64 add $xc0,$xc1,@K[2] cmp $len,#64 add $xd0,$xd1,@K[3] b.lo .Last_neon ld1.8 {$xt0-$xt3},[$inp],#64 eor $xa1,$xa0,$xt0 eor $xb1,$xb0,$xt1 eor $xc1,$xc0,$xt2 eor $xd1,$xd0,$xt3 st1.8 {$xa1-$xd1},[$out],#64 b.eq .Ldone_neon add $xa0,$xa2,@K[0] add $xb0,$xb2,@K[1] sub $len,$len,#64 add $xc0,$xc2,@K[2] cmp $len,#64 add $xd0,$xd2,@K[3] b.lo .Last_neon ld1.8 {$xt0-$xt3},[$inp],#64 eor $xa2,$xa0,$xt0 eor $xb2,$xb0,$xt1 eor $xc2,$xc0,$xt2 eor $xd2,$xd0,$xt3 st1.8 {$xa2-$xd2},[$out],#64 b.eq .Ldone_neon add $xa0,$xa3,@K[0] add $xb0,$xb3,@K[1] add $xc0,$xc3,@K[2] add $xd0,$xd3,@K[3] sub $len,$len,#64 .Last_neon: st1.8 {$xa0-$xd0},[sp] sub $out,$out,#1 add $inp,$inp,$len add $out,$out,$len add $ctr,sp,$len neg $len,$len .Loop_tail_neon: ldrb w10,[$inp,$len] ldrb w11,[$ctr,$len] add $len,$len,#1 eor w10,w10,w11 strb w10,[$out,$len] cbnz $len,.Loop_tail_neon stp xzr,xzr,[sp,#0] stp xzr,xzr,[sp,#16] stp xzr,xzr,[sp,#32] stp xzr,xzr,[sp,#48] .Ldone_neon: ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_neon,.-ChaCha20_neon ___ { my @K = map("v$_.4s",(0..6)); my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31)); my $rot24 = @K[6]; my $ONE = "v7.4s"; sub NEONROUND { my $odd = pop; my ($a,$b,$c,$d,$t)=@_; ( "&add ('$a','$a','$b')", "&eor ('$d','$d','$a')", "&rev32_16 ('$d','$d')", # vrot ($d,16) "&add ('$c','$c','$d')", "&eor ('$t','$b','$c')", "&ushr ('$b','$t',20)", "&sli ('$b','$t',12)", "&add ('$a','$a','$b')", "&eor ('$d','$d','$a')", "&tbl ('$d','{$d}','$rot24')", "&add ('$c','$c','$d')", "&eor ('$t','$b','$c')", "&ushr ('$b','$t',25)", "&sli ('$b','$t',7)", "&ext ('$c','$c','$c',8)", "&ext ('$d','$d','$d',$odd?4:12)", "&ext ('$b','$b','$b',$odd?12:4)" ); } $code.=<<___; .type ChaCha20_512_neon,%function .align 5 ChaCha20_512_neon: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 adr @x[0],.Lsigma stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] .L512_or_more_neon: sub sp,sp,#128+64 eor $ONE,$ONE,$ONE ldp @d[0],@d[1],[@x[0]] // load sigma ld1 {@K[0]},[@x[0]],#16 ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ld1 {@K[1],@K[2]},[$key] ldp @d[6],@d[7],[$ctr] // load counter ld1 {@K[3]},[$ctr] ld1 {$ONE}[0],[@x[0]] add $key,@x[0],#16 // .Lrot24 #ifdef __AARCH64EB__ rev64 @K[0],@K[0] ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 ror @d[4],@d[4],#32 ror @d[5],@d[5],#32 ror @d[6],@d[6],#32 ror @d[7],@d[7],#32 #endif add @K[3],@K[3],$ONE // += 1 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part add @K[3],@K[3],$ONE // not typo str @K[2],[sp,#32] add @K[4],@K[3],$ONE add @K[5],@K[4],$ONE add @K[6],@K[5],$ONE shl $ONE,$ONE,#2 // 1 -> 4 stp d8,d9,[sp,#128+0] // meet ABI requirements stp d10,d11,[sp,#128+16] stp d12,d13,[sp,#128+32] stp d14,d15,[sp,#128+48] sub $len,$len,#512 // not typo .Loop_outer_512_neon: mov $A0,@K[0] mov $A1,@K[0] mov $A2,@K[0] mov $A3,@K[0] mov $A4,@K[0] mov $A5,@K[0] mov $B0,@K[1] mov.32 @x[0],@d[0] // unpack key block mov $B1,@K[1] lsr @x[1],@d[0],#32 mov $B2,@K[1] mov.32 @x[2],@d[1] mov $B3,@K[1] lsr @x[3],@d[1],#32 mov $B4,@K[1] mov.32 @x[4],@d[2] mov $B5,@K[1] lsr @x[5],@d[2],#32 mov $D0,@K[3] mov.32 @x[6],@d[3] mov $D1,@K[4] lsr @x[7],@d[3],#32 mov $D2,@K[5] mov.32 @x[8],@d[4] mov $D3,@K[6] lsr @x[9],@d[4],#32 mov $C0,@K[2] mov.32 @x[10],@d[5] mov $C1,@K[2] lsr @x[11],@d[5],#32 add $D4,$D0,$ONE // +4 mov.32 @x[12],@d[6] add $D5,$D1,$ONE // +4 lsr @x[13],@d[6],#32 mov $C2,@K[2] mov.32 @x[14],@d[7] mov $C3,@K[2] lsr @x[15],@d[7],#32 mov $C4,@K[2] stp @K[3],@K[4],[sp,#48] // off-load key block, variable part mov $C5,@K[2] stp @K[5],@K[6],[sp,#80] mov $ctr,#5 ld1 {$rot24},[$key] subs $len,$len,#512 .Loop_upper_neon: sub $ctr,$ctr,#1 ___ my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); my $diff = ($#thread0+1)*6 - $#thread67 - 1; my $i = 0; foreach (@thread0) { eval; eval(shift(@thread67)); eval(shift(@thread1)); eval(shift(@thread67)); eval(shift(@thread2)); eval(shift(@thread67)); eval(shift(@thread3)); eval(shift(@thread67)); eval(shift(@thread4)); eval(shift(@thread67)); eval(shift(@thread5)); eval(shift(@thread67)); } @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); foreach (@thread0) { eval; eval(shift(@thread67)); eval(shift(@thread1)); eval(shift(@thread67)); eval(shift(@thread2)); eval(shift(@thread67)); eval(shift(@thread3)); eval(shift(@thread67)); eval(shift(@thread4)); eval(shift(@thread67)); eval(shift(@thread5)); eval(shift(@thread67)); } $code.=<<___; cbnz $ctr,.Loop_upper_neon add.32 @x[0],@x[0],@d[0] // accumulate key block add @x[1],@x[1],@d[0],lsr#32 add.32 @x[2],@x[2],@d[1] add @x[3],@x[3],@d[1],lsr#32 add.32 @x[4],@x[4],@d[2] add @x[5],@x[5],@d[2],lsr#32 add.32 @x[6],@x[6],@d[3] add @x[7],@x[7],@d[3],lsr#32 add.32 @x[8],@x[8],@d[4] add @x[9],@x[9],@d[4],lsr#32 add.32 @x[10],@x[10],@d[5] add @x[11],@x[11],@d[5],lsr#32 add.32 @x[12],@x[12],@d[6] add @x[13],@x[13],@d[6],lsr#32 add.32 @x[14],@x[14],@d[7] add @x[15],@x[15],@d[7],lsr#32 add @x[0],@x[0],@x[1],lsl#32 // pack add @x[2],@x[2],@x[3],lsl#32 ldp @x[1],@x[3],[$inp,#0] // load input add @x[4],@x[4],@x[5],lsl#32 add @x[6],@x[6],@x[7],lsl#32 ldp @x[5],@x[7],[$inp,#16] add @x[8],@x[8],@x[9],lsl#32 add @x[10],@x[10],@x[11],lsl#32 ldp @x[9],@x[11],[$inp,#32] add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor @x[10],@x[10],@x[11] eor @x[12],@x[12],@x[13] eor @x[14],@x[14],@x[15] stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#1 // increment counter mov.32 @x[0],@d[0] // unpack key block lsr @x[1],@d[0],#32 stp @x[4],@x[6],[$out,#16] mov.32 @x[2],@d[1] lsr @x[3],@d[1],#32 stp @x[8],@x[10],[$out,#32] mov.32 @x[4],@d[2] lsr @x[5],@d[2],#32 stp @x[12],@x[14],[$out,#48] add $out,$out,#64 mov.32 @x[6],@d[3] lsr @x[7],@d[3],#32 mov.32 @x[8],@d[4] lsr @x[9],@d[4],#32 mov.32 @x[10],@d[5] lsr @x[11],@d[5],#32 mov.32 @x[12],@d[6] lsr @x[13],@d[6],#32 mov.32 @x[14],@d[7] lsr @x[15],@d[7],#32 mov $ctr,#5 .Loop_lower_neon: sub $ctr,$ctr,#1 ___ @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); foreach (@thread0) { eval; eval(shift(@thread67)); eval(shift(@thread1)); eval(shift(@thread67)); eval(shift(@thread2)); eval(shift(@thread67)); eval(shift(@thread3)); eval(shift(@thread67)); eval(shift(@thread4)); eval(shift(@thread67)); eval(shift(@thread5)); eval(shift(@thread67)); } @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); foreach (@thread0) { eval; eval(shift(@thread67)); eval(shift(@thread1)); eval(shift(@thread67)); eval(shift(@thread2)); eval(shift(@thread67)); eval(shift(@thread3)); eval(shift(@thread67)); eval(shift(@thread4)); eval(shift(@thread67)); eval(shift(@thread5)); eval(shift(@thread67)); } $code.=<<___; cbnz $ctr,.Loop_lower_neon add.32 @x[0],@x[0],@d[0] // accumulate key block ldp @K[0],@K[1],[sp,#0] add @x[1],@x[1],@d[0],lsr#32 ldp @K[2],@K[3],[sp,#32] add.32 @x[2],@x[2],@d[1] ldp @K[4],@K[5],[sp,#64] add @x[3],@x[3],@d[1],lsr#32 ldr @K[6],[sp,#96] add $A0,$A0,@K[0] add.32 @x[4],@x[4],@d[2] add $A1,$A1,@K[0] add @x[5],@x[5],@d[2],lsr#32 add $A2,$A2,@K[0] add.32 @x[6],@x[6],@d[3] add $A3,$A3,@K[0] add @x[7],@x[7],@d[3],lsr#32 add $A4,$A4,@K[0] add.32 @x[8],@x[8],@d[4] add $A5,$A5,@K[0] add @x[9],@x[9],@d[4],lsr#32 add $C0,$C0,@K[2] add.32 @x[10],@x[10],@d[5] add $C1,$C1,@K[2] add @x[11],@x[11],@d[5],lsr#32 add $C2,$C2,@K[2] add.32 @x[12],@x[12],@d[6] add $C3,$C3,@K[2] add @x[13],@x[13],@d[6],lsr#32 add $C4,$C4,@K[2] add.32 @x[14],@x[14],@d[7] add $C5,$C5,@K[2] add @x[15],@x[15],@d[7],lsr#32 add $D4,$D4,$ONE // +4 add @x[0],@x[0],@x[1],lsl#32 // pack add $D5,$D5,$ONE // +4 add @x[2],@x[2],@x[3],lsl#32 add $D0,$D0,@K[3] ldp @x[1],@x[3],[$inp,#0] // load input add $D1,$D1,@K[4] add @x[4],@x[4],@x[5],lsl#32 add $D2,$D2,@K[5] add @x[6],@x[6],@x[7],lsl#32 add $D3,$D3,@K[6] ldp @x[5],@x[7],[$inp,#16] add $D4,$D4,@K[3] add @x[8],@x[8],@x[9],lsl#32 add $D5,$D5,@K[4] add @x[10],@x[10],@x[11],lsl#32 add $B0,$B0,@K[1] ldp @x[9],@x[11],[$inp,#32] add $B1,$B1,@K[1] add @x[12],@x[12],@x[13],lsl#32 add $B2,$B2,@K[1] add @x[14],@x[14],@x[15],lsl#32 add $B3,$B3,@K[1] ldp @x[13],@x[15],[$inp,#48] add $B4,$B4,@K[1] add $inp,$inp,#64 add $B5,$B5,@K[1] #ifdef __AARCH64EB__ rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] rev @x[6],@x[6] rev @x[8],@x[8] rev @x[10],@x[10] rev @x[12],@x[12] rev @x[14],@x[14] #endif ld1.8 {$T0-$T3},[$inp],#64 eor @x[0],@x[0],@x[1] eor @x[2],@x[2],@x[3] eor @x[4],@x[4],@x[5] eor @x[6],@x[6],@x[7] eor @x[8],@x[8],@x[9] eor $A0,$A0,$T0 eor @x[10],@x[10],@x[11] eor $B0,$B0,$T1 eor @x[12],@x[12],@x[13] eor $C0,$C0,$T2 eor @x[14],@x[14],@x[15] eor $D0,$D0,$T3 ld1.8 {$T0-$T3},[$inp],#64 stp @x[0],@x[2],[$out,#0] // store output add @d[6],@d[6],#7 // increment counter stp @x[4],@x[6],[$out,#16] stp @x[8],@x[10],[$out,#32] stp @x[12],@x[14],[$out,#48] add $out,$out,#64 st1.8 {$A0-$D0},[$out],#64 ld1.8 {$A0-$D0},[$inp],#64 eor $A1,$A1,$T0 eor $B1,$B1,$T1 eor $C1,$C1,$T2 eor $D1,$D1,$T3 st1.8 {$A1-$D1},[$out],#64 ld1.8 {$A1-$D1},[$inp],#64 eor $A2,$A2,$A0 ldp @K[0],@K[1],[sp,#0] eor $B2,$B2,$B0 ldp @K[2],@K[3],[sp,#32] eor $C2,$C2,$C0 eor $D2,$D2,$D0 st1.8 {$A2-$D2},[$out],#64 ld1.8 {$A2-$D2},[$inp],#64 eor $A3,$A3,$A1 eor $B3,$B3,$B1 eor $C3,$C3,$C1 eor $D3,$D3,$D1 st1.8 {$A3-$D3},[$out],#64 ld1.8 {$A3-$D3},[$inp],#64 eor $A4,$A4,$A2 eor $B4,$B4,$B2 eor $C4,$C4,$C2 eor $D4,$D4,$D2 st1.8 {$A4-$D4},[$out],#64 shl $A0,$ONE,#1 // 4 -> 8 eor $A5,$A5,$A3 eor $B5,$B5,$B3 eor $C5,$C5,$C3 eor $D5,$D5,$D3 st1.8 {$A5-$D5},[$out],#64 add @K[3],@K[3],$A0 // += 8 add @K[4],@K[4],$A0 add @K[5],@K[5],$A0 add @K[6],@K[6],$A0 b.hs .Loop_outer_512_neon adds $len,$len,#512 ushr $ONE,$ONE,#1 // 4 -> 2 ldp d10,d11,[sp,#128+16] // meet ABI requirements ldp d12,d13,[sp,#128+32] ldp d14,d15,[sp,#128+48] stp @K[0],@K[0],[sp,#0] // wipe off-load area stp @K[0],@K[0],[sp,#32] stp @K[0],@K[0],[sp,#64] b.eq .Ldone_512_neon sub $key,$key,#16 // .Lone cmp $len,#192 add sp,sp,#128 sub @K[3],@K[3],$ONE // -= 2 ld1 {$CTR,$ROT24},[$key] b.hs .Loop_outer_neon ldp d8,d9,[sp,#0] // meet ABI requirements eor @K[1],@K[1],@K[1] eor @K[2],@K[2],@K[2] eor @K[3],@K[3],@K[3] eor @K[4],@K[4],@K[4] eor @K[5],@K[5],@K[5] eor @K[6],@K[6],@K[6] b .Loop_outer .Ldone_512_neon: ldp d8,d9,[sp,#128+0] // meet ABI requirements ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_512_neon,.-ChaCha20_512_neon ___ } }}} foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or (m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1)) or (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or (m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1)) or (s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1)) or (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; # flush diff --git a/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl index 81ee3947d7e4..6c5d0e8b3cf0 100755 --- a/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl +++ b/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -1,1877 +1,1889 @@ #! /usr/bin/env perl # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # ECP_NISTZ256 module for ARMv8. # # February 2015. # # Original ECP_NISTZ256 submission targeting x86_64 is detailed in # http://eprint.iacr.org/2013/816. # # with/without -DECP_NISTZ256_ASM # Apple A7 +190-360% # Cortex-A53 +190-400% # Cortex-A57 +190-350% # Denver +230-400% # # Ranges denote minimum and maximum improvement coefficients depending # on benchmark. Lower coefficients are for ECDSA sign, server-side # operation. Keep in mind that +400% means 5x improvement. # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; { my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = map("x$_",(0..17,19,20)); my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont $code.=<<___; #include "arm_arch.h" .text ___ ######################################################################## # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 # $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; open TABLE,") { s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; } close TABLE; # See ecp_nistz256_table.c for explanation for why it's 64*16*37. # 64*16*37-1 is because $#arr returns last valid index or @arr, not # amount of elements. die "insane number of elements" if ($#arr != 64*16*37-1); $code.=<<___; .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,%object .align 12 ecp_nistz256_precomputed: ___ ######################################################################## # this conversion smashes P256_POINT_AFFINE by individual bytes with # 64 byte interval, similar to # 1111222233334444 # 1234123412341234 for(1..37) { @tbl = splice(@arr,0,64*16); for($i=0;$i<64;$i++) { undef @line; for($j=0;$j<64;$j++) { push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; } $code.=".byte\t"; $code.=join(',',map { sprintf "0x%02x",$_} @line); $code.="\n"; } } $code.=<<___; .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed .align 5 .Lpoly: .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 .LRR: // 2^512 mod P precomputed for NIST P256 polynomial .quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd .Lone_mont: .quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe .Lone: .quad 1,0,0,0 .Lord: .quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f .asciz "ECP_NISTZ256 for ARMv8, CRYPTOGAMS by " // void ecp_nistz256_to_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_to_mont .type ecp_nistz256_to_mont,%function .align 6 ecp_nistz256_to_mont: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldr $bi,.LRR // bp[0] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 adr $bp,.LRR // &bp[0] bl __ecp_nistz256_mul_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont // void ecp_nistz256_from_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_from_mont .type ecp_nistz256_from_mont,%function .align 4 ecp_nistz256_from_mont: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] mov $bi,#1 // bp[0] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 adr $bp,.Lone // &bp[0] bl __ecp_nistz256_mul_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont // void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], // const BN_ULONG x2[4]); .globl ecp_nistz256_mul_mont .type ecp_nistz256_mul_mont,%function .align 4 ecp_nistz256_mul_mont: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldr $bi,[$bp] // bp[0] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 bl __ecp_nistz256_mul_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont // void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_sqr_mont .type ecp_nistz256_sqr_mont,%function .align 4 ecp_nistz256_sqr_mont: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-32]! add x29,sp,#0 stp x19,x20,[sp,#16] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 bl __ecp_nistz256_sqr_mont ldp x19,x20,[sp,#16] ldp x29,x30,[sp],#32 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont // void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], // const BN_ULONG x2[4]); .globl ecp_nistz256_add .type ecp_nistz256_add,%function .align 4 ecp_nistz256_add: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp $acc0,$acc1,[$ap] ldp $t0,$t1,[$bp] ldp $acc2,$acc3,[$ap,#16] ldp $t2,$t3,[$bp,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 bl __ecp_nistz256_add ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_add,.-ecp_nistz256_add // void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_div_by_2 .type ecp_nistz256_div_by_2,%function .align 4 ecp_nistz256_div_by_2: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp $acc0,$acc1,[$ap] ldp $acc2,$acc3,[$ap,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 bl __ecp_nistz256_div_by_2 ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 // void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_mul_by_2 .type ecp_nistz256_mul_by_2,%function .align 4 ecp_nistz256_mul_by_2: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp $acc0,$acc1,[$ap] ldp $acc2,$acc3,[$ap,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 mov $t0,$acc0 mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 bl __ecp_nistz256_add // ret = a+a // 2*a ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 // void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_mul_by_3 .type ecp_nistz256_mul_by_3,%function .align 4 ecp_nistz256_mul_by_3: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp $acc0,$acc1,[$ap] ldp $acc2,$acc3,[$ap,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 mov $t0,$acc0 mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 mov $a0,$acc0 mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 bl __ecp_nistz256_add // ret = a+a // 2*a mov $t0,$a0 mov $t1,$a1 mov $t2,$a2 mov $t3,$a3 bl __ecp_nistz256_add // ret += a // 2*a+a=3*a ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 // void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], // const BN_ULONG x2[4]); .globl ecp_nistz256_sub .type ecp_nistz256_sub,%function .align 4 ecp_nistz256_sub: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 ldp $acc0,$acc1,[$ap] ldp $acc2,$acc3,[$ap,#16] ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sub,.-ecp_nistz256_sub // void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_neg .type ecp_nistz256_neg,%function .align 4 ecp_nistz256_neg: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 mov $bp,$ap mov $acc0,xzr // a = 0 mov $acc1,xzr mov $acc2,xzr mov $acc3,xzr ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 bl __ecp_nistz256_sub_from ldp x29,x30,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_neg,.-ecp_nistz256_neg // note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded // to $a0-$a3 and b[0] - to $bi .type __ecp_nistz256_mul_mont,%function .align 4 __ecp_nistz256_mul_mont: mul $acc0,$a0,$bi // a[0]*b[0] umulh $t0,$a0,$bi mul $acc1,$a1,$bi // a[1]*b[0] umulh $t1,$a1,$bi mul $acc2,$a2,$bi // a[2]*b[0] umulh $t2,$a2,$bi mul $acc3,$a3,$bi // a[3]*b[0] umulh $t3,$a3,$bi ldr $bi,[$bp,#8] // b[1] adds $acc1,$acc1,$t0 // accumulate high parts of multiplication lsl $t0,$acc0,#32 adcs $acc2,$acc2,$t1 lsr $t1,$acc0,#32 adcs $acc3,$acc3,$t2 adc $acc4,xzr,$t3 mov $acc5,xzr ___ for($i=1;$i<4;$i++) { # Reduction iteration is normally performed by accumulating # result of multiplication of modulus by "magic" digit [and # omitting least significant word, which is guaranteed to # be 0], but thanks to special form of modulus and "magic" # digit being equal to least significant word, it can be # performed with additions and subtractions alone. Indeed: # # ffff0001.00000000.0000ffff.ffffffff # * abcdefgh # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh # # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we # rewrite above as: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh # # or marking redundant operations: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- # - 0000abcd.efgh0000.--------.--------.-------- $code.=<<___; subs $t2,$acc0,$t0 // "*0xffff0001" sbc $t3,$acc0,$t1 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] mul $t0,$a0,$bi // lo(a[0]*b[i]) adcs $acc1,$acc2,$t1 mul $t1,$a1,$bi // lo(a[1]*b[i]) adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 mul $t2,$a2,$bi // lo(a[2]*b[i]) adcs $acc3,$acc4,$t3 mul $t3,$a3,$bi // lo(a[3]*b[i]) adc $acc4,$acc5,xzr adds $acc0,$acc0,$t0 // accumulate low parts of multiplication umulh $t0,$a0,$bi // hi(a[0]*b[i]) adcs $acc1,$acc1,$t1 umulh $t1,$a1,$bi // hi(a[1]*b[i]) adcs $acc2,$acc2,$t2 umulh $t2,$a2,$bi // hi(a[2]*b[i]) adcs $acc3,$acc3,$t3 umulh $t3,$a3,$bi // hi(a[3]*b[i]) adc $acc4,$acc4,xzr ___ $code.=<<___ if ($i<3); ldr $bi,[$bp,#8*($i+1)] // b[$i+1] ___ $code.=<<___; adds $acc1,$acc1,$t0 // accumulate high parts of multiplication lsl $t0,$acc0,#32 adcs $acc2,$acc2,$t1 lsr $t1,$acc0,#32 adcs $acc3,$acc3,$t2 adcs $acc4,$acc4,$t3 adc $acc5,xzr,xzr ___ } $code.=<<___; // last reduction subs $t2,$acc0,$t0 // "*0xffff0001" sbc $t3,$acc0,$t1 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 adcs $acc3,$acc4,$t3 adc $acc4,$acc5,xzr adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr sbcs $t3,$acc3,$poly3 sbcs xzr,$acc4,xzr // did it borrow? csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont // note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded // to $a0-$a3 .type __ecp_nistz256_sqr_mont,%function .align 4 __ecp_nistz256_sqr_mont: // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul $acc1,$a1,$a0 // a[1]*a[0] umulh $t1,$a1,$a0 mul $acc2,$a2,$a0 // a[2]*a[0] umulh $t2,$a2,$a0 mul $acc3,$a3,$a0 // a[3]*a[0] umulh $acc4,$a3,$a0 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication mul $t0,$a2,$a1 // a[2]*a[1] umulh $t1,$a2,$a1 adcs $acc3,$acc3,$t2 mul $t2,$a3,$a1 // a[3]*a[1] umulh $t3,$a3,$a1 adc $acc4,$acc4,xzr // can't overflow mul $acc5,$a3,$a2 // a[3]*a[2] umulh $acc6,$a3,$a2 adds $t1,$t1,$t2 // accumulate high parts of multiplication mul $acc0,$a0,$a0 // a[0]*a[0] adc $t2,$t3,xzr // can't overflow adds $acc3,$acc3,$t0 // accumulate low parts of multiplication umulh $a0,$a0,$a0 adcs $acc4,$acc4,$t1 mul $t1,$a1,$a1 // a[1]*a[1] adcs $acc5,$acc5,$t2 umulh $a1,$a1,$a1 adc $acc6,$acc6,xzr // can't overflow adds $acc1,$acc1,$acc1 // acc[1-6]*=2 mul $t2,$a2,$a2 // a[2]*a[2] adcs $acc2,$acc2,$acc2 umulh $a2,$a2,$a2 adcs $acc3,$acc3,$acc3 mul $t3,$a3,$a3 // a[3]*a[3] adcs $acc4,$acc4,$acc4 umulh $a3,$a3,$a3 adcs $acc5,$acc5,$acc5 adcs $acc6,$acc6,$acc6 adc $acc7,xzr,xzr adds $acc1,$acc1,$a0 // +a[i]*a[i] adcs $acc2,$acc2,$t1 adcs $acc3,$acc3,$a1 adcs $acc4,$acc4,$t2 adcs $acc5,$acc5,$a2 lsl $t0,$acc0,#32 adcs $acc6,$acc6,$t3 lsr $t1,$acc0,#32 adc $acc7,$acc7,$a3 ___ for($i=0;$i<3;$i++) { # reductions, see commentary in # multiplication for details $code.=<<___; subs $t2,$acc0,$t0 // "*0xffff0001" sbc $t3,$acc0,$t1 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] adcs $acc1,$acc2,$t1 lsl $t0,$acc0,#32 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 lsr $t1,$acc0,#32 adc $acc3,$t3,xzr // can't overflow ___ } $code.=<<___; subs $t2,$acc0,$t0 // "*0xffff0001" sbc $t3,$acc0,$t1 adds $acc0,$acc1,$t0 // +=acc[0]<<96 and omit acc[0] adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 // +=acc[0]*0xffff0001 adc $acc3,$t3,xzr // can't overflow adds $acc0,$acc0,$acc4 // accumulate upper half adcs $acc1,$acc1,$acc5 adcs $acc2,$acc2,$acc6 adcs $acc3,$acc3,$acc7 adc $acc4,xzr,xzr adds $t0,$acc0,#1 // subs $t0,$acc0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr sbcs $t3,$acc3,$poly3 sbcs xzr,$acc4,xzr // did it borrow? csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont // Note that __ecp_nistz256_add expects both input vectors pre-loaded to // $a0-$a3 and $t0-$t3. This is done because it's used in multiple // contexts, e.g. in multiplication by 2 and 3... .type __ecp_nistz256_add,%function .align 4 __ecp_nistz256_add: adds $acc0,$acc0,$t0 // ret = a+b adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 adc $ap,xzr,xzr // zap $ap adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr sbcs $t3,$acc3,$poly3 sbcs xzr,$ap,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_add,.-__ecp_nistz256_add .type __ecp_nistz256_sub_from,%function .align 4 __ecp_nistz256_sub_from: ldp $t0,$t1,[$bp] ldp $t2,$t3,[$bp,#16] subs $acc0,$acc0,$t0 // ret = a-b sbcs $acc1,$acc1,$t1 sbcs $acc2,$acc2,$t2 sbcs $acc3,$acc3,$t3 sbc $ap,xzr,xzr // zap $ap subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adc $t3,$acc3,$poly3 cmp $ap,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret csel $acc1,$acc1,$t1,eq csel $acc2,$acc2,$t2,eq stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,eq stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from .type __ecp_nistz256_sub_morf,%function .align 4 __ecp_nistz256_sub_morf: ldp $t0,$t1,[$bp] ldp $t2,$t3,[$bp,#16] subs $acc0,$t0,$acc0 // ret = b-a sbcs $acc1,$t1,$acc1 sbcs $acc2,$t2,$acc2 sbcs $acc3,$t3,$acc3 sbc $ap,xzr,xzr // zap $ap subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adc $t3,$acc3,$poly3 cmp $ap,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret csel $acc1,$acc1,$t1,eq csel $acc2,$acc2,$t2,eq stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,eq stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf .type __ecp_nistz256_div_by_2,%function .align 4 __ecp_nistz256_div_by_2: subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = a+modulus adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adcs $t3,$acc3,$poly3 adc $ap,xzr,xzr // zap $ap tst $acc0,#1 // is a even? csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus csel $acc1,$acc1,$t1,eq csel $acc2,$acc2,$t2,eq csel $acc3,$acc3,$t3,eq csel $ap,xzr,$ap,eq lsr $acc0,$acc0,#1 // ret >>= 1 orr $acc0,$acc0,$acc1,lsl#63 lsr $acc1,$acc1,#1 orr $acc1,$acc1,$acc2,lsl#63 lsr $acc2,$acc2,#1 orr $acc2,$acc2,$acc3,lsl#63 lsr $acc3,$acc3,#1 stp $acc0,$acc1,[$rp] orr $acc3,$acc3,$ap,lsl#63 stp $acc2,$acc3,[$rp,#16] ret .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 ___ ######################################################################## # following subroutines are "literal" implementation of those found in # ecp_nistz256.c # ######################################################################## # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); # { my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); # above map() describes stack layout with 4 temporary # 256-bit vectors on top. my ($rp_real,$ap_real) = map("x$_",(21,22)); $code.=<<___; .globl ecp_nistz256_point_double .type ecp_nistz256_point_double,%function .align 5 ecp_nistz256_point_double: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] sub sp,sp,#32*4 .Ldouble_shortcut: ldp $acc0,$acc1,[$ap,#32] mov $rp_real,$rp ldp $acc2,$acc3,[$ap,#48] mov $ap_real,$ap ldr $poly1,.Lpoly+8 mov $t0,$acc0 ldr $poly3,.Lpoly+24 mov $t1,$acc1 ldp $a0,$a1,[$ap_real,#64] // forward load for p256_sqr_mont mov $t2,$acc2 mov $t3,$acc3 ldp $a2,$a3,[$ap_real,#64+16] add $rp,sp,#$S bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); add $rp,sp,#$Zsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); ldp $t0,$t1,[$ap_real] ldp $t2,$t3,[$ap_real,#16] mov $a0,$acc0 // put Zsqr aside for p256_sub mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 add $rp,sp,#$M bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); add $bp,$ap_real,#0 mov $acc0,$a0 // restore Zsqr mov $acc1,$a1 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont mov $acc2,$a2 mov $acc3,$a3 ldp $a2,$a3,[sp,#$S+16] add $rp,sp,#$Zsqr bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); add $rp,sp,#$S bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); ldr $bi,[$ap_real,#32] ldp $a0,$a1,[$ap_real,#64] ldp $a2,$a3,[$ap_real,#64+16] add $bp,$ap_real,#32 add $rp,sp,#$tmp0 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); mov $t0,$acc0 mov $t1,$acc1 ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont mov $t2,$acc2 mov $t3,$acc3 ldp $a2,$a3,[sp,#$S+16] add $rp,$rp_real,#64 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); add $rp,sp,#$tmp0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont ldp $a0,$a1,[sp,#$M] ldp $a2,$a3,[sp,#$M+16] add $rp,$rp_real,#32 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); add $bp,sp,#$Zsqr add $rp,sp,#$M bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); mov $t0,$acc0 // duplicate M mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 mov $a0,$acc0 // put M aside mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 add $rp,sp,#$M bl __ecp_nistz256_add mov $t0,$a0 // restore M mov $t1,$a1 ldr $bi,[$ap_real] // forward load for p256_mul_mont mov $t2,$a2 ldp $a0,$a1,[sp,#$S] mov $t3,$a3 ldp $a2,$a3,[sp,#$S+16] bl __ecp_nistz256_add // p256_mul_by_3(M, M); add $bp,$ap_real,#0 add $rp,sp,#$S bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); mov $t0,$acc0 mov $t1,$acc1 ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont mov $t2,$acc2 mov $t3,$acc3 ldp $a2,$a3,[sp,#$M+16] add $rp,sp,#$tmp0 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); add $rp,$rp_real,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); add $bp,sp,#$tmp0 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); add $bp,sp,#$S add $rp,sp,#$S bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); ldr $bi,[sp,#$M] mov $a0,$acc0 // copy S mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 add $bp,sp,#$M bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); add $bp,$rp_real,#32 add $rp,$rp_real,#32 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x29,x30,[sp],#96 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double ___ } ######################################################################## # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, # const P256_POINT *in2); { my ($res_x,$res_y,$res_z, $H,$Hsqr,$R,$Rsqr,$Hcub, $U1,$U2,$S1,$S2)=map(32*$_,(0..11)); my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); # above map() describes stack layout with 12 temporary # 256-bit vectors on top. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); $code.=<<___; .globl ecp_nistz256_point_add .type ecp_nistz256_point_add,%function .align 5 ecp_nistz256_point_add: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#32*12 ldp $a0,$a1,[$bp,#64] // in2_z ldp $a2,$a3,[$bp,#64+16] mov $rp_real,$rp mov $ap_real,$ap mov $bp_real,$bp ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 orr $t0,$a0,$a1 orr $t2,$a2,$a3 orr $in2infty,$t0,$t2 cmp $in2infty,#0 csetm $in2infty,ne // ~in2infty add $rp,sp,#$Z2sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); ldp $a0,$a1,[$ap_real,#64] // in1_z ldp $a2,$a3,[$ap_real,#64+16] orr $t0,$a0,$a1 orr $t2,$a2,$a3 orr $in1infty,$t0,$t2 cmp $in1infty,#0 csetm $in1infty,ne // ~in1infty add $rp,sp,#$Z1sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); ldr $bi,[$bp_real,#64] ldp $a0,$a1,[sp,#$Z2sqr] ldp $a2,$a3,[sp,#$Z2sqr+16] add $bp,$bp_real,#64 add $rp,sp,#$S1 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); ldr $bi,[$ap_real,#64] ldp $a0,$a1,[sp,#$Z1sqr] ldp $a2,$a3,[sp,#$Z1sqr+16] add $bp,$ap_real,#64 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr $bi,[$ap_real,#32] ldp $a0,$a1,[sp,#$S1] ldp $a2,$a3,[sp,#$S1+16] add $bp,$ap_real,#32 add $rp,sp,#$S1 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); ldr $bi,[$bp_real,#32] ldp $a0,$a1,[sp,#$S2] ldp $a2,$a3,[sp,#$S2+16] add $bp,$bp_real,#32 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add $bp,sp,#$S1 ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont ldp $a0,$a1,[$ap_real] ldp $a2,$a3,[$ap_real,#16] add $rp,sp,#$R bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); orr $acc0,$acc0,$acc1 // see if result is zero orr $acc2,$acc2,$acc3 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) add $bp,sp,#$Z2sqr add $rp,sp,#$U1 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); ldr $bi,[sp,#$Z1sqr] ldp $a0,$a1,[$bp_real] ldp $a2,$a3,[$bp_real,#16] add $bp,sp,#$Z1sqr add $rp,sp,#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); add $bp,sp,#$U1 ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont ldp $a2,$a3,[sp,#$R+16] add $rp,sp,#$H bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); orr $acc0,$acc0,$acc1 // see if result is zero orr $acc2,$acc2,$acc3 orr $acc0,$acc0,$acc2 // ~is_equal(U1,U2) mvn $temp1,$in1infty // -1/0 -> 0/-1 mvn $temp2,$in2infty // -1/0 -> 0/-1 orr $acc0,$acc0,$temp1 orr $acc0,$acc0,$temp2 orr $acc0,$acc0,$temp0 cbnz $acc0,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) .Ladd_double: mov $ap,$ap_real mov $rp,$rp_real ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] add sp,sp,#32*(12-4) // difference in stack frames b .Ldouble_shortcut .align 4 .Ladd_proceed: add $rp,sp,#$Rsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr $bi,[$ap_real,#64] ldp $a0,$a1,[sp,#$H] ldp $a2,$a3,[sp,#$H+16] add $bp,$ap_real,#64 add $rp,sp,#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldp $a0,$a1,[sp,#$H] ldp $a2,$a3,[sp,#$H+16] add $rp,sp,#$Hsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldr $bi,[$bp_real,#64] ldp $a0,$a1,[sp,#$res_z] ldp $a2,$a3,[sp,#$res_z+16] add $bp,$bp_real,#64 add $rp,sp,#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); ldr $bi,[sp,#$H] ldp $a0,$a1,[sp,#$Hsqr] ldp $a2,$a3,[sp,#$Hsqr+16] add $bp,sp,#$H add $rp,sp,#$Hcub bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr $bi,[sp,#$Hsqr] ldp $a0,$a1,[sp,#$U1] ldp $a2,$a3,[sp,#$U1+16] add $bp,sp,#$Hsqr add $rp,sp,#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); mov $t0,$acc0 mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 add $rp,sp,#$Hsqr bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); add $bp,sp,#$Rsqr add $rp,sp,#$res_x bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add $bp,sp,#$Hcub bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add $bp,sp,#$U2 ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont ldp $a0,$a1,[sp,#$S1] ldp $a2,$a3,[sp,#$S1+16] add $rp,sp,#$res_y bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add $bp,sp,#$Hcub add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); ldr $bi,[sp,#$R] ldp $a0,$a1,[sp,#$res_y] ldp $a2,$a3,[sp,#$res_y+16] add $bp,sp,#$R add $rp,sp,#$res_y bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add $bp,sp,#$S2 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp $a0,$a1,[sp,#$res_x] // res ldp $a2,$a3,[sp,#$res_x+16] ldp $t0,$t1,[$bp_real] // in2 ldp $t2,$t3,[$bp_real,#16] ___ for($i=0;$i<64;$i+=32) { # conditional moves $code.=<<___; ldp $acc0,$acc1,[$ap_real,#$i] // in1 cmp $in1infty,#0 // ~$in1intfy, remember? ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne ldp $a0,$a1,[sp,#$res_x+$i+32] // res csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? ldp $a2,$a3,[sp,#$res_x+$i+48] csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne ldp $t0,$t1,[$bp_real,#$i+32] // in2 csel $acc2,$t2,$acc2,ne csel $acc3,$t3,$acc3,ne ldp $t2,$t3,[$bp_real,#$i+48] stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] ___ } $code.=<<___; ldp $acc0,$acc1,[$ap_real,#$i] // in1 cmp $in1infty,#0 // ~$in1intfy, remember? ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne csel $acc2,$t2,$acc2,ne csel $acc3,$t3,$acc3,ne stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] .Ladd_done: add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#96 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add,.-ecp_nistz256_point_add ___ } ######################################################################## # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, # const P256_POINT_AFFINE *in2); { my ($res_x,$res_y,$res_z, $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9)); my $Z1sqr = $S2; # above map() describes stack layout with 10 temporary # 256-bit vectors on top. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); $code.=<<___; .globl ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,%function .align 5 ecp_nistz256_point_add_affine: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] sub sp,sp,#32*10 mov $rp_real,$rp mov $ap_real,$ap mov $bp_real,$bp ldr $poly1,.Lpoly+8 ldr $poly3,.Lpoly+24 ldp $a0,$a1,[$ap,#64] // in1_z ldp $a2,$a3,[$ap,#64+16] orr $t0,$a0,$a1 orr $t2,$a2,$a3 orr $in1infty,$t0,$t2 cmp $in1infty,#0 csetm $in1infty,ne // ~in1infty ldp $acc0,$acc1,[$bp] // in2_x ldp $acc2,$acc3,[$bp,#16] ldp $t0,$t1,[$bp,#32] // in2_y ldp $t2,$t3,[$bp,#48] orr $acc0,$acc0,$acc1 orr $acc2,$acc2,$acc3 orr $t0,$t0,$t1 orr $t2,$t2,$t3 orr $acc0,$acc0,$acc2 orr $t0,$t0,$t2 orr $in2infty,$acc0,$t0 cmp $in2infty,#0 csetm $in2infty,ne // ~in2infty add $rp,sp,#$Z1sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); mov $a0,$acc0 mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 ldr $bi,[$bp_real] add $bp,$bp_real,#0 add $rp,sp,#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); add $bp,$ap_real,#0 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont ldp $a0,$a1,[sp,#$Z1sqr] ldp $a2,$a3,[sp,#$Z1sqr+16] add $rp,sp,#$H bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); add $bp,$ap_real,#64 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr $bi,[$ap_real,#64] ldp $a0,$a1,[sp,#$H] ldp $a2,$a3,[sp,#$H+16] add $bp,$ap_real,#64 add $rp,sp,#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldr $bi,[$bp_real,#32] ldp $a0,$a1,[sp,#$S2] ldp $a2,$a3,[sp,#$S2+16] add $bp,$bp_real,#32 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add $bp,$ap_real,#32 ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont ldp $a2,$a3,[sp,#$H+16] add $rp,sp,#$R bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); add $rp,sp,#$Hsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldp $a0,$a1,[sp,#$R] ldp $a2,$a3,[sp,#$R+16] add $rp,sp,#$Rsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr $bi,[sp,#$H] ldp $a0,$a1,[sp,#$Hsqr] ldp $a2,$a3,[sp,#$Hsqr+16] add $bp,sp,#$H add $rp,sp,#$Hcub bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr $bi,[$ap_real] ldp $a0,$a1,[sp,#$Hsqr] ldp $a2,$a3,[sp,#$Hsqr+16] add $bp,$ap_real,#0 add $rp,sp,#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); mov $t0,$acc0 mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 add $rp,sp,#$Hsqr bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); add $bp,sp,#$Rsqr add $rp,sp,#$res_x bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); add $bp,sp,#$Hcub bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); add $bp,sp,#$U2 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont ldp $a0,$a1,[sp,#$Hcub] ldp $a2,$a3,[sp,#$Hcub+16] add $rp,sp,#$res_y bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add $bp,$ap_real,#32 add $rp,sp,#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); ldr $bi,[sp,#$R] ldp $a0,$a1,[sp,#$res_y] ldp $a2,$a3,[sp,#$res_y+16] add $bp,sp,#$R add $rp,sp,#$res_y bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); add $bp,sp,#$S2 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); ldp $a0,$a1,[sp,#$res_x] // res ldp $a2,$a3,[sp,#$res_x+16] ldp $t0,$t1,[$bp_real] // in2 ldp $t2,$t3,[$bp_real,#16] ___ for($i=0;$i<64;$i+=32) { # conditional moves $code.=<<___; ldp $acc0,$acc1,[$ap_real,#$i] // in1 cmp $in1infty,#0 // ~$in1intfy, remember? ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne ldp $a0,$a1,[sp,#$res_x+$i+32] // res csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? ldp $a2,$a3,[sp,#$res_x+$i+48] csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne ldp $t0,$t1,[$bp_real,#$i+32] // in2 csel $acc2,$t2,$acc2,ne csel $acc3,$t3,$acc3,ne ldp $t2,$t3,[$bp_real,#$i+48] stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] ___ $code.=<<___ if ($i == 0); adr $bp_real,.Lone_mont-64 ___ } $code.=<<___; ldp $acc0,$acc1,[$ap_real,#$i] // in1 cmp $in1infty,#0 // ~$in1intfy, remember? ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne csel $acc2,$t2,$acc2,ne csel $acc3,$t3,$acc3,ne stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] add sp,x29,#0 // destroy frame ldp x19,x20,[x29,#16] ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x29,x30,[sp],#80 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine ___ } if (1) { my ($ord0,$ord1) = ($poly1,$poly3); my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); my $acc7 = $bi; $code.=<<___; //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], // uint64_t b[4]); .globl ecp_nistz256_ord_mul_mont .type ecp_nistz256_ord_mul_mont,%function .align 4 ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adr $ordk,.Lord ldr $bi,[$bp] // bp[0] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] ldp $ord0,$ord1,[$ordk,#0] ldp $ord2,$ord3,[$ordk,#16] ldr $ordk,[$ordk,#32] mul $acc0,$a0,$bi // a[0]*b[0] umulh $t0,$a0,$bi mul $acc1,$a1,$bi // a[1]*b[0] umulh $t1,$a1,$bi mul $acc2,$a2,$bi // a[2]*b[0] umulh $t2,$a2,$bi mul $acc3,$a3,$bi // a[3]*b[0] umulh $acc4,$a3,$bi mul $t4,$acc0,$ordk adds $acc1,$acc1,$t0 // accumulate high parts of multiplication adcs $acc2,$acc2,$t1 adcs $acc3,$acc3,$t2 adc $acc4,$acc4,xzr mov $acc5,xzr ___ for ($i=1;$i<4;$i++) { ################################################################ # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz # * abcdefgh # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx # # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we # rewrite above as: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh $code.=<<___; ldr $bi,[$bp,#8*$i] // b[i] lsl $t0,$t4,#32 subs $acc2,$acc2,$t4 lsr $t1,$t4,#32 sbcs $acc3,$acc3,$t0 sbcs $acc4,$acc4,$t1 sbc $acc5,$acc5,xzr subs xzr,$acc0,#1 umulh $t1,$ord0,$t4 mul $t2,$ord1,$t4 umulh $t3,$ord1,$t4 adcs $t2,$t2,$t1 mul $t0,$a0,$bi adc $t3,$t3,xzr mul $t1,$a1,$bi adds $acc0,$acc1,$t2 mul $t2,$a2,$bi adcs $acc1,$acc2,$t3 mul $t3,$a3,$bi adcs $acc2,$acc3,$t4 adcs $acc3,$acc4,$t4 adc $acc4,$acc5,xzr adds $acc0,$acc0,$t0 // accumulate low parts umulh $t0,$a0,$bi adcs $acc1,$acc1,$t1 umulh $t1,$a1,$bi adcs $acc2,$acc2,$t2 umulh $t2,$a2,$bi adcs $acc3,$acc3,$t3 umulh $t3,$a3,$bi adc $acc4,$acc4,xzr mul $t4,$acc0,$ordk adds $acc1,$acc1,$t0 // accumulate high parts adcs $acc2,$acc2,$t1 adcs $acc3,$acc3,$t2 adcs $acc4,$acc4,$t3 adc $acc5,xzr,xzr ___ } $code.=<<___; lsl $t0,$t4,#32 // last reduction subs $acc2,$acc2,$t4 lsr $t1,$t4,#32 sbcs $acc3,$acc3,$t0 sbcs $acc4,$acc4,$t1 sbc $acc5,$acc5,xzr subs xzr,$acc0,#1 umulh $t1,$ord0,$t4 mul $t2,$ord1,$t4 umulh $t3,$ord1,$t4 adcs $t2,$t2,$t1 adc $t3,$t3,xzr adds $acc0,$acc1,$t2 adcs $acc1,$acc2,$t3 adcs $acc2,$acc3,$t4 adcs $acc3,$acc4,$t4 adc $acc4,$acc5,xzr subs $t0,$acc0,$ord0 // ret -= modulus sbcs $t1,$acc1,$ord1 sbcs $t2,$acc2,$ord2 sbcs $t3,$acc3,$ord3 sbcs xzr,$acc4,xzr csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo csel $acc2,$acc2,$t2,lo stp $acc0,$acc1,[$rp] csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont //////////////////////////////////////////////////////////////////////// // void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], // uint64_t rep); .globl ecp_nistz256_ord_sqr_mont .type ecp_nistz256_ord_sqr_mont,%function .align 4 ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-64]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] adr $ordk,.Lord ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] ldp $ord0,$ord1,[$ordk,#0] ldp $ord2,$ord3,[$ordk,#16] ldr $ordk,[$ordk,#32] b .Loop_ord_sqr .align 4 .Loop_ord_sqr: sub $bp,$bp,#1 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | // | |a3*a2|a3*a0| | | | // | | | |a2*a1| | | | // | | |a3*a1| | | | | // *| | | | | | | | 2| // +|a3*a3|a2*a2|a1*a1|a0*a0| // |--+--+--+--+--+--+--+--| // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx // // "can't overflow" below mark carrying into high part of // multiplication result, which can't overflow, because it // can never be all ones. mul $acc1,$a1,$a0 // a[1]*a[0] umulh $t1,$a1,$a0 mul $acc2,$a2,$a0 // a[2]*a[0] umulh $t2,$a2,$a0 mul $acc3,$a3,$a0 // a[3]*a[0] umulh $acc4,$a3,$a0 adds $acc2,$acc2,$t1 // accumulate high parts of multiplication mul $t0,$a2,$a1 // a[2]*a[1] umulh $t1,$a2,$a1 adcs $acc3,$acc3,$t2 mul $t2,$a3,$a1 // a[3]*a[1] umulh $t3,$a3,$a1 adc $acc4,$acc4,xzr // can't overflow mul $acc5,$a3,$a2 // a[3]*a[2] umulh $acc6,$a3,$a2 adds $t1,$t1,$t2 // accumulate high parts of multiplication mul $acc0,$a0,$a0 // a[0]*a[0] adc $t2,$t3,xzr // can't overflow adds $acc3,$acc3,$t0 // accumulate low parts of multiplication umulh $a0,$a0,$a0 adcs $acc4,$acc4,$t1 mul $t1,$a1,$a1 // a[1]*a[1] adcs $acc5,$acc5,$t2 umulh $a1,$a1,$a1 adc $acc6,$acc6,xzr // can't overflow adds $acc1,$acc1,$acc1 // acc[1-6]*=2 mul $t2,$a2,$a2 // a[2]*a[2] adcs $acc2,$acc2,$acc2 umulh $a2,$a2,$a2 adcs $acc3,$acc3,$acc3 mul $t3,$a3,$a3 // a[3]*a[3] adcs $acc4,$acc4,$acc4 umulh $a3,$a3,$a3 adcs $acc5,$acc5,$acc5 adcs $acc6,$acc6,$acc6 adc $acc7,xzr,xzr adds $acc1,$acc1,$a0 // +a[i]*a[i] mul $t4,$acc0,$ordk adcs $acc2,$acc2,$t1 adcs $acc3,$acc3,$a1 adcs $acc4,$acc4,$t2 adcs $acc5,$acc5,$a2 adcs $acc6,$acc6,$t3 adc $acc7,$acc7,$a3 ___ for($i=0; $i<4; $i++) { # reductions $code.=<<___; subs xzr,$acc0,#1 umulh $t1,$ord0,$t4 mul $t2,$ord1,$t4 umulh $t3,$ord1,$t4 adcs $t2,$t2,$t1 adc $t3,$t3,xzr adds $acc0,$acc1,$t2 adcs $acc1,$acc2,$t3 adcs $acc2,$acc3,$t4 adc $acc3,xzr,$t4 // can't overflow ___ $code.=<<___ if ($i<3); mul $t3,$acc0,$ordk ___ $code.=<<___; lsl $t0,$t4,#32 subs $acc1,$acc1,$t4 lsr $t1,$t4,#32 sbcs $acc2,$acc2,$t0 sbc $acc3,$acc3,$t1 // can't borrow ___ ($t3,$t4) = ($t4,$t3); } $code.=<<___; adds $acc0,$acc0,$acc4 // accumulate upper half adcs $acc1,$acc1,$acc5 adcs $acc2,$acc2,$acc6 adcs $acc3,$acc3,$acc7 adc $acc4,xzr,xzr subs $t0,$acc0,$ord0 // ret -= modulus sbcs $t1,$acc1,$ord1 sbcs $t2,$acc2,$ord2 sbcs $t3,$acc3,$ord3 sbcs xzr,$acc4,xzr csel $a0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $a1,$acc1,$t1,lo csel $a2,$acc2,$t2,lo csel $a3,$acc3,$t3,lo cbnz $bp,.Loop_ord_sqr stp $a0,$a1,[$rp] stp $a2,$a3,[$rp,#16] ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldr x29,[sp],#64 ret .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont ___ } } ######################################################################## # scatter-gather subroutines { my ($out,$inp,$index,$mask)=map("x$_",(0..3)); $code.=<<___; // void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, // int x2); .globl ecp_nistz256_scatter_w5 .type ecp_nistz256_scatter_w5,%function .align 4 ecp_nistz256_scatter_w5: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 add $out,$out,$index,lsl#2 ldp x4,x5,[$inp] // X ldp x6,x7,[$inp,#16] stur w4,[$out,#64*0-4] lsr x4,x4,#32 str w5,[$out,#64*1-4] lsr x5,x5,#32 str w6,[$out,#64*2-4] lsr x6,x6,#32 str w7,[$out,#64*3-4] lsr x7,x7,#32 str w4,[$out,#64*4-4] str w5,[$out,#64*5-4] str w6,[$out,#64*6-4] str w7,[$out,#64*7-4] add $out,$out,#64*8 ldp x4,x5,[$inp,#32] // Y ldp x6,x7,[$inp,#48] stur w4,[$out,#64*0-4] lsr x4,x4,#32 str w5,[$out,#64*1-4] lsr x5,x5,#32 str w6,[$out,#64*2-4] lsr x6,x6,#32 str w7,[$out,#64*3-4] lsr x7,x7,#32 str w4,[$out,#64*4-4] str w5,[$out,#64*5-4] str w6,[$out,#64*6-4] str w7,[$out,#64*7-4] add $out,$out,#64*8 ldp x4,x5,[$inp,#64] // Z ldp x6,x7,[$inp,#80] stur w4,[$out,#64*0-4] lsr x4,x4,#32 str w5,[$out,#64*1-4] lsr x5,x5,#32 str w6,[$out,#64*2-4] lsr x6,x6,#32 str w7,[$out,#64*3-4] lsr x7,x7,#32 str w4,[$out,#64*4-4] str w5,[$out,#64*5-4] str w6,[$out,#64*6-4] str w7,[$out,#64*7-4] ldr x29,[sp],#16 ret .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 // void ecp_nistz256_gather_w5(P256_POINT *x0,const void *x1, // int x2); .globl ecp_nistz256_gather_w5 .type ecp_nistz256_gather_w5,%function .align 4 ecp_nistz256_gather_w5: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 cmp $index,xzr csetm x3,ne add $index,$index,x3 add $inp,$inp,$index,lsl#2 ldr w4,[$inp,#64*0] ldr w5,[$inp,#64*1] ldr w6,[$inp,#64*2] ldr w7,[$inp,#64*3] ldr w8,[$inp,#64*4] ldr w9,[$inp,#64*5] ldr w10,[$inp,#64*6] ldr w11,[$inp,#64*7] add $inp,$inp,#64*8 orr x4,x4,x8,lsl#32 orr x5,x5,x9,lsl#32 orr x6,x6,x10,lsl#32 orr x7,x7,x11,lsl#32 csel x4,x4,xzr,ne csel x5,x5,xzr,ne csel x6,x6,xzr,ne csel x7,x7,xzr,ne stp x4,x5,[$out] // X stp x6,x7,[$out,#16] ldr w4,[$inp,#64*0] ldr w5,[$inp,#64*1] ldr w6,[$inp,#64*2] ldr w7,[$inp,#64*3] ldr w8,[$inp,#64*4] ldr w9,[$inp,#64*5] ldr w10,[$inp,#64*6] ldr w11,[$inp,#64*7] add $inp,$inp,#64*8 orr x4,x4,x8,lsl#32 orr x5,x5,x9,lsl#32 orr x6,x6,x10,lsl#32 orr x7,x7,x11,lsl#32 csel x4,x4,xzr,ne csel x5,x5,xzr,ne csel x6,x6,xzr,ne csel x7,x7,xzr,ne stp x4,x5,[$out,#32] // Y stp x6,x7,[$out,#48] ldr w4,[$inp,#64*0] ldr w5,[$inp,#64*1] ldr w6,[$inp,#64*2] ldr w7,[$inp,#64*3] ldr w8,[$inp,#64*4] ldr w9,[$inp,#64*5] ldr w10,[$inp,#64*6] ldr w11,[$inp,#64*7] orr x4,x4,x8,lsl#32 orr x5,x5,x9,lsl#32 orr x6,x6,x10,lsl#32 orr x7,x7,x11,lsl#32 csel x4,x4,xzr,ne csel x5,x5,xzr,ne csel x6,x6,xzr,ne csel x7,x7,xzr,ne stp x4,x5,[$out,#64] // Z stp x6,x7,[$out,#80] ldr x29,[sp],#16 ret .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 // void ecp_nistz256_scatter_w7(void *x0,const P256_POINT_AFFINE *x1, // int x2); .globl ecp_nistz256_scatter_w7 .type ecp_nistz256_scatter_w7,%function .align 4 ecp_nistz256_scatter_w7: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 add $out,$out,$index mov $index,#64/8 .Loop_scatter_w7: ldr x3,[$inp],#8 subs $index,$index,#1 prfm pstl1strm,[$out,#4096+64*0] prfm pstl1strm,[$out,#4096+64*1] prfm pstl1strm,[$out,#4096+64*2] prfm pstl1strm,[$out,#4096+64*3] prfm pstl1strm,[$out,#4096+64*4] prfm pstl1strm,[$out,#4096+64*5] prfm pstl1strm,[$out,#4096+64*6] prfm pstl1strm,[$out,#4096+64*7] strb w3,[$out,#64*0] lsr x3,x3,#8 strb w3,[$out,#64*1] lsr x3,x3,#8 strb w3,[$out,#64*2] lsr x3,x3,#8 strb w3,[$out,#64*3] lsr x3,x3,#8 strb w3,[$out,#64*4] lsr x3,x3,#8 strb w3,[$out,#64*5] lsr x3,x3,#8 strb w3,[$out,#64*6] lsr x3,x3,#8 strb w3,[$out,#64*7] add $out,$out,#64*8 b.ne .Loop_scatter_w7 ldr x29,[sp],#16 ret .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 // void ecp_nistz256_gather_w7(P256_POINT_AFFINE *x0,const void *x1, // int x2); .globl ecp_nistz256_gather_w7 .type ecp_nistz256_gather_w7,%function .align 4 ecp_nistz256_gather_w7: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 cmp $index,xzr csetm x3,ne add $index,$index,x3 add $inp,$inp,$index mov $index,#64/8 nop .Loop_gather_w7: ldrb w4,[$inp,#64*0] prfm pldl1strm,[$inp,#4096+64*0] subs $index,$index,#1 ldrb w5,[$inp,#64*1] prfm pldl1strm,[$inp,#4096+64*1] ldrb w6,[$inp,#64*2] prfm pldl1strm,[$inp,#4096+64*2] ldrb w7,[$inp,#64*3] prfm pldl1strm,[$inp,#4096+64*3] ldrb w8,[$inp,#64*4] prfm pldl1strm,[$inp,#4096+64*4] ldrb w9,[$inp,#64*5] prfm pldl1strm,[$inp,#4096+64*5] ldrb w10,[$inp,#64*6] prfm pldl1strm,[$inp,#4096+64*6] ldrb w11,[$inp,#64*7] prfm pldl1strm,[$inp,#4096+64*7] add $inp,$inp,#64*8 orr x4,x4,x5,lsl#8 orr x6,x6,x7,lsl#8 orr x8,x8,x9,lsl#8 orr x4,x4,x6,lsl#16 orr x10,x10,x11,lsl#8 orr x4,x4,x8,lsl#32 orr x4,x4,x10,lsl#48 and x4,x4,x3 str x4,[$out],#8 b.ne .Loop_gather_w7 ldr x29,[sp],#16 ret .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl index 302b6f5a8ec8..ac061f797d32 100755 --- a/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl +++ b/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl @@ -1,6097 +1,6103 @@ #! /usr/bin/env perl # Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # #======================================================================== # Written by Fangming Fang for the OpenSSL project, # derived from https://github.com/ARM-software/AArch64cryptolib, original # author Samuel Lee . The module is, however, dual # licensed under OpenSSL and CRYPTOGAMS licenses depending on where you # obtain it. For further details see http://www.openssl.org/~appro/cryptogams/. #======================================================================== # # Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants # # main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks # # ____________________________________________________ # | | # | PRE | # |____________________________________________________| # | | | | # | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 | # |________________|________________|__________________| # | | | | # | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 | # |________________|________________|__________________| # | | | | # | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 | # |________________|________________|__________________| # | | | | # | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 | # |________________|____(mostly)____|__________________| # | | # | MODULO | # |____________________________________________________| # # PRE: # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0 # EXT low_acc, low_acc, low_acc, #8 # EOR res_curr (4k+0), res_curr (4k+0), low_acc # # CTR block: # Increment and byte reverse counter in scalar registers and transfer to SIMD registers # REV ctr32, rev_ctr32 # ORR ctr64, constctr96_top32, ctr32, LSL #32 # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF # INS ctr_next.d[1], ctr64X # ADD rev_ctr32, #1 # # AES block: # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example. # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring # Given we are very constrained in our ASIMD registers this is quite important # # Encrypt: # LDR input_low, [ input_ptr ], #8 # LDR input_high, [ input_ptr ], #8 # EOR input_low, k14_low # EOR input_high, k14_high # INS res_curr.d[0], input_low # INS res_curr.d[1], input_high # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k13 # EOR res_curr, res_curr, ctr_curr # ST1 { res_curr.16b }, [ output_ptr ], #16 # # Decrypt: # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr # AESE ctr_curr, k13 # LDR res_curr, [ input_ptr ], #16 # EOR res_curr, res_curr, ctr_curr # MOV output_low, res_curr.d[0] # MOV output_high, res_curr.d[1] # EOR output_low, k14_low # EOR output_high, k14_high # STP output_low, output_high, [ output_ptr ], #16 # # GHASH block X: # do 128b karatsuba polynomial multiplication on block # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b # # multiplication: # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 # # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies: # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64 # # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are # multiplying with "twisted" powers of H # # Note: We can PMULL directly into the acc_x in first GHASH of the loop # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical # path latency dominates the performance # # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers # than indicated here # REV64 res_curr, res_curr # INS t_m.d[0], res_curr.d[1] # EOR t_m.8B, t_m.8B, res_curr.8B # PMULL2 t_h, res_curr, HX # PMULL t_l, res_curr, HX # PMULL t_m, t_m, HX_k # EOR acc_h, acc_h, t_h # EOR acc_l, acc_l, t_l # EOR acc_m, acc_m, t_m # # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo # with a reversed constant # EOR acc_m, acc_m, acc_h # EOR acc_m, acc_m, acc_l // Finish off karatsuba processing # PMULL t_mod, acc_h, mod_constant # EXT acc_h, acc_h, acc_h, #8 # EOR acc_m, acc_m, acc_h # EOR acc_m, acc_m, t_mod # PMULL acc_h, acc_m, mod_constant # EXT acc_m, acc_m, acc_m, #8 # EOR acc_l, acc_l, acc_h # EOR acc_l, acc_l, acc_m $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; $input_ptr="x0"; #argument block $bit_length="x1"; $output_ptr="x2"; $current_tag="x3"; $counter="x16"; $cc="x8"; { my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); my ($output_l0,$output_h0)=map("x$_",(6..7)); my $ctr32w="w9"; my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15)); my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); my $t0="v8"; my $t0d="d8"; my ($t1,$t2,$t3)=map("v$_",(28..30)); my ($t1d,$t2d,$t3d)=map("d$_",(28..30)); my $t4="v8"; my $t4d="d8"; my $t5="v28"; my $t5d="d28"; my $t6="v31"; my $t6d="d31"; my $t7="v4"; my $t7d="d4"; my $t8="v29"; my $t8d="d29"; my $t9="v30"; my $t9d="d30"; my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); my $mod_constantd="d8"; my $mod_constant="v8"; my $mod_t="v31"; my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27)); my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27)); my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27)); my $rk2q1="v20.1q"; my $rk3q1="v21.1q"; my $rk4v="v22"; my $rk4d="d22"; $code=<<___; #include "arm_arch.h" #if __ARM_MAX_ARCH__>=8 ___ $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); $code.=<<___ if ($flavour !~ /64/); .fpu neon #ifdef __thumb2__ .syntax unified .thumb # define INST(a,b,c,d) $_byte c,0xef,a,b #else .code 32 # define INST(a,b,c,d) $_byte a,b,c,0xf2 #endif .text ___ ######################################################################################### # size_t aes_gcm_enc_128_kernel(const unsigned char *in, # size_t len, # unsigned char *out, # const void *key, # unsigned char ivec[16], # u64 *Xi); # $code.=<<___; .global aes_gcm_enc_128_kernel .type aes_gcm_enc_128_kernel,%function .align 4 aes_gcm_enc_128_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L128_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev $ctr96_b64x, $ctr96_b64x rev $ctr96_t32x, $ctr96_t32x #endif ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 #ifdef __AARCH64EB__ ror $rk10_l, $rk10_l, #32 ror $rk10_h, $rk10_h, #32 #endif ld1 {$acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ld1 {$rk0s}, [$cc], #16 @ load rk0 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 lsr $rctr32x, $ctr96_t32x, #32 ldr $h4q, [$current_tag, #112] @ load h4l | h4h #ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 #endif fmov $ctr1d, $ctr96_b64x @ CTR block 1 rev $rctr32w, $rctr32w @ rev_ctr32 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w ld1 {$rk1s}, [$cc], #16 @ load rk1 rev $ctr32w, $rctr32w @ CTR block 1 add $rctr32w, $rctr32w, #1 @ CTR block 1 fmov $ctr3d, $ctr96_b64x @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible fmov $ctr1.d[1], $ctr32x @ CTR block 1 rev $ctr32w, $rctr32w @ CTR block 2 fmov $ctr2d, $ctr96_b64x @ CTR block 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 add $rctr32w, $rctr32w, #1 @ CTR block 2 fmov $ctr2.d[1], $ctr32x @ CTR block 2 rev $ctr32w, $rctr32w @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 ld1 {$rk2s}, [$cc], #16 @ load rk2 add $rctr32w, $rctr32w, #1 @ CTR block 3 fmov $ctr3.d[1], $ctr32x @ CTR block 3 ldr $h3q, [$current_tag, #80] @ load h3l | h3h #ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 #endif aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ld1 {$rk3s}, [$cc], #16 @ load rk3 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ldr $h1q, [$current_tag, #32] @ load h1l | h1h #ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 #endif aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ld1 {$rk4s}, [$cc], #16 @ load rk4 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ld1 {$rk5s}, [$cc], #16 @ load rk5 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ld1 {$rk6s}, [$cc], #16 @ load rk6 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ldr $h2q, [$current_tag, #64] @ load h2l | h2h #ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 #endif aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 aese $ctr2b, $rk9 @ AES block 2 - round 9 aese $ctr0b, $rk9 @ AES block 0 - round 9 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k aese $ctr1b, $rk9 @ AES block 1 - round 9 aese $ctr3b, $rk9 @ AES block 3 - round 9 b.ge .L128_enc_tail @ handle tail ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext #ifdef __AARCH64EB__ rev $input_l2, $input_l2 rev $input_h2, $input_h2 #endif ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext #ifdef __AARCH64EB__ rev $input_l1, $input_l1 rev $input_h1, $input_h1 #endif ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext #ifdef __AARCH64EB__ rev $input_l3, $input_l3 rev $input_h3, $input_h3 #endif eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high rev $ctr32w, $rctr32w @ CTR block 4 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result fmov $ctr0d, $ctr96_b64x @ CTR block 4 add $rctr32w, $rctr32w, #1 @ CTR block 4 fmov $ctr0.d[1], $ctr32x @ CTR block 4 rev $ctr32w, $rctr32w @ CTR block 5 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result fmov $ctr1d, $ctr96_b64x @ CTR block 5 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 add $rctr32w, $rctr32w, #1 @ CTR block 5 add $input_ptr, $input_ptr, #64 @ AES input_ptr update fmov $ctr1.d[1], $ctr32x @ CTR block 5 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low rev $ctr32w, $rctr32w @ CTR block 6 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 add $rctr32w, $rctr32w, #1 @ CTR block 6 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result fmov $ctr2d, $ctr96_b64x @ CTR block 6 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks fmov $ctr2.d[1], $ctr32x @ CTR block 6 rev $ctr32w, $rctr32w @ CTR block 7 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result b.ge .L128_enc_prepretail @ do prepretail .L128_enc_main_loop: @ main loop start ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext #ifdef __AARCH64EB__ rev $input_l3, $input_l3 rev $input_h3, $input_h3 #endif rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 eor $res0b, $res0b, $acc_lb @ PRE 1 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 rev $ctr32w, $rctr32w @ CTR block 4k+8 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid mov $t0d, $res0.d[1] @ GHASH block 4k - mid orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low movi $mod_constant.8b, #0xc2 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev $input_l1, $input_l1 rev $input_h1, $input_h1 #endif aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev $input_l2, $input_l2 rev $input_h2, $input_h2 #endif pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high add $input_ptr, $input_ptr, #64 @ AES input_ptr update fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 rev $ctr32w, $rctr32w @ CTR block 4k+9 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 rev $ctr32w, $rctr32w @ CTR block 4k+10 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result rev $ctr32w, $rctr32w @ CTR block 4k+11 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result b.lt .L128_enc_main_loop .L128_enc_prepretail: @ PREPRETAIL rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) eor $res0b, $res0b, $acc_lb @ PRE 1 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low mov $t0d, $res0.d[1] @ GHASH block 4k - mid mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low movi $mod_constant.8b, #0xc2 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 pmull $t1.1q, $acc_h.1d, $mod_constant.1d eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 ext $acc_hb, $acc_hb, $acc_hb, #8 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 eor $acc_mb, $acc_mb, $acc_lb aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 eor $acc_mb, $acc_mb, $t1.16b aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 eor $acc_mb, $acc_mb, $acc_hb aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 pmull $t1.1q, $acc_m.1d, $mod_constant.1d aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 ext $acc_mb, $acc_mb, $acc_mb, #8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 eor $acc_lb, $acc_lb, $t1.16b aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 eor $acc_lb, $acc_lb, $acc_mb aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 .L128_enc_tail: @ TAIL sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif cmp $main_end_input_ptr, #48 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result b.gt .L128_enc_blocks_more_than_3 sub $rctr32w, $rctr32w, #1 movi $acc_l.8b, #0 mov $ctr3b, $ctr2b cmp $main_end_input_ptr, #32 mov $ctr2b, $ctr1b movi $acc_h.8b, #0 movi $acc_m.8b, #0 b.gt .L128_enc_blocks_more_than_2 mov $ctr3b, $ctr1b cmp $main_end_input_ptr, #16 sub $rctr32w, $rctr32w, #1 b.gt .L128_enc_blocks_more_than_1 sub $rctr32w, $rctr32w, #1 b .L128_enc_blocks_less_than_1 .L128_enc_blocks_more_than_3: @ blocks left > 3 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif rev64 $res0b, $res1b @ GHASH final-3 block eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low fmov $res1d, $input_l0 @ AES final-2 block - mov low movi $t0.8b, #0 @ suppress further partial tag feed in fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid eor $res1b, $res1b, $ctr1b @ AES final-2 block - result eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid .L128_enc_blocks_more_than_2: @ blocks left > 2 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result rev64 $res0b, $res1b @ GHASH final-2 block ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low fmov $res1d, $input_l0 @ AES final-1 block - mov low eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid eor $res1b, $res1b, $ctr2b @ AES final-1 block - result eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid .L128_enc_blocks_more_than_1: @ blocks left > 1 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result rev64 $res0b, $res1b @ GHASH final-1 block ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low fmov $res1d, $input_l0 @ AES final block - mov low pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high fmov $res1.d[1], $input_h0 @ AES final block - mov high mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid eor $res1b, $res1b, $ctr3b @ AES final block - result ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in .L128_enc_blocks_less_than_1: @ blocks left <= 1 and $bit_length, $bit_length, #127 @ bit_length %= 128 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff sub $bit_length, $bit_length, #128 @ bit_length -= 128 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) and $bit_length, $bit_length, #127 @ bit_length %= 128 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block cmp $bit_length, #64 csel $input_l0, $rk10_l, $rk10_h, lt csel $input_h0, $rk10_h, xzr, lt fmov $ctr0d, $input_l0 @ ctr0b is mask for last block fmov $ctr0.d[1], $input_h0 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits rev64 $res0b, $res1b @ GHASH final block eor $res0b, $res0b, $t0.16b @ feed in partial tag mov $t0d, $res0.d[1] @ GHASH final block - mid pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid #ifndef __AARCH64EB__ rev $ctr32w, $rctr32w #else mov $ctr32w, $rctr32w #endif pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid movi $mod_constant.8b, #0xc2 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up shl $mod_constantd, $mod_constantd, #56 @ mod_constant eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low st1 { $res1b}, [$output_ptr] @ store all 16B str $ctr32w, [$counter, #12] @ store the updated counter eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L128_enc_ret: mov w0, #0x0 ret .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel ___ ######################################################################################### # size_t aes_gcm_dec_128_kernel(const unsigned char *in, # size_t len, # unsigned char *out, # const void *key, # unsigned char ivec[16], # u64 *Xi); # $code.=<<___; .global aes_gcm_dec_128_kernel .type aes_gcm_dec_128_kernel,%function .align 4 aes_gcm_dec_128_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L128_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev $ctr96_b64x, $ctr96_b64x rev $ctr96_t32x, $ctr96_t32x #endif ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 #ifdef __AARCH64EB__ ror $rk10_h, $rk10_h, 32 ror $rk10_l, $rk10_l, 32 #endif sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ld1 {$rk0s}, [$cc], #16 @ load rk0 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible ldr $h2q, [$current_tag, #64] @ load h2l | h2h #ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 #endif lsr $rctr32x, $ctr96_t32x, #32 fmov $ctr2d, $ctr96_b64x @ CTR block 2 ld1 {$rk1s}, [$cc], #16 @ load rk1 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w rev $rctr32w, $rctr32w @ rev_ctr32 fmov $ctr1d, $ctr96_b64x @ CTR block 1 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 rev $ctr32w, $rctr32w @ CTR block 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 ld1 {$rk2s}, [$cc], #16 @ load rk2 add $rctr32w, $rctr32w, #1 @ CTR block 1 fmov $ctr1.d[1], $ctr32x @ CTR block 1 rev $ctr32w, $rctr32w @ CTR block 2 add $rctr32w, $rctr32w, #1 @ CTR block 2 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 fmov $ctr2.d[1], $ctr32x @ CTR block 2 rev $ctr32w, $rctr32w @ CTR block 3 fmov $ctr3d, $ctr96_b64x @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 add $rctr32w, $rctr32w, #1 @ CTR block 3 fmov $ctr3.d[1], $ctr32x @ CTR block 3 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ld1 {$rk3s}, [$cc], #16 @ load rk3 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ld1 {$rk4s}, [$cc], #16 @ load rk4 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ld1 {$rk5s}, [$cc], #16 @ load rk5 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ld1 {$rk6s}, [$cc], #16 @ load rk6 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ld1 { $acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ldr $h3q, [$current_tag, #80] @ load h3l | h3h #ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 #endif aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 ldr $h1q, [$current_tag, #32] @ load h1l | h1h #ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 #endif aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h ldr $h4q, [$current_tag, #112] @ load h4l | h4h #ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 #endif trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l add $main_end_input_ptr, $main_end_input_ptr, $input_ptr aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h aese $ctr2b, $rk9 @ AES block 2 - round 9 aese $ctr3b, $rk9 @ AES block 3 - round 9 aese $ctr0b, $rk9 @ AES block 0 - round 9 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr1b, $rk9 @ AES block 1 - round 9 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k b.ge .L128_dec_tail @ handle tail ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result rev64 $res0b, $res0b @ GHASH block 0 rev $ctr32w, $rctr32w @ CTR block 4 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 add $rctr32w, $rctr32w, #1 @ CTR block 4 ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext rev64 $res1b, $res1b @ GHASH block 1 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high fmov $ctr0d, $ctr96_b64x @ CTR block 4 fmov $ctr0.d[1], $ctr32x @ CTR block 4 rev $ctr32w, $rctr32w @ CTR block 5 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low #ifdef __AARCH64EB__ rev $output_l1, $output_l1 #endif fmov $ctr1d, $ctr96_b64x @ CTR block 5 add $rctr32w, $rctr32w, #1 @ CTR block 5 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 fmov $ctr1.d[1], $ctr32x @ CTR block 5 rev $ctr32w, $rctr32w @ CTR block 6 add $rctr32w, $rctr32w, #1 @ CTR block 6 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high #ifdef __AARCH64EB__ rev $output_h1, $output_h1 #endif eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result b.ge .L128_dec_prepretail @ do prepretail .L128_dec_main_loop: @ main loop start eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 rev64 $res2b, $res2b @ GHASH block 4k+2 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 rev $ctr32w, $rctr32w @ CTR block 4k+7 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low eor $res0b, $res0b, $acc_lb @ PRE 1 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 rev64 $res3b, $res3b @ GHASH block 4k+3 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 mov $t0d, $res0.d[1] @ GHASH block 4k - mid aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low #ifdef __AARCH64EB__ rev $output_l3, $output_l3 #endif pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high #ifdef __AARCH64EB__ rev $output_h2, $output_h2 #endif mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high #ifdef __AARCH64EB__ rev $output_h3, $output_h3 #endif aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low #ifdef __AARCH64EB__ rev $output_l2, $output_l2 #endif aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 movi $mod_constant.8b, #0xc2 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+3 - load ciphertext aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 rev $ctr32w, $rctr32w @ CTR block 4k+8 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 rev64 $res1b, $res1b @ GHASH block 4k+5 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 rev $ctr32w, $rctr32w @ CTR block 4k+9 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL rev64 $res0b, $res0b @ GHASH block 4k+4 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 rev $ctr32w, $rctr32w @ CTR block 4k+10 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high #ifdef __AARCH64EB__ rev $output_h1, $output_h1 #endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low #ifdef __AARCH64EB__ rev $output_l1, $output_l1 #endif stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 b.lt L128_dec_main_loop .L128_dec_prepretail: @ PREPRETAIL ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high eor $res0b, $res0b, $acc_lb @ PRE 1 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 rev64 $res2b, $res2b @ GHASH block 4k+2 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 rev $ctr32w, $rctr32w @ CTR block 4k+7 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low mov $t0d, $res0.d[1] @ GHASH block 4k - mid fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid rev64 $res3b, $res3b @ GHASH block 4k+3 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high movi $mod_constant.8b, #0xc2 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low #ifdef __AARCH64EB__ rev $output_l3, $output_l3 #endif pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low #ifdef __AARCH64EB__ rev $output_l2, $output_l2 #endif eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high #ifdef __AARCH64EB__ rev $output_h3, $output_h3 #endif aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high #ifdef __AARCH64EB__ rev $output_h2, $output_h2 #endif aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low .L128_dec_tail: @ TAIL sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low cmp $main_end_input_ptr, #48 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif b.gt .L128_dec_blocks_more_than_3 mov $ctr3b, $ctr2b sub $rctr32w, $rctr32w, #1 movi $acc_l.8b, #0 movi $acc_h.8b, #0 mov $ctr2b, $ctr1b movi $acc_m.8b, #0 cmp $main_end_input_ptr, #32 b.gt .L128_dec_blocks_more_than_2 cmp $main_end_input_ptr, #16 mov $ctr3b, $ctr1b sub $rctr32w, $rctr32w, #1 b.gt .L128_dec_blocks_more_than_1 sub $rctr32w, $rctr32w, #1 b .L128_dec_blocks_less_than_1 .L128_dec_blocks_more_than_3: @ blocks left > 3 rev64 $res0b, $res1b @ GHASH final-3 block ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext eor $res0b, $res0b, $t0.16b @ feed in partial tag mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif .L128_dec_blocks_more_than_2: @ blocks left > 2 rev64 $res0b, $res1b @ GHASH final-2 block ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif .L128_dec_blocks_more_than_1: @ blocks left > 1 rev64 $res0b, $res1b @ GHASH final-1 block ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext eor $res0b, $res0b, $t0.16b @ feed in partial tag mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid eor $ctr0b, $res1b, $ctr3b @ AES final block - result eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result mov $output_l0, $ctr0.d[0] @ AES final block - mov low mov $output_h0, $ctr0.d[1] @ AES final block - mov high ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid .L128_dec_blocks_less_than_1: @ blocks left <= 1 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff and $bit_length, $bit_length, #127 @ bit_length %= 128 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff sub $bit_length, $bit_length, #128 @ bit_length -= 128 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) and $bit_length, $bit_length, #127 @ bit_length %= 128 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block cmp $bit_length, #64 csel $ctr96_b64x, $rk10_h, xzr, lt csel $ctr32x, $rk10_l, $rk10_h, lt fmov $ctr0d, $ctr32x @ ctr0b is mask for last block mov $ctr0.d[1], $ctr96_b64x and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits rev64 $res0b, $res1b @ GHASH final block eor $res0b, $res0b, $t0.16b @ feed in partial tag ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite and $output_h0, $output_h0, $ctr96_b64x pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high mov $t0d, $res0.d[1] @ GHASH final block - mid eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes and $output_l0, $output_l0, $ctr32x #ifndef __AARCH64EB__ rev $ctr32w, $rctr32w #else mov $ctr32w, $rctr32w #endif eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid movi $mod_constant.8b, #0xc2 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes shl $mod_constantd, $mod_constantd, #56 @ mod_constant eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up orr $output_l0, $output_l0, $end_input_ptr str $ctr32w, [$counter, #12] @ store the updated counter orr $output_h0, $output_h0, $main_end_input_ptr stp $output_l0, $output_h0, [$output_ptr] ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L128_dec_ret: mov w0, #0x0 ret .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel ___ } { my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); my ($output_l0,$output_h0)=map("x$_",(6..7)); my $ctr32w="w9"; my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15)); my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); my $t0="v8"; my $t0d="d8"; my $t3="v4"; my $t3d="d4"; my ($t1,$t2)=map("v$_",(30..31)); my ($t1d,$t2d)=map("d$_",(30..31)); my $t4="v30"; my $t4d="d30"; my $t5="v8"; my $t5d="d8"; my $t6="v31"; my $t6d="d31"; my $t7="v5"; my $t7d="d5"; my $t8="v6"; my $t8d="d6"; my $t9="v30"; my $t9d="d30"; my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); my $mod_constantd="d8"; my $mod_constant="v8"; my $mod_t="v31"; my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29)); my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29)); my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29)); my $rk2q1="v20.1q"; my $rk3q1="v21.1q"; my $rk4v="v22"; my $rk4d="d22"; ######################################################################################### # size_t aes_gcm_enc_192_kernel(const unsigned char *in, # size_t len, # unsigned char *out, # const void *key, # unsigned char ivec[16], # u64 *Xi); # $code.=<<___; .global aes_gcm_enc_192_kernel .type aes_gcm_enc_192_kernel,%function .align 4 aes_gcm_enc_192_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L192_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev $ctr96_b64x, $ctr96_b64x rev $ctr96_t32x, $ctr96_t32x #endif ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 #ifdef __AARCH64EB__ ror $rk12_l, $rk12_l, #32 ror $rk12_h, $rk12_h, #32 #endif ld1 {$rk0s}, [$cc], #16 @ load rk0 ld1 {$rk1s}, [$cc], #16 @ load rk1 ld1 {$rk2s}, [$cc], #16 @ load rk2 lsr $rctr32x, $ctr96_t32x, #32 ld1 {$rk3s}, [$cc], #16 @ load rk3 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w ld1 {$rk4s}, [$cc], #16 @ load rk4 rev $rctr32w, $rctr32w @ rev_ctr32 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 fmov $ctr3d, $ctr96_b64x @ CTR block 3 rev $ctr32w, $rctr32w @ CTR block 1 add $rctr32w, $rctr32w, #1 @ CTR block 1 fmov $ctr1d, $ctr96_b64x @ CTR block 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible fmov $ctr1.d[1], $ctr32x @ CTR block 1 rev $ctr32w, $rctr32w @ CTR block 2 add $rctr32w, $rctr32w, #1 @ CTR block 2 fmov $ctr2d, $ctr96_b64x @ CTR block 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 fmov $ctr2.d[1], $ctr32x @ CTR block 2 rev $ctr32w, $rctr32w @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 ld1 {$rk5s}, [$cc], #16 @ load rk5 fmov $ctr3.d[1], $ctr32x @ CTR block 3 ld1 {$rk6s}, [$cc], #16 @ load rk6 ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ld1 { $acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ldr $h4q, [$current_tag, #112] @ load h4l | h4h #ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 #endif aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 ld1 {$rk10s}, [$cc], #16 @ load rk10 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ldr $h1q, [$current_tag, #32] @ load h1l | h1h #ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 #endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ld1 {$rk11s}, [$cc], #16 @ load rk11 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ldr $h3q, [$current_tag, #80] @ load h3l | h3h #ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 #endif aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ldr $h2q, [$current_tag, #64] @ load h2l | h2h #ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 #endif aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k aese $ctr2b, $rk11 @ AES block 2 - round 11 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr add $main_end_input_ptr, $main_end_input_ptr, $input_ptr aese $ctr1b, $rk11 @ AES block 1 - round 11 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr0b, $rk11 @ AES block 0 - round 11 add $rctr32w, $rctr32w, #1 @ CTR block 3 aese $ctr3b, $rk11 @ AES block 3 - round 11 b.ge .L192_enc_tail @ handle tail rev $ctr32w, $rctr32w @ CTR block 4 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext #ifdef __AARCH64EB__ rev $input_l2, $input_l2 rev $input_h2, $input_h2 #endif ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext #ifdef __AARCH64EB__ rev $input_l3, $input_l3 rev $input_h3, $input_h3 #endif ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext #ifdef __AARCH64EB__ rev $input_l1, $input_l1 rev $input_h1, $input_h1 #endif add $input_ptr, $input_ptr, #64 @ AES input_ptr update cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low add $rctr32w, $rctr32w, #1 @ CTR block 4 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result fmov $ctr0d, $ctr96_b64x @ CTR block 4 fmov $ctr0.d[1], $ctr32x @ CTR block 4 rev $ctr32w, $rctr32w @ CTR block 5 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 add $rctr32w, $rctr32w, #1 @ CTR block 5 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result fmov $ctr1d, $ctr96_b64x @ CTR block 5 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high fmov $ctr1.d[1], $ctr32x @ CTR block 5 rev $ctr32w, $rctr32w @ CTR block 6 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 add $rctr32w, $rctr32w, #1 @ CTR block 6 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result fmov $ctr2d, $ctr96_b64x @ CTR block 6 fmov $ctr2.d[1], $ctr32x @ CTR block 6 rev $ctr32w, $rctr32w @ CTR block 7 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result b.ge .L192_enc_prepretail @ do prepretail .L192_enc_main_loop: @ main loop start aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev $input_l1, $input_l1 rev $input_h1, $input_h1 #endif ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev $input_l2, $input_l2 rev $input_h2, $input_h2 #endif aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext #ifdef __AARCH64EB__ rev $input_l3, $input_l3 rev $input_h3, $input_h3 #endif pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low eor $res0b, $res0b, $acc_lb @ PRE 1 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low mov $t0d, $res0.d[1] @ GHASH block 4k - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid rev $ctr32w, $rctr32w @ CTR block 4k+8 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 add $input_ptr, $input_ptr, #64 @ AES input_ptr update aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 movi $mod_constant.8b, #0xc2 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 rev $ctr32w, $rctr32w @ CTR block 4k+9 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 rev $ctr32w, $rctr32w @ CTR block 4k+10 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 rev $ctr32w, $rctr32w @ CTR block 4k+11 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result b.lt .L192_enc_main_loop .L192_enc_prepretail: @ PREPRETAIL aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 eor $res0b, $res0b, $acc_lb @ PRE 1 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low mov $t0d, $res0.d[1] @ GHASH block 4k - mid pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low movi $mod_constant.8b, #0xc2 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 eor $acc_mb, $acc_mb, $acc_lb aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 pmull $t1.1q, $acc_h.1d, $mod_constant.1d aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 ext $acc_hb, $acc_hb, $acc_hb, #8 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 eor $acc_mb, $acc_mb, $t1.16b aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 eor $acc_mb, $acc_mb, $acc_hb aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 pmull $t1.1q, $acc_m.1d, $mod_constant.1d ext $acc_mb, $acc_mb, $acc_mb, #8 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 eor $acc_lb, $acc_lb, $t1.16b aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 eor $acc_lb, $acc_lb, $acc_mb .L192_enc_tail: @ TAIL sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high cmp $main_end_input_ptr, #48 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag b.gt .L192_enc_blocks_more_than_3 sub $rctr32w, $rctr32w, #1 movi $acc_m.8b, #0 mov $ctr3b, $ctr2b movi $acc_h.8b, #0 cmp $main_end_input_ptr, #32 mov $ctr2b, $ctr1b movi $acc_l.8b, #0 b.gt .L192_enc_blocks_more_than_2 sub $rctr32w, $rctr32w, #1 mov $ctr3b, $ctr1b cmp $main_end_input_ptr, #16 b.gt .L192_enc_blocks_more_than_1 sub $rctr32w, $rctr32w, #1 b .L192_enc_blocks_less_than_1 .L192_enc_blocks_more_than_3: @ blocks left > 3 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif rev64 $res0b, $res1b @ GHASH final-3 block eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high fmov $res1d, $input_l0 @ AES final-2 block - mov low fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid eor $res1b, $res1b, $ctr1b @ AES final-2 block - result .L192_enc_blocks_more_than_2: @ blocks left > 2 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result rev64 $res0b, $res1b @ GHASH final-2 block ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low fmov $res1d, $input_l0 @ AES final-1 block - mov low fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in eor $res1b, $res1b, $ctr2b @ AES final-1 block - result eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid .L192_enc_blocks_more_than_1: @ blocks left > 1 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif rev64 $res0b, $res1b @ GHASH final-1 block eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low eor $res0b, $res0b, $t0.16b @ feed in partial tag movi $t0.8b, #0 @ suppress further partial tag feed in mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high fmov $res1d, $input_l0 @ AES final block - mov low pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high fmov $res1.d[1], $input_h0 @ AES final block - mov high ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid eor $res1b, $res1b, $ctr3b @ AES final block - result eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid .L192_enc_blocks_less_than_1: @ blocks left <= 1 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored #ifndef __AARCH64EB__ rev $ctr32w, $rctr32w #else mov $ctr32w, $rctr32w #endif and $bit_length, $bit_length, #127 @ bit_length %= 128 sub $bit_length, $bit_length, #128 @ bit_length -= 128 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff and $bit_length, $bit_length, #127 @ bit_length %= 128 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block cmp $bit_length, #64 csel $input_l0, $rk12_l, $rk12_h, lt csel $input_h0, $rk12_h, xzr, lt fmov $ctr0d, $input_l0 @ ctr0b is mask for last block fmov $ctr0.d[1], $input_h0 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits rev64 $res0b, $res1b @ GHASH final block eor $res0b, $res0b, $t0.16b @ feed in partial tag mov $t0d, $res0.d[1] @ GHASH final block - mid pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid movi $mod_constant.8b, #0xc2 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up shl $mod_constantd, $mod_constantd, #56 @ mod_constant bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low str $ctr32w, [$counter, #12] @ store the updated counter st1 { $res1b}, [$output_ptr] @ store all 16B eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L192_enc_ret: mov w0, #0x0 ret .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel ___ ######################################################################################### # size_t aes_gcm_dec_192_kernel(const unsigned char *in, # size_t len, # unsigned char *out, # const void *key, # unsigned char ivec[16], # u64 *Xi); # $code.=<<___; .global aes_gcm_dec_192_kernel .type aes_gcm_dec_192_kernel,%function .align 4 aes_gcm_dec_192_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L192_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev $ctr96_b64x, $ctr96_b64x rev $ctr96_t32x, $ctr96_t32x #endif ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 #ifdef __AARCH64EB__ ror $rk12_l, $rk12_l, #32 ror $rk12_h, $rk12_h, #32 #endif ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible ld1 {$rk0s}, [$cc], #16 @ load rk0 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ld1 {$rk1s}, [$cc], #16 @ load rk1 lsr $rctr32x, $ctr96_t32x, #32 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w fmov $ctr3d, $ctr96_b64x @ CTR block 3 rev $rctr32w, $rctr32w @ rev_ctr32 fmov $ctr1d, $ctr96_b64x @ CTR block 1 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 ld1 {$rk2s}, [$cc], #16 @ load rk2 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 rev $ctr32w, $rctr32w @ CTR block 1 add $rctr32w, $rctr32w, #1 @ CTR block 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 ld1 {$rk3s}, [$cc], #16 @ load rk3 fmov $ctr1.d[1], $ctr32x @ CTR block 1 rev $ctr32w, $rctr32w @ CTR block 2 add $rctr32w, $rctr32w, #1 @ CTR block 2 fmov $ctr2d, $ctr96_b64x @ CTR block 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 fmov $ctr2.d[1], $ctr32x @ CTR block 2 rev $ctr32w, $rctr32w @ CTR block 3 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 fmov $ctr3.d[1], $ctr32x @ CTR block 3 ld1 {$rk4s}, [$cc], #16 @ load rk4 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ld1 {$rk5s}, [$cc], #16 @ load rk5 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ldr $h4q, [$current_tag, #112] @ load h4l | h4h #ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 #endif aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ldr $h2q, [$current_tag, #64] @ load h2l | h2h #ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 #endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ldr $h3q, [$current_tag, #80] @ load h3l | h3h #ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 #endif aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ldr $h1q, [$current_tag, #32] @ load h1l | h1h #ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 #endif aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ld1 {$rk6s}, [$cc], #16 @ load rk6 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 ld1 { $acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 add $rctr32w, $rctr32w, #1 @ CTR block 3 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 ld1 {$rk10s}, [$cc], #16 @ load rk10 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 ld1 {$rk11s}, [$cc], #16 @ load rk11 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h aese $ctr3b, $rk11 @ AES block 3 - round 11 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k aese $ctr2b, $rk11 @ AES block 2 - round 11 aese $ctr1b, $rk11 @ AES block 1 - round 11 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k aese $ctr0b, $rk11 @ AES block 0 - round 11 b.ge .L192_dec_tail @ handle tail ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result rev $ctr32w, $rctr32w @ CTR block 4 ld1 {$res2b, $res3b}, [$input_ptr], #32 @ AES block 2,3 - load ciphertext mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 add $rctr32w, $rctr32w, #1 @ CTR block 4 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high rev64 $res0b, $res0b @ GHASH block 0 fmov $ctr0d, $ctr96_b64x @ CTR block 4 rev64 $res1b, $res1b @ GHASH block 1 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low #ifdef __AARCH64EB__ rev $output_l1, $output_l1 #endif fmov $ctr0.d[1], $ctr32x @ CTR block 4 rev $ctr32w, $rctr32w @ CTR block 5 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 fmov $ctr1d, $ctr96_b64x @ CTR block 5 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high #ifdef __AARCH64EB__ rev $output_h1, $output_h1 #endif add $rctr32w, $rctr32w, #1 @ CTR block 5 fmov $ctr1.d[1], $ctr32x @ CTR block 5 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif rev $ctr32w, $rctr32w @ CTR block 6 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result add $rctr32w, $rctr32w, #1 @ CTR block 6 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result b.ge .L192_dec_prepretail @ do prepretail .L192_dec_main_loop: @ main loop start aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result rev64 $res3b, $res3b @ GHASH block 4k+3 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 eor $res0b, $res0b, $acc_lb @ PRE 1 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 mov $t0d, $res0.d[1] @ GHASH block 4k - mid pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid rev $ctr32w, $rctr32w @ CTR block 4k+7 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high #ifdef __AARCH64EB__ rev $output_h2, $output_h2 #endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 rev64 $res2b, $res2b @ GHASH block 4k+2 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low #ifdef __AARCH64EB__ rev $output_l2, $output_l2 #endif aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 movi $mod_constant.8b, #0xc2 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low #ifdef __AARCH64EB__ rev $output_l3, $output_l3 #endif aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext rev $ctr32w, $rctr32w @ CTR block 4k+8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high #ifdef __AARCH64EB__ rev $output_h3, $output_h3 #endif eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result rev64 $res1b, $res1b @ GHASH block 4k+5 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 rev $ctr32w, $rctr32w @ CTR block 4k+9 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low #ifdef __AARCH64EB__ rev $output_l1, $output_l1 #endif fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 rev $ctr32w, $rctr32w @ CTR block 4k+10 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high #ifdef __AARCH64EB__ rev $output_h1, $output_h1 #endif eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 rev64 $res0b, $res0b @ GHASH block 4k+4 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result b.lt .L192_dec_main_loop .L192_dec_prepretail: @ PREPRETAIL mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid eor $res0b, $res0b, $acc_lb @ PRE 1 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low mov $t0d, $res0.d[1] @ GHASH block 4k - mid fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 rev64 $res2b, $res2b @ GHASH block 4k+2 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 rev $ctr32w, $rctr32w @ CTR block 4k+7 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high #ifdef __AARCH64EB__ rev $output_h3, $output_h3 #endif fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low #ifdef __AARCH64EB__ rev $output_l2, $output_l2 #endif pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high #ifdef __AARCH64EB__ rev $output_h2, $output_h2 #endif eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low #ifdef __AARCH64EB__ rev $output_l3, $output_l3 #endif stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result rev64 $res3b, $res3b @ GHASH block 4k+3 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high movi $mod_constant.8b, #0xc2 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 shl $mod_constantd, $mod_constantd, #56 @ mod_constant eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 aese $ctr0b, $rk11 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low aese $ctr2b, $rk11 aese $ctr1b, $rk11 aese $ctr3b, $rk11 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low .L192_dec_tail: @ TAIL sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag cmp $main_end_input_ptr, #48 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif b.gt .L192_dec_blocks_more_than_3 movi $acc_l.8b, #0 movi $acc_h.8b, #0 mov $ctr3b, $ctr2b mov $ctr2b, $ctr1b sub $rctr32w, $rctr32w, #1 movi $acc_m.8b, #0 cmp $main_end_input_ptr, #32 b.gt .L192_dec_blocks_more_than_2 mov $ctr3b, $ctr1b cmp $main_end_input_ptr, #16 sub $rctr32w, $rctr32w, #1 b.gt .L192_dec_blocks_more_than_1 sub $rctr32w, $rctr32w, #1 b .L192_dec_blocks_less_than_1 .L192_dec_blocks_more_than_3: @ blocks left > 3 rev64 $res0b, $res1b @ GHASH final-3 block ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif movi $t0.8b, #0 @ suppress further partial tag feed in pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif .L192_dec_blocks_more_than_2: @ blocks left > 2 rev64 $res0b, $res1b @ GHASH final-2 block ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext eor $res0b, $res0b, $t0.16b @ feed in partial tag movi $t0.8b, #0 @ suppress further partial tag feed in eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid .L192_dec_blocks_more_than_1: @ blocks left > 1 rev64 $res0b, $res1b @ GHASH final-1 block eor $res0b, $res0b, $t0.16b @ feed in partial tag ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high eor $ctr0b, $res1b, $ctr3b @ AES final block - result stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low mov $output_h0, $ctr0.d[1] @ AES final block - mov high ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid mov $output_l0, $ctr0.d[0] @ AES final block - mov low pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid .L192_dec_blocks_less_than_1: @ blocks left <= 1 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite and $bit_length, $bit_length, #127 @ bit_length %= 128 sub $bit_length, $bit_length, #128 @ bit_length -= 128 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) and $bit_length, $bit_length, #127 @ bit_length %= 128 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block cmp $bit_length, #64 csel $ctr32x, $rk12_l, $rk12_h, lt csel $ctr96_b64x, $rk12_h, xzr, lt fmov $ctr0d, $ctr32x @ ctr0b is mask for last block and $output_l0, $output_l0, $ctr32x bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes orr $output_l0, $output_l0, $end_input_ptr mov $ctr0.d[1], $ctr96_b64x #ifndef __AARCH64EB__ rev $ctr32w, $rctr32w #else mov $ctr32w, $rctr32w #endif and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits str $ctr32w, [$counter, #12] @ store the updated counter rev64 $res0b, $res1b @ GHASH final block eor $res0b, $res0b, $t0.16b @ feed in partial tag bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes and $output_h0, $output_h0, $ctr96_b64x pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high mov $t0d, $res0.d[1] @ GHASH final block - mid pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid movi $mod_constant.8b, #0xc2 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up shl $mod_constantd, $mod_constantd, #56 @ mod_constant eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid orr $output_h0, $output_h0, $main_end_input_ptr stp $output_l0, $output_h0, [$output_ptr] ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L192_dec_ret: mov w0, #0x0 ret .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel ___ } { my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); my ($output_l0,$output_h0)=map("x$_",(6..7)); my $ctr32w="w9"; my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15)); my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); my $t0="v8"; my $t0d="d8"; my $t1="v4"; my $t1d="d4"; my $t2="v8"; my $t2d="d8"; my $t3="v4"; my $t3d="d4"; my $t4="v4"; my $t4d="d4"; my $t5="v5"; my $t5d="d5"; my $t6="v8"; my $t6d="d8"; my $t7="v5"; my $t7d="d5"; my $t8="v6"; my $t8d="d6"; my $t9="v4"; my $t9d="d4"; my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); my $mod_constantd="d8"; my $mod_constant="v8"; my $mod_t="v7"; my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31)); my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31)); my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31)); my $rk2q1="v20.1q"; my $rk3q1="v21.1q"; my $rk4v="v22"; my $rk4d="d22"; ######################################################################################### # size_t aes_gcm_enc_256_kernel(const unsigned char *in, # size_t len, # unsigned char *out, # const void *key, # unsigned char ivec[16], # u64 *Xi); # $code.=<<___; .global aes_gcm_enc_256_kernel .type aes_gcm_enc_256_kernel,%function .align 4 aes_gcm_enc_256_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L256_enc_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev $ctr96_b64x, $ctr96_b64x rev $ctr96_t32x, $ctr96_t32x #endif ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 #ifdef __AARCH64EB__ ror $rk14_l, $rk14_l, #32 ror $rk14_h, $rk14_h, #32 #endif ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ld1 {$rk0s}, [$cc], #16 @ load rk0 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 {$rk1s}, [$cc], #16 @ load rk1 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr lsr $rctr32x, $ctr96_t32x, #32 fmov $ctr2d, $ctr96_b64x @ CTR block 2 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w rev $rctr32w, $rctr32w @ rev_ctr32 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks fmov $ctr1d, $ctr96_b64x @ CTR block 1 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 rev $ctr32w, $rctr32w @ CTR block 1 fmov $ctr3d, $ctr96_b64x @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 add $rctr32w, $rctr32w, #1 @ CTR block 1 ld1 {$rk2s}, [$cc], #16 @ load rk2 fmov $ctr1.d[1], $ctr32x @ CTR block 1 rev $ctr32w, $rctr32w @ CTR block 2 add $rctr32w, $rctr32w, #1 @ CTR block 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 ld1 {$rk3s}, [$cc], #16 @ load rk3 fmov $ctr2.d[1], $ctr32x @ CTR block 2 rev $ctr32w, $rctr32w @ CTR block 3 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 fmov $ctr3.d[1], $ctr32x @ CTR block 3 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ld1 {$rk4s}, [$cc], #16 @ load rk4 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ld1 {$rk5s}, [$cc], #16 @ load rk5 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ld1 {$rk6s}, [$cc], #16 @ load rk6 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ldr $h3q, [$current_tag, #80] @ load h3l | h3h #ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 #endif aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ld1 {$rk7s}, [$cc], #16 @ load rk7 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 ldr $h2q, [$current_tag, #64] @ load h2l | h2h #ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 #endif aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ldr $h4q, [$current_tag, #112] @ load h4l | h4h #ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 #endif aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 ld1 {$rk10s}, [$cc], #16 @ load rk10 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 ld1 {$rk11s}, [$cc], #16 @ load rk11 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 add $rctr32w, $rctr32w, #1 @ CTR block 3 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 ld1 { $acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 ld1 {$rk12s}, [$cc], #16 @ load rk12 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 ldr $h1q, [$current_tag, #32] @ load h1l | h1h #ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 #endif aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 ld1 {$rk13s}, [$cc], #16 @ load rk13 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 aese $ctr2b, $rk13 @ AES block 2 - round 13 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 aese $ctr1b, $rk13 @ AES block 1 - round 13 aese $ctr0b, $rk13 @ AES block 0 - round 13 aese $ctr3b, $rk13 @ AES block 3 - round 13 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k b.ge .L256_enc_tail @ handle tail ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext #ifdef __AARCH64EB__ rev $input_l1, $input_l1 rev $input_h1, $input_h1 #endif rev $ctr32w, $rctr32w @ CTR block 4 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext #ifdef __AARCH64EB__ rev $input_l3, $input_l3 rev $input_h3, $input_h3 #endif ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext #ifdef __AARCH64EB__ rev $input_l2, $input_l2 rev $input_h2, $input_h2 #endif add $input_ptr, $input_ptr, #64 @ AES input_ptr update eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low add $rctr32w, $rctr32w, #1 @ CTR block 4 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result fmov $ctr0d, $ctr96_b64x @ CTR block 4 fmov $ctr0.d[1], $ctr32x @ CTR block 4 rev $ctr32w, $rctr32w @ CTR block 5 add $rctr32w, $rctr32w, #1 @ CTR block 5 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result fmov $ctr1d, $ctr96_b64x @ CTR block 5 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 fmov $ctr1.d[1], $ctr32x @ CTR block 5 rev $ctr32w, $rctr32w @ CTR block 6 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result add $rctr32w, $rctr32w, #1 @ CTR block 6 fmov $ctr2d, $ctr96_b64x @ CTR block 6 fmov $ctr2.d[1], $ctr32x @ CTR block 6 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result rev $ctr32w, $rctr32w @ CTR block 7 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result b.ge L256_enc_prepretail @ do prepretail .L256_enc_main_loop: @ main loop start aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext #ifdef __AARCH64EB__ rev $input_l3, $input_l3 rev $input_h3, $input_h3 #endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev $input_l2, $input_l2 rev $input_h2, $input_h2 #endif aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 eor $res0b, $res0b, $acc_lb @ PRE 1 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high mov $t0d, $res0.d[1] @ GHASH block 4k - mid aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev $input_l1, $input_l1 rev $input_h1, $input_h1 #endif aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 movi $mod_constant.8b, #0xc2 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 add $input_ptr, $input_ptr, #64 @ AES input_ptr update pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid rev $ctr32w, $rctr32w @ CTR block 4k+8 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 rev $ctr32w, $rctr32w @ CTR block 4k+9 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 rev $ctr32w, $rctr32w @ CTR block 4k+10 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 rev $ctr32w, $rctr32w @ CTR block 4k+11 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result b.lt L256_enc_main_loop .L256_enc_prepretail: @ PREPRETAIL aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 eor $res0b, $res0b, $acc_lb @ PRE 1 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low mov $t0d, $res0.d[1] @ GHASH block 4k - mid pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 movi $mod_constant.8b, #0xc2 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up pmull $t1.1q, $acc_h.1d, $mod_constant.1d ext $acc_hb, $acc_hb, $acc_hb, #8 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 eor $acc_mb, $acc_mb, $acc_lb aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 eor $acc_mb, $acc_mb, $t1.16b aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 eor $acc_mb, $acc_mb, $acc_hb aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 pmull $t1.1q, $acc_m.1d, $mod_constant.1d aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 ext $acc_mb, $acc_mb, $acc_mb, #8 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 eor $acc_lb, $acc_lb, $t1.16b aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 eor $acc_lb, $acc_lb, $acc_mb .L256_enc_tail: @ TAIL ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high cmp $main_end_input_ptr, #48 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result b.gt .L256_enc_blocks_more_than_3 cmp $main_end_input_ptr, #32 mov $ctr3b, $ctr2b movi $acc_l.8b, #0 movi $acc_h.8b, #0 sub $rctr32w, $rctr32w, #1 mov $ctr2b, $ctr1b movi $acc_m.8b, #0 b.gt .L256_enc_blocks_more_than_2 mov $ctr3b, $ctr1b sub $rctr32w, $rctr32w, #1 cmp $main_end_input_ptr, #16 b.gt .L256_enc_blocks_more_than_1 sub $rctr32w, $rctr32w, #1 b .L256_enc_blocks_less_than_1 .L256_enc_blocks_more_than_3: @ blocks left > 3 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif rev64 $res0b, $res1b @ GHASH final-3 block eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid fmov $res1d, $input_l0 @ AES final-2 block - mov low fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid eor $res1b, $res1b, $ctr1b @ AES final-2 block - result .L256_enc_blocks_more_than_2: @ blocks left > 2 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif rev64 $res0b, $res1b @ GHASH final-2 block eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low eor $res0b, $res0b, $t0.16b @ feed in partial tag fmov $res1d, $input_l0 @ AES final-1 block - mov low eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high movi $t0.8b, #0 @ suppress further partial tag feed in pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid eor $res1b, $res1b, $ctr2b @ AES final-1 block - result eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid .L256_enc_blocks_more_than_1: @ blocks left > 1 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result rev64 $res0b, $res1b @ GHASH final-1 block ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high #ifdef __AARCH64EB__ rev $input_l0, $input_l0 rev $input_h0, $input_h0 #endif eor $res0b, $res0b, $t0.16b @ feed in partial tag movi $t0.8b, #0 @ suppress further partial tag feed in eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid fmov $res1d, $input_l0 @ AES final block - mov low fmov $res1.d[1], $input_h0 @ AES final block - mov high pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low eor $res1b, $res1b, $ctr3b @ AES final block - result eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low .L256_enc_blocks_less_than_1: @ blocks left <= 1 and $bit_length, $bit_length, #127 @ bit_length %= 128 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff sub $bit_length, $bit_length, #128 @ bit_length -= 128 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff and $bit_length, $bit_length, #127 @ bit_length %= 128 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block cmp $bit_length, #64 csel $input_l0, $rk14_l, $rk14_h, lt csel $input_h0, $rk14_h, xzr, lt fmov $ctr0d, $input_l0 @ ctr0b is mask for last block fmov $ctr0.d[1], $input_h0 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits rev64 $res0b, $res1b @ GHASH final block eor $res0b, $res0b, $t0.16b @ feed in partial tag bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high mov $t0d, $res0.d[1] @ GHASH final block - mid #ifndef __AARCH64EB__ rev $ctr32w, $rctr32w #else mov $ctr32w, $rctr32w #endif pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid movi $mod_constant.8b, #0xc2 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up shl $mod_constantd, $mod_constantd, #56 @ mod_constant eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment str $ctr32w, [$counter, #12] @ store the updated counter st1 { $res1b}, [$output_ptr] @ store all 16B eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L256_enc_ret: mov w0, #0x0 ret .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel ___ { my $t8="v4"; my $t8d="d4"; my $t9="v6"; my $t9d="d6"; ######################################################################################### # size_t aes_gcm_dec_256_kernel(const unsigned char *in, # size_t len, # unsigned char *out, # const void *key, # unsigned char ivec[16], # u64 *Xi); # $code.=<<___; .global aes_gcm_dec_256_kernel .type aes_gcm_dec_256_kernel,%function .align 4 aes_gcm_dec_256_kernel: + AARCH64_VALID_CALL_TARGET cbz x1, .L256_dec_ret stp x19, x20, [sp, #-112]! mov x16, x4 mov x8, x5 stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] stp d8, d9, [sp, #48] stp d10, d11, [sp, #64] stp d12, d13, [sp, #80] stp d14, d15, [sp, #96] lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev $ctr96_b64x, $ctr96_b64x rev $ctr96_t32x, $ctr96_t32x #endif ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 #ifdef __AARCH64EB__ ror $rk14_h, $rk14_h, #32 ror $rk14_l, $rk14_l, #32 #endif ld1 {$rk0s}, [$cc], #16 @ load rk0 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 ld1 {$rk1s}, [$cc], #16 @ load rk1 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr ld1 {$rk2s}, [$cc], #16 @ load rk2 lsr $rctr32x, $ctr96_t32x, #32 ld1 {$rk3s}, [$cc], #16 @ load rk3 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w ld1 {$rk4s}, [$cc], #16 @ load rk4 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr rev $rctr32w, $rctr32w @ rev_ctr32 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 fmov $ctr3d, $ctr96_b64x @ CTR block 3 rev $ctr32w, $rctr32w @ CTR block 1 add $rctr32w, $rctr32w, #1 @ CTR block 1 fmov $ctr1d, $ctr96_b64x @ CTR block 1 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible fmov $ctr1.d[1], $ctr32x @ CTR block 1 rev $ctr32w, $rctr32w @ CTR block 2 add $rctr32w, $rctr32w, #1 @ CTR block 2 fmov $ctr2d, $ctr96_b64x @ CTR block 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 fmov $ctr2.d[1], $ctr32x @ CTR block 2 rev $ctr32w, $rctr32w @ CTR block 3 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 ld1 {$rk5s}, [$cc], #16 @ load rk5 fmov $ctr3.d[1], $ctr32x @ CTR block 3 add $rctr32w, $rctr32w, #1 @ CTR block 3 ld1 {$rk6s}, [$cc], #16 @ load rk6 ld1 {$rk7s}, [$cc], #16 @ load rk7 ld1 {$rk8s}, [$cc], #16 @ load rk8 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 ldr $h3q, [$current_tag, #80] @ load h3l | h3h #ifndef __AARCH64EB__ ext $h3b, $h3b, $h3b, #8 #endif aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 ldr $h4q, [$current_tag, #112] @ load h4l | h4h #ifndef __AARCH64EB__ ext $h4b, $h4b, $h4b, #8 #endif aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ldr $h2q, [$current_tag, #64] @ load h2l | h2h #ifndef __AARCH64EB__ ext $h2b, $h2b, $h2b, #8 #endif aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 ld1 {$rk9s}, [$cc], #16 @ load rk9 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 ld1 { $acc_lb}, [$current_tag] ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 ld1 {$rk10s}, [$cc], #16 @ load rk10 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 ld1 {$rk11s}, [$cc], #16 @ load rk11 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 ldr $h1q, [$current_tag, #32] @ load h1l | h1h #ifndef __AARCH64EB__ ext $h1b, $h1b, $h1b, #8 #endif aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 ld1 {$rk12s}, [$cc], #16 @ load rk12 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 ld1 {$rk13s}, [$cc], #16 @ load rk13 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k aese $ctr1b, $rk13 @ AES block 1 - round 13 aese $ctr2b, $rk13 @ AES block 2 - round 13 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k aese $ctr3b, $rk13 @ AES block 3 - round 13 aese $ctr0b, $rk13 @ AES block 0 - round 13 b.ge .L256_dec_tail @ handle tail ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext rev $ctr32w, $rctr32w @ CTR block 4 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result rev64 $res1b, $res1b @ GHASH block 1 ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low rev64 $res0b, $res0b @ GHASH block 0 add $rctr32w, $rctr32w, #1 @ CTR block 4 fmov $ctr0d, $ctr96_b64x @ CTR block 4 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 fmov $ctr0.d[1], $ctr32x @ CTR block 4 rev $ctr32w, $rctr32w @ CTR block 5 add $rctr32w, $rctr32w, #1 @ CTR block 5 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result fmov $ctr1d, $ctr96_b64x @ CTR block 5 ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext fmov $ctr1.d[1], $ctr32x @ CTR block 5 rev $ctr32w, $rctr32w @ CTR block 6 add $rctr32w, $rctr32w, #1 @ CTR block 6 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low #ifdef __AARCH64EB__ rev $output_l1, $output_l1 #endif orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high #ifdef __AARCH64EB__ rev $output_h1, $output_h1 #endif stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks b.ge .L256_dec_prepretail @ do prepretail .L256_dec_main_loop: @ main loop start mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 eor $res0b, $res0b, $acc_lb @ PRE 1 rev $ctr32w, $rctr32w @ CTR block 4k+7 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high mov $t0d, $res0.d[1] @ GHASH block 4k - mid fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high #ifdef __AARCH64EB__ rev $output_h2, $output_h2 #endif aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 rev64 $res2b, $res2b @ GHASH block 4k+2 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low #ifdef __AARCH64EB__ rev $output_l2, $output_l2 #endif aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 rev64 $res3b, $res3b @ GHASH block 4k+3 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low #ifdef __AARCH64EB__ rev $output_l3, $output_l3 #endif pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high #ifdef __AARCH64EB__ rev $output_h3, $output_h3 #endif eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid rev $ctr32w, $rctr32w @ CTR block 4k+8 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid movi $mod_constant.8b, #0xc2 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result rev $ctr32w, $rctr32w @ CTR block 4k+9 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 rev $ctr32w, $rctr32w @ CTR block 4k+10 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 rev64 $res1b, $res1b @ GHASH block 4k+5 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high #ifdef __AARCH64EB__ rev $output_h1, $output_h1 #endif stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low #ifdef __AARCH64EB__ rev $output_l1, $output_l1 #endif stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result rev64 $res0b, $res0b @ GHASH block 4k+4 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low b.lt .L256_dec_main_loop .L256_dec_prepretail: @ PREPRETAIL ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 rev $ctr32w, $rctr32w @ CTR block 4k+7 eor $res0b, $res0b, $acc_lb @ PRE 1 rev64 $res2b, $res2b @ GHASH block 4k+2 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low mov $t0d, $res0.d[1] @ GHASH block 4k - mid fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 rev64 $res3b, $res3b @ GHASH block 4k+3 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 movi $mod_constant.8b, #0xc2 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 shl $mod_constantd, $mod_constantd, #56 @ mod_constant aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high #ifdef __AARCH64EB__ rev $output_h2, $output_h2 #endif aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low #ifdef __AARCH64EB__ rev $output_l3, $output_l3 #endif aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low #ifdef __AARCH64EB__ rev $output_l2, $output_l2 #endif aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high #ifdef __AARCH64EB__ rev $output_h3, $output_h3 #endif aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low .L256_dec_tail: @ TAIL sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag cmp $main_end_input_ptr, #48 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif b.gt .L256_dec_blocks_more_than_3 sub $rctr32w, $rctr32w, #1 mov $ctr3b, $ctr2b movi $acc_m.8b, #0 movi $acc_l.8b, #0 cmp $main_end_input_ptr, #32 movi $acc_h.8b, #0 mov $ctr2b, $ctr1b b.gt .L256_dec_blocks_more_than_2 sub $rctr32w, $rctr32w, #1 mov $ctr3b, $ctr1b cmp $main_end_input_ptr, #16 b.gt .L256_dec_blocks_more_than_1 sub $rctr32w, $rctr32w, #1 b .L256_dec_blocks_less_than_1 .L256_dec_blocks_more_than_3: @ blocks left > 3 rev64 $res0b, $res1b @ GHASH final-3 block ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid eor $res0b, $res0b, $t0.16b @ feed in partial tag eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid movi $t0.8b, #0 @ suppress further partial tag feed in pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif .L256_dec_blocks_more_than_2: @ blocks left > 2 rev64 $res0b, $res1b @ GHASH final-2 block ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext eor $res0b, $res0b, $t0.16b @ feed in partial tag stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low movi $t0.8b, #0 @ suppress further partial tag feed in pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif .L256_dec_blocks_more_than_1: @ blocks left > 1 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result rev64 $res0b, $res1b @ GHASH final-1 block ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext eor $res0b, $res0b, $t0.16b @ feed in partial tag movi $t0.8b, #0 @ suppress further partial tag feed in mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid eor $ctr0b, $res1b, $ctr3b @ AES final block - result pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low mov $output_l0, $ctr0.d[0] @ AES final block - mov low ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid mov $output_h0, $ctr0.d[1] @ AES final block - mov high pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low #ifdef __AARCH64EB__ rev $output_l0, $output_l0 #endif eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high #ifdef __AARCH64EB__ rev $output_h0, $output_h0 #endif .L256_dec_blocks_less_than_1: @ blocks left <= 1 and $bit_length, $bit_length, #127 @ bit_length %= 128 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff sub $bit_length, $bit_length, #128 @ bit_length -= 128 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) and $bit_length, $bit_length, #127 @ bit_length %= 128 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block cmp $bit_length, #64 csel $ctr32x, $rk14_l, $rk14_h, lt csel $ctr96_b64x, $rk14_h, xzr, lt fmov $ctr0d, $ctr32x @ ctr0b is mask for last block and $output_l0, $output_l0, $ctr32x mov $ctr0.d[1], $ctr96_b64x bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes #ifndef __AARCH64EB__ rev $ctr32w, $rctr32w #else mov $ctr32w, $rctr32w #endif bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes orr $output_l0, $output_l0, $end_input_ptr and $output_h0, $output_h0, $ctr96_b64x orr $output_h0, $output_h0, $main_end_input_ptr and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits rev64 $res0b, $res1b @ GHASH final block eor $res0b, $res0b, $t0.16b @ feed in partial tag pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low mov $t0d, $res0.d[1] @ GHASH final block - mid eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid movi $mod_constant.8b, #0xc2 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up shl $mod_constantd, $mod_constantd, #56 @ mod_constant eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low stp $output_l0, $output_h0, [$output_ptr] str $ctr32w, [$counter, #12] @ store the updated counter eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low ext $acc_lb, $acc_lb, $acc_lb, #8 rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] ldp x21, x22, [sp, #16] ldp x23, x24, [sp, #32] ldp d8, d9, [sp, #48] ldp d10, d11, [sp, #64] ldp d12, d13, [sp, #80] ldp d14, d15, [sp, #96] ldp x19, x20, [sp], #112 ret .L256_dec_ret: mov w0, #0x0 ret .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel ___ } } $code.=<<___; .asciz "GHASH for ARMv8, CRYPTOGAMS by " .align 2 #endif ___ if ($flavour =~ /64/) { ######## 64-bit code sub unvmov { my $arg=shift; $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, $3<8?$3:$3+8,($4 eq "lo")?0:1; } foreach(split("\n",$code)) { s/@\s/\/\//o; # old->new style commentary print $_,"\n"; } } else { ######## 32-bit code sub unvdup32 { my $arg=shift; $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; } sub unvpmullp64 { my ($mnemonic,$arg)=@_; if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) |(($2&7)<<17)|(($2&8)<<4) |(($3&7)<<1) |(($3&8)<<2); $word |= 0x00010001 if ($mnemonic =~ "2"); # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older%%%% # assemblers don't implement it:-( sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; } } foreach(split("\n",$code)) { s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers s/\/\/\s?/@ /o; # new->old style commentary # fix up remaining new-style suffixes s/\],#[0-9]+/]!/o; s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or s/^(\s+)b\./$1b/o or s/^(\s+)ret/$1bx\tlr/o; if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { print " it $2\n"; } s/__AARCH64E([BL])__/__ARME$1__/go; print $_,"\n"; } } close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl b/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl index b3d94041729e..cb7720ae9cfb 100644 --- a/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl +++ b/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl @@ -1,800 +1,811 @@ #! /usr/bin/env perl # Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication. # # June 2014 # # Initial version was developed in tight cooperation with Ard # Biesheuvel of Linaro from bits-n-pieces from other assembly modules. # Just like aesv8-armx.pl this module supports both AArch32 and # AArch64 execution modes. # # July 2014 # # Implement 2x aggregated reduction [see ghash-x86.pl for background # information]. # # November 2017 # # AArch64 register bank to "accommodate" 4x aggregated reduction and # improve performance by 20-70% depending on processor. # # Current performance in cycles per processed byte: # # 64-bit PMULL 32-bit PMULL 32-bit NEON(*) # Apple A7 0.58 0.92 5.62 # Cortex-A53 0.85 1.01 8.39 # Cortex-A57 0.73 1.17 7.61 # Denver 0.51 0.65 6.02 # Mongoose 0.65 1.10 8.06 # Kryo 0.76 1.16 8.00 # ThunderX2 1.05 # # (*) presented for reference/comparison purposes; # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; $Xi="x0"; # argument block $Htbl="x1"; $inp="x2"; $len="x3"; $inc="x12"; { my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14)); my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); $code=<<___; #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 ___ $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); $code.=<<___ if ($flavour !~ /64/); .fpu neon #ifdef __thumb2__ .syntax unified .thumb # define INST(a,b,c,d) $_byte c,0xef,a,b #else .code 32 # define INST(a,b,c,d) $_byte a,b,c,0xf2 #endif .text ___ ################################################################################ # void gcm_init_v8(u128 Htable[16],const u64 H[2]); # # input: 128-bit H - secret parameter E(K,0^128) # output: precomputed table filled with degrees of twisted H; # H is twisted to handle reverse bitness of GHASH; # only few of 16 slots of Htable[16] are used; # data is opaque to outside world (which allows to # optimize the code independently); # $code.=<<___; .global gcm_init_v8 .type gcm_init_v8,%function .align 4 gcm_init_v8: +___ +$code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET +___ +$code.=<<___; vld1.64 {$t1},[x1] @ load input H vmov.i8 $xC2,#0xe1 vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 vext.8 $IN,$t1,$t1,#8 vshr.u64 $t2,$xC2,#63 vdup.32 $t1,${t1}[1] vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01 vshr.u64 $t2,$IN,#63 vshr.s32 $t1,$t1,#31 @ broadcast carry bit vand $t2,$t2,$t0 vshl.i64 $IN,$IN,#1 vext.8 $t2,$t2,$t2,#8 vand $t0,$t0,$t1 vorr $IN,$IN,$t2 @ H<<<=1 veor $H,$IN,$t0 @ twisted H vst1.64 {$H},[x0],#16 @ store Htable[0] @ calculate H^2 vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing vpmull.p64 $Xl,$H,$H veor $t0,$t0,$H vpmull2.p64 $Xh,$H,$H vpmull.p64 $Xm,$t0,$t0 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl veor $Xl,$Xm,$t2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh veor $H2,$Xl,$t2 vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing veor $t1,$t1,$H2 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] ___ if ($flavour =~ /64/) { my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); $code.=<<___; @ calculate H^3 and H^4 vpmull.p64 $Xl,$H, $H2 vpmull.p64 $Yl,$H2,$H2 vpmull2.p64 $Xh,$H, $H2 vpmull2.p64 $Yh,$H2,$H2 vpmull.p64 $Xm,$t0,$t1 vpmull.p64 $Ym,$t1,$t1 vext.8 $t0,$Xl,$Xh,#8 @ Karatsuba post-processing vext.8 $t1,$Yl,$Yh,#8 veor $t2,$Xl,$Xh veor $Xm,$Xm,$t0 veor $t3,$Yl,$Yh veor $Ym,$Ym,$t1 veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase veor $Ym,$Ym,$t3 vpmull.p64 $t3,$Yl,$xC2 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Yh#lo,$Ym#hi vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl vmov $Ym#hi,$Yl#lo veor $Xl,$Xm,$t2 veor $Yl,$Ym,$t3 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase vext.8 $t3,$Yl,$Yl,#8 vpmull.p64 $Xl,$Xl,$xC2 vpmull.p64 $Yl,$Yl,$xC2 veor $t2,$t2,$Xh veor $t3,$t3,$Yh veor $H, $Xl,$t2 @ H^3 veor $H2,$Yl,$t3 @ H^4 vext.8 $t0,$H, $H,#8 @ Karatsuba pre-processing vext.8 $t1,$H2,$H2,#8 veor $t0,$t0,$H veor $t1,$t1,$H2 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed vst1.64 {$H-$H2},[x0] @ store Htable[3..5] ___ } $code.=<<___; ret .size gcm_init_v8,.-gcm_init_v8 ___ ################################################################################ # void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]); # # input: Xi - current hash value; # Htable - table precomputed in gcm_init_v8; # output: Xi - next hash value Xi; # $code.=<<___; .global gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: +___ +$code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET +___ +$code.=<<___; vld1.64 {$t1},[$Xi] @ load Xi vmov.i8 $xC2,#0xe1 vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... vshl.u64 $xC2,$xC2,#57 #ifndef __ARMEB__ vrev64.8 $t1,$t1 #endif vext.8 $IN,$t1,$t1,#8 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo veor $t1,$t1,$IN @ Karatsuba pre-processing vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl veor $Xl,$Xm,$t2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh veor $Xl,$Xl,$t2 #ifndef __ARMEB__ vrev64.8 $Xl,$Xl #endif vext.8 $Xl,$Xl,$Xl,#8 vst1.64 {$Xl},[$Xi] @ write out Xi ret .size gcm_gmult_v8,.-gcm_gmult_v8 ___ ################################################################################ # void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); # # input: table precomputed in gcm_init_v8; # current hash value Xi; # pointer to input data; # length of input data in bytes, but divisible by block size; # output: next hash value Xi; # $code.=<<___; .global gcm_ghash_v8 .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: ___ $code.=<<___ if ($flavour =~ /64/); + AARCH64_VALID_CALL_TARGET cmp $len,#64 b.hs .Lgcm_ghash_v8_4x ___ $code.=<<___ if ($flavour !~ /64/); vstmdb sp!,{d8-d15} @ 32-bit ABI says so ___ $code.=<<___; vld1.64 {$Xl},[$Xi] @ load [rotated] Xi @ "[rotated]" means that @ loaded value would have @ to be rotated in order to @ make it appear as in @ algorithm specification subs $len,$len,#32 @ see if $len is 32 or larger mov $inc,#16 @ $inc is used as post- @ increment for input pointer; @ as loop is modulo-scheduled @ $inc is zeroed just in time @ to preclude overstepping @ inp[len], which means that @ last block[s] are actually @ loaded twice, but last @ copy is not processed vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2 vmov.i8 $xC2,#0xe1 vld1.64 {$H2},[$Htbl] cclr $inc,eq @ is it time to zero $inc? vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0] vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant #ifndef __ARMEB__ vrev64.8 $t0,$t0 vrev64.8 $Xl,$Xl #endif vext.8 $IN,$t0,$t0,#8 @ rotate I[0] b.lo .Lodd_tail_v8 @ $len was less than 32 ___ { my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7)); ####### # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = # [(H*Ii+1) + (H*Xi+1)] mod P = # [(H*Ii+1) + H^2*(Ii+Xi)] mod P # $code.=<<___; vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1] #ifndef __ARMEB__ vrev64.8 $t1,$t1 #endif vext.8 $In,$t1,$t1,#8 veor $IN,$IN,$Xl @ I[i]^=Xi vpmull.p64 $Xln,$H,$In @ H·Ii+1 veor $t1,$t1,$In @ Karatsuba pre-processing vpmull2.p64 $Xhn,$H,$In b .Loop_mod2x_v8 .align 4 .Loop_mod2x_v8: vext.8 $t2,$IN,$IN,#8 subs $len,$len,#32 @ is there more data? vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo cclr $inc,lo @ is it time to zero $inc? vpmull.p64 $Xmn,$Hhl,$t1 veor $t2,$t2,$IN @ Karatsuba pre-processing vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi veor $Xl,$Xl,$Xln @ accumulate vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2] veor $Xh,$Xh,$Xhn cclr $inc,eq @ is it time to zero $inc? veor $Xm,$Xm,$Xmn vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3] #ifndef __ARMEB__ vrev64.8 $t0,$t0 #endif veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction #ifndef __ARMEB__ vrev64.8 $t1,$t1 #endif vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl vext.8 $In,$t1,$t1,#8 vext.8 $IN,$t0,$t0,#8 veor $Xl,$Xm,$t2 vpmull.p64 $Xln,$H,$In @ H·Ii+1 veor $IN,$IN,$Xh @ accumulate $IN early vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 veor $IN,$IN,$t2 veor $t1,$t1,$In @ Karatsuba pre-processing veor $IN,$IN,$Xl vpmull2.p64 $Xhn,$H,$In b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes veor $Xh,$Xh,$t2 vext.8 $IN,$t0,$t0,#8 @ re-construct $IN adds $len,$len,#32 @ re-construct $len veor $Xl,$Xl,$Xh @ re-construct $Xl b.eq .Ldone_v8 @ is $len zero? ___ } $code.=<<___; .Lodd_tail_v8: vext.8 $t2,$Xl,$Xl,#8 veor $IN,$IN,$Xl @ inp^=Xi veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo veor $t1,$t1,$IN @ Karatsuba pre-processing vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl veor $Xl,$Xm,$t2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh veor $Xl,$Xl,$t2 .Ldone_v8: #ifndef __ARMEB__ vrev64.8 $Xl,$Xl #endif vext.8 $Xl,$Xl,$Xl,#8 vst1.64 {$Xl},[$Xi] @ write out Xi ___ $code.=<<___ if ($flavour !~ /64/); vldmia sp!,{d8-d15} @ 32-bit ABI says so ___ $code.=<<___; ret .size gcm_ghash_v8,.-gcm_ghash_v8 ___ if ($flavour =~ /64/) { # 4x subroutine my ($I0,$j1,$j2,$j3, $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23)); $code.=<<___; .type gcm_ghash_v8_4x,%function .align 4 gcm_ghash_v8_4x: .Lgcm_ghash_v8_4x: vld1.64 {$Xl},[$Xi] @ load [rotated] Xi vld1.64 {$H-$H2},[$Htbl],#48 @ load twisted H, ..., H^2 vmov.i8 $xC2,#0xe1 vld1.64 {$H3-$H4},[$Htbl] @ load twisted H^3, ..., H^4 vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant vld1.64 {$I0-$j3},[$inp],#64 #ifndef __ARMEB__ vrev64.8 $Xl,$Xl vrev64.8 $j1,$j1 vrev64.8 $j2,$j2 vrev64.8 $j3,$j3 vrev64.8 $I0,$I0 #endif vext.8 $I3,$j3,$j3,#8 vext.8 $I2,$j2,$j2,#8 vext.8 $I1,$j1,$j1,#8 vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 veor $j3,$j3,$I3 vpmull2.p64 $Yh,$H,$I3 vpmull.p64 $Ym,$Hhl,$j3 vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 veor $j2,$j2,$I2 vpmull2.p64 $I2,$H2,$I2 vpmull2.p64 $j2,$Hhl,$j2 veor $Yl,$Yl,$t0 veor $Yh,$Yh,$I2 veor $Ym,$Ym,$j2 vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 veor $j1,$j1,$I1 vpmull2.p64 $I1,$H3,$I1 vpmull.p64 $j1,$H34,$j1 veor $Yl,$Yl,$j3 veor $Yh,$Yh,$I1 veor $Ym,$Ym,$j1 subs $len,$len,#128 b.lo .Ltail4x b .Loop4x .align 4 .Loop4x: veor $t0,$I0,$Xl vld1.64 {$I0-$j3},[$inp],#64 vext.8 $IN,$t0,$t0,#8 #ifndef __ARMEB__ vrev64.8 $j1,$j1 vrev64.8 $j2,$j2 vrev64.8 $j3,$j3 vrev64.8 $I0,$I0 #endif vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) veor $t0,$t0,$IN vpmull2.p64 $Xh,$H4,$IN vext.8 $I3,$j3,$j3,#8 vpmull2.p64 $Xm,$H34,$t0 veor $Xl,$Xl,$Yl veor $Xh,$Xh,$Yh vext.8 $I2,$j2,$j2,#8 veor $Xm,$Xm,$Ym vext.8 $I1,$j1,$j1,#8 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 veor $j3,$j3,$I3 veor $Xm,$Xm,$t1 vpmull2.p64 $Yh,$H,$I3 veor $Xm,$Xm,$t2 vpmull.p64 $Ym,$Hhl,$j3 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 veor $j2,$j2,$I2 vpmull2.p64 $I2,$H2,$I2 veor $Xl,$Xm,$t2 vpmull2.p64 $j2,$Hhl,$j2 veor $Yl,$Yl,$t0 veor $Yh,$Yh,$I2 veor $Ym,$Ym,$j2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 veor $j1,$j1,$I1 veor $t2,$t2,$Xh vpmull2.p64 $I1,$H3,$I1 vpmull.p64 $j1,$H34,$j1 veor $Xl,$Xl,$t2 veor $Yl,$Yl,$j3 veor $Yh,$Yh,$I1 vext.8 $Xl,$Xl,$Xl,#8 veor $Ym,$Ym,$j1 subs $len,$len,#64 b.hs .Loop4x .Ltail4x: veor $t0,$I0,$Xl vext.8 $IN,$t0,$t0,#8 vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) veor $t0,$t0,$IN vpmull2.p64 $Xh,$H4,$IN vpmull2.p64 $Xm,$H34,$t0 veor $Xl,$Xl,$Yl veor $Xh,$Xh,$Yh veor $Xm,$Xm,$Ym adds $len,$len,#64 b.eq .Ldone4x cmp $len,#32 b.lo .Lone b.eq .Ltwo .Lthree: vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 vld1.64 {$I0-$j2},[$inp] veor $Xm,$Xm,$t2 #ifndef __ARMEB__ vrev64.8 $j1,$j1 vrev64.8 $j2,$j2 vrev64.8 $I0,$I0 #endif vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl vext.8 $I2,$j2,$j2,#8 vext.8 $I1,$j1,$j1,#8 veor $Xl,$Xm,$t2 vpmull.p64 $Yl,$H,$I2 @ H·Ii+2 veor $j2,$j2,$I2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh vpmull2.p64 $Yh,$H,$I2 vpmull.p64 $Ym,$Hhl,$j2 veor $Xl,$Xl,$t2 vpmull.p64 $j3,$H2,$I1 @ H^2·Ii+1 veor $j1,$j1,$I1 vext.8 $Xl,$Xl,$Xl,#8 vpmull2.p64 $I1,$H2,$I1 veor $t0,$I0,$Xl vpmull2.p64 $j1,$Hhl,$j1 vext.8 $IN,$t0,$t0,#8 veor $Yl,$Yl,$j3 veor $Yh,$Yh,$I1 veor $Ym,$Ym,$j1 vpmull.p64 $Xl,$H3,$IN @ H^3·(Xi+Ii) veor $t0,$t0,$IN vpmull2.p64 $Xh,$H3,$IN vpmull.p64 $Xm,$H34,$t0 veor $Xl,$Xl,$Yl veor $Xh,$Xh,$Yh veor $Xm,$Xm,$Ym b .Ldone4x .align 4 .Ltwo: vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 vld1.64 {$I0-$j1},[$inp] veor $Xm,$Xm,$t2 #ifndef __ARMEB__ vrev64.8 $j1,$j1 vrev64.8 $I0,$I0 #endif vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl vext.8 $I1,$j1,$j1,#8 veor $Xl,$Xm,$t2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh veor $Xl,$Xl,$t2 vext.8 $Xl,$Xl,$Xl,#8 vpmull.p64 $Yl,$H,$I1 @ H·Ii+1 veor $j1,$j1,$I1 veor $t0,$I0,$Xl vext.8 $IN,$t0,$t0,#8 vpmull2.p64 $Yh,$H,$I1 vpmull.p64 $Ym,$Hhl,$j1 vpmull.p64 $Xl,$H2,$IN @ H^2·(Xi+Ii) veor $t0,$t0,$IN vpmull2.p64 $Xh,$H2,$IN vpmull2.p64 $Xm,$Hhl,$t0 veor $Xl,$Xl,$Yl veor $Xh,$Xh,$Yh veor $Xm,$Xm,$Ym b .Ldone4x .align 4 .Lone: vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 vld1.64 {$I0},[$inp] veor $Xm,$Xm,$t2 #ifndef __ARMEB__ vrev64.8 $I0,$I0 #endif vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl veor $Xl,$Xm,$t2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh veor $Xl,$Xl,$t2 vext.8 $Xl,$Xl,$Xl,#8 veor $t0,$I0,$Xl vext.8 $IN,$t0,$t0,#8 vpmull.p64 $Xl,$H,$IN veor $t0,$t0,$IN vpmull2.p64 $Xh,$H,$IN vpmull.p64 $Xm,$Hhl,$t0 .Ldone4x: vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing veor $t2,$Xl,$Xh veor $Xm,$Xm,$t1 veor $Xm,$Xm,$t2 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl veor $Xl,$Xm,$t2 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction vpmull.p64 $Xl,$Xl,$xC2 veor $t2,$t2,$Xh veor $Xl,$Xl,$t2 vext.8 $Xl,$Xl,$Xl,#8 #ifndef __ARMEB__ vrev64.8 $Xl,$Xl #endif vst1.64 {$Xl},[$Xi] @ write out Xi ret .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x ___ } } $code.=<<___; .asciz "GHASH for ARMv8, CRYPTOGAMS by " .align 2 #endif ___ if ($flavour =~ /64/) { ######## 64-bit code sub unvmov { my $arg=shift; $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, $3<8?$3:$3+8,($4 eq "lo")?0:1; } foreach(split("\n",$code)) { s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or s/vmov\.i8/movi/o or # fix up legacy mnemonics s/vmov\s+(.*)/unvmov($1)/geo or s/vext\.8/ext/o or s/vshr\.s/sshr\.s/o or s/vshr/ushr/o or s/^(\s+)v/$1/o or # strip off v prefix s/\bbx\s+lr\b/ret/o; s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers s/@\s/\/\//o; # old->new style commentary # fix up remaining legacy suffixes s/\.[ui]?8(\s)/$1/o; s/\.[uis]?32//o and s/\.16b/\.4s/go; m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments s/\.[uisp]?64//o and s/\.16b/\.2d/go; s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; # Switch preprocessor checks to aarch64 versions. s/__ARME([BL])__/__AARCH64E$1__/go; print $_,"\n"; } } else { ######## 32-bit code sub unvdup32 { my $arg=shift; $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; } sub unvpmullp64 { my ($mnemonic,$arg)=@_; if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) |(($2&7)<<17)|(($2&8)<<4) |(($3&7)<<1) |(($3&8)<<2); $word |= 0x00010001 if ($mnemonic =~ "2"); # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; } } foreach(split("\n",$code)) { s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers s/\/\/\s?/@ /o; # new->old style commentary # fix up remaining new-style suffixes s/\],#[0-9]+/]!/o; s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or s/vdup\.32\s+(.*)/unvdup32($1)/geo or s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or s/^(\s+)b\./$1b/o or s/^(\s+)ret/$1bx\tlr/o; if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { print " it $2\n"; } print $_,"\n"; } } close STDOUT or die "error closing STDOUT: $!"; # enforce flush diff --git a/crypto/openssl/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/openssl/crypto/poly1305/asm/poly1305-armv8.pl index dc39f4053fe6..ca1be8d72d3b 100755 --- a/crypto/openssl/crypto/poly1305/asm/poly1305-armv8.pl +++ b/crypto/openssl/crypto/poly1305/asm/poly1305-armv8.pl @@ -1,947 +1,960 @@ #! /usr/bin/env perl # Copyright 2016-2023 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for ARMv8. # # June 2015 # # Numbers are cycles per processed byte with poly1305_blocks alone. # # IALU/gcc-4.9 NEON # # Apple A7 1.86/+5% 0.72 # Cortex-A53 2.69/+58% 1.47 # Cortex-A57 2.70/+7% 1.14 # Denver 1.64/+50% 1.18(*) # X-Gene 2.13/+68% 2.27 # Mongoose 1.77/+75% 1.12 # Kryo 2.70/+55% 1.13 # ThunderX2 1.17/+95% 1.36 # # (*) estimate based on resources availability is less than 1.0, # i.e. measured result is worse than expected, presumably binary # translator is not almighty; # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); my ($mac,$nonce)=($inp,$len); my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); $code.=<<___; #include "arm_arch.h" .text // forward "declarations" are required for Apple .extern OPENSSL_armcap_P .hidden OPENSSL_armcap_P .globl poly1305_init .hidden poly1305_init .globl poly1305_blocks .hidden poly1305_blocks .globl poly1305_emit .hidden poly1305_emit .type poly1305_init,%function .align 5 poly1305_init: + AARCH64_VALID_CALL_TARGET cmp $inp,xzr stp xzr,xzr,[$ctx] // zero hash value stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] csel x0,xzr,x0,eq b.eq .Lno_key adrp x17,OPENSSL_armcap_P ldr w17,[x17,#:lo12:OPENSSL_armcap_P] ldp $r0,$r1,[$inp] // load key mov $s1,#0xfffffffc0fffffff movk $s1,#0x0fff,lsl#48 #ifdef __AARCH64EB__ rev $r0,$r0 // flip bytes rev $r1,$r1 #endif and $r0,$r0,$s1 // &=0ffffffc0fffffff and $s1,$s1,#-4 and $r1,$r1,$s1 // &=0ffffffc0ffffffc stp $r0,$r1,[$ctx,#32] // save key value tst w17,#ARMV7_NEON adr $d0,.Lpoly1305_blocks adr $r0,.Lpoly1305_blocks_neon adr $d1,.Lpoly1305_emit adr $r1,.Lpoly1305_emit_neon csel $d0,$d0,$r0,eq csel $d1,$d1,$r1,eq #ifdef __ILP32__ stp w12,w13,[$len] #else stp $d0,$d1,[$len] #endif mov x0,#1 .Lno_key: ret .size poly1305_init,.-poly1305_init .type poly1305_blocks,%function .align 5 poly1305_blocks: .Lpoly1305_blocks: + // The symbol .Lpoly1305_blocks is not a .globl symbol + // but a pointer to it is returned by poly1305_init + AARCH64_VALID_CALL_TARGET ands $len,$len,#-16 b.eq .Lno_data ldp $h0,$h1,[$ctx] // load hash value ldp $r0,$r1,[$ctx,#32] // load key value ldr $h2,[$ctx,#16] add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) b .Loop .align 5 .Loop: ldp $t0,$t1,[$inp],#16 // load input sub $len,$len,#16 #ifdef __AARCH64EB__ rev $t0,$t0 rev $t1,$t1 #endif adds $h0,$h0,$t0 // accumulate input adcs $h1,$h1,$t1 mul $d0,$h0,$r0 // h0*r0 adc $h2,$h2,$padbit umulh $d1,$h0,$r0 mul $t0,$h1,$s1 // h1*5*r1 umulh $t1,$h1,$s1 adds $d0,$d0,$t0 mul $t0,$h0,$r1 // h0*r1 adc $d1,$d1,$t1 umulh $d2,$h0,$r1 adds $d1,$d1,$t0 mul $t0,$h1,$r0 // h1*r0 adc $d2,$d2,xzr umulh $t1,$h1,$r0 adds $d1,$d1,$t0 mul $t0,$h2,$s1 // h2*5*r1 adc $d2,$d2,$t1 mul $t1,$h2,$r0 // h2*r0 adds $d1,$d1,$t0 adc $d2,$d2,$t1 and $t0,$d2,#-4 // final reduction and $h2,$d2,#3 add $t0,$t0,$d2,lsr#2 adds $h0,$d0,$t0 adcs $h1,$d1,xzr adc $h2,$h2,xzr cbnz $len,.Loop stp $h0,$h1,[$ctx] // store hash value str $h2,[$ctx,#16] .Lno_data: ret .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,%function .align 5 poly1305_emit: .Lpoly1305_emit: + // The symbol .poly1305_emit is not a .globl symbol + // but a pointer to it is returned by poly1305_init + AARCH64_VALID_CALL_TARGET ldp $h0,$h1,[$ctx] // load hash base 2^64 ldr $h2,[$ctx,#16] ldp $t0,$t1,[$nonce] // load nonce adds $d0,$h0,#5 // compare to modulus adcs $d1,$h1,xzr adc $d2,$h2,xzr tst $d2,#-4 // see if it's carried/borrowed csel $h0,$h0,$d0,eq csel $h1,$h1,$d1,eq #ifdef __AARCH64EB__ ror $t0,$t0,#32 // flip nonce words ror $t1,$t1,#32 #endif adds $h0,$h0,$t0 // accumulate nonce adc $h1,$h1,$t1 #ifdef __AARCH64EB__ rev $h0,$h0 // flip output bytes rev $h1,$h1 #endif stp $h0,$h1,[$mac] // write result ret .size poly1305_emit,.-poly1305_emit ___ my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); my ($T0,$T1,$MASK) = map("v$_",(29..31)); my ($in2,$zeros)=("x16","x17"); my $is_base2_26 = $zeros; # borrow $code.=<<___; .type poly1305_mult,%function .align 5 poly1305_mult: mul $d0,$h0,$r0 // h0*r0 umulh $d1,$h0,$r0 mul $t0,$h1,$s1 // h1*5*r1 umulh $t1,$h1,$s1 adds $d0,$d0,$t0 mul $t0,$h0,$r1 // h0*r1 adc $d1,$d1,$t1 umulh $d2,$h0,$r1 adds $d1,$d1,$t0 mul $t0,$h1,$r0 // h1*r0 adc $d2,$d2,xzr umulh $t1,$h1,$r0 adds $d1,$d1,$t0 mul $t0,$h2,$s1 // h2*5*r1 adc $d2,$d2,$t1 mul $t1,$h2,$r0 // h2*r0 adds $d1,$d1,$t0 adc $d2,$d2,$t1 and $t0,$d2,#-4 // final reduction and $h2,$d2,#3 add $t0,$t0,$d2,lsr#2 adds $h0,$d0,$t0 adcs $h1,$d1,xzr adc $h2,$h2,xzr ret .size poly1305_mult,.-poly1305_mult .type poly1305_splat,%function .align 5 poly1305_splat: and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 ubfx x13,$h0,#26,#26 extr x14,$h1,$h0,#52 and x14,x14,#0x03ffffff ubfx x15,$h1,#14,#26 extr x16,$h2,$h1,#40 str w12,[$ctx,#16*0] // r0 add w12,w13,w13,lsl#2 // r1*5 str w13,[$ctx,#16*1] // r1 add w13,w14,w14,lsl#2 // r2*5 str w12,[$ctx,#16*2] // s1 str w14,[$ctx,#16*3] // r2 add w14,w15,w15,lsl#2 // r3*5 str w13,[$ctx,#16*4] // s2 str w15,[$ctx,#16*5] // r3 add w15,w16,w16,lsl#2 // r4*5 str w14,[$ctx,#16*6] // s3 str w16,[$ctx,#16*7] // r4 str w15,[$ctx,#16*8] // s4 ret .size poly1305_splat,.-poly1305_splat .type poly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: .Lpoly1305_blocks_neon: + // The symbol .Lpoly1305_blocks_neon is not a .globl symbol + // but a pointer to it is returned by poly1305_init + AARCH64_VALID_CALL_TARGET ldr $is_base2_26,[$ctx,#24] cmp $len,#128 b.hs .Lblocks_neon cbz $is_base2_26,.Lpoly1305_blocks .Lblocks_neon: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 ands $len,$len,#-16 b.eq .Lno_data_neon cbz $is_base2_26,.Lbase2_64_neon ldp w10,w11,[$ctx] // load hash value base 2^26 ldp w12,w13,[$ctx,#8] ldr w14,[$ctx,#16] tst $len,#31 b.eq .Leven_neon ldp $r0,$r1,[$ctx,#32] // load key value add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 lsr $h1,x12,#12 adds $h0,$h0,x12,lsl#52 add $h1,$h1,x13,lsl#14 adc $h1,$h1,xzr lsr $h2,x14,#24 adds $h1,$h1,x14,lsl#40 adc $d2,$h2,xzr // can be partially reduced... ldp $d0,$d1,[$inp],#16 // load input sub $len,$len,#16 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) and $t0,$d2,#-4 // ... so reduce and $h2,$d2,#3 add $t0,$t0,$d2,lsr#2 adds $h0,$h0,$t0 adcs $h1,$h1,xzr adc $h2,$h2,xzr #ifdef __AARCH64EB__ rev $d0,$d0 rev $d1,$d1 #endif adds $h0,$h0,$d0 // accumulate input adcs $h1,$h1,$d1 adc $h2,$h2,$padbit bl poly1305_mult ldr x30,[sp,#8] cbz $padbit,.Lstore_base2_64_neon and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 ubfx x11,$h0,#26,#26 extr x12,$h1,$h0,#52 and x12,x12,#0x03ffffff ubfx x13,$h1,#14,#26 extr x14,$h2,$h1,#40 cbnz $len,.Leven_neon stp w10,w11,[$ctx] // store hash value base 2^26 stp w12,w13,[$ctx,#8] str w14,[$ctx,#16] b .Lno_data_neon .align 4 .Lstore_base2_64_neon: stp $h0,$h1,[$ctx] // store hash value base 2^64 stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed b .Lno_data_neon .align 4 .Lbase2_64_neon: ldp $r0,$r1,[$ctx,#32] // load key value ldp $h0,$h1,[$ctx] // load hash value base 2^64 ldr $h2,[$ctx,#16] tst $len,#31 b.eq .Linit_neon ldp $d0,$d1,[$inp],#16 // load input sub $len,$len,#16 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) #ifdef __AARCH64EB__ rev $d0,$d0 rev $d1,$d1 #endif adds $h0,$h0,$d0 // accumulate input adcs $h1,$h1,$d1 adc $h2,$h2,$padbit bl poly1305_mult .Linit_neon: and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 ubfx x11,$h0,#26,#26 extr x12,$h1,$h0,#52 and x12,x12,#0x03ffffff ubfx x13,$h1,#14,#26 extr x14,$h2,$h1,#40 stp d8,d9,[sp,#16] // meet ABI requirements stp d10,d11,[sp,#32] stp d12,d13,[sp,#48] stp d14,d15,[sp,#64] fmov ${H0},x10 fmov ${H1},x11 fmov ${H2},x12 fmov ${H3},x13 fmov ${H4},x14 ////////////////////////////////// initialize r^n table mov $h0,$r0 // r^1 add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) mov $h1,$r1 mov $h2,xzr add $ctx,$ctx,#48+12 bl poly1305_splat bl poly1305_mult // r^2 sub $ctx,$ctx,#4 bl poly1305_splat bl poly1305_mult // r^3 sub $ctx,$ctx,#4 bl poly1305_splat bl poly1305_mult // r^4 sub $ctx,$ctx,#4 bl poly1305_splat ldr x30,[sp,#8] add $in2,$inp,#32 adr $zeros,.Lzeros subs $len,$len,#64 csel $in2,$zeros,$in2,lo mov x4,#1 stur x4,[$ctx,#-24] // set is_base2_26 sub $ctx,$ctx,#48 // restore original $ctx b .Ldo_neon .align 4 .Leven_neon: add $in2,$inp,#32 adr $zeros,.Lzeros subs $len,$len,#64 csel $in2,$zeros,$in2,lo stp d8,d9,[sp,#16] // meet ABI requirements stp d10,d11,[sp,#32] stp d12,d13,[sp,#48] stp d14,d15,[sp,#64] fmov ${H0},x10 fmov ${H1},x11 fmov ${H2},x12 fmov ${H3},x13 fmov ${H4},x14 .Ldo_neon: ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) ldp x9,x13,[$in2],#48 lsl $padbit,$padbit,#24 add x15,$ctx,#48 #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 #endif and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 and x5,x9,#0x03ffffff ubfx x6,x8,#26,#26 ubfx x7,x9,#26,#26 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 extr x8,x12,x8,#52 extr x9,x13,x9,#52 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 fmov $IN23_0,x4 and x8,x8,#0x03ffffff and x9,x9,#0x03ffffff ubfx x10,x12,#14,#26 ubfx x11,x13,#14,#26 add x12,$padbit,x12,lsr#40 add x13,$padbit,x13,lsr#40 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 fmov $IN23_1,x6 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 fmov $IN23_2,x8 fmov $IN23_3,x10 fmov $IN23_4,x12 ldp x8,x12,[$inp],#16 // inp[0:1] ldp x9,x13,[$inp],#48 ld1 {$R0,$R1,$S1,$R2},[x15],#64 ld1 {$S2,$R3,$S3,$R4},[x15],#64 ld1 {$S4},[x15] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 #endif and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 and x5,x9,#0x03ffffff ubfx x6,x8,#26,#26 ubfx x7,x9,#26,#26 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 extr x8,x12,x8,#52 extr x9,x13,x9,#52 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 fmov $IN01_0,x4 and x8,x8,#0x03ffffff and x9,x9,#0x03ffffff ubfx x10,x12,#14,#26 ubfx x11,x13,#14,#26 add x12,$padbit,x12,lsr#40 add x13,$padbit,x13,lsr#40 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 fmov $IN01_1,x6 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 movi $MASK.2d,#-1 fmov $IN01_2,x8 fmov $IN01_3,x10 fmov $IN01_4,x12 ushr $MASK.2d,$MASK.2d,#38 b.ls .Lskip_loop .align 4 .Loop_neon: //////////////////////////////////////////////////////////////// // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r // \___________________/ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r // \___________________/ \____________________/ // // Note that we start with inp[2:3]*r^2. This is because it // doesn't depend on reduction in previous iteration. //////////////////////////////////////////////////////////////// // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 subs $len,$len,#64 umull $ACC4,$IN23_0,${R4}[2] csel $in2,$zeros,$in2,lo umull $ACC3,$IN23_0,${R3}[2] umull $ACC2,$IN23_0,${R2}[2] ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) umull $ACC1,$IN23_0,${R1}[2] ldp x9,x13,[$in2],#48 umull $ACC0,$IN23_0,${R0}[2] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 #endif umlal $ACC4,$IN23_1,${R3}[2] and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 umlal $ACC3,$IN23_1,${R2}[2] and x5,x9,#0x03ffffff umlal $ACC2,$IN23_1,${R1}[2] ubfx x6,x8,#26,#26 umlal $ACC1,$IN23_1,${R0}[2] ubfx x7,x9,#26,#26 umlal $ACC0,$IN23_1,${S4}[2] add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 umlal $ACC4,$IN23_2,${R2}[2] extr x8,x12,x8,#52 umlal $ACC3,$IN23_2,${R1}[2] extr x9,x13,x9,#52 umlal $ACC2,$IN23_2,${R0}[2] add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 umlal $ACC1,$IN23_2,${S4}[2] fmov $IN23_0,x4 umlal $ACC0,$IN23_2,${S3}[2] and x8,x8,#0x03ffffff umlal $ACC4,$IN23_3,${R1}[2] and x9,x9,#0x03ffffff umlal $ACC3,$IN23_3,${R0}[2] ubfx x10,x12,#14,#26 umlal $ACC2,$IN23_3,${S4}[2] ubfx x11,x13,#14,#26 umlal $ACC1,$IN23_3,${S3}[2] add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 umlal $ACC0,$IN23_3,${S2}[2] fmov $IN23_1,x6 add $IN01_2,$IN01_2,$H2 add x12,$padbit,x12,lsr#40 umlal $ACC4,$IN23_4,${R0}[2] add x13,$padbit,x13,lsr#40 umlal $ACC3,$IN23_4,${S4}[2] add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 umlal $ACC2,$IN23_4,${S3}[2] add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 umlal $ACC1,$IN23_4,${S2}[2] fmov $IN23_2,x8 umlal $ACC0,$IN23_4,${S1}[2] fmov $IN23_3,x10 //////////////////////////////////////////////////////////////// // (hash+inp[0:1])*r^4 and accumulate add $IN01_0,$IN01_0,$H0 fmov $IN23_4,x12 umlal $ACC3,$IN01_2,${R1}[0] ldp x8,x12,[$inp],#16 // inp[0:1] umlal $ACC0,$IN01_2,${S3}[0] ldp x9,x13,[$inp],#48 umlal $ACC4,$IN01_2,${R2}[0] umlal $ACC1,$IN01_2,${S4}[0] umlal $ACC2,$IN01_2,${R0}[0] #ifdef __AARCH64EB__ rev x8,x8 rev x12,x12 rev x9,x9 rev x13,x13 #endif add $IN01_1,$IN01_1,$H1 umlal $ACC3,$IN01_0,${R3}[0] umlal $ACC4,$IN01_0,${R4}[0] and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 umlal $ACC2,$IN01_0,${R2}[0] and x5,x9,#0x03ffffff umlal $ACC0,$IN01_0,${R0}[0] ubfx x6,x8,#26,#26 umlal $ACC1,$IN01_0,${R1}[0] ubfx x7,x9,#26,#26 add $IN01_3,$IN01_3,$H3 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 umlal $ACC3,$IN01_1,${R2}[0] extr x8,x12,x8,#52 umlal $ACC4,$IN01_1,${R3}[0] extr x9,x13,x9,#52 umlal $ACC0,$IN01_1,${S4}[0] add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 umlal $ACC2,$IN01_1,${R1}[0] fmov $IN01_0,x4 umlal $ACC1,$IN01_1,${R0}[0] and x8,x8,#0x03ffffff add $IN01_4,$IN01_4,$H4 and x9,x9,#0x03ffffff umlal $ACC3,$IN01_3,${R0}[0] ubfx x10,x12,#14,#26 umlal $ACC0,$IN01_3,${S2}[0] ubfx x11,x13,#14,#26 umlal $ACC4,$IN01_3,${R1}[0] add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 umlal $ACC1,$IN01_3,${S3}[0] fmov $IN01_1,x6 umlal $ACC2,$IN01_3,${S4}[0] add x12,$padbit,x12,lsr#40 umlal $ACC3,$IN01_4,${S4}[0] add x13,$padbit,x13,lsr#40 umlal $ACC0,$IN01_4,${S1}[0] add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 umlal $ACC4,$IN01_4,${R0}[0] add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 umlal $ACC1,$IN01_4,${S2}[0] fmov $IN01_2,x8 umlal $ACC2,$IN01_4,${S3}[0] fmov $IN01_3,x10 fmov $IN01_4,x12 ///////////////////////////////////////////////////////////////// // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein // and P. Schwabe // // [see discussion in poly1305-armv4 module] ushr $T0.2d,$ACC3,#26 xtn $H3,$ACC3 ushr $T1.2d,$ACC0,#26 and $ACC0,$ACC0,$MASK.2d add $ACC4,$ACC4,$T0.2d // h3 -> h4 bic $H3,#0xfc,lsl#24 // &=0x03ffffff add $ACC1,$ACC1,$T1.2d // h0 -> h1 ushr $T0.2d,$ACC4,#26 xtn $H4,$ACC4 ushr $T1.2d,$ACC1,#26 xtn $H1,$ACC1 bic $H4,#0xfc,lsl#24 add $ACC2,$ACC2,$T1.2d // h1 -> h2 add $ACC0,$ACC0,$T0.2d shl $T0.2d,$T0.2d,#2 shrn $T1.2s,$ACC2,#26 xtn $H2,$ACC2 add $ACC0,$ACC0,$T0.2d // h4 -> h0 bic $H1,#0xfc,lsl#24 add $H3,$H3,$T1.2s // h2 -> h3 bic $H2,#0xfc,lsl#24 shrn $T0.2s,$ACC0,#26 xtn $H0,$ACC0 ushr $T1.2s,$H3,#26 bic $H3,#0xfc,lsl#24 bic $H0,#0xfc,lsl#24 add $H1,$H1,$T0.2s // h0 -> h1 add $H4,$H4,$T1.2s // h3 -> h4 b.hi .Loop_neon .Lskip_loop: dup $IN23_2,${IN23_2}[0] add $IN01_2,$IN01_2,$H2 //////////////////////////////////////////////////////////////// // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 adds $len,$len,#32 b.ne .Long_tail dup $IN23_2,${IN01_2}[0] add $IN23_0,$IN01_0,$H0 add $IN23_3,$IN01_3,$H3 add $IN23_1,$IN01_1,$H1 add $IN23_4,$IN01_4,$H4 .Long_tail: dup $IN23_0,${IN23_0}[0] umull2 $ACC0,$IN23_2,${S3} umull2 $ACC3,$IN23_2,${R1} umull2 $ACC4,$IN23_2,${R2} umull2 $ACC2,$IN23_2,${R0} umull2 $ACC1,$IN23_2,${S4} dup $IN23_1,${IN23_1}[0] umlal2 $ACC0,$IN23_0,${R0} umlal2 $ACC2,$IN23_0,${R2} umlal2 $ACC3,$IN23_0,${R3} umlal2 $ACC4,$IN23_0,${R4} umlal2 $ACC1,$IN23_0,${R1} dup $IN23_3,${IN23_3}[0] umlal2 $ACC0,$IN23_1,${S4} umlal2 $ACC3,$IN23_1,${R2} umlal2 $ACC2,$IN23_1,${R1} umlal2 $ACC4,$IN23_1,${R3} umlal2 $ACC1,$IN23_1,${R0} dup $IN23_4,${IN23_4}[0] umlal2 $ACC3,$IN23_3,${R0} umlal2 $ACC4,$IN23_3,${R1} umlal2 $ACC0,$IN23_3,${S2} umlal2 $ACC1,$IN23_3,${S3} umlal2 $ACC2,$IN23_3,${S4} umlal2 $ACC3,$IN23_4,${S4} umlal2 $ACC0,$IN23_4,${S1} umlal2 $ACC4,$IN23_4,${R0} umlal2 $ACC1,$IN23_4,${S2} umlal2 $ACC2,$IN23_4,${S3} b.eq .Lshort_tail //////////////////////////////////////////////////////////////// // (hash+inp[0:1])*r^4:r^3 and accumulate add $IN01_0,$IN01_0,$H0 umlal $ACC3,$IN01_2,${R1} umlal $ACC0,$IN01_2,${S3} umlal $ACC4,$IN01_2,${R2} umlal $ACC1,$IN01_2,${S4} umlal $ACC2,$IN01_2,${R0} add $IN01_1,$IN01_1,$H1 umlal $ACC3,$IN01_0,${R3} umlal $ACC0,$IN01_0,${R0} umlal $ACC4,$IN01_0,${R4} umlal $ACC1,$IN01_0,${R1} umlal $ACC2,$IN01_0,${R2} add $IN01_3,$IN01_3,$H3 umlal $ACC3,$IN01_1,${R2} umlal $ACC0,$IN01_1,${S4} umlal $ACC4,$IN01_1,${R3} umlal $ACC1,$IN01_1,${R0} umlal $ACC2,$IN01_1,${R1} add $IN01_4,$IN01_4,$H4 umlal $ACC3,$IN01_3,${R0} umlal $ACC0,$IN01_3,${S2} umlal $ACC4,$IN01_3,${R1} umlal $ACC1,$IN01_3,${S3} umlal $ACC2,$IN01_3,${S4} umlal $ACC3,$IN01_4,${S4} umlal $ACC0,$IN01_4,${S1} umlal $ACC4,$IN01_4,${R0} umlal $ACC1,$IN01_4,${S2} umlal $ACC2,$IN01_4,${S3} .Lshort_tail: //////////////////////////////////////////////////////////////// // horizontal add addp $ACC3,$ACC3,$ACC3 ldp d8,d9,[sp,#16] // meet ABI requirements addp $ACC0,$ACC0,$ACC0 ldp d10,d11,[sp,#32] addp $ACC4,$ACC4,$ACC4 ldp d12,d13,[sp,#48] addp $ACC1,$ACC1,$ACC1 ldp d14,d15,[sp,#64] addp $ACC2,$ACC2,$ACC2 //////////////////////////////////////////////////////////////// // lazy reduction, but without narrowing ushr $T0.2d,$ACC3,#26 and $ACC3,$ACC3,$MASK.2d ushr $T1.2d,$ACC0,#26 and $ACC0,$ACC0,$MASK.2d add $ACC4,$ACC4,$T0.2d // h3 -> h4 add $ACC1,$ACC1,$T1.2d // h0 -> h1 ushr $T0.2d,$ACC4,#26 and $ACC4,$ACC4,$MASK.2d ushr $T1.2d,$ACC1,#26 and $ACC1,$ACC1,$MASK.2d add $ACC2,$ACC2,$T1.2d // h1 -> h2 add $ACC0,$ACC0,$T0.2d shl $T0.2d,$T0.2d,#2 ushr $T1.2d,$ACC2,#26 and $ACC2,$ACC2,$MASK.2d add $ACC0,$ACC0,$T0.2d // h4 -> h0 add $ACC3,$ACC3,$T1.2d // h2 -> h3 ushr $T0.2d,$ACC0,#26 and $ACC0,$ACC0,$MASK.2d ushr $T1.2d,$ACC3,#26 and $ACC3,$ACC3,$MASK.2d add $ACC1,$ACC1,$T0.2d // h0 -> h1 add $ACC4,$ACC4,$T1.2d // h3 -> h4 //////////////////////////////////////////////////////////////// // write the result, can be partially reduced st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 st1 {$ACC4}[0],[$ctx] .Lno_data_neon: ldr x29,[sp],#80 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size poly1305_blocks_neon,.-poly1305_blocks_neon .type poly1305_emit_neon,%function .align 5 poly1305_emit_neon: .Lpoly1305_emit_neon: + // The symbol .Lpoly1305_emit_neon is not a .globl symbol + // but a pointer to it is returned by poly1305_init + AARCH64_VALID_CALL_TARGET ldr $is_base2_26,[$ctx,#24] cbz $is_base2_26,poly1305_emit ldp w10,w11,[$ctx] // load hash value base 2^26 ldp w12,w13,[$ctx,#8] ldr w14,[$ctx,#16] add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 lsr $h1,x12,#12 adds $h0,$h0,x12,lsl#52 add $h1,$h1,x13,lsl#14 adc $h1,$h1,xzr lsr $h2,x14,#24 adds $h1,$h1,x14,lsl#40 adc $h2,$h2,xzr // can be partially reduced... ldp $t0,$t1,[$nonce] // load nonce and $d0,$h2,#-4 // ... so reduce add $d0,$d0,$h2,lsr#2 and $h2,$h2,#3 adds $h0,$h0,$d0 adcs $h1,$h1,xzr adc $h2,$h2,xzr adds $d0,$h0,#5 // compare to modulus adcs $d1,$h1,xzr adc $d2,$h2,xzr tst $d2,#-4 // see if it's carried/borrowed csel $h0,$h0,$d0,eq csel $h1,$h1,$d1,eq #ifdef __AARCH64EB__ ror $t0,$t0,#32 // flip nonce words ror $t1,$t1,#32 #endif adds $h0,$h0,$t0 // accumulate nonce adc $h1,$h1,$t1 #ifdef __AARCH64EB__ rev $h0,$h0 // flip output bytes rev $h1,$h1 #endif stp $h0,$h1,[$mac] // write result ret .size poly1305_emit_neon,.-poly1305_emit_neon .align 5 .Lzeros: .long 0,0,0,0,0,0,0,0 .asciz "Poly1305 for ARMv8, CRYPTOGAMS by " .align 2 ___ foreach (split("\n",$code)) { s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); s/\.[124]([sd])\[/.$1\[/; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/openssl/crypto/sha/asm/keccak1600-armv8.pl b/crypto/openssl/crypto/sha/asm/keccak1600-armv8.pl index 65102e7c292f..cf54b62c636a 100755 --- a/crypto/openssl/crypto/sha/asm/keccak1600-armv8.pl +++ b/crypto/openssl/crypto/sha/asm/keccak1600-armv8.pl @@ -1,876 +1,878 @@ #!/usr/bin/env perl # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # Keccak-1600 for ARMv8. # # June 2017. # # This is straightforward KECCAK_1X_ALT implementation. It makes no # sense to attempt SIMD/NEON implementation for following reason. # 64-bit lanes of vector registers can't be addressed as easily as in # 32-bit mode. This means that 64-bit NEON is bound to be slower than # 32-bit NEON, and this implementation is faster than 32-bit NEON on # same processor. Even though it takes more scalar xor's and andn's, # it gets compensated by availability of rotate. Not to forget that # most processors achieve higher issue rate with scalar instructions. # # February 2018. # # Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT # variant with register permutation/rotation twist that allows to # eliminate copies to temporary registers. If you look closely you'll # notice that it uses only one lane of vector registers. The new # instructions effectively facilitate parallel hashing, which we don't # support [yet?]. But lowest-level core procedure is prepared for it. # The inner round is 67 [vector] instructions, so it's not actually # obvious that it will provide performance improvement [in serial # hash] as long as vector instructions issue rate is limited to 1 per # cycle... # ###################################################################### # Numbers are cycles per processed byte. # # r=1088(*) # # Cortex-A53 13 # Cortex-A57 12 # X-Gene 14 # Mongoose 10 # Kryo 12 # Denver 7.8 # Apple A7 7.2 # ThunderX2 9.7 # # (*) Corresponds to SHA3-256. No improvement coefficients are listed # because they vary too much from compiler to compiler. Newer # compiler does much better and improvement varies from 5% on # Cortex-A57 to 25% on Cortex-A53. While in comparison to older # compiler this code is at least 2x faster... # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; my @rhotates = ([ 0, 1, 62, 28, 27 ], [ 36, 44, 6, 55, 20 ], [ 3, 10, 43, 25, 39 ], [ 41, 45, 15, 21, 8 ], [ 18, 2, 61, 56, 14 ]); $code.=<<___; +#include "arm_arch.h" + .text .align 8 // strategic alignment and padding that allows to use // address value as loop termination condition... .quad 0,0,0,0,0,0,0,0 .type iotas,%object iotas: .quad 0x0000000000000001 .quad 0x0000000000008082 .quad 0x800000000000808a .quad 0x8000000080008000 .quad 0x000000000000808b .quad 0x0000000080000001 .quad 0x8000000080008081 .quad 0x8000000000008009 .quad 0x000000000000008a .quad 0x0000000000000088 .quad 0x0000000080008009 .quad 0x000000008000000a .quad 0x000000008000808b .quad 0x800000000000008b .quad 0x8000000000008089 .quad 0x8000000000008003 .quad 0x8000000000008002 .quad 0x8000000000000080 .quad 0x000000000000800a .quad 0x800000008000000a .quad 0x8000000080008081 .quad 0x8000000000008080 .quad 0x0000000080000001 .quad 0x8000000080008008 .size iotas,.-iotas ___ {{{ my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], (0, 5, 10, 15, 20)); $A[3][3] = "x25"; # x18 is reserved my @C = map("x$_", (26,27,28,30)); $code.=<<___; .type KeccakF1600_int,%function .align 5 KeccakF1600_int: adr $C[2],iotas - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp $C[2],x30,[sp,#16] // 32 bytes on top are mine b .Loop .align 4 .Loop: ////////////////////////////////////////// Theta eor $C[0],$A[0][0],$A[1][0] stp $A[0][4],$A[1][4],[sp,#0] // offload pair... eor $C[1],$A[0][1],$A[1][1] eor $C[2],$A[0][2],$A[1][2] eor $C[3],$A[0][3],$A[1][3] ___ $C[4]=$A[0][4]; $C[5]=$A[1][4]; $code.=<<___; eor $C[4],$A[0][4],$A[1][4] eor $C[0],$C[0],$A[2][0] eor $C[1],$C[1],$A[2][1] eor $C[2],$C[2],$A[2][2] eor $C[3],$C[3],$A[2][3] eor $C[4],$C[4],$A[2][4] eor $C[0],$C[0],$A[3][0] eor $C[1],$C[1],$A[3][1] eor $C[2],$C[2],$A[3][2] eor $C[3],$C[3],$A[3][3] eor $C[4],$C[4],$A[3][4] eor $C[0],$C[0],$A[4][0] eor $C[2],$C[2],$A[4][2] eor $C[1],$C[1],$A[4][1] eor $C[3],$C[3],$A[4][3] eor $C[4],$C[4],$A[4][4] eor $C[5],$C[0],$C[2],ror#63 eor $A[0][1],$A[0][1],$C[5] eor $A[1][1],$A[1][1],$C[5] eor $A[2][1],$A[2][1],$C[5] eor $A[3][1],$A[3][1],$C[5] eor $A[4][1],$A[4][1],$C[5] eor $C[5],$C[1],$C[3],ror#63 eor $C[2],$C[2],$C[4],ror#63 eor $C[3],$C[3],$C[0],ror#63 eor $C[4],$C[4],$C[1],ror#63 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2] eor $A[1][2],$A[1][2],$C[5] eor $A[2][2],$A[2][2],$C[5] eor $A[3][2],$A[3][2],$C[5] eor $A[4][2],$A[4][2],$C[5] eor $A[0][0],$A[0][0],$C[4] eor $A[1][0],$A[1][0],$C[4] eor $A[2][0],$A[2][0],$C[4] eor $A[3][0],$A[3][0],$C[4] eor $A[4][0],$A[4][0],$C[4] ___ $C[4]=undef; $C[5]=undef; $code.=<<___; ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3] eor $A[1][3],$A[1][3],$C[2] eor $A[2][3],$A[2][3],$C[2] eor $A[3][3],$A[3][3],$C[2] eor $A[4][3],$A[4][3],$C[2] eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4] eor $A[1][4],$A[1][4],$C[3] eor $A[2][4],$A[2][4],$C[3] eor $A[3][4],$A[3][4],$C[3] eor $A[4][4],$A[4][4],$C[3] ////////////////////////////////////////// Rho+Pi mov $C[3],$A[0][1] ror $A[0][1],$A[1][1],#64-$rhotates[1][1] //mov $C[1],$A[0][2] ror $A[0][2],$A[2][2],#64-$rhotates[2][2] //mov $C[0],$A[0][3] ror $A[0][3],$A[3][3],#64-$rhotates[3][3] //mov $C[2],$A[0][4] ror $A[0][4],$A[4][4],#64-$rhotates[4][4] ror $A[1][1],$A[1][4],#64-$rhotates[1][4] ror $A[2][2],$A[2][3],#64-$rhotates[2][3] ror $A[3][3],$A[3][2],#64-$rhotates[3][2] ror $A[4][4],$A[4][1],#64-$rhotates[4][1] ror $A[1][4],$A[4][2],#64-$rhotates[4][2] ror $A[2][3],$A[3][4],#64-$rhotates[3][4] ror $A[3][2],$A[2][1],#64-$rhotates[2][1] ror $A[4][1],$A[1][3],#64-$rhotates[1][3] ror $A[4][2],$A[2][4],#64-$rhotates[2][4] ror $A[3][4],$A[4][3],#64-$rhotates[4][3] ror $A[2][1],$A[1][2],#64-$rhotates[1][2] ror $A[1][3],$A[3][1],#64-$rhotates[3][1] ror $A[2][4],$A[4][0],#64-$rhotates[4][0] ror $A[4][3],$A[3][0],#64-$rhotates[3][0] ror $A[1][2],$A[2][0],#64-$rhotates[2][0] ror $A[3][1],$A[1][0],#64-$rhotates[1][0] ror $A[1][0],$C[0],#64-$rhotates[0][3] ror $A[2][0],$C[3],#64-$rhotates[0][1] ror $A[3][0],$C[2],#64-$rhotates[0][4] ror $A[4][0],$C[1],#64-$rhotates[0][2] ////////////////////////////////////////// Chi+Iota bic $C[0],$A[0][2],$A[0][1] bic $C[1],$A[0][3],$A[0][2] bic $C[2],$A[0][0],$A[0][4] bic $C[3],$A[0][1],$A[0][0] eor $A[0][0],$A[0][0],$C[0] bic $C[0],$A[0][4],$A[0][3] eor $A[0][1],$A[0][1],$C[1] ldr $C[1],[sp,#16] eor $A[0][3],$A[0][3],$C[2] eor $A[0][4],$A[0][4],$C[3] eor $A[0][2],$A[0][2],$C[0] ldr $C[3],[$C[1]],#8 // Iota[i++] bic $C[0],$A[1][2],$A[1][1] tst $C[1],#255 // are we done? str $C[1],[sp,#16] bic $C[1],$A[1][3],$A[1][2] bic $C[2],$A[1][0],$A[1][4] eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota bic $C[3],$A[1][1],$A[1][0] eor $A[1][0],$A[1][0],$C[0] bic $C[0],$A[1][4],$A[1][3] eor $A[1][1],$A[1][1],$C[1] eor $A[1][3],$A[1][3],$C[2] eor $A[1][4],$A[1][4],$C[3] eor $A[1][2],$A[1][2],$C[0] bic $C[0],$A[2][2],$A[2][1] bic $C[1],$A[2][3],$A[2][2] bic $C[2],$A[2][0],$A[2][4] bic $C[3],$A[2][1],$A[2][0] eor $A[2][0],$A[2][0],$C[0] bic $C[0],$A[2][4],$A[2][3] eor $A[2][1],$A[2][1],$C[1] eor $A[2][3],$A[2][3],$C[2] eor $A[2][4],$A[2][4],$C[3] eor $A[2][2],$A[2][2],$C[0] bic $C[0],$A[3][2],$A[3][1] bic $C[1],$A[3][3],$A[3][2] bic $C[2],$A[3][0],$A[3][4] bic $C[3],$A[3][1],$A[3][0] eor $A[3][0],$A[3][0],$C[0] bic $C[0],$A[3][4],$A[3][3] eor $A[3][1],$A[3][1],$C[1] eor $A[3][3],$A[3][3],$C[2] eor $A[3][4],$A[3][4],$C[3] eor $A[3][2],$A[3][2],$C[0] bic $C[0],$A[4][2],$A[4][1] bic $C[1],$A[4][3],$A[4][2] bic $C[2],$A[4][0],$A[4][4] bic $C[3],$A[4][1],$A[4][0] eor $A[4][0],$A[4][0],$C[0] bic $C[0],$A[4][4],$A[4][3] eor $A[4][1],$A[4][1],$C[1] eor $A[4][3],$A[4][3],$C[2] eor $A[4][4],$A[4][4],$C[3] eor $A[4][2],$A[4][2],$C[0] bne .Loop ldr x30,[sp,#24] - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600_int,.-KeccakF1600_int .type KeccakF1600,%function .align 5 KeccakF1600: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#48 str x0,[sp,#32] // offload argument mov $C[0],x0 ldp $A[0][0],$A[0][1],[x0,#16*0] ldp $A[0][2],$A[0][3],[$C[0],#16*1] ldp $A[0][4],$A[1][0],[$C[0],#16*2] ldp $A[1][1],$A[1][2],[$C[0],#16*3] ldp $A[1][3],$A[1][4],[$C[0],#16*4] ldp $A[2][0],$A[2][1],[$C[0],#16*5] ldp $A[2][2],$A[2][3],[$C[0],#16*6] ldp $A[2][4],$A[3][0],[$C[0],#16*7] ldp $A[3][1],$A[3][2],[$C[0],#16*8] ldp $A[3][3],$A[3][4],[$C[0],#16*9] ldp $A[4][0],$A[4][1],[$C[0],#16*10] ldp $A[4][2],$A[4][3],[$C[0],#16*11] ldr $A[4][4],[$C[0],#16*12] bl KeccakF1600_int ldr $C[0],[sp,#32] stp $A[0][0],$A[0][1],[$C[0],#16*0] stp $A[0][2],$A[0][3],[$C[0],#16*1] stp $A[0][4],$A[1][0],[$C[0],#16*2] stp $A[1][1],$A[1][2],[$C[0],#16*3] stp $A[1][3],$A[1][4],[$C[0],#16*4] stp $A[2][0],$A[2][1],[$C[0],#16*5] stp $A[2][2],$A[2][3],[$C[0],#16*6] stp $A[2][4],$A[3][0],[$C[0],#16*7] stp $A[3][1],$A[3][2],[$C[0],#16*8] stp $A[3][3],$A[3][4],[$C[0],#16*9] stp $A[4][0],$A[4][1],[$C[0],#16*10] stp $A[4][2],$A[4][3],[$C[0],#16*11] str $A[4][4],[$C[0],#16*12] ldp x19,x20,[x29,#16] add sp,sp,#48 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600,.-KeccakF1600 .globl SHA3_absorb .type SHA3_absorb,%function .align 5 SHA3_absorb: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#64 stp x0,x1,[sp,#32] // offload arguments stp x2,x3,[sp,#48] mov $C[0],x0 // uint64_t A[5][5] mov $C[1],x1 // const void *inp mov $C[2],x2 // size_t len mov $C[3],x3 // size_t bsz ldp $A[0][0],$A[0][1],[$C[0],#16*0] ldp $A[0][2],$A[0][3],[$C[0],#16*1] ldp $A[0][4],$A[1][0],[$C[0],#16*2] ldp $A[1][1],$A[1][2],[$C[0],#16*3] ldp $A[1][3],$A[1][4],[$C[0],#16*4] ldp $A[2][0],$A[2][1],[$C[0],#16*5] ldp $A[2][2],$A[2][3],[$C[0],#16*6] ldp $A[2][4],$A[3][0],[$C[0],#16*7] ldp $A[3][1],$A[3][2],[$C[0],#16*8] ldp $A[3][3],$A[3][4],[$C[0],#16*9] ldp $A[4][0],$A[4][1],[$C[0],#16*10] ldp $A[4][2],$A[4][3],[$C[0],#16*11] ldr $A[4][4],[$C[0],#16*12] b .Loop_absorb .align 4 .Loop_absorb: subs $C[0],$C[2],$C[3] // len - bsz blo .Labsorbed str $C[0],[sp,#48] // save len - bsz ___ for (my $i=0; $i<24; $i+=2) { my $j = $i+1; $code.=<<___; ldr $C[0],[$C[1]],#8 // *inp++ #ifdef __AARCH64EB__ rev $C[0],$C[0] #endif eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] cmp $C[3],#8*($i+2) blo .Lprocess_block ldr $C[0],[$C[1]],#8 // *inp++ #ifdef __AARCH64EB__ rev $C[0],$C[0] #endif eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0] beq .Lprocess_block ___ } $code.=<<___; ldr $C[0],[$C[1]],#8 // *inp++ #ifdef __AARCH64EB__ rev $C[0],$C[0] #endif eor $A[4][4],$A[4][4],$C[0] .Lprocess_block: str $C[1],[sp,#40] // save inp bl KeccakF1600_int ldr $C[1],[sp,#40] // restore arguments ldp $C[2],$C[3],[sp,#48] b .Loop_absorb .align 4 .Labsorbed: ldr $C[1],[sp,#32] stp $A[0][0],$A[0][1],[$C[1],#16*0] stp $A[0][2],$A[0][3],[$C[1],#16*1] stp $A[0][4],$A[1][0],[$C[1],#16*2] stp $A[1][1],$A[1][2],[$C[1],#16*3] stp $A[1][3],$A[1][4],[$C[1],#16*4] stp $A[2][0],$A[2][1],[$C[1],#16*5] stp $A[2][2],$A[2][3],[$C[1],#16*6] stp $A[2][4],$A[3][0],[$C[1],#16*7] stp $A[3][1],$A[3][2],[$C[1],#16*8] stp $A[3][3],$A[3][4],[$C[1],#16*9] stp $A[4][0],$A[4][1],[$C[1],#16*10] stp $A[4][2],$A[4][3],[$C[1],#16*11] str $A[4][4],[$C[1],#16*12] mov x0,$C[2] // return value ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_absorb,.-SHA3_absorb ___ { my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); $code.=<<___; .globl SHA3_squeeze .type SHA3_squeeze,%function .align 5 SHA3_squeeze: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-48]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] mov $A_flat,x0 // put aside arguments mov $out,x1 mov $len,x2 mov $bsz,x3 .Loop_squeeze: ldr x4,[x0],#8 cmp $len,#8 blo .Lsqueeze_tail #ifdef __AARCH64EB__ rev x4,x4 #endif str x4,[$out],#8 subs $len,$len,#8 beq .Lsqueeze_done subs x3,x3,#8 bhi .Loop_squeeze mov x0,$A_flat bl KeccakF1600 mov x0,$A_flat mov x3,$bsz b .Loop_squeeze .align 4 .Lsqueeze_tail: strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done strb w4,[$out],#1 .Lsqueeze_done: ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x29,x30,[sp],#48 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_squeeze,.-SHA3_squeeze ___ } }}} {{{ my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", "v".($_+3).".16b", "v".($_+4).".16b" ], (0, 5, 10, 15, 20)); my @C = map("v$_.16b", (25..31)); my @D = @C[4,5,6,2,3]; $code.=<<___; .type KeccakF1600_ce,%function .align 5 KeccakF1600_ce: mov x9,#24 adr x10,iotas b .Loop_ce .align 4 .Loop_ce: ////////////////////////////////////////////////// Theta eor3 $C[0],$A[4][0],$A[3][0],$A[2][0] eor3 $C[1],$A[4][1],$A[3][1],$A[2][1] eor3 $C[2],$A[4][2],$A[3][2],$A[2][2] eor3 $C[3],$A[4][3],$A[3][3],$A[2][3] eor3 $C[4],$A[4][4],$A[3][4],$A[2][4] eor3 $C[0],$C[0], $A[1][0],$A[0][0] eor3 $C[1],$C[1], $A[1][1],$A[0][1] eor3 $C[2],$C[2], $A[1][2],$A[0][2] eor3 $C[3],$C[3], $A[1][3],$A[0][3] eor3 $C[4],$C[4], $A[1][4],$A[0][4] rax1 $C[5],$C[0],$C[2] // D[1] rax1 $C[6],$C[1],$C[3] // D[2] rax1 $C[2],$C[2],$C[4] // D[3] rax1 $C[3],$C[3],$C[0] // D[4] rax1 $C[4],$C[4],$C[1] // D[0] ////////////////////////////////////////////////// Theta+Rho+Pi xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0] xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1] xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4] xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2] xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4] xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0] xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0] xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2] xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3] xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4] xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3] xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0] xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4] xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4] xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1] xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1] xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3] xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0] xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3] eor $A[0][0],$A[0][0],$D[0] xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3] xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3] xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2] xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1] xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2] ////////////////////////////////////////////////// Chi+Iota bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1] bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1] bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] bcax $A[4][3],$A[4][3],$C[1], $A[4][4] bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1] ld1r {$C[1]},[x10],#8 bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3] bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3] bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] bcax $A[3][0],$A[3][0],$D[1], $A[3][1] bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3] bcax $A[2][0],$C[0], $A[2][2],$D[2] bcax $A[2][1],$D[2], $A[2][3],$A[2][2] bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] bcax $A[2][3],$A[2][3],$C[0], $A[2][4] bcax $A[2][4],$A[2][4],$D[2], $C[0] bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3] bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3] bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0] bcax $A[1][0],$A[1][0],$D[0], $A[1][1] bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3] bcax $A[0][3],$D[3], $A[0][0],$D[4] bcax $A[0][4],$D[4], $A[0][1],$A[0][0] bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1] bcax $A[0][1],$A[0][1],$D[3], $A[0][2] bcax $A[0][2],$A[0][2],$D[4], $D[3] eor $A[0][0],$A[0][0],$C[1] subs x9,x9,#1 bne .Loop_ce ret .size KeccakF1600_ce,.-KeccakF1600_ce .type KeccakF1600_cext,%function .align 5 KeccakF1600_cext: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp d8,d9,[sp,#16] // per ABI requirement stp d10,d11,[sp,#32] stp d12,d13,[sp,#48] stp d14,d15,[sp,#64] ___ for($i=0; $i<24; $i+=2) { # load A[5][5] my $j=$i+1; $code.=<<___; ldp d$i,d$j,[x0,#8*$i] ___ } $code.=<<___; ldr d24,[x0,#8*$i] bl KeccakF1600_ce ldr x30,[sp,#8] ___ for($i=0; $i<24; $i+=2) { # store A[5][5] my $j=$i+1; $code.=<<___; stp d$i,d$j,[x0,#8*$i] ___ } $code.=<<___; str d24,[x0,#8*$i] ldp d8,d9,[sp,#16] ldp d10,d11,[sp,#32] ldp d12,d13,[sp,#48] ldp d14,d15,[sp,#64] ldr x29,[sp],#80 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600_cext,.-KeccakF1600_cext ___ { my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); $code.=<<___; .globl SHA3_absorb_cext .type SHA3_absorb_cext,%function .align 5 SHA3_absorb_cext: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-80]! add x29,sp,#0 stp d8,d9,[sp,#16] // per ABI requirement stp d10,d11,[sp,#32] stp d12,d13,[sp,#48] stp d14,d15,[sp,#64] ___ for($i=0; $i<24; $i+=2) { # load A[5][5] my $j=$i+1; $code.=<<___; ldp d$i,d$j,[x0,#8*$i] ___ } $code.=<<___; ldr d24,[x0,#8*$i] b .Loop_absorb_ce .align 4 .Loop_absorb_ce: subs $len,$len,$bsz // len - bsz blo .Labsorbed_ce ___ for (my $i=0; $i<24; $i+=2) { my $j = $i+1; $code.=<<___; ldr d31,[$inp],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b cmp $bsz,#8*($i+2) blo .Lprocess_block_ce ldr d31,[$inp],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b beq .Lprocess_block_ce ___ } $code.=<<___; ldr d31,[$inp],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor $A[4][4],$A[4][4],v31.16b .Lprocess_block_ce: bl KeccakF1600_ce b .Loop_absorb_ce .align 4 .Labsorbed_ce: ___ for($i=0; $i<24; $i+=2) { # store A[5][5] my $j=$i+1; $code.=<<___; stp d$i,d$j,[x0,#8*$i] ___ } $code.=<<___; str d24,[x0,#8*$i] add x0,$len,$bsz // return value ldp d8,d9,[sp,#16] ldp d10,d11,[sp,#32] ldp d12,d13,[sp,#48] ldp d14,d15,[sp,#64] ldp x29,x30,[sp],#80 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_absorb_cext,.-SHA3_absorb_cext ___ } { my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); $code.=<<___; .globl SHA3_squeeze_cext .type SHA3_squeeze_cext,%function .align 5 SHA3_squeeze_cext: - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-16]! add x29,sp,#0 mov x9,$ctx mov x10,$bsz .Loop_squeeze_ce: ldr x4,[x9],#8 cmp $len,#8 blo .Lsqueeze_tail_ce #ifdef __AARCH64EB__ rev x4,x4 #endif str x4,[$out],#8 beq .Lsqueeze_done_ce sub $len,$len,#8 subs x10,x10,#8 bhi .Loop_squeeze_ce bl KeccakF1600_cext ldr x30,[sp,#8] mov x9,$ctx mov x10,$bsz b .Loop_squeeze_ce .align 4 .Lsqueeze_tail_ce: strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done_ce strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done_ce strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done_ce strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done_ce strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done_ce strb w4,[$out],#1 lsr x4,x4,#8 subs $len,$len,#1 beq .Lsqueeze_done_ce strb w4,[$out],#1 .Lsqueeze_done_ce: ldr x29,[sp],#16 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_squeeze_cext,.-SHA3_squeeze_cext ___ } }}} $code.=<<___; .asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by " ___ { my %opcode = ( "rax1" => 0xce608c00, "eor3" => 0xce000000, "bcax" => 0xce200000, "xar" => 0xce800000 ); sub unsha3 { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), $mnemonic,$arg; } } foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; m/\bld1r\b/ and s/\.16b/.2d/g or s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/openssl/crypto/sha/asm/sha1-armv8.pl b/crypto/openssl/crypto/sha/asm/sha1-armv8.pl index cdea8845af85..5f23a20c1ab7 100755 --- a/crypto/openssl/crypto/sha/asm/sha1-armv8.pl +++ b/crypto/openssl/crypto/sha/asm/sha1-armv8.pl @@ -1,358 +1,361 @@ #! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # SHA1 for ARMv8. # # Performance in cycles per processed byte and improvement coefficient # over code generated with "default" compiler: # # hardware-assisted software(*) # Apple A7 2.31 4.13 (+14%) # Cortex-A53 2.24 8.03 (+97%) # Cortex-A57 2.35 7.88 (+74%) # Denver 2.13 3.97 (+0%)(**) # X-Gene 8.80 (+200%) # Mongoose 2.05 6.50 (+160%) # Kryo 1.88 8.00 (+90%) # ThunderX2 2.64 6.36 (+150%) # # (*) Software results are presented mostly for reference purposes. # (**) Keep in mind that Denver relies on binary translation, which # optimizes compiler output at run-time. # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $1"; *STDOUT=*OUT; ($ctx,$inp,$num)=("x0","x1","x2"); @Xw=map("w$_",(3..17,19)); @Xx=map("x$_",(3..17,19)); @V=($A,$B,$C,$D,$E)=map("w$_",(20..24)); ($t0,$t1,$t2,$K)=map("w$_",(25..28)); sub BODY_00_19 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=($i+2)&15; $code.=<<___ if ($i<15 && !($i&1)); lsr @Xx[$i+1],@Xx[$i],#32 ___ $code.=<<___ if ($i<14 && !($i&1)); ldur @Xx[$i+2],[$inp,#`($i+2)*4-64`] ___ $code.=<<___ if ($i<14 && ($i&1)); #ifdef __AARCH64EB__ ror @Xx[$i+1],@Xx[$i+1],#32 #else rev32 @Xx[$i+1],@Xx[$i+1] #endif ___ $code.=<<___ if ($i<14); bic $t0,$d,$b and $t1,$c,$b ror $t2,$a,#27 add $d,$d,$K // future e+=K orr $t0,$t0,$t1 add $e,$e,$t2 // e+=rot(a,5) ror $b,$b,#2 add $d,$d,@Xw[($i+1)&15] // future e+=X[i] add $e,$e,$t0 // e+=F(b,c,d) ___ $code.=<<___ if ($i==19); movz $K,#0xeba1 movk $K,#0x6ed9,lsl#16 ___ $code.=<<___ if ($i>=14); eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] bic $t0,$d,$b and $t1,$c,$b ror $t2,$a,#27 eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] add $d,$d,$K // future e+=K orr $t0,$t0,$t1 add $e,$e,$t2 // e+=rot(a,5) eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] ror $b,$b,#2 add $d,$d,@Xw[($i+1)&15] // future e+=X[i] add $e,$e,$t0 // e+=F(b,c,d) ror @Xw[$j],@Xw[$j],#31 ___ } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=($i+2)&15; $code.=<<___ if ($i==59); movz $K,#0xc1d6 movk $K,#0xca62,lsl#16 ___ $code.=<<___; orr $t0,$b,$c and $t1,$b,$c eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] ror $t2,$a,#27 and $t0,$t0,$d add $d,$d,$K // future e+=K eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] add $e,$e,$t2 // e+=rot(a,5) orr $t0,$t0,$t1 ror $b,$b,#2 eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] add $d,$d,@Xw[($i+1)&15] // future e+=X[i] add $e,$e,$t0 // e+=F(b,c,d) ror @Xw[$j],@Xw[$j],#31 ___ } sub BODY_20_39 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=($i+2)&15; $code.=<<___ if ($i==39); movz $K,#0xbcdc movk $K,#0x8f1b,lsl#16 ___ $code.=<<___ if ($i<78); eor @Xw[$j],@Xw[$j],@Xw[($j+2)&15] eor $t0,$d,$b ror $t2,$a,#27 add $d,$d,$K // future e+=K eor @Xw[$j],@Xw[$j],@Xw[($j+8)&15] eor $t0,$t0,$c add $e,$e,$t2 // e+=rot(a,5) ror $b,$b,#2 eor @Xw[$j],@Xw[$j],@Xw[($j+13)&15] add $d,$d,@Xw[($i+1)&15] // future e+=X[i] add $e,$e,$t0 // e+=F(b,c,d) ror @Xw[$j],@Xw[$j],#31 ___ $code.=<<___ if ($i==78); ldp @Xw[1],@Xw[2],[$ctx] eor $t0,$d,$b ror $t2,$a,#27 add $d,$d,$K // future e+=K eor $t0,$t0,$c add $e,$e,$t2 // e+=rot(a,5) ror $b,$b,#2 add $d,$d,@Xw[($i+1)&15] // future e+=X[i] add $e,$e,$t0 // e+=F(b,c,d) ___ $code.=<<___ if ($i==79); ldp @Xw[3],@Xw[4],[$ctx,#8] eor $t0,$d,$b ror $t2,$a,#27 eor $t0,$t0,$c add $e,$e,$t2 // e+=rot(a,5) ror $b,$b,#2 ldr @Xw[5],[$ctx,#16] add $e,$e,$t0 // e+=F(b,c,d) ___ } $code.=<<___; +#include "arm_arch.h" #ifndef __KERNEL__ -# include "arm_arch.h" .extern OPENSSL_armcap_P .hidden OPENSSL_armcap_P #endif .text .globl sha1_block_data_order .type sha1_block_data_order,%function .align 6 sha1_block_data_order: + AARCH64_VALID_CALL_TARGET adrp x16,OPENSSL_armcap_P ldr w16,[x16,#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-96]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] ldp $A,$B,[$ctx] ldp $C,$D,[$ctx,#8] ldr $E,[$ctx,#16] .Loop: ldr @Xx[0],[$inp],#64 movz $K,#0x7999 sub $num,$num,#1 movk $K,#0x5a82,lsl#16 #ifdef __AARCH64EB__ ror $Xx[0],@Xx[0],#32 #else rev32 @Xx[0],@Xx[0] #endif add $E,$E,$K // warm it up add $E,$E,@Xw[0] ___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; add $B,$B,@Xw[2] add $C,$C,@Xw[3] add $A,$A,@Xw[1] add $D,$D,@Xw[4] add $E,$E,@Xw[5] stp $A,$B,[$ctx] stp $C,$D,[$ctx,#8] str $E,[$ctx,#16] cbnz $num,.Loop ldp x19,x20,[sp,#16] ldp x21,x22,[sp,#32] ldp x23,x24,[sp,#48] ldp x25,x26,[sp,#64] ldp x27,x28,[sp,#80] ldr x29,[sp],#96 ret .size sha1_block_data_order,.-sha1_block_data_order ___ {{{ my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3)); my @MSG=map("v$_.16b",(4..7)); my @Kxx=map("v$_.4s",(16..19)); my ($W0,$W1)=("v20.4s","v21.4s"); my $ABCD_SAVE="v22.16b"; $code.=<<___; .type sha1_block_armv8,%function .align 6 sha1_block_armv8: .Lv8_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 adr x4,.Lconst eor $E,$E,$E ld1.32 {$ABCD},[$ctx],#16 ld1.32 {$E}[0],[$ctx] sub $ctx,$ctx,#16 ld1.32 {@Kxx[0]-@Kxx[3]},[x4] .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 sub $num,$num,#1 rev32 @MSG[0],@MSG[0] rev32 @MSG[1],@MSG[1] add.i32 $W0,@Kxx[0],@MSG[0] rev32 @MSG[2],@MSG[2] orr $ABCD_SAVE,$ABCD,$ABCD // offload add.i32 $W1,@Kxx[0],@MSG[1] rev32 @MSG[3],@MSG[3] sha1h $E1,$ABCD sha1c $ABCD,$E,$W0 // 0 add.i32 $W0,@Kxx[$j],@MSG[2] sha1su0 @MSG[0],@MSG[1],@MSG[2] ___ for ($j=0,$i=1;$i<20-3;$i++) { my $f=("c","p","m","p")[$i/5]; $code.=<<___; sha1h $E0,$ABCD // $i sha1$f $ABCD,$E1,$W1 add.i32 $W1,@Kxx[$j],@MSG[3] sha1su1 @MSG[0],@MSG[3] ___ $code.=<<___ if ($i<20-4); sha1su0 @MSG[1],@MSG[2],@MSG[3] ___ ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); } $code.=<<___; sha1h $E0,$ABCD // $i sha1p $ABCD,$E1,$W1 add.i32 $W1,@Kxx[$j],@MSG[3] sha1h $E1,$ABCD // 18 sha1p $ABCD,$E0,$W0 sha1h $E0,$ABCD // 19 sha1p $ABCD,$E1,$W1 add.i32 $E,$E,$E0 add.i32 $ABCD,$ABCD,$ABCD_SAVE cbnz $num,.Loop_hw st1.32 {$ABCD},[$ctx],#16 st1.32 {$E}[0],[$ctx] ldr x29,[sp],#16 ret .size sha1_block_armv8,.-sha1_block_armv8 .align 6 .Lconst: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 .asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by " .align 2 ___ }}} { my %opcode = ( "sha1c" => 0x5e000000, "sha1p" => 0x5e001000, "sha1m" => 0x5e002000, "sha1su0" => 0x5e003000, "sha1h" => 0x5e280800, "sha1su1" => 0x5e281800 ); sub unsha1 { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5)|($3<<16), $mnemonic,$arg; } } foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo; s/\.\w?32\b//o and s/\.16b/\.4s/go; m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/openssl/crypto/sha/asm/sha512-armv8.pl b/crypto/openssl/crypto/sha/asm/sha512-armv8.pl index 6bcff0b7d3f3..f900882fee8b 100755 --- a/crypto/openssl/crypto/sha/asm/sha512-armv8.pl +++ b/crypto/openssl/crypto/sha/asm/sha512-armv8.pl @@ -1,889 +1,894 @@ #! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # # Permission to use under GPLv2 terms is granted. # ==================================================================== # # SHA256/512 for ARMv8. # # Performance in cycles per processed byte and improvement coefficient # over code generated with "default" compiler: # # SHA256-hw SHA256(*) SHA512 # Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) # Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) # Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) # Denver 2.01 10.5 (+26%) 6.70 (+8%) # X-Gene 20.0 (+100%) 12.8 (+300%(***)) # Mongoose 2.36 13.0 (+50%) 8.36 (+33%) # Kryo 1.92 17.4 (+30%) 11.2 (+8%) # ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) # # (*) Software SHA256 results are of lesser relevance, presented # mostly for informational purposes. # (**) The result is a trade-off: it's possible to improve it by # 10% (or by 1 cycle per round), but at the cost of 20% loss # on Cortex-A53 (or by 4 cycles per round). # (***) Super-impressive coefficients over gcc-generated code are # indication of some compiler "pathology", most notably code # generated with -mgeneral-regs-only is significantly faster # and the gap is only 40-90%. # # October 2016. # # Originally it was reckoned that it makes no sense to implement NEON # version of SHA256 for 64-bit processors. This is because performance # improvement on most wide-spread Cortex-A5x processors was observed # to be marginal, same on Cortex-A53 and ~10% on A57. But then it was # observed that 32-bit NEON SHA256 performs significantly better than # 64-bit scalar version on *some* of the more recent processors. As # result 64-bit NEON version of SHA256 was added to provide best # all-round performance. For example it executes ~30% faster on X-Gene # and Mongoose. [For reference, NEON version of SHA512 is bound to # deliver much less improvement, likely *negative* on Cortex-A5x. # Which is why NEON support is limited to SHA256.] # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; } else { $output and open STDOUT,">$output"; } if ($output =~ /512/) { $BITS=512; $SZ=8; @Sigma0=(28,34,39); @Sigma1=(14,18,41); @sigma0=(1, 8, 7); @sigma1=(19,61, 6); $rounds=80; $reg_t="x"; } else { $BITS=256; $SZ=4; @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; $reg_t="w"; } $func="sha${BITS}_block_data_order"; ($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); @X=map("$reg_t$_",(3..15,0..2)); @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); ($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); sub BODY_00_xx { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; my $j=($i+1)&15; my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); $T0=@X[$i+3] if ($i<11); $code.=<<___ if ($i<16); #ifndef __AARCH64EB__ rev @X[$i],@X[$i] // $i #endif ___ $code.=<<___ if ($i<13 && ($i&1)); ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ ___ $code.=<<___ if ($i==13); ldp @X[14],@X[15],[$inp] ___ $code.=<<___ if ($i>=14); ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] ___ $code.=<<___ if ($i>0 && $i<16); add $a,$a,$t1 // h+=Sigma0(a) ___ $code.=<<___ if ($i>=11); str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] ___ # While ARMv8 specifies merged rotate-n-logical operation such as # 'eor x,y,z,ror#n', it was found to negatively affect performance # on Apple A7. The reason seems to be that it requires even 'y' to # be available earlier. This means that such merged instruction is # not necessarily best choice on critical path... On the other hand # Cortex-A5x handles merged instructions much better than disjoint # rotate and logical... See (**) footnote above. $code.=<<___ if ($i<15); ror $t0,$e,#$Sigma1[0] add $h,$h,$t2 // h+=K[i] eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` and $t1,$f,$e bic $t2,$g,$e add $h,$h,@X[$i&15] // h+=X[i] orr $t1,$t1,$t2 // Ch(e,f,g) eor $t2,$a,$b // a^b, b^c in next round eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) ror $T0,$a,#$Sigma0[0] add $h,$h,$t1 // h+=Ch(e,f,g) eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` add $h,$h,$t0 // h+=Sigma1(e) and $t3,$t3,$t2 // (b^c)&=(a^b) add $d,$d,$h // d+=h eor $t3,$t3,$b // Maj(a,b,c) eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) add $h,$h,$t3 // h+=Maj(a,b,c) ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round //add $h,$h,$t1 // h+=Sigma0(a) ___ $code.=<<___ if ($i>=15); ror $t0,$e,#$Sigma1[0] add $h,$h,$t2 // h+=K[i] ror $T1,@X[($j+1)&15],#$sigma0[0] and $t1,$f,$e ror $T2,@X[($j+14)&15],#$sigma1[0] bic $t2,$g,$e ror $T0,$a,#$Sigma0[0] add $h,$h,@X[$i&15] // h+=X[i] eor $t0,$t0,$e,ror#$Sigma1[1] eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] orr $t1,$t1,$t2 // Ch(e,f,g) eor $t2,$a,$b // a^b, b^c in next round eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) eor $T0,$T0,$a,ror#$Sigma0[1] add $h,$h,$t1 // h+=Ch(e,f,g) and $t3,$t3,$t2 // (b^c)&=(a^b) eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) add $h,$h,$t0 // h+=Sigma1(e) eor $t3,$t3,$b // Maj(a,b,c) eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) add @X[$j],@X[$j],@X[($j+9)&15] add $d,$d,$h // d+=h add $h,$h,$t3 // h+=Maj(a,b,c) ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round add @X[$j],@X[$j],$T1 add $h,$h,$t1 // h+=Sigma0(a) add @X[$j],@X[$j],$T2 ___ ($t2,$t3)=($t3,$t2); } $code.=<<___; +#include "arm_arch.h" #ifndef __KERNEL__ -# include "arm_arch.h" .extern OPENSSL_armcap_P .hidden OPENSSL_armcap_P #endif .text .globl $func .type $func,%function .align 6 $func: + AARCH64_VALID_CALL_TARGET #ifndef __KERNEL__ adrp x16,OPENSSL_armcap_P ldr w16,[x16,#:lo12:OPENSSL_armcap_P] ___ $code.=<<___ if ($SZ==4); tst w16,#ARMV8_SHA256 b.ne .Lv8_entry tst w16,#ARMV7_NEON b.ne .Lneon_entry ___ $code.=<<___ if ($SZ==8); tst w16,#ARMV8_SHA512 b.ne .Lv8_entry ___ $code.=<<___; #endif - .inst 0xd503233f // paciasp + AARCH64_SIGN_LINK_REGISTER stp x29,x30,[sp,#-128]! add x29,sp,#0 stp x19,x20,[sp,#16] stp x21,x22,[sp,#32] stp x23,x24,[sp,#48] stp x25,x26,[sp,#64] stp x27,x28,[sp,#80] sub sp,sp,#4*$SZ ldp $A,$B,[$ctx] // load context ldp $C,$D,[$ctx,#2*$SZ] ldp $E,$F,[$ctx,#4*$SZ] add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input ldp $G,$H,[$ctx,#6*$SZ] adr $Ktbl,.LK$BITS stp $ctx,$num,[x29,#96] .Loop: ldp @X[0],@X[1],[$inp],#2*$SZ ldr $t2,[$Ktbl],#$SZ // *K++ eor $t3,$B,$C // magic seed str $inp,[x29,#112] ___ for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } $code.=".Loop_16_xx:\n"; for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; cbnz $t2,.Loop_16_xx ldp $ctx,$num,[x29,#96] ldr $inp,[x29,#112] sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind ldp @X[0],@X[1],[$ctx] ldp @X[2],@X[3],[$ctx,#2*$SZ] add $inp,$inp,#14*$SZ // advance input pointer ldp @X[4],@X[5],[$ctx,#4*$SZ] add $A,$A,@X[0] ldp @X[6],@X[7],[$ctx,#6*$SZ] add $B,$B,@X[1] add $C,$C,@X[2] add $D,$D,@X[3] stp $A,$B,[$ctx] add $E,$E,@X[4] add $F,$F,@X[5] stp $C,$D,[$ctx,#2*$SZ] add $G,$G,@X[6] add $H,$H,@X[7] cmp $inp,$num stp $E,$F,[$ctx,#4*$SZ] stp $G,$H,[$ctx,#6*$SZ] b.ne .Loop ldp x19,x20,[x29,#16] add sp,sp,#4*$SZ ldp x21,x22,[x29,#32] ldp x23,x24,[x29,#48] ldp x25,x26,[x29,#64] ldp x27,x28,[x29,#80] ldp x29,x30,[sp],#128 - .inst 0xd50323bf // autiasp + AARCH64_VALIDATE_LINK_REGISTER ret .size $func,.-$func .align 6 .type .LK$BITS,%object .LK$BITS: ___ $code.=<<___ if ($SZ==8); .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator ___ $code.=<<___ if ($SZ==4); .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator ___ $code.=<<___; .size .LK$BITS,.-.LK$BITS .asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by " .align 2 ___ if ($SZ==4) { my $Ktbl="x3"; my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); my @MSG=map("v$_.16b",(4..7)); my ($W0,$W1)=("v16.4s","v17.4s"); my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); $code.=<<___; #ifndef __KERNEL__ .type sha256_block_armv8,%function .align 6 sha256_block_armv8: .Lv8_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1.32 {$ABCD,$EFGH},[$ctx] adr $Ktbl,.LK256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 sub $num,$num,#1 ld1.32 {$W0},[$Ktbl],#16 rev32 @MSG[0],@MSG[0] rev32 @MSG[1],@MSG[1] rev32 @MSG[2],@MSG[2] rev32 @MSG[3],@MSG[3] orr $ABCD_SAVE,$ABCD,$ABCD // offload orr $EFGH_SAVE,$EFGH,$EFGH ___ for($i=0;$i<12;$i++) { $code.=<<___; ld1.32 {$W1},[$Ktbl],#16 add.i32 $W0,$W0,@MSG[0] sha256su0 @MSG[0],@MSG[1] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 sha256su1 @MSG[0],@MSG[2],@MSG[3] ___ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); } $code.=<<___; ld1.32 {$W1},[$Ktbl],#16 add.i32 $W0,$W0,@MSG[0] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 ld1.32 {$W0},[$Ktbl],#16 add.i32 $W1,$W1,@MSG[1] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W1 sha256h2 $EFGH,$abcd,$W1 ld1.32 {$W1},[$Ktbl] add.i32 $W0,$W0,@MSG[2] sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W0 sha256h2 $EFGH,$abcd,$W0 add.i32 $W1,$W1,@MSG[3] orr $abcd,$ABCD,$ABCD sha256h $ABCD,$EFGH,$W1 sha256h2 $EFGH,$abcd,$W1 add.i32 $ABCD,$ABCD,$ABCD_SAVE add.i32 $EFGH,$EFGH,$EFGH_SAVE cbnz $num,.Loop_hw st1.32 {$ABCD,$EFGH},[$ctx] ldr x29,[sp],#16 ret .size sha256_block_armv8,.-sha256_block_armv8 #endif ___ } if ($SZ==4) { ######################################### NEON stuff # # You'll surely note a lot of similarities with sha256-armv4 module, # and of course it's not a coincidence. sha256-armv4 was used as # initial template, but was adapted for ARMv8 instruction set and # extensively re-tuned for all-round performance. my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); my $Ktbl="x16"; my $Xfer="x17"; my @X = map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); my $j=0; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } sub Xupdate() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T2,$T0,$sigma0[0]); eval(shift(@insns)); &ushr_32 ($T1,$T0,$sigma0[2]); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] eval(shift(@insns)); &sli_32 ($T2,$T0,32-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T0,$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T2); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T3,$T0,32-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T4,$T7,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T4,$T7,32-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T5,$T7,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T3,$T7,$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &sli_u32 ($T3,$T7,32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T4); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &ushr_32 ($T6,@X[0],$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T7,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T6,@X[0],32-$sigma1[0]); eval(shift(@insns)); &ushr_32 ($T5,@X[0],$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T7,$T7,$T6); eval(shift(@insns)); eval(shift(@insns)); &sli_32 ($T5,@X[0],32-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl], #16"); eval(shift(@insns)); &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); &eor_8 ($T5,$T5,$T5); eval(shift(@insns)); eval(shift(@insns)); &mov (&Dhi($T5), &Dlo($T7)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); while($#insns>=1) { eval(shift(@insns)); } &st1_32 ("{$T0}","[$Xfer], #16"); eval(shift(@insns)); push(@X,shift(@X)); # "rotate" X[] } sub Xpreload() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); my ($a,$b,$c,$d,$e,$f,$g,$h); eval(shift(@insns)); eval(shift(@insns)); &ld1_8 ("{@X[0]}","[$inp],#16"); eval(shift(@insns)); eval(shift(@insns)); &ld1_32 ("{$T0}","[$Ktbl],#16"); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &rev32 (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &add_32 ($T0,$T0,@X[0]); foreach (@insns) { eval; } # remaining instructions &st1_32 ("{$T0}","[$Xfer], #16"); push(@X,shift(@X)); # "rotate" X[] } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. '&add ($h,$h,$t1)', # h+=X[i]+K[i] '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past '&and ($t1,$f,$e)', '&bic ($t4,$g,$e)', '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past '&orr ($t1,$t1,$t4)', # Ch(e,f,g) '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', '&add ($h,$h,$t1)', # h+=Ch(e,f,g) '&ror ($t0,$t0,"#$Sigma1[0]")', '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t0)', # h+=Sigma1(e) '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&ror ($t4,$t4,"#$Sigma0[0]")', '&add ($d,$d,$h)', # d+=h '&eor ($t3,$t3,$b)', # Maj(a,b,c) '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' ) } $code.=<<___; #ifdef __KERNEL__ .globl sha256_block_neon #endif .type sha256_block_neon,%function .align 4 sha256_block_neon: + AARCH64_VALID_CALL_TARGET .Lneon_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later stp x29, x30, [sp, #-16]! mov x29, sp sub sp,sp,#16*4 adr $Ktbl,.LK256 add $num,$inp,$num,lsl#6 // len to point at the end of inp ld1.8 {@X[0]},[$inp], #16 ld1.8 {@X[1]},[$inp], #16 ld1.8 {@X[2]},[$inp], #16 ld1.8 {@X[3]},[$inp], #16 ld1.32 {$T0},[$Ktbl], #16 ld1.32 {$T1},[$Ktbl], #16 ld1.32 {$T2},[$Ktbl], #16 ld1.32 {$T3},[$Ktbl], #16 rev32 @X[0],@X[0] // yes, even on rev32 @X[1],@X[1] // big-endian rev32 @X[2],@X[2] rev32 @X[3],@X[3] mov $Xfer,sp add.32 $T0,$T0,@X[0] add.32 $T1,$T1,@X[1] add.32 $T2,$T2,@X[2] st1.32 {$T0-$T1},[$Xfer], #32 add.32 $T3,$T3,@X[3] st1.32 {$T2-$T3},[$Xfer] sub $Xfer,$Xfer,#32 ldp $A,$B,[$ctx] ldp $C,$D,[$ctx,#8] ldp $E,$F,[$ctx,#16] ldp $G,$H,[$ctx,#24] ldr $t1,[sp,#0] mov $t2,wzr eor $t3,$B,$C mov $t4,wzr b .L_00_48 .align 4 .L_00_48: ___ &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); &Xupdate(\&body_00_15); $code.=<<___; cmp $t1,#0 // check for K256 terminator ldr $t1,[sp,#0] sub $Xfer,$Xfer,#64 bne .L_00_48 sub $Ktbl,$Ktbl,#256 // rewind $Ktbl cmp $inp,$num mov $Xfer, #64 csel $Xfer, $Xfer, xzr, eq sub $inp,$inp,$Xfer // avoid SEGV mov $Xfer,sp ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); $code.=<<___; add $A,$A,$t4 // h+=Sigma0(a) from the past ldp $t0,$t1,[$ctx,#0] add $A,$A,$t2 // h+=Maj(a,b,c) from the past ldp $t2,$t3,[$ctx,#8] add $A,$A,$t0 // accumulate add $B,$B,$t1 ldp $t0,$t1,[$ctx,#16] add $C,$C,$t2 add $D,$D,$t3 ldp $t2,$t3,[$ctx,#24] add $E,$E,$t0 add $F,$F,$t1 ldr $t1,[sp,#0] stp $A,$B,[$ctx,#0] add $G,$G,$t2 mov $t2,wzr stp $C,$D,[$ctx,#8] add $H,$H,$t3 stp $E,$F,[$ctx,#16] eor $t3,$B,$C stp $G,$H,[$ctx,#24] mov $t4,wzr mov $Xfer,sp b.ne .L_00_48 ldr x29,[x29] add sp,sp,#16*4+16 ret .size sha256_block_neon,.-sha256_block_neon ___ } if ($SZ==8) { my $Ktbl="x3"; my @H = map("v$_.16b",(0..4)); my ($fg,$de,$m9_10)=map("v$_.16b",(5..7)); my @MSG=map("v$_.16b",(16..23)); my ($W0,$W1)=("v24.2d","v25.2d"); my ($AB,$CD,$EF,$GH)=map("v$_.16b",(26..29)); $code.=<<___; #ifndef __KERNEL__ .type sha512_block_armv8,%function .align 6 sha512_block_armv8: .Lv8_entry: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later stp x29,x30,[sp,#-16]! add x29,sp,#0 ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input ld1 {@MSG[4]-@MSG[7]},[$inp],#64 ld1.64 {@H[0]-@H[3]},[$ctx] // load context adr $Ktbl,.LK512 rev64 @MSG[0],@MSG[0] rev64 @MSG[1],@MSG[1] rev64 @MSG[2],@MSG[2] rev64 @MSG[3],@MSG[3] rev64 @MSG[4],@MSG[4] rev64 @MSG[5],@MSG[5] rev64 @MSG[6],@MSG[6] rev64 @MSG[7],@MSG[7] b .Loop_hw .align 4 .Loop_hw: ld1.64 {$W0},[$Ktbl],#16 subs $num,$num,#1 sub x4,$inp,#128 orr $AB,@H[0],@H[0] // offload orr $CD,@H[1],@H[1] orr $EF,@H[2],@H[2] orr $GH,@H[3],@H[3] csel $inp,$inp,x4,ne // conditional rewind ___ for($i=0;$i<32;$i++) { $code.=<<___; add.i64 $W0,$W0,@MSG[0] ld1.64 {$W1},[$Ktbl],#16 ext $W0,$W0,$W0,#8 ext $fg,@H[2],@H[3],#8 ext $de,@H[1],@H[2],#8 add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" sha512su0 @MSG[0],@MSG[1] ext $m9_10,@MSG[4],@MSG[5],#8 sha512h @H[3],$fg,$de sha512su1 @MSG[0],@MSG[7],$m9_10 add.i64 @H[4],@H[1],@H[3] // "D + T1" sha512h2 @H[3],$H[1],@H[0] ___ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); } for(;$i<40;$i++) { $code.=<<___ if ($i<39); ld1.64 {$W1},[$Ktbl],#16 ___ $code.=<<___ if ($i==39); sub $Ktbl,$Ktbl,#$rounds*$SZ // rewind ___ $code.=<<___; add.i64 $W0,$W0,@MSG[0] ld1 {@MSG[0]},[$inp],#16 // load next input ext $W0,$W0,$W0,#8 ext $fg,@H[2],@H[3],#8 ext $de,@H[1],@H[2],#8 add.i64 @H[3],@H[3],$W0 // "T1 + H + K512[i]" sha512h @H[3],$fg,$de rev64 @MSG[0],@MSG[0] add.i64 @H[4],@H[1],@H[3] // "D + T1" sha512h2 @H[3],$H[1],@H[0] ___ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); @H = (@H[3],@H[0],@H[4],@H[2],@H[1]); } $code.=<<___; add.i64 @H[0],@H[0],$AB // accumulate add.i64 @H[1],@H[1],$CD add.i64 @H[2],@H[2],$EF add.i64 @H[3],@H[3],$GH cbnz $num,.Loop_hw st1.64 {@H[0]-@H[3]},[$ctx] // store context ldr x29,[sp],#16 ret .size sha512_block_armv8,.-sha512_block_armv8 #endif ___ } { my %opcode = ( "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); sub unsha256 { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5)|($3<<16), $mnemonic,$arg; } } { my %opcode = ( "sha512h" => 0xce608000, "sha512h2" => 0xce608400, "sha512su0" => 0xcec08000, "sha512su1" => 0xce608800 ); sub unsha512 { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o && sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5)|($3<<16), $mnemonic,$arg; } } open SELF,$0; while() { next if (/^#!/); last if (!s/^#/\/\// and !/^$/); print; } close SELF; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers s/\.[ui]?8(\s)/$1/; s/\.\w?64\b// and s/\.16b/\.2d/g or s/\.\w?32\b// and s/\.16b/\.4s/g; m/\bext\b/ and s/\.2d/\.16b/g or m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/openssl/crypto/sha/build.info b/crypto/openssl/crypto/sha/build.info index 186ec13cc82a..deb7b1530d46 100644 --- a/crypto/openssl/crypto/sha/build.info +++ b/crypto/openssl/crypto/sha/build.info @@ -1,176 +1,177 @@ LIBS=../../libcrypto $SHA1ASM= IF[{- !$disabled{asm} -}] $SHA1ASM_x86=sha1-586.S sha256-586.S sha512-586.S $SHA1DEF_x86=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_x86_64=\ sha1-x86_64.s sha256-x86_64.s sha512-x86_64.s sha1-mb-x86_64.s \ sha256-mb-x86_64.s $SHA1DEF_x86_64=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_ia64=sha1-ia64.s sha256-ia64.s sha512-ia64.s $SHA1DEF_ia64=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_sparcv9=sha1-sparcv9.S sha256-sparcv9.S sha512-sparcv9.S $SHA1DEF_sparcv9=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_alpha=sha1-alpha.S $SHA1DEF_alpha=SHA1_ASM $SHA1ASM_mips32=sha1-mips.S sha256-mips.S $SHA1DEF_mips32=SHA1_ASM SHA256_ASM $SHA1ASM_mips64=$SHA1ASM_mips32 sha512-mips.S $SHA1DEF_mips64=$SHA1DEF_mips32 SHA512_ASM $SHA1ASM_s390x=sha1-s390x.S sha256-s390x.S sha512-s390x.S $SHA1DEF_s390x=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_armv4=sha1-armv4-large.S sha256-armv4.S sha512-armv4.S $SHA1DEF_armv4=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_aarch64=sha1-armv8.S sha256-armv8.S sha512-armv8.S $SHA1DEF_aarch64=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_parisc11=sha1-parisc.s sha256-parisc.s sha512-parisc.s $SHA1DEF_parisc11=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_parisc20_64=$SHA1ASM_parisc11 $SHA1DEF_parisc20_64=$SHA1DEF_parisc11 $SHA1ASM_ppc32=\ sha_ppc.c sha1-ppc.s sha256-ppc.s sha512-ppc.s sha256p8-ppc.s \ sha512p8-ppc.s $SHA1DEF_ppc32=SHA1_ASM SHA256_ASM SHA512_ASM $SHA1ASM_ppc64=$SHA1ASM_ppc32 $SHA1DEF_ppc64=$SHA1DEF_ppc32 $SHA1ASM_c64xplus=sha1-c64xplus.s sha256-c64xplus.s sha512-c64xplus.s $SHA1DEF_c64xplus=SHA1_ASM SHA256_ASM SHA512_ASM # Now that we have defined all the arch specific variables, use the # appropriate one, and define the appropriate macros IF[$SHA1ASM_{- $target{asm_arch} -}] $SHA1ASM=$SHA1ASM_{- $target{asm_arch} -} $SHA1DEF=$SHA1DEF_{- $target{asm_arch} -} ENDIF ENDIF $KECCAK1600ASM=keccak1600.c IF[{- !$disabled{asm} -}] $KECCAK1600ASM_x86= $KECCAK1600ASM_x86_64=keccak1600-x86_64.s $KECCAK1600ASM_s390x=keccak1600-s390x.S $KECCAK1600ASM_armv4=keccak1600-armv4.S $KECCAK1600ASM_aarch64=keccak1600-armv8.S $KECCAK1600ASM_ppc64=keccak1600-ppc64.s # Now that we have defined all the arch specific variables, use the # appropriate one, and define the appropriate macros IF[$KECCAK1600ASM_{- $target{asm_arch} -}] $KECCAK1600ASM=$KECCAK1600ASM_{- $target{asm_arch} -} $KECCAK1600DEF=KECCAK1600_ASM ENDIF ENDIF $COMMON=sha1dgst.c sha256.c sha512.c sha3.c $SHA1ASM $KECCAK1600ASM SOURCE[../../libcrypto]=$COMMON sha1_one.c SOURCE[../../providers/libfips.a]= $COMMON # Implementations are now spread across several libraries, so the defines # need to be applied to all affected libraries and modules. DEFINE[../../libcrypto]=$SHA1DEF $KECCAK1600DEF DEFINE[../../providers/libfips.a]=$SHA1DEF $KECCAK1600DEF DEFINE[../../providers/libdefault.a]=$SHA1DEF $KECCAK1600DEF # We only need to include the SHA1DEF and KECCAK1600DEF stuff in the # legacy provider when it's a separate module and it's dynamically # linked with libcrypto. Otherwise, it already gets everything that # the static libcrypto.a has, and doesn't need it added again. IF[{- !$disabled{module} && !$disabled{shared} -}] DEFINE[../../providers/liblegacy.a]=$SHA1DEF $KECCAK1600DEF ENDIF GENERATE[sha1-586.S]=asm/sha1-586.pl DEPEND[sha1-586.S]=../perlasm/x86asm.pl GENERATE[sha256-586.S]=asm/sha256-586.pl DEPEND[sha256-586.S]=../perlasm/x86asm.pl GENERATE[sha512-586.S]=asm/sha512-586.pl DEPEND[sha512-586.S]=../perlasm/x86asm.pl GENERATE[sha1-ia64.s]=asm/sha1-ia64.pl GENERATE[sha256-ia64.s]=asm/sha512-ia64.pl GENERATE[sha512-ia64.s]=asm/sha512-ia64.pl GENERATE[sha1-alpha.S]=asm/sha1-alpha.pl GENERATE[sha1-x86_64.s]=asm/sha1-x86_64.pl GENERATE[sha1-mb-x86_64.s]=asm/sha1-mb-x86_64.pl GENERATE[sha256-x86_64.s]=asm/sha512-x86_64.pl GENERATE[sha256-mb-x86_64.s]=asm/sha256-mb-x86_64.pl GENERATE[sha512-x86_64.s]=asm/sha512-x86_64.pl GENERATE[keccak1600-x86_64.s]=asm/keccak1600-x86_64.pl GENERATE[sha1-sparcv9a.S]=asm/sha1-sparcv9a.pl GENERATE[sha1-sparcv9.S]=asm/sha1-sparcv9.pl INCLUDE[sha1-sparcv9.o]=.. GENERATE[sha256-sparcv9.S]=asm/sha512-sparcv9.pl INCLUDE[sha256-sparcv9.o]=.. GENERATE[sha512-sparcv9.S]=asm/sha512-sparcv9.pl INCLUDE[sha512-sparcv9.o]=.. GENERATE[sha1-ppc.s]=asm/sha1-ppc.pl GENERATE[sha256-ppc.s]=asm/sha512-ppc.pl GENERATE[sha512-ppc.s]=asm/sha512-ppc.pl GENERATE[sha256p8-ppc.s]=asm/sha512p8-ppc.pl GENERATE[sha512p8-ppc.s]=asm/sha512p8-ppc.pl GENERATE[keccak1600-ppc64.s]=asm/keccak1600-ppc64.pl GENERATE[sha1-parisc.s]=asm/sha1-parisc.pl GENERATE[sha256-parisc.s]=asm/sha512-parisc.pl GENERATE[sha512-parisc.s]=asm/sha512-parisc.pl GENERATE[sha1-mips.S]=asm/sha1-mips.pl INCLUDE[sha1-mips.o]=.. GENERATE[sha256-mips.S]=asm/sha512-mips.pl INCLUDE[sha256-mips.o]=.. GENERATE[sha512-mips.S]=asm/sha512-mips.pl INCLUDE[sha512-mips.o]=.. GENERATE[sha1-armv4-large.S]=asm/sha1-armv4-large.pl INCLUDE[sha1-armv4-large.o]=.. GENERATE[sha256-armv4.S]=asm/sha256-armv4.pl INCLUDE[sha256-armv4.o]=.. GENERATE[sha512-armv4.S]=asm/sha512-armv4.pl INCLUDE[sha512-armv4.o]=.. GENERATE[keccak1600-armv4.S]=asm/keccak1600-armv4.pl INCLUDE[keccak1600-armv4.o]=.. GENERATE[sha1-armv8.S]=asm/sha1-armv8.pl INCLUDE[sha1-armv8.o]=.. GENERATE[sha256-armv8.S]=asm/sha512-armv8.pl INCLUDE[sha256-armv8.o]=.. GENERATE[sha512-armv8.S]=asm/sha512-armv8.pl INCLUDE[sha512-armv8.o]=.. GENERATE[keccak1600-armv8.S]=asm/keccak1600-armv8.pl +INCLUDE[keccak1600-armv8.o]=.. GENERATE[sha1-s390x.S]=asm/sha1-s390x.pl INCLUDE[sha1-s390x.o]=.. GENERATE[sha256-s390x.S]=asm/sha512-s390x.pl INCLUDE[sha256-s390x.o]=.. GENERATE[sha512-s390x.S]=asm/sha512-s390x.pl INCLUDE[sha512-s390x.o]=.. GENERATE[keccak1600-s390x.S]=asm/keccak1600-s390x.pl GENERATE[sha1-c64xplus.S]=asm/sha1-c64xplus.pl GENERATE[sha256-c64xplus.S]=asm/sha256-c64xplus.pl GENERATE[sha512-c64xplus.S]=asm/sha512-c64xplus.pl GENERATE[keccak1600-c64x.S]=asm/keccak1600-c64x.pl # These are not yet used GENERATE[keccak1600-avx2.S]=asm/keccak1600-avx2.pl GENERATE[keccak1600-avx512.S]=asm/keccak1600-avx512.pl GENERATE[keccak1600-avx512vl.S]=asm/keccak1600-avx512vl.pl GENERATE[keccak1600-mmx.S]=asm/keccak1600-mmx.pl GENERATE[keccak1600p8-ppc.S]=asm/keccak1600p8-ppc.pl GENERATE[sha1-thumb.S]=asm/sha1-thumb.pl