Changeset View
Changeset View
Standalone View
Standalone View
head/sys/dev/if_wg/module/crypto/zinc/chacha20/chacha20-arm.pl
Property | Old Value | New Value |
---|---|---|
fbsd:nokeywords | null | ON \ No newline at end of property |
svn:eol-style | null | native \ No newline at end of property |
svn:executable | null | * \ No newline at end of property |
svn:mime-type | null | text/plain \ No newline at end of property |
#!/usr/bin/env perl | |||||
# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause | |||||
# | |||||
# This code is taken from the OpenSSL project but the author, Andy Polyakov, | |||||
# has relicensed it under the licenses specified in the SPDX header above. | |||||
# The original headers, including the original license headers, are | |||||
# included below for completeness. | |||||
# | |||||
# ==================================================================== | |||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |||||
# project. The module is, however, dual licensed under OpenSSL and | |||||
# CRYPTOGAMS licenses depending on where you obtain it. For further | |||||
# details see http://www.openssl.org/~appro/cryptogams/. | |||||
# ==================================================================== | |||||
# | |||||
# December 2014 | |||||
# | |||||
# ChaCha20 for ARMv4. | |||||
# | |||||
# September 2018 | |||||
# | |||||
# Improve scalar performance per Eric Biggers' suggestion to eliminate | |||||
# separate rotates. This requires b[0..3] and d[0..3] to be maintained | |||||
# pre-rotated, hence odd twists prior inner loop and when accumulating | |||||
# key material. Since amount of instructions is reduced as result, even | |||||
# NEON performance is improved somewhat, most notably by ~9% on low-end | |||||
# Cortex-A5/A7. Full unroll was shown to provide even better scalar | |||||
# performance on Cortex-A5/A7, naturally at the cost of manyfold size | |||||
# increase. We let it be. Oversized code works in benchmarks, but is not | |||||
# necessarily optimal in real life, when it's likely to be out-of-cache | |||||
# upon entry and evict significant part of cache upon completion. | |||||
# | |||||
# Performance in cycles per byte out of large buffer. | |||||
# | |||||
# IALU/gcc-4.4 1xNEON 3xNEON+1xIALU | |||||
# | |||||
# Cortex-A5 14.2(*)/+160% 21.8 12.9(**) | |||||
# Cortex-A8 10.2(*)/+190% 13.9 6.10 | |||||
# Cortex-A9 10.8(*)/+150% 14.3 6.50 | |||||
# Cortex-A15 11.0/+40% 16.0 4.90 | |||||
# Snapdragon S4 13.9(***)/+90% 13.6 4.90 | |||||
# | |||||
# (*) most "favourable" result for aligned data on little-endian | |||||
# processor, result for misaligned data is 10-15% lower; | |||||
# (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8% | |||||
# better performance on Cortex-A5/A7, but not on others; | |||||
# (***) it's 17% slower than original, trade-off is considered | |||||
# acceptable, because of improvement on others, specifically | |||||
# +36% on Cortex-A5/A7 and +20% on Cortex-A9; | |||||
$flavour = shift; | |||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } | |||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } | |||||
if ($flavour && $flavour ne "void") { | |||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |||||
die "can't locate arm-xlate.pl"; | |||||
open STDOUT,"| \"$^X\" $xlate $flavour $output"; | |||||
} else { | |||||
open STDOUT,">$output"; | |||||
} | |||||
sub AUTOLOAD() # thunk [simplified] x86-style perlasm | |||||
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; | |||||
my $arg = pop; | |||||
$arg = "#$arg" if ($arg*1 eq $arg); | |||||
$code .= "\t$opcode\t".join(',',@_,$arg)."\n"; | |||||
} | |||||
my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); | |||||
my @t=map("r$_",(8..11)); | |||||
sub ROUND { | |||||
my ($a0,$b0,$c0,$d0)=@_; | |||||
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |||||
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |||||
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |||||
my $odd = $d0&1; | |||||
my ($xc,$xc_) = (@t[0..1]); | |||||
my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); | |||||
my @ret; | |||||
# Consider order in which variables are addressed by their | |||||
# index: | |||||
# | |||||
# a b c d | |||||
# | |||||
# 0 4 8 12 < even round | |||||
# 1 5 9 13 | |||||
# 2 6 10 14 | |||||
# 3 7 11 15 | |||||
# 0 5 10 15 < odd round | |||||
# 1 6 11 12 | |||||
# 2 7 8 13 | |||||
# 3 4 9 14 | |||||
# | |||||
# 'a', 'b' are permanently allocated in registers, @x[0..7], | |||||
# while 'c's and pair of 'd's are maintained in memory. If | |||||
# you observe 'c' column, you'll notice that pair of 'c's is | |||||
# invariant between rounds. This means that we have to reload | |||||
# them once per round, in the middle. This is why you'll see | |||||
# bunch of 'c' stores and loads in the middle, but none in | |||||
# the beginning or end. If you observe 'd' column, you'll | |||||
# notice that 15 and 13 are reused in next pair of rounds. | |||||
# This is why these two are chosen for offloading to memory, | |||||
# to make loads count more. | |||||
push @ret,( | |||||
"&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')", | |||||
"&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')", | |||||
"&eor ($xd,@x[$a0],$xd,'ror#24')", | |||||
"&eor ($xd_,@x[$a1],$xd_,'ror#24')", | |||||
"&add ($xc,$xc,$xd,'ror#16')", | |||||
"&add ($xc_,$xc_,$xd_,'ror#16')", | |||||
"&eor (@x[$b0],$xc, @x[$b0],'ror#13')", | |||||
"&eor (@x[$b1],$xc_,@x[$b1],'ror#13')", | |||||
"&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')", | |||||
"&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')", | |||||
"&eor ($xd,@x[$a0],$xd,'ror#16')", | |||||
"&eor ($xd_,@x[$a1],$xd_,'ror#16')" ); | |||||
push @ret,( | |||||
"&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd); | |||||
push @ret,( | |||||
"&add ($xc,$xc,$xd,'ror#24')" ); | |||||
push @ret,( | |||||
"&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); | |||||
push @ret,( | |||||
"&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd); | |||||
push @ret,( | |||||
"&add ($xc_,$xc_,$xd_,'ror#24')" ); | |||||
push @ret,( | |||||
"&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); | |||||
push @ret,( | |||||
"&str ($xc,'[sp,#4*(16+$c0)]')", | |||||
"&eor (@x[$b0],@x[$b0],$xc,'ror#12')", | |||||
"&str ($xc_,'[sp,#4*(16+$c1)]')", | |||||
"&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" ); | |||||
$xd=@x[$d2] if (!$odd); | |||||
$xd_=@x[$d3] if ($odd); | |||||
push @ret,( | |||||
"&ldr ($xc,'[sp,#4*(16+$c2)]')", | |||||
"&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')", | |||||
"&ldr ($xc_,'[sp,#4*(16+$c3)]')", | |||||
"&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')", | |||||
"&eor ($xd,@x[$a2],$xd,'ror#24')", | |||||
"&eor ($xd_,@x[$a3],$xd_,'ror#24')", | |||||
"&add ($xc,$xc,$xd,'ror#16')", | |||||
"&add ($xc_,$xc_,$xd_,'ror#16')", | |||||
"&eor (@x[$b2],$xc, @x[$b2],'ror#13')", | |||||
"&eor (@x[$b3],$xc_,@x[$b3],'ror#13')", | |||||
"&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')", | |||||
"&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')", | |||||
"&eor ($xd,@x[$a2],$xd,'ror#16')", | |||||
"&eor ($xd_,@x[$a3],$xd_,'ror#16')", | |||||
"&add ($xc,$xc,$xd,'ror#24')", | |||||
"&add ($xc_,$xc_,$xd_,'ror#24')", | |||||
"&eor (@x[$b2],@x[$b2],$xc,'ror#12')", | |||||
"&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" ); | |||||
@ret; | |||||
} | |||||
$code.=<<___; | |||||
#ifndef __KERNEL__ | |||||
# include "arm_arch.h" | |||||
#else | |||||
# define __ARM_ARCH__ __LINUX_ARM_ARCH__ | |||||
# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ | |||||
# define ChaCha20_ctr32 chacha20_arm_cryptogams | |||||
# define ChaCha20_neon chacha20_neon | |||||
#endif | |||||
.text | |||||
#if defined(__thumb2__) || defined(__clang__) | |||||
.syntax unified | |||||
# define ldrhsb ldrbhs | |||||
#endif | |||||
#if defined(__thumb2__) | |||||
.thumb | |||||
#else | |||||
.code 32 | |||||
#endif | |||||
.align 5 | |||||
.Lsigma: | |||||
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral | |||||
.Lone: | |||||
.long 1,0,0,0 | |||||
.Lrot8: | |||||
.long 0x02010003,0x06050407 | |||||
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) | |||||
.LOPENSSL_armcap: | |||||
.word OPENSSL_armcap_P-.LChaCha20_ctr32 | |||||
#else | |||||
.word -1 | |||||
#endif | |||||
.globl ChaCha20_ctr32 | |||||
.type ChaCha20_ctr32,%function | |||||
.align 5 | |||||
ChaCha20_ctr32: | |||||
.LChaCha20_ctr32: | |||||
ldr r12,[sp,#0] @ pull pointer to counter and nonce | |||||
stmdb sp!,{r0-r2,r4-r11,lr} | |||||
#if __ARM_ARCH__<7 && !defined(__thumb2__) | |||||
sub r14,pc,#16 @ ChaCha20_ctr32 | |||||
#else | |||||
adr r14,.LChaCha20_ctr32 | |||||
#endif | |||||
cmp r2,#0 @ len==0? | |||||
#ifdef __thumb2__ | |||||
itt eq | |||||
#endif | |||||
addeq sp,sp,#4*3 | |||||
beq .Lno_data | |||||
#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) | |||||
cmp r2,#192 @ test len | |||||
bls .Lshort | |||||
ldr r4,[r14,#-24] | |||||
ldr r4,[r14,r4] | |||||
# ifdef __APPLE__ | |||||
ldr r4,[r4] | |||||
# endif | |||||
tst r4,#ARMV7_NEON | |||||
bne .LChaCha20_neon | |||||
.Lshort: | |||||
#endif | |||||
ldmia r12,{r4-r7} @ load counter and nonce | |||||
sub sp,sp,#4*(16) @ off-load area | |||||
sub r14,r14,#64 @ .Lsigma | |||||
stmdb sp!,{r4-r7} @ copy counter and nonce | |||||
ldmia r3,{r4-r11} @ load key | |||||
ldmia r14,{r0-r3} @ load sigma | |||||
stmdb sp!,{r4-r11} @ copy key | |||||
stmdb sp!,{r0-r3} @ copy sigma | |||||
str r10,[sp,#4*(16+10)] @ off-load "@x[10]" | |||||
str r11,[sp,#4*(16+11)] @ off-load "@x[11]" | |||||
b .Loop_outer_enter | |||||
.align 4 | |||||
.Loop_outer: | |||||
ldmia sp,{r0-r9} @ load key material | |||||
str @t[3],[sp,#4*(32+2)] @ save len | |||||
str r12, [sp,#4*(32+1)] @ save inp | |||||
str r14, [sp,#4*(32+0)] @ save out | |||||
.Loop_outer_enter: | |||||
ldr @t[3], [sp,#4*(15)] | |||||
mov @x[4],@x[4],ror#19 @ twist b[0..3] | |||||
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load | |||||
mov @x[5],@x[5],ror#19 | |||||
ldr @t[2], [sp,#4*(13)] | |||||
mov @x[6],@x[6],ror#19 | |||||
ldr @x[14],[sp,#4*(14)] | |||||
mov @x[7],@x[7],ror#19 | |||||
mov @t[3],@t[3],ror#8 @ twist d[0..3] | |||||
mov @x[12],@x[12],ror#8 | |||||
mov @t[2],@t[2],ror#8 | |||||
mov @x[14],@x[14],ror#8 | |||||
str @t[3], [sp,#4*(16+15)] | |||||
mov @t[3],#10 | |||||
b .Loop | |||||
.align 4 | |||||
.Loop: | |||||
subs @t[3],@t[3],#1 | |||||
___ | |||||
foreach (&ROUND(0, 4, 8,12)) { eval; } | |||||
foreach (&ROUND(0, 5,10,15)) { eval; } | |||||
$code.=<<___; | |||||
bne .Loop | |||||
ldr @t[3],[sp,#4*(32+2)] @ load len | |||||
str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store | |||||
str @t[1], [sp,#4*(16+9)] | |||||
str @x[12],[sp,#4*(16+12)] | |||||
str @t[2], [sp,#4*(16+13)] | |||||
str @x[14],[sp,#4*(16+14)] | |||||
@ at this point we have first half of 512-bit result in | |||||
@ @x[0-7] and second half at sp+4*(16+8) | |||||
cmp @t[3],#64 @ done yet? | |||||
#ifdef __thumb2__ | |||||
itete lo | |||||
#endif | |||||
addlo r12,sp,#4*(0) @ shortcut or ... | |||||
ldrhs r12,[sp,#4*(32+1)] @ ... load inp | |||||
addlo r14,sp,#4*(0) @ shortcut or ... | |||||
ldrhs r14,[sp,#4*(32+0)] @ ... load out | |||||
ldr @t[0],[sp,#4*(0)] @ load key material | |||||
ldr @t[1],[sp,#4*(1)] | |||||
#if __ARM_ARCH__>=6 || !defined(__ARMEB__) | |||||
# if __ARM_ARCH__<7 | |||||
orr @t[2],r12,r14 | |||||
tst @t[2],#3 @ are input and output aligned? | |||||
ldr @t[2],[sp,#4*(2)] | |||||
bne .Lunaligned | |||||
cmp @t[3],#64 @ restore flags | |||||
# else | |||||
ldr @t[2],[sp,#4*(2)] | |||||
# endif | |||||
ldr @t[3],[sp,#4*(3)] | |||||
add @x[0],@x[0],@t[0] @ accumulate key material | |||||
add @x[1],@x[1],@t[1] | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhs @t[0],[r12],#16 @ load input | |||||
ldrhs @t[1],[r12,#-12] | |||||
add @x[2],@x[2],@t[2] | |||||
add @x[3],@x[3],@t[3] | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhs @t[2],[r12,#-8] | |||||
ldrhs @t[3],[r12,#-4] | |||||
# if __ARM_ARCH__>=6 && defined(__ARMEB__) | |||||
rev @x[0],@x[0] | |||||
rev @x[1],@x[1] | |||||
rev @x[2],@x[2] | |||||
rev @x[3],@x[3] | |||||
# endif | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
eorhs @x[0],@x[0],@t[0] @ xor with input | |||||
eorhs @x[1],@x[1],@t[1] | |||||
add @t[0],sp,#4*(4) | |||||
str @x[0],[r14],#16 @ store output | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
eorhs @x[2],@x[2],@t[2] | |||||
eorhs @x[3],@x[3],@t[3] | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
str @x[1],[r14,#-12] | |||||
str @x[2],[r14,#-8] | |||||
str @x[3],[r14,#-4] | |||||
add @x[4],@t[0],@x[4],ror#13 @ accumulate key material | |||||
add @x[5],@t[1],@x[5],ror#13 | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhs @t[0],[r12],#16 @ load input | |||||
ldrhs @t[1],[r12,#-12] | |||||
add @x[6],@t[2],@x[6],ror#13 | |||||
add @x[7],@t[3],@x[7],ror#13 | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhs @t[2],[r12,#-8] | |||||
ldrhs @t[3],[r12,#-4] | |||||
# if __ARM_ARCH__>=6 && defined(__ARMEB__) | |||||
rev @x[4],@x[4] | |||||
rev @x[5],@x[5] | |||||
rev @x[6],@x[6] | |||||
rev @x[7],@x[7] | |||||
# endif | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
eorhs @x[4],@x[4],@t[0] | |||||
eorhs @x[5],@x[5],@t[1] | |||||
add @t[0],sp,#4*(8) | |||||
str @x[4],[r14],#16 @ store output | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
eorhs @x[6],@x[6],@t[2] | |||||
eorhs @x[7],@x[7],@t[3] | |||||
str @x[5],[r14,#-12] | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
str @x[6],[r14,#-8] | |||||
add @x[0],sp,#4*(16+8) | |||||
str @x[7],[r14,#-4] | |||||
ldmia @x[0],{@x[0]-@x[7]} @ load second half | |||||
add @x[0],@x[0],@t[0] @ accumulate key material | |||||
add @x[1],@x[1],@t[1] | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhs @t[0],[r12],#16 @ load input | |||||
ldrhs @t[1],[r12,#-12] | |||||
# ifdef __thumb2__ | |||||
itt hi | |||||
# endif | |||||
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it | |||||
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it | |||||
add @x[2],@x[2],@t[2] | |||||
add @x[3],@x[3],@t[3] | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhs @t[2],[r12,#-8] | |||||
ldrhs @t[3],[r12,#-4] | |||||
# if __ARM_ARCH__>=6 && defined(__ARMEB__) | |||||
rev @x[0],@x[0] | |||||
rev @x[1],@x[1] | |||||
rev @x[2],@x[2] | |||||
rev @x[3],@x[3] | |||||
# endif | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
eorhs @x[0],@x[0],@t[0] | |||||
eorhs @x[1],@x[1],@t[1] | |||||
add @t[0],sp,#4*(12) | |||||
str @x[0],[r14],#16 @ store output | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
eorhs @x[2],@x[2],@t[2] | |||||
eorhs @x[3],@x[3],@t[3] | |||||
str @x[1],[r14,#-12] | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
str @x[2],[r14,#-8] | |||||
str @x[3],[r14,#-4] | |||||
add @x[4],@t[0],@x[4],ror#24 @ accumulate key material | |||||
add @x[5],@t[1],@x[5],ror#24 | |||||
# ifdef __thumb2__ | |||||
itt hi | |||||
# endif | |||||
addhi @t[0],@t[0],#1 @ next counter value | |||||
strhi @t[0],[sp,#4*(12)] @ save next counter value | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhs @t[0],[r12],#16 @ load input | |||||
ldrhs @t[1],[r12,#-12] | |||||
add @x[6],@t[2],@x[6],ror#24 | |||||
add @x[7],@t[3],@x[7],ror#24 | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhs @t[2],[r12,#-8] | |||||
ldrhs @t[3],[r12,#-4] | |||||
# if __ARM_ARCH__>=6 && defined(__ARMEB__) | |||||
rev @x[4],@x[4] | |||||
rev @x[5],@x[5] | |||||
rev @x[6],@x[6] | |||||
rev @x[7],@x[7] | |||||
# endif | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
eorhs @x[4],@x[4],@t[0] | |||||
eorhs @x[5],@x[5],@t[1] | |||||
# ifdef __thumb2__ | |||||
it ne | |||||
# endif | |||||
ldrne @t[0],[sp,#4*(32+2)] @ re-load len | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
eorhs @x[6],@x[6],@t[2] | |||||
eorhs @x[7],@x[7],@t[3] | |||||
str @x[4],[r14],#16 @ store output | |||||
str @x[5],[r14,#-12] | |||||
# ifdef __thumb2__ | |||||
it hs | |||||
# endif | |||||
subhs @t[3],@t[0],#64 @ len-=64 | |||||
str @x[6],[r14,#-8] | |||||
str @x[7],[r14,#-4] | |||||
bhi .Loop_outer | |||||
beq .Ldone | |||||
# if __ARM_ARCH__<7 | |||||
b .Ltail | |||||
.align 4 | |||||
.Lunaligned: @ unaligned endian-neutral path | |||||
cmp @t[3],#64 @ restore flags | |||||
# endif | |||||
#endif | |||||
#if __ARM_ARCH__<7 | |||||
ldr @t[3],[sp,#4*(3)] | |||||
___ | |||||
for ($i=0;$i<16;$i+=4) { | |||||
my $j=$i&0x7; | |||||
my $twist=""; | |||||
if ($i==4) { $twist = ",ror#13"; } | |||||
elsif ($i==12) { $twist = ",ror#24"; } | |||||
$code.=<<___ if ($i==4); | |||||
add @x[0],sp,#4*(16+8) | |||||
___ | |||||
$code.=<<___ if ($i==8); | |||||
ldmia @x[0],{@x[0]-@x[7]} @ load second half | |||||
# ifdef __thumb2__ | |||||
itt hi | |||||
# endif | |||||
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" | |||||
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" | |||||
___ | |||||
$code.=<<___; | |||||
add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material | |||||
___ | |||||
$code.=<<___ if ($i==12); | |||||
# ifdef __thumb2__ | |||||
itt hi | |||||
# endif | |||||
addhi @t[0],@t[0],#1 @ next counter value | |||||
strhi @t[0],[sp,#4*(12)] @ save next counter value | |||||
___ | |||||
$code.=<<___; | |||||
add @x[$j+1],@t[1],@x[$j+1]$twist | |||||
add @x[$j+2],@t[2],@x[$j+2]$twist | |||||
# ifdef __thumb2__ | |||||
itete lo | |||||
# endif | |||||
eorlo @t[0],@t[0],@t[0] @ zero or ... | |||||
ldrhsb @t[0],[r12],#16 @ ... load input | |||||
eorlo @t[1],@t[1],@t[1] | |||||
ldrhsb @t[1],[r12,#-12] | |||||
add @x[$j+3],@t[3],@x[$j+3]$twist | |||||
# ifdef __thumb2__ | |||||
itete lo | |||||
# endif | |||||
eorlo @t[2],@t[2],@t[2] | |||||
ldrhsb @t[2],[r12,#-8] | |||||
eorlo @t[3],@t[3],@t[3] | |||||
ldrhsb @t[3],[r12,#-4] | |||||
eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) | |||||
eor @x[$j+1],@t[1],@x[$j+1] | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhsb @t[0],[r12,#-15] @ load more input | |||||
ldrhsb @t[1],[r12,#-11] | |||||
eor @x[$j+2],@t[2],@x[$j+2] | |||||
strb @x[$j+0],[r14],#16 @ store output | |||||
eor @x[$j+3],@t[3],@x[$j+3] | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhsb @t[2],[r12,#-7] | |||||
ldrhsb @t[3],[r12,#-3] | |||||
strb @x[$j+1],[r14,#-12] | |||||
eor @x[$j+0],@t[0],@x[$j+0],lsr#8 | |||||
strb @x[$j+2],[r14,#-8] | |||||
eor @x[$j+1],@t[1],@x[$j+1],lsr#8 | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhsb @t[0],[r12,#-14] @ load more input | |||||
ldrhsb @t[1],[r12,#-10] | |||||
strb @x[$j+3],[r14,#-4] | |||||
eor @x[$j+2],@t[2],@x[$j+2],lsr#8 | |||||
strb @x[$j+0],[r14,#-15] | |||||
eor @x[$j+3],@t[3],@x[$j+3],lsr#8 | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhsb @t[2],[r12,#-6] | |||||
ldrhsb @t[3],[r12,#-2] | |||||
strb @x[$j+1],[r14,#-11] | |||||
eor @x[$j+0],@t[0],@x[$j+0],lsr#8 | |||||
strb @x[$j+2],[r14,#-7] | |||||
eor @x[$j+1],@t[1],@x[$j+1],lsr#8 | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhsb @t[0],[r12,#-13] @ load more input | |||||
ldrhsb @t[1],[r12,#-9] | |||||
strb @x[$j+3],[r14,#-3] | |||||
eor @x[$j+2],@t[2],@x[$j+2],lsr#8 | |||||
strb @x[$j+0],[r14,#-14] | |||||
eor @x[$j+3],@t[3],@x[$j+3],lsr#8 | |||||
# ifdef __thumb2__ | |||||
itt hs | |||||
# endif | |||||
ldrhsb @t[2],[r12,#-5] | |||||
ldrhsb @t[3],[r12,#-1] | |||||
strb @x[$j+1],[r14,#-10] | |||||
strb @x[$j+2],[r14,#-6] | |||||
eor @x[$j+0],@t[0],@x[$j+0],lsr#8 | |||||
strb @x[$j+3],[r14,#-2] | |||||
eor @x[$j+1],@t[1],@x[$j+1],lsr#8 | |||||
strb @x[$j+0],[r14,#-13] | |||||
eor @x[$j+2],@t[2],@x[$j+2],lsr#8 | |||||
strb @x[$j+1],[r14,#-9] | |||||
eor @x[$j+3],@t[3],@x[$j+3],lsr#8 | |||||
strb @x[$j+2],[r14,#-5] | |||||
strb @x[$j+3],[r14,#-1] | |||||
___ | |||||
$code.=<<___ if ($i<12); | |||||
add @t[0],sp,#4*(4+$i) | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
___ | |||||
} | |||||
$code.=<<___; | |||||
# ifdef __thumb2__ | |||||
it ne | |||||
# endif | |||||
ldrne @t[0],[sp,#4*(32+2)] @ re-load len | |||||
# ifdef __thumb2__ | |||||
it hs | |||||
# endif | |||||
subhs @t[3],@t[0],#64 @ len-=64 | |||||
bhi .Loop_outer | |||||
beq .Ldone | |||||
#endif | |||||
.Ltail: | |||||
ldr r12,[sp,#4*(32+1)] @ load inp | |||||
add @t[1],sp,#4*(0) | |||||
ldr r14,[sp,#4*(32+0)] @ load out | |||||
.Loop_tail: | |||||
ldrb @t[2],[@t[1]],#1 @ read buffer on stack | |||||
ldrb @t[3],[r12],#1 @ read input | |||||
subs @t[0],@t[0],#1 | |||||
eor @t[3],@t[3],@t[2] | |||||
strb @t[3],[r14],#1 @ store output | |||||
bne .Loop_tail | |||||
.Ldone: | |||||
add sp,sp,#4*(32+3) | |||||
.Lno_data: | |||||
#if __ARM_ARCH__>=5 | |||||
ldmia sp!,{r4-r11,pc} | |||||
#else | |||||
ldmia sp!,{r4-r12,lr} | |||||
tst lr,#1 | |||||
moveq pc,lr @ be binary compatible with V4, yet | |||||
.long 0xe12fff1e @ interoperable with Thumb ISA:-) | |||||
#endif | |||||
.size ChaCha20_ctr32,.-ChaCha20_ctr32 | |||||
___ | |||||
{{{ | |||||
my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = | |||||
map("q$_",(0..15)); | |||||
# This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on | |||||
# Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%! | |||||
sub vperm() | |||||
{ my ($dst,$src,$tbl) = @_; | |||||
$code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n"; | |||||
$code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n"; | |||||
} | |||||
sub NEONROUND { | |||||
my $odd = pop; | |||||
my ($a,$b,$c,$d,$t)=@_; | |||||
( | |||||
"&vadd_i32 ($a,$a,$b)", | |||||
"&veor ($d,$d,$a)", | |||||
"&vrev32_16 ($d,$d)", # vrot ($d,16) | |||||
"&vadd_i32 ($c,$c,$d)", | |||||
"&veor ($t,$b,$c)", | |||||
"&vshr_u32 ($b,$t,20)", | |||||
"&vsli_32 ($b,$t,12)", | |||||
"&vadd_i32 ($a,$a,$b)", | |||||
"&veor ($t,$d,$a)", | |||||
"&vshr_u32 ($d,$t,24)", | |||||
"&vsli_32 ($d,$t,8)", | |||||
#"&vperm ($d,$t,$t3)", | |||||
"&vadd_i32 ($c,$c,$d)", | |||||
"&veor ($t,$b,$c)", | |||||
"&vshr_u32 ($b,$t,25)", | |||||
"&vsli_32 ($b,$t,7)", | |||||
"&vext_8 ($a,$a,$a,$odd?4:12)", | |||||
"&vext_8 ($d,$d,$d,8)", | |||||
"&vext_8 ($c,$c,$c,$odd?12:4)" | |||||
); | |||||
} | |||||
$code.=<<___; | |||||
#if (defined(__KERNEL__) && defined(CONFIG_KERNEL_MODE_NEON)) || (!defined(__KERNEL__) && __ARM_MAX_ARCH__>=7) | |||||
.arch armv7-a | |||||
.fpu neon | |||||
# ifdef __KERNEL__ | |||||
.globl ChaCha20_neon | |||||
@ For optimal performance it's appropriate for caller to enforce | |||||
@ minimum input length, 193 bytes is suggested. | |||||
# endif | |||||
.type ChaCha20_neon,%function | |||||
.align 5 | |||||
ChaCha20_neon: | |||||
ldr r12,[sp,#0] @ pull pointer to counter and nonce | |||||
stmdb sp!,{r0-r2,r4-r11,lr} | |||||
.LChaCha20_neon: | |||||
adr r14,.Lsigma | |||||
vstmdb sp!,{d8-d15} @ ABI spec says so | |||||
stmdb sp!,{r0-r3} | |||||
vld1.32 {$b0-$c0},[r3] @ load key | |||||
ldmia r3,{r4-r11} @ load key | |||||
sub sp,sp,#4*(16+16) | |||||
vld1.32 {$d0},[r12] @ load counter and nonce | |||||
add r12,sp,#4*8 | |||||
ldmia r14,{r0-r3} @ load sigma | |||||
vld1.32 {$a0},[r14]! @ load sigma | |||||
vld1.32 {$t0},[r14]! @ one | |||||
@ vld1.32 {$t3#lo},[r14] @ rot8 | |||||
vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce | |||||
vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key | |||||
str r10,[sp,#4*(16+10)] @ off-load "@x[10]" | |||||
str r11,[sp,#4*(16+11)] @ off-load "@x[11]" | |||||
vshl.i32 $t1#lo,$t0#lo,#1 @ two | |||||
vstr $t0#lo,[sp,#4*(16+0)] | |||||
vshl.i32 $t2#lo,$t0#lo,#2 @ four | |||||
vstr $t1#lo,[sp,#4*(16+2)] | |||||
vmov $a1,$a0 | |||||
vstr $t2#lo,[sp,#4*(16+4)] | |||||
vmov $a2,$a0 | |||||
@ vstr $t3#lo,[sp,#4*(16+6)] | |||||
vmov $b1,$b0 | |||||
vmov $b2,$b0 | |||||
b .Loop_neon_enter | |||||
.align 4 | |||||
.Loop_neon_outer: | |||||
ldmia sp,{r0-r9} @ load key material | |||||
cmp @t[3],#64*2 @ if len<=64*2 | |||||
bls .Lbreak_neon @ switch to integer-only | |||||
@ vldr $t3#lo,[sp,#4*(16+6)] @ rot8 | |||||
vmov $a1,$a0 | |||||
str @t[3],[sp,#4*(32+2)] @ save len | |||||
vmov $a2,$a0 | |||||
str r12, [sp,#4*(32+1)] @ save inp | |||||
vmov $b1,$b0 | |||||
str r14, [sp,#4*(32+0)] @ save out | |||||
vmov $b2,$b0 | |||||
.Loop_neon_enter: | |||||
ldr @t[3], [sp,#4*(15)] | |||||
mov @x[4],@x[4],ror#19 @ twist b[0..3] | |||||
vadd.i32 $d1,$d0,$t0 @ counter+1 | |||||
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load | |||||
mov @x[5],@x[5],ror#19 | |||||
vmov $c1,$c0 | |||||
ldr @t[2], [sp,#4*(13)] | |||||
mov @x[6],@x[6],ror#19 | |||||
vmov $c2,$c0 | |||||
ldr @x[14],[sp,#4*(14)] | |||||
mov @x[7],@x[7],ror#19 | |||||
vadd.i32 $d2,$d1,$t0 @ counter+2 | |||||
add @x[12],@x[12],#3 @ counter+3 | |||||
mov @t[3],@t[3],ror#8 @ twist d[0..3] | |||||
mov @x[12],@x[12],ror#8 | |||||
mov @t[2],@t[2],ror#8 | |||||
mov @x[14],@x[14],ror#8 | |||||
str @t[3], [sp,#4*(16+15)] | |||||
mov @t[3],#10 | |||||
b .Loop_neon | |||||
.align 4 | |||||
.Loop_neon: | |||||
subs @t[3],@t[3],#1 | |||||
___ | |||||
my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); | |||||
my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); | |||||
my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); | |||||
my @thread3=&ROUND(0,4,8,12); | |||||
foreach (@thread0) { | |||||
eval; eval(shift(@thread3)); | |||||
eval(shift(@thread1)); eval(shift(@thread3)); | |||||
eval(shift(@thread2)); eval(shift(@thread3)); | |||||
} | |||||
@thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); | |||||
@thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); | |||||
@thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); | |||||
@thread3=&ROUND(0,5,10,15); | |||||
foreach (@thread0) { | |||||
eval; eval(shift(@thread3)); | |||||
eval(shift(@thread1)); eval(shift(@thread3)); | |||||
eval(shift(@thread2)); eval(shift(@thread3)); | |||||
} | |||||
$code.=<<___; | |||||
bne .Loop_neon | |||||
add @t[3],sp,#32 | |||||
vld1.32 {$t0-$t1},[sp] @ load key material | |||||
vld1.32 {$t2-$t3},[@t[3]] | |||||
ldr @t[3],[sp,#4*(32+2)] @ load len | |||||
str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store | |||||
str @t[1], [sp,#4*(16+9)] | |||||
str @x[12],[sp,#4*(16+12)] | |||||
str @t[2], [sp,#4*(16+13)] | |||||
str @x[14],[sp,#4*(16+14)] | |||||
@ at this point we have first half of 512-bit result in | |||||
@ @x[0-7] and second half at sp+4*(16+8) | |||||
ldr r12,[sp,#4*(32+1)] @ load inp | |||||
ldr r14,[sp,#4*(32+0)] @ load out | |||||
vadd.i32 $a0,$a0,$t0 @ accumulate key material | |||||
vadd.i32 $a1,$a1,$t0 | |||||
vadd.i32 $a2,$a2,$t0 | |||||
vldr $t0#lo,[sp,#4*(16+0)] @ one | |||||
vadd.i32 $b0,$b0,$t1 | |||||
vadd.i32 $b1,$b1,$t1 | |||||
vadd.i32 $b2,$b2,$t1 | |||||
vldr $t1#lo,[sp,#4*(16+2)] @ two | |||||
vadd.i32 $c0,$c0,$t2 | |||||
vadd.i32 $c1,$c1,$t2 | |||||
vadd.i32 $c2,$c2,$t2 | |||||
vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 | |||||
vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 | |||||
vadd.i32 $d0,$d0,$t3 | |||||
vadd.i32 $d1,$d1,$t3 | |||||
vadd.i32 $d2,$d2,$t3 | |||||
cmp @t[3],#64*4 | |||||
blo .Ltail_neon | |||||
vld1.8 {$t0-$t1},[r12]! @ load input | |||||
mov @t[3],sp | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a0,$a0,$t0 @ xor with input | |||||
veor $b0,$b0,$t1 | |||||
vld1.8 {$t0-$t1},[r12]! | |||||
veor $c0,$c0,$t2 | |||||
veor $d0,$d0,$t3 | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a1,$a1,$t0 | |||||
vst1.8 {$a0-$b0},[r14]! @ store output | |||||
veor $b1,$b1,$t1 | |||||
vld1.8 {$t0-$t1},[r12]! | |||||
veor $c1,$c1,$t2 | |||||
vst1.8 {$c0-$d0},[r14]! | |||||
veor $d1,$d1,$t3 | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a2,$a2,$t0 | |||||
vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration | |||||
veor $t0#hi,$t0#hi,$t0#hi | |||||
vldr $t0#lo,[sp,#4*(16+4)] @ four | |||||
veor $b2,$b2,$t1 | |||||
vld1.32 {$c0-$d0},[@t[3]] | |||||
veor $c2,$c2,$t2 | |||||
vst1.8 {$a1-$b1},[r14]! | |||||
veor $d2,$d2,$t3 | |||||
vst1.8 {$c1-$d1},[r14]! | |||||
vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value | |||||
vldr $t0#lo,[sp,#4*(16+0)] @ one | |||||
ldmia sp,{@t[0]-@t[3]} @ load key material | |||||
add @x[0],@x[0],@t[0] @ accumulate key material | |||||
ldr @t[0],[r12],#16 @ load input | |||||
vst1.8 {$a2-$b2},[r14]! | |||||
add @x[1],@x[1],@t[1] | |||||
ldr @t[1],[r12,#-12] | |||||
vst1.8 {$c2-$d2},[r14]! | |||||
add @x[2],@x[2],@t[2] | |||||
ldr @t[2],[r12,#-8] | |||||
add @x[3],@x[3],@t[3] | |||||
ldr @t[3],[r12,#-4] | |||||
# ifdef __ARMEB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[1],@x[1] | |||||
rev @x[2],@x[2] | |||||
rev @x[3],@x[3] | |||||
# endif | |||||
eor @x[0],@x[0],@t[0] @ xor with input | |||||
add @t[0],sp,#4*(4) | |||||
eor @x[1],@x[1],@t[1] | |||||
str @x[0],[r14],#16 @ store output | |||||
eor @x[2],@x[2],@t[2] | |||||
str @x[1],[r14,#-12] | |||||
eor @x[3],@x[3],@t[3] | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
str @x[2],[r14,#-8] | |||||
str @x[3],[r14,#-4] | |||||
add @x[4],@t[0],@x[4],ror#13 @ accumulate key material | |||||
ldr @t[0],[r12],#16 @ load input | |||||
add @x[5],@t[1],@x[5],ror#13 | |||||
ldr @t[1],[r12,#-12] | |||||
add @x[6],@t[2],@x[6],ror#13 | |||||
ldr @t[2],[r12,#-8] | |||||
add @x[7],@t[3],@x[7],ror#13 | |||||
ldr @t[3],[r12,#-4] | |||||
# ifdef __ARMEB__ | |||||
rev @x[4],@x[4] | |||||
rev @x[5],@x[5] | |||||
rev @x[6],@x[6] | |||||
rev @x[7],@x[7] | |||||
# endif | |||||
eor @x[4],@x[4],@t[0] | |||||
add @t[0],sp,#4*(8) | |||||
eor @x[5],@x[5],@t[1] | |||||
str @x[4],[r14],#16 @ store output | |||||
eor @x[6],@x[6],@t[2] | |||||
str @x[5],[r14,#-12] | |||||
eor @x[7],@x[7],@t[3] | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
str @x[6],[r14,#-8] | |||||
add @x[0],sp,#4*(16+8) | |||||
str @x[7],[r14,#-4] | |||||
ldmia @x[0],{@x[0]-@x[7]} @ load second half | |||||
add @x[0],@x[0],@t[0] @ accumulate key material | |||||
ldr @t[0],[r12],#16 @ load input | |||||
add @x[1],@x[1],@t[1] | |||||
ldr @t[1],[r12,#-12] | |||||
# ifdef __thumb2__ | |||||
it hi | |||||
# endif | |||||
strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it | |||||
add @x[2],@x[2],@t[2] | |||||
ldr @t[2],[r12,#-8] | |||||
# ifdef __thumb2__ | |||||
it hi | |||||
# endif | |||||
strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it | |||||
add @x[3],@x[3],@t[3] | |||||
ldr @t[3],[r12,#-4] | |||||
# ifdef __ARMEB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[1],@x[1] | |||||
rev @x[2],@x[2] | |||||
rev @x[3],@x[3] | |||||
# endif | |||||
eor @x[0],@x[0],@t[0] | |||||
add @t[0],sp,#4*(12) | |||||
eor @x[1],@x[1],@t[1] | |||||
str @x[0],[r14],#16 @ store output | |||||
eor @x[2],@x[2],@t[2] | |||||
str @x[1],[r14,#-12] | |||||
eor @x[3],@x[3],@t[3] | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
str @x[2],[r14,#-8] | |||||
str @x[3],[r14,#-4] | |||||
add @x[4],@t[0],@x[4],ror#24 @ accumulate key material | |||||
add @t[0],@t[0],#4 @ next counter value | |||||
add @x[5],@t[1],@x[5],ror#24 | |||||
str @t[0],[sp,#4*(12)] @ save next counter value | |||||
ldr @t[0],[r12],#16 @ load input | |||||
add @x[6],@t[2],@x[6],ror#24 | |||||
add @x[4],@x[4],#3 @ counter+3 | |||||
ldr @t[1],[r12,#-12] | |||||
add @x[7],@t[3],@x[7],ror#24 | |||||
ldr @t[2],[r12,#-8] | |||||
ldr @t[3],[r12,#-4] | |||||
# ifdef __ARMEB__ | |||||
rev @x[4],@x[4] | |||||
rev @x[5],@x[5] | |||||
rev @x[6],@x[6] | |||||
rev @x[7],@x[7] | |||||
# endif | |||||
eor @x[4],@x[4],@t[0] | |||||
# ifdef __thumb2__ | |||||
it hi | |||||
# endif | |||||
ldrhi @t[0],[sp,#4*(32+2)] @ re-load len | |||||
eor @x[5],@x[5],@t[1] | |||||
eor @x[6],@x[6],@t[2] | |||||
str @x[4],[r14],#16 @ store output | |||||
eor @x[7],@x[7],@t[3] | |||||
str @x[5],[r14,#-12] | |||||
sub @t[3],@t[0],#64*4 @ len-=64*4 | |||||
str @x[6],[r14,#-8] | |||||
str @x[7],[r14,#-4] | |||||
bhi .Loop_neon_outer | |||||
b .Ldone_neon | |||||
.align 4 | |||||
.Lbreak_neon: | |||||
@ harmonize NEON and integer-only stack frames: load data | |||||
@ from NEON frame, but save to integer-only one; distance | |||||
@ between the two is 4*(32+4+16-32)=4*(20). | |||||
str @t[3], [sp,#4*(20+32+2)] @ save len | |||||
add @t[3],sp,#4*(32+4) | |||||
str r12, [sp,#4*(20+32+1)] @ save inp | |||||
str r14, [sp,#4*(20+32+0)] @ save out | |||||
ldr @x[12],[sp,#4*(16+10)] | |||||
ldr @x[14],[sp,#4*(16+11)] | |||||
vldmia @t[3],{d8-d15} @ fulfill ABI requirement | |||||
str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" | |||||
str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" | |||||
ldr @t[3], [sp,#4*(15)] | |||||
mov @x[4],@x[4],ror#19 @ twist b[0..3] | |||||
ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load | |||||
mov @x[5],@x[5],ror#19 | |||||
ldr @t[2], [sp,#4*(13)] | |||||
mov @x[6],@x[6],ror#19 | |||||
ldr @x[14],[sp,#4*(14)] | |||||
mov @x[7],@x[7],ror#19 | |||||
mov @t[3],@t[3],ror#8 @ twist d[0..3] | |||||
mov @x[12],@x[12],ror#8 | |||||
mov @t[2],@t[2],ror#8 | |||||
mov @x[14],@x[14],ror#8 | |||||
str @t[3], [sp,#4*(20+16+15)] | |||||
add @t[3],sp,#4*(20) | |||||
vst1.32 {$a0-$b0},[@t[3]]! @ copy key | |||||
add sp,sp,#4*(20) @ switch frame | |||||
vst1.32 {$c0-$d0},[@t[3]] | |||||
mov @t[3],#10 | |||||
b .Loop @ go integer-only | |||||
.align 4 | |||||
.Ltail_neon: | |||||
cmp @t[3],#64*3 | |||||
bhs .L192_or_more_neon | |||||
cmp @t[3],#64*2 | |||||
bhs .L128_or_more_neon | |||||
cmp @t[3],#64*1 | |||||
bhs .L64_or_more_neon | |||||
add @t[0],sp,#4*(8) | |||||
vst1.8 {$a0-$b0},[sp] | |||||
add @t[2],sp,#4*(0) | |||||
vst1.8 {$c0-$d0},[@t[0]] | |||||
b .Loop_tail_neon | |||||
.align 4 | |||||
.L64_or_more_neon: | |||||
vld1.8 {$t0-$t1},[r12]! | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a0,$a0,$t0 | |||||
veor $b0,$b0,$t1 | |||||
veor $c0,$c0,$t2 | |||||
veor $d0,$d0,$t3 | |||||
vst1.8 {$a0-$b0},[r14]! | |||||
vst1.8 {$c0-$d0},[r14]! | |||||
beq .Ldone_neon | |||||
add @t[0],sp,#4*(8) | |||||
vst1.8 {$a1-$b1},[sp] | |||||
add @t[2],sp,#4*(0) | |||||
vst1.8 {$c1-$d1},[@t[0]] | |||||
sub @t[3],@t[3],#64*1 @ len-=64*1 | |||||
b .Loop_tail_neon | |||||
.align 4 | |||||
.L128_or_more_neon: | |||||
vld1.8 {$t0-$t1},[r12]! | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a0,$a0,$t0 | |||||
veor $b0,$b0,$t1 | |||||
vld1.8 {$t0-$t1},[r12]! | |||||
veor $c0,$c0,$t2 | |||||
veor $d0,$d0,$t3 | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a1,$a1,$t0 | |||||
veor $b1,$b1,$t1 | |||||
vst1.8 {$a0-$b0},[r14]! | |||||
veor $c1,$c1,$t2 | |||||
vst1.8 {$c0-$d0},[r14]! | |||||
veor $d1,$d1,$t3 | |||||
vst1.8 {$a1-$b1},[r14]! | |||||
vst1.8 {$c1-$d1},[r14]! | |||||
beq .Ldone_neon | |||||
add @t[0],sp,#4*(8) | |||||
vst1.8 {$a2-$b2},[sp] | |||||
add @t[2],sp,#4*(0) | |||||
vst1.8 {$c2-$d2},[@t[0]] | |||||
sub @t[3],@t[3],#64*2 @ len-=64*2 | |||||
b .Loop_tail_neon | |||||
.align 4 | |||||
.L192_or_more_neon: | |||||
vld1.8 {$t0-$t1},[r12]! | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a0,$a0,$t0 | |||||
veor $b0,$b0,$t1 | |||||
vld1.8 {$t0-$t1},[r12]! | |||||
veor $c0,$c0,$t2 | |||||
veor $d0,$d0,$t3 | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a1,$a1,$t0 | |||||
veor $b1,$b1,$t1 | |||||
vld1.8 {$t0-$t1},[r12]! | |||||
veor $c1,$c1,$t2 | |||||
vst1.8 {$a0-$b0},[r14]! | |||||
veor $d1,$d1,$t3 | |||||
vld1.8 {$t2-$t3},[r12]! | |||||
veor $a2,$a2,$t0 | |||||
vst1.8 {$c0-$d0},[r14]! | |||||
veor $b2,$b2,$t1 | |||||
vst1.8 {$a1-$b1},[r14]! | |||||
veor $c2,$c2,$t2 | |||||
vst1.8 {$c1-$d1},[r14]! | |||||
veor $d2,$d2,$t3 | |||||
vst1.8 {$a2-$b2},[r14]! | |||||
vst1.8 {$c2-$d2},[r14]! | |||||
beq .Ldone_neon | |||||
ldmia sp,{@t[0]-@t[3]} @ load key material | |||||
add @x[0],@x[0],@t[0] @ accumulate key material | |||||
add @t[0],sp,#4*(4) | |||||
add @x[1],@x[1],@t[1] | |||||
add @x[2],@x[2],@t[2] | |||||
add @x[3],@x[3],@t[3] | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
add @x[4],@t[0],@x[4],ror#13 @ accumulate key material | |||||
add @t[0],sp,#4*(8) | |||||
add @x[5],@t[1],@x[5],ror#13 | |||||
add @x[6],@t[2],@x[6],ror#13 | |||||
add @x[7],@t[3],@x[7],ror#13 | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
# ifdef __ARMEB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[1],@x[1] | |||||
rev @x[2],@x[2] | |||||
rev @x[3],@x[3] | |||||
rev @x[4],@x[4] | |||||
rev @x[5],@x[5] | |||||
rev @x[6],@x[6] | |||||
rev @x[7],@x[7] | |||||
# endif | |||||
stmia sp,{@x[0]-@x[7]} | |||||
add @x[0],sp,#4*(16+8) | |||||
ldmia @x[0],{@x[0]-@x[7]} @ load second half | |||||
add @x[0],@x[0],@t[0] @ accumulate key material | |||||
add @t[0],sp,#4*(12) | |||||
add @x[1],@x[1],@t[1] | |||||
add @x[2],@x[2],@t[2] | |||||
add @x[3],@x[3],@t[3] | |||||
ldmia @t[0],{@t[0]-@t[3]} @ load key material | |||||
add @x[4],@t[0],@x[4],ror#24 @ accumulate key material | |||||
add @t[0],sp,#4*(8) | |||||
add @x[5],@t[1],@x[5],ror#24 | |||||
add @x[4],@x[4],#3 @ counter+3 | |||||
add @x[6],@t[2],@x[6],ror#24 | |||||
add @x[7],@t[3],@x[7],ror#24 | |||||
ldr @t[3],[sp,#4*(32+2)] @ re-load len | |||||
# ifdef __ARMEB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[1],@x[1] | |||||
rev @x[2],@x[2] | |||||
rev @x[3],@x[3] | |||||
rev @x[4],@x[4] | |||||
rev @x[5],@x[5] | |||||
rev @x[6],@x[6] | |||||
rev @x[7],@x[7] | |||||
# endif | |||||
stmia @t[0],{@x[0]-@x[7]} | |||||
add @t[2],sp,#4*(0) | |||||
sub @t[3],@t[3],#64*3 @ len-=64*3 | |||||
.Loop_tail_neon: | |||||
ldrb @t[0],[@t[2]],#1 @ read buffer on stack | |||||
ldrb @t[1],[r12],#1 @ read input | |||||
subs @t[3],@t[3],#1 | |||||
eor @t[0],@t[0],@t[1] | |||||
strb @t[0],[r14],#1 @ store output | |||||
bne .Loop_tail_neon | |||||
.Ldone_neon: | |||||
add sp,sp,#4*(32+4) | |||||
vldmia sp,{d8-d15} | |||||
add sp,sp,#4*(16+3) | |||||
ldmia sp!,{r4-r11,pc} | |||||
.size ChaCha20_neon,.-ChaCha20_neon | |||||
# ifndef __KERNEL__ | |||||
.comm OPENSSL_armcap_P,4,4 | |||||
# endif | |||||
#endif | |||||
___ | |||||
}}} | |||||
open SELF,$0; | |||||
while(<SELF>) { | |||||
next if (/^#!/); | |||||
last if (!s/^#/@/ and !/^$/); | |||||
print; | |||||
} | |||||
close SELF; | |||||
foreach (split("\n",$code)) { | |||||
s/\`([^\`]*)\`/eval $1/geo; | |||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; | |||||
print $_,"\n"; | |||||
} | |||||
close STDOUT; |