Changeset View
Changeset View
Standalone View
Standalone View
module/crypto/zinc/chacha20/chacha20-arm64.pl
- This file was added.
#!/usr/bin/env perl | |||||
# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause | |||||
# | |||||
# This code is taken from the OpenSSL project but the author, Andy Polyakov, | |||||
# has relicensed it under the licenses specified in the SPDX header above. | |||||
# The original headers, including the original license headers, are | |||||
# included below for completeness. | |||||
# | |||||
# ==================================================================== | |||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | |||||
# project. The module is, however, dual licensed under OpenSSL and | |||||
# CRYPTOGAMS licenses depending on where you obtain it. For further | |||||
# details see http://www.openssl.org/~appro/cryptogams/. | |||||
# ==================================================================== | |||||
# | |||||
# June 2015 | |||||
# | |||||
# ChaCha20 for ARMv8. | |||||
# | |||||
# Performance in cycles per byte out of large buffer. | |||||
# | |||||
# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU(*) | |||||
# | |||||
# Apple A7 5.50/+49% 3.33 1.70 | |||||
# Cortex-A53 8.40/+80% 4.72 4.72(**) | |||||
# Cortex-A57 8.06/+43% 4.90 4.43(***) | |||||
# Denver 4.50/+82% 2.63 2.67(**) | |||||
# X-Gene 9.50/+46% 8.82 8.89(**) | |||||
# Mongoose 8.00/+44% 3.64 3.25(***) | |||||
# Kryo 8.17/+50% 4.83 4.65(***) | |||||
# | |||||
# (*) since no non-Apple processor exhibits significantly better | |||||
# performance, the code path is #ifdef __APPLE__-ed; | |||||
# (**) it's expected that doubling interleave factor doesn't help | |||||
# all processors, only those with higher NEON latency and | |||||
# higher instruction issue rate; | |||||
# (***) expected improvement was actually higher; | |||||
$flavour=shift; | |||||
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } | |||||
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } | |||||
if ($flavour && $flavour ne "void") { | |||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |||||
die "can't locate arm-xlate.pl"; | |||||
open STDOUT,"| \"$^X\" $xlate $flavour $output"; | |||||
} else { | |||||
open STDOUT,">$output"; | |||||
} | |||||
sub AUTOLOAD() # thunk [simplified] x86-style perlasm | |||||
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; | |||||
my $arg = pop; | |||||
$arg = "#$arg" if ($arg*1 eq $arg); | |||||
$code .= "\t$opcode\t".join(',',@_,$arg)."\n"; | |||||
} | |||||
my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); | |||||
my @x=map("x$_",(5..17,19..21)); | |||||
my @d=map("x$_",(22..28,30)); | |||||
sub ROUND { | |||||
my ($a0,$b0,$c0,$d0)=@_; | |||||
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); | |||||
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); | |||||
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); | |||||
( | |||||
"&add_32 (@x[$a0],@x[$a0],@x[$b0])", | |||||
"&add_32 (@x[$a1],@x[$a1],@x[$b1])", | |||||
"&add_32 (@x[$a2],@x[$a2],@x[$b2])", | |||||
"&add_32 (@x[$a3],@x[$a3],@x[$b3])", | |||||
"&eor_32 (@x[$d0],@x[$d0],@x[$a0])", | |||||
"&eor_32 (@x[$d1],@x[$d1],@x[$a1])", | |||||
"&eor_32 (@x[$d2],@x[$d2],@x[$a2])", | |||||
"&eor_32 (@x[$d3],@x[$d3],@x[$a3])", | |||||
"&ror_32 (@x[$d0],@x[$d0],16)", | |||||
"&ror_32 (@x[$d1],@x[$d1],16)", | |||||
"&ror_32 (@x[$d2],@x[$d2],16)", | |||||
"&ror_32 (@x[$d3],@x[$d3],16)", | |||||
"&add_32 (@x[$c0],@x[$c0],@x[$d0])", | |||||
"&add_32 (@x[$c1],@x[$c1],@x[$d1])", | |||||
"&add_32 (@x[$c2],@x[$c2],@x[$d2])", | |||||
"&add_32 (@x[$c3],@x[$c3],@x[$d3])", | |||||
"&eor_32 (@x[$b0],@x[$b0],@x[$c0])", | |||||
"&eor_32 (@x[$b1],@x[$b1],@x[$c1])", | |||||
"&eor_32 (@x[$b2],@x[$b2],@x[$c2])", | |||||
"&eor_32 (@x[$b3],@x[$b3],@x[$c3])", | |||||
"&ror_32 (@x[$b0],@x[$b0],20)", | |||||
"&ror_32 (@x[$b1],@x[$b1],20)", | |||||
"&ror_32 (@x[$b2],@x[$b2],20)", | |||||
"&ror_32 (@x[$b3],@x[$b3],20)", | |||||
"&add_32 (@x[$a0],@x[$a0],@x[$b0])", | |||||
"&add_32 (@x[$a1],@x[$a1],@x[$b1])", | |||||
"&add_32 (@x[$a2],@x[$a2],@x[$b2])", | |||||
"&add_32 (@x[$a3],@x[$a3],@x[$b3])", | |||||
"&eor_32 (@x[$d0],@x[$d0],@x[$a0])", | |||||
"&eor_32 (@x[$d1],@x[$d1],@x[$a1])", | |||||
"&eor_32 (@x[$d2],@x[$d2],@x[$a2])", | |||||
"&eor_32 (@x[$d3],@x[$d3],@x[$a3])", | |||||
"&ror_32 (@x[$d0],@x[$d0],24)", | |||||
"&ror_32 (@x[$d1],@x[$d1],24)", | |||||
"&ror_32 (@x[$d2],@x[$d2],24)", | |||||
"&ror_32 (@x[$d3],@x[$d3],24)", | |||||
"&add_32 (@x[$c0],@x[$c0],@x[$d0])", | |||||
"&add_32 (@x[$c1],@x[$c1],@x[$d1])", | |||||
"&add_32 (@x[$c2],@x[$c2],@x[$d2])", | |||||
"&add_32 (@x[$c3],@x[$c3],@x[$d3])", | |||||
"&eor_32 (@x[$b0],@x[$b0],@x[$c0])", | |||||
"&eor_32 (@x[$b1],@x[$b1],@x[$c1])", | |||||
"&eor_32 (@x[$b2],@x[$b2],@x[$c2])", | |||||
"&eor_32 (@x[$b3],@x[$b3],@x[$c3])", | |||||
"&ror_32 (@x[$b0],@x[$b0],25)", | |||||
"&ror_32 (@x[$b1],@x[$b1],25)", | |||||
"&ror_32 (@x[$b2],@x[$b2],25)", | |||||
"&ror_32 (@x[$b3],@x[$b3],25)" | |||||
); | |||||
} | |||||
$code.=<<___; | |||||
#ifndef __KERNEL__ | |||||
# include "arm_arch.h" | |||||
.extern OPENSSL_armcap_P | |||||
#else | |||||
# define ChaCha20_ctr32 chacha20_arm | |||||
# define ChaCha20_neon chacha20_neon | |||||
#endif | |||||
.text | |||||
.align 5 | |||||
.Lsigma: | |||||
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral | |||||
.Lone: | |||||
.long 1,0,0,0 | |||||
#ifndef __KERNEL__ | |||||
.LOPENSSL_armcap_P: | |||||
# ifdef __ILP32__ | |||||
.long OPENSSL_armcap_P-. | |||||
# else | |||||
.quad OPENSSL_armcap_P-. | |||||
# endif | |||||
#endif | |||||
.globl ChaCha20_ctr32 | |||||
.type ChaCha20_ctr32,%function | |||||
.align 5 | |||||
ChaCha20_ctr32: | |||||
cbz $len,.Labort | |||||
#ifndef __KERNEL__ | |||||
adr @x[0],.LOPENSSL_armcap_P | |||||
cmp $len,#192 | |||||
b.lo .Lshort | |||||
# ifdef __ILP32__ | |||||
ldrsw @x[1],[@x[0]] | |||||
# else | |||||
ldr @x[1],[@x[0]] | |||||
# endif | |||||
ldr w17,[@x[1],@x[0]] | |||||
tst w17,#ARMV7_NEON | |||||
b.ne ChaCha20_neon | |||||
.Lshort: | |||||
#endif | |||||
stp x29,x30,[sp,#-96]! | |||||
add x29,sp,#0 | |||||
adr @x[0],.Lsigma | |||||
stp x19,x20,[sp,#16] | |||||
stp x21,x22,[sp,#32] | |||||
stp x23,x24,[sp,#48] | |||||
stp x25,x26,[sp,#64] | |||||
stp x27,x28,[sp,#80] | |||||
sub sp,sp,#64 | |||||
ldp @d[0],@d[1],[@x[0]] // load sigma | |||||
ldp @d[2],@d[3],[$key] // load key | |||||
ldp @d[4],@d[5],[$key,#16] | |||||
ldp @d[6],@d[7],[$ctr] // load counter | |||||
#ifdef __AARCH64EB__ | |||||
ror @d[2],@d[2],#32 | |||||
ror @d[3],@d[3],#32 | |||||
ror @d[4],@d[4],#32 | |||||
ror @d[5],@d[5],#32 | |||||
ror @d[6],@d[6],#32 | |||||
ror @d[7],@d[7],#32 | |||||
#endif | |||||
.Loop_outer: | |||||
mov.32 @x[0],@d[0] // unpack key block | |||||
lsr @x[1],@d[0],#32 | |||||
mov.32 @x[2],@d[1] | |||||
lsr @x[3],@d[1],#32 | |||||
mov.32 @x[4],@d[2] | |||||
lsr @x[5],@d[2],#32 | |||||
mov.32 @x[6],@d[3] | |||||
lsr @x[7],@d[3],#32 | |||||
mov.32 @x[8],@d[4] | |||||
lsr @x[9],@d[4],#32 | |||||
mov.32 @x[10],@d[5] | |||||
lsr @x[11],@d[5],#32 | |||||
mov.32 @x[12],@d[6] | |||||
lsr @x[13],@d[6],#32 | |||||
mov.32 @x[14],@d[7] | |||||
lsr @x[15],@d[7],#32 | |||||
mov $ctr,#10 | |||||
subs $len,$len,#64 | |||||
.Loop: | |||||
sub $ctr,$ctr,#1 | |||||
___ | |||||
foreach (&ROUND(0, 4, 8,12)) { eval; } | |||||
foreach (&ROUND(0, 5,10,15)) { eval; } | |||||
$code.=<<___; | |||||
cbnz $ctr,.Loop | |||||
add.32 @x[0],@x[0],@d[0] // accumulate key block | |||||
add @x[1],@x[1],@d[0],lsr#32 | |||||
add.32 @x[2],@x[2],@d[1] | |||||
add @x[3],@x[3],@d[1],lsr#32 | |||||
add.32 @x[4],@x[4],@d[2] | |||||
add @x[5],@x[5],@d[2],lsr#32 | |||||
add.32 @x[6],@x[6],@d[3] | |||||
add @x[7],@x[7],@d[3],lsr#32 | |||||
add.32 @x[8],@x[8],@d[4] | |||||
add @x[9],@x[9],@d[4],lsr#32 | |||||
add.32 @x[10],@x[10],@d[5] | |||||
add @x[11],@x[11],@d[5],lsr#32 | |||||
add.32 @x[12],@x[12],@d[6] | |||||
add @x[13],@x[13],@d[6],lsr#32 | |||||
add.32 @x[14],@x[14],@d[7] | |||||
add @x[15],@x[15],@d[7],lsr#32 | |||||
b.lo .Ltail | |||||
add @x[0],@x[0],@x[1],lsl#32 // pack | |||||
add @x[2],@x[2],@x[3],lsl#32 | |||||
ldp @x[1],@x[3],[$inp,#0] // load input | |||||
add @x[4],@x[4],@x[5],lsl#32 | |||||
add @x[6],@x[6],@x[7],lsl#32 | |||||
ldp @x[5],@x[7],[$inp,#16] | |||||
add @x[8],@x[8],@x[9],lsl#32 | |||||
add @x[10],@x[10],@x[11],lsl#32 | |||||
ldp @x[9],@x[11],[$inp,#32] | |||||
add @x[12],@x[12],@x[13],lsl#32 | |||||
add @x[14],@x[14],@x[15],lsl#32 | |||||
ldp @x[13],@x[15],[$inp,#48] | |||||
add $inp,$inp,#64 | |||||
#ifdef __AARCH64EB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[2],@x[2] | |||||
rev @x[4],@x[4] | |||||
rev @x[6],@x[6] | |||||
rev @x[8],@x[8] | |||||
rev @x[10],@x[10] | |||||
rev @x[12],@x[12] | |||||
rev @x[14],@x[14] | |||||
#endif | |||||
eor @x[0],@x[0],@x[1] | |||||
eor @x[2],@x[2],@x[3] | |||||
eor @x[4],@x[4],@x[5] | |||||
eor @x[6],@x[6],@x[7] | |||||
eor @x[8],@x[8],@x[9] | |||||
eor @x[10],@x[10],@x[11] | |||||
eor @x[12],@x[12],@x[13] | |||||
eor @x[14],@x[14],@x[15] | |||||
stp @x[0],@x[2],[$out,#0] // store output | |||||
add @d[6],@d[6],#1 // increment counter | |||||
stp @x[4],@x[6],[$out,#16] | |||||
stp @x[8],@x[10],[$out,#32] | |||||
stp @x[12],@x[14],[$out,#48] | |||||
add $out,$out,#64 | |||||
b.hi .Loop_outer | |||||
ldp x19,x20,[x29,#16] | |||||
add sp,sp,#64 | |||||
ldp x21,x22,[x29,#32] | |||||
ldp x23,x24,[x29,#48] | |||||
ldp x25,x26,[x29,#64] | |||||
ldp x27,x28,[x29,#80] | |||||
ldp x29,x30,[sp],#96 | |||||
.Labort: | |||||
ret | |||||
.align 4 | |||||
.Ltail: | |||||
add $len,$len,#64 | |||||
.Less_than_64: | |||||
sub $out,$out,#1 | |||||
add $inp,$inp,$len | |||||
add $out,$out,$len | |||||
add $ctr,sp,$len | |||||
neg $len,$len | |||||
add @x[0],@x[0],@x[1],lsl#32 // pack | |||||
add @x[2],@x[2],@x[3],lsl#32 | |||||
add @x[4],@x[4],@x[5],lsl#32 | |||||
add @x[6],@x[6],@x[7],lsl#32 | |||||
add @x[8],@x[8],@x[9],lsl#32 | |||||
add @x[10],@x[10],@x[11],lsl#32 | |||||
add @x[12],@x[12],@x[13],lsl#32 | |||||
add @x[14],@x[14],@x[15],lsl#32 | |||||
#ifdef __AARCH64EB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[2],@x[2] | |||||
rev @x[4],@x[4] | |||||
rev @x[6],@x[6] | |||||
rev @x[8],@x[8] | |||||
rev @x[10],@x[10] | |||||
rev @x[12],@x[12] | |||||
rev @x[14],@x[14] | |||||
#endif | |||||
stp @x[0],@x[2],[sp,#0] | |||||
stp @x[4],@x[6],[sp,#16] | |||||
stp @x[8],@x[10],[sp,#32] | |||||
stp @x[12],@x[14],[sp,#48] | |||||
.Loop_tail: | |||||
ldrb w10,[$inp,$len] | |||||
ldrb w11,[$ctr,$len] | |||||
add $len,$len,#1 | |||||
eor w10,w10,w11 | |||||
strb w10,[$out,$len] | |||||
cbnz $len,.Loop_tail | |||||
stp xzr,xzr,[sp,#0] | |||||
stp xzr,xzr,[sp,#16] | |||||
stp xzr,xzr,[sp,#32] | |||||
stp xzr,xzr,[sp,#48] | |||||
ldp x19,x20,[x29,#16] | |||||
add sp,sp,#64 | |||||
ldp x21,x22,[x29,#32] | |||||
ldp x23,x24,[x29,#48] | |||||
ldp x25,x26,[x29,#64] | |||||
ldp x27,x28,[x29,#80] | |||||
ldp x29,x30,[sp],#96 | |||||
ret | |||||
.size ChaCha20_ctr32,.-ChaCha20_ctr32 | |||||
___ | |||||
{{{ | |||||
my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = | |||||
map("v$_.4s",(0..7,16..23)); | |||||
my (@K)=map("v$_.4s",(24..30)); | |||||
my $ONE="v31.4s"; | |||||
sub NEONROUND { | |||||
my $odd = pop; | |||||
my ($a,$b,$c,$d,$t)=@_; | |||||
( | |||||
"&add ('$a','$a','$b')", | |||||
"&eor ('$d','$d','$a')", | |||||
"&rev32_16 ('$d','$d')", # vrot ($d,16) | |||||
"&add ('$c','$c','$d')", | |||||
"&eor ('$t','$b','$c')", | |||||
"&ushr ('$b','$t',20)", | |||||
"&sli ('$b','$t',12)", | |||||
"&add ('$a','$a','$b')", | |||||
"&eor ('$t','$d','$a')", | |||||
"&ushr ('$d','$t',24)", | |||||
"&sli ('$d','$t',8)", | |||||
"&add ('$c','$c','$d')", | |||||
"&eor ('$t','$b','$c')", | |||||
"&ushr ('$b','$t',25)", | |||||
"&sli ('$b','$t',7)", | |||||
"&ext ('$a','$a','$a',$odd?4:12)", | |||||
"&ext ('$d','$d','$d',8)", | |||||
"&ext ('$c','$c','$c',$odd?12:4)" | |||||
); | |||||
} | |||||
$code.=<<___; | |||||
#if !defined(__KERNEL__) || defined(CONFIG_KERNEL_MODE_NEON) | |||||
#ifdef __KERNEL__ | |||||
.globl ChaCha20_neon | |||||
.type ChaCha20_neon,%function | |||||
#endif | |||||
.type ChaCha20_neon,%function | |||||
.align 5 | |||||
ChaCha20_neon: | |||||
stp x29,x30,[sp,#-96]! | |||||
add x29,sp,#0 | |||||
adr @x[0],.Lsigma | |||||
stp x19,x20,[sp,#16] | |||||
stp x21,x22,[sp,#32] | |||||
stp x23,x24,[sp,#48] | |||||
stp x25,x26,[sp,#64] | |||||
stp x27,x28,[sp,#80] | |||||
#ifdef __APPLE__ | |||||
cmp $len,#512 | |||||
b.hs .L512_or_more_neon | |||||
#endif | |||||
sub sp,sp,#64 | |||||
ldp @d[0],@d[1],[@x[0]] // load sigma | |||||
ld1 {@K[0]},[@x[0]],#16 | |||||
ldp @d[2],@d[3],[$key] // load key | |||||
ldp @d[4],@d[5],[$key,#16] | |||||
ld1 {@K[1],@K[2]},[$key] | |||||
ldp @d[6],@d[7],[$ctr] // load counter | |||||
ld1 {@K[3]},[$ctr] | |||||
ld1 {$ONE},[@x[0]] | |||||
#ifdef __AARCH64EB__ | |||||
rev64 @K[0],@K[0] | |||||
ror @d[2],@d[2],#32 | |||||
ror @d[3],@d[3],#32 | |||||
ror @d[4],@d[4],#32 | |||||
ror @d[5],@d[5],#32 | |||||
ror @d[6],@d[6],#32 | |||||
ror @d[7],@d[7],#32 | |||||
#endif | |||||
add @K[3],@K[3],$ONE // += 1 | |||||
add @K[4],@K[3],$ONE | |||||
add @K[5],@K[4],$ONE | |||||
shl $ONE,$ONE,#2 // 1 -> 4 | |||||
.Loop_outer_neon: | |||||
mov.32 @x[0],@d[0] // unpack key block | |||||
lsr @x[1],@d[0],#32 | |||||
mov $A0,@K[0] | |||||
mov.32 @x[2],@d[1] | |||||
lsr @x[3],@d[1],#32 | |||||
mov $A1,@K[0] | |||||
mov.32 @x[4],@d[2] | |||||
lsr @x[5],@d[2],#32 | |||||
mov $A2,@K[0] | |||||
mov.32 @x[6],@d[3] | |||||
mov $B0,@K[1] | |||||
lsr @x[7],@d[3],#32 | |||||
mov $B1,@K[1] | |||||
mov.32 @x[8],@d[4] | |||||
mov $B2,@K[1] | |||||
lsr @x[9],@d[4],#32 | |||||
mov $D0,@K[3] | |||||
mov.32 @x[10],@d[5] | |||||
mov $D1,@K[4] | |||||
lsr @x[11],@d[5],#32 | |||||
mov $D2,@K[5] | |||||
mov.32 @x[12],@d[6] | |||||
mov $C0,@K[2] | |||||
lsr @x[13],@d[6],#32 | |||||
mov $C1,@K[2] | |||||
mov.32 @x[14],@d[7] | |||||
mov $C2,@K[2] | |||||
lsr @x[15],@d[7],#32 | |||||
mov $ctr,#10 | |||||
subs $len,$len,#256 | |||||
.Loop_neon: | |||||
sub $ctr,$ctr,#1 | |||||
___ | |||||
my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); | |||||
my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); | |||||
my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); | |||||
my @thread3=&ROUND(0,4,8,12); | |||||
foreach (@thread0) { | |||||
eval; eval(shift(@thread3)); | |||||
eval(shift(@thread1)); eval(shift(@thread3)); | |||||
eval(shift(@thread2)); eval(shift(@thread3)); | |||||
} | |||||
@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); | |||||
@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); | |||||
@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); | |||||
@thread3=&ROUND(0,5,10,15); | |||||
foreach (@thread0) { | |||||
eval; eval(shift(@thread3)); | |||||
eval(shift(@thread1)); eval(shift(@thread3)); | |||||
eval(shift(@thread2)); eval(shift(@thread3)); | |||||
} | |||||
$code.=<<___; | |||||
cbnz $ctr,.Loop_neon | |||||
add.32 @x[0],@x[0],@d[0] // accumulate key block | |||||
add $A0,$A0,@K[0] | |||||
add @x[1],@x[1],@d[0],lsr#32 | |||||
add $A1,$A1,@K[0] | |||||
add.32 @x[2],@x[2],@d[1] | |||||
add $A2,$A2,@K[0] | |||||
add @x[3],@x[3],@d[1],lsr#32 | |||||
add $C0,$C0,@K[2] | |||||
add.32 @x[4],@x[4],@d[2] | |||||
add $C1,$C1,@K[2] | |||||
add @x[5],@x[5],@d[2],lsr#32 | |||||
add $C2,$C2,@K[2] | |||||
add.32 @x[6],@x[6],@d[3] | |||||
add $D0,$D0,@K[3] | |||||
add @x[7],@x[7],@d[3],lsr#32 | |||||
add.32 @x[8],@x[8],@d[4] | |||||
add $D1,$D1,@K[4] | |||||
add @x[9],@x[9],@d[4],lsr#32 | |||||
add.32 @x[10],@x[10],@d[5] | |||||
add $D2,$D2,@K[5] | |||||
add @x[11],@x[11],@d[5],lsr#32 | |||||
add.32 @x[12],@x[12],@d[6] | |||||
add $B0,$B0,@K[1] | |||||
add @x[13],@x[13],@d[6],lsr#32 | |||||
add.32 @x[14],@x[14],@d[7] | |||||
add $B1,$B1,@K[1] | |||||
add @x[15],@x[15],@d[7],lsr#32 | |||||
add $B2,$B2,@K[1] | |||||
b.lo .Ltail_neon | |||||
add @x[0],@x[0],@x[1],lsl#32 // pack | |||||
add @x[2],@x[2],@x[3],lsl#32 | |||||
ldp @x[1],@x[3],[$inp,#0] // load input | |||||
add @x[4],@x[4],@x[5],lsl#32 | |||||
add @x[6],@x[6],@x[7],lsl#32 | |||||
ldp @x[5],@x[7],[$inp,#16] | |||||
add @x[8],@x[8],@x[9],lsl#32 | |||||
add @x[10],@x[10],@x[11],lsl#32 | |||||
ldp @x[9],@x[11],[$inp,#32] | |||||
add @x[12],@x[12],@x[13],lsl#32 | |||||
add @x[14],@x[14],@x[15],lsl#32 | |||||
ldp @x[13],@x[15],[$inp,#48] | |||||
add $inp,$inp,#64 | |||||
#ifdef __AARCH64EB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[2],@x[2] | |||||
rev @x[4],@x[4] | |||||
rev @x[6],@x[6] | |||||
rev @x[8],@x[8] | |||||
rev @x[10],@x[10] | |||||
rev @x[12],@x[12] | |||||
rev @x[14],@x[14] | |||||
#endif | |||||
ld1.8 {$T0-$T3},[$inp],#64 | |||||
eor @x[0],@x[0],@x[1] | |||||
eor @x[2],@x[2],@x[3] | |||||
eor @x[4],@x[4],@x[5] | |||||
eor @x[6],@x[6],@x[7] | |||||
eor @x[8],@x[8],@x[9] | |||||
eor $A0,$A0,$T0 | |||||
eor @x[10],@x[10],@x[11] | |||||
eor $B0,$B0,$T1 | |||||
eor @x[12],@x[12],@x[13] | |||||
eor $C0,$C0,$T2 | |||||
eor @x[14],@x[14],@x[15] | |||||
eor $D0,$D0,$T3 | |||||
ld1.8 {$T0-$T3},[$inp],#64 | |||||
stp @x[0],@x[2],[$out,#0] // store output | |||||
add @d[6],@d[6],#4 // increment counter | |||||
stp @x[4],@x[6],[$out,#16] | |||||
add @K[3],@K[3],$ONE // += 4 | |||||
stp @x[8],@x[10],[$out,#32] | |||||
add @K[4],@K[4],$ONE | |||||
stp @x[12],@x[14],[$out,#48] | |||||
add @K[5],@K[5],$ONE | |||||
add $out,$out,#64 | |||||
st1.8 {$A0-$D0},[$out],#64 | |||||
ld1.8 {$A0-$D0},[$inp],#64 | |||||
eor $A1,$A1,$T0 | |||||
eor $B1,$B1,$T1 | |||||
eor $C1,$C1,$T2 | |||||
eor $D1,$D1,$T3 | |||||
st1.8 {$A1-$D1},[$out],#64 | |||||
eor $A2,$A2,$A0 | |||||
eor $B2,$B2,$B0 | |||||
eor $C2,$C2,$C0 | |||||
eor $D2,$D2,$D0 | |||||
st1.8 {$A2-$D2},[$out],#64 | |||||
b.hi .Loop_outer_neon | |||||
ldp x19,x20,[x29,#16] | |||||
add sp,sp,#64 | |||||
ldp x21,x22,[x29,#32] | |||||
ldp x23,x24,[x29,#48] | |||||
ldp x25,x26,[x29,#64] | |||||
ldp x27,x28,[x29,#80] | |||||
ldp x29,x30,[sp],#96 | |||||
ret | |||||
.Ltail_neon: | |||||
add $len,$len,#256 | |||||
cmp $len,#64 | |||||
b.lo .Less_than_64 | |||||
add @x[0],@x[0],@x[1],lsl#32 // pack | |||||
add @x[2],@x[2],@x[3],lsl#32 | |||||
ldp @x[1],@x[3],[$inp,#0] // load input | |||||
add @x[4],@x[4],@x[5],lsl#32 | |||||
add @x[6],@x[6],@x[7],lsl#32 | |||||
ldp @x[5],@x[7],[$inp,#16] | |||||
add @x[8],@x[8],@x[9],lsl#32 | |||||
add @x[10],@x[10],@x[11],lsl#32 | |||||
ldp @x[9],@x[11],[$inp,#32] | |||||
add @x[12],@x[12],@x[13],lsl#32 | |||||
add @x[14],@x[14],@x[15],lsl#32 | |||||
ldp @x[13],@x[15],[$inp,#48] | |||||
add $inp,$inp,#64 | |||||
#ifdef __AARCH64EB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[2],@x[2] | |||||
rev @x[4],@x[4] | |||||
rev @x[6],@x[6] | |||||
rev @x[8],@x[8] | |||||
rev @x[10],@x[10] | |||||
rev @x[12],@x[12] | |||||
rev @x[14],@x[14] | |||||
#endif | |||||
eor @x[0],@x[0],@x[1] | |||||
eor @x[2],@x[2],@x[3] | |||||
eor @x[4],@x[4],@x[5] | |||||
eor @x[6],@x[6],@x[7] | |||||
eor @x[8],@x[8],@x[9] | |||||
eor @x[10],@x[10],@x[11] | |||||
eor @x[12],@x[12],@x[13] | |||||
eor @x[14],@x[14],@x[15] | |||||
stp @x[0],@x[2],[$out,#0] // store output | |||||
add @d[6],@d[6],#4 // increment counter | |||||
stp @x[4],@x[6],[$out,#16] | |||||
stp @x[8],@x[10],[$out,#32] | |||||
stp @x[12],@x[14],[$out,#48] | |||||
add $out,$out,#64 | |||||
b.eq .Ldone_neon | |||||
sub $len,$len,#64 | |||||
cmp $len,#64 | |||||
b.lo .Less_than_128 | |||||
ld1.8 {$T0-$T3},[$inp],#64 | |||||
eor $A0,$A0,$T0 | |||||
eor $B0,$B0,$T1 | |||||
eor $C0,$C0,$T2 | |||||
eor $D0,$D0,$T3 | |||||
st1.8 {$A0-$D0},[$out],#64 | |||||
b.eq .Ldone_neon | |||||
sub $len,$len,#64 | |||||
cmp $len,#64 | |||||
b.lo .Less_than_192 | |||||
ld1.8 {$T0-$T3},[$inp],#64 | |||||
eor $A1,$A1,$T0 | |||||
eor $B1,$B1,$T1 | |||||
eor $C1,$C1,$T2 | |||||
eor $D1,$D1,$T3 | |||||
st1.8 {$A1-$D1},[$out],#64 | |||||
b.eq .Ldone_neon | |||||
sub $len,$len,#64 | |||||
st1.8 {$A2-$D2},[sp] | |||||
b .Last_neon | |||||
.Less_than_128: | |||||
st1.8 {$A0-$D0},[sp] | |||||
b .Last_neon | |||||
.Less_than_192: | |||||
st1.8 {$A1-$D1},[sp] | |||||
b .Last_neon | |||||
.align 4 | |||||
.Last_neon: | |||||
sub $out,$out,#1 | |||||
add $inp,$inp,$len | |||||
add $out,$out,$len | |||||
add $ctr,sp,$len | |||||
neg $len,$len | |||||
.Loop_tail_neon: | |||||
ldrb w10,[$inp,$len] | |||||
ldrb w11,[$ctr,$len] | |||||
add $len,$len,#1 | |||||
eor w10,w10,w11 | |||||
strb w10,[$out,$len] | |||||
cbnz $len,.Loop_tail_neon | |||||
stp xzr,xzr,[sp,#0] | |||||
stp xzr,xzr,[sp,#16] | |||||
stp xzr,xzr,[sp,#32] | |||||
stp xzr,xzr,[sp,#48] | |||||
.Ldone_neon: | |||||
ldp x19,x20,[x29,#16] | |||||
add sp,sp,#64 | |||||
ldp x21,x22,[x29,#32] | |||||
ldp x23,x24,[x29,#48] | |||||
ldp x25,x26,[x29,#64] | |||||
ldp x27,x28,[x29,#80] | |||||
ldp x29,x30,[sp],#96 | |||||
ret | |||||
.size ChaCha20_neon,.-ChaCha20_neon | |||||
___ | |||||
{ | |||||
my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; | |||||
my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, | |||||
$A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); | |||||
$code.=<<___; | |||||
#ifdef __APPLE__ | |||||
.type ChaCha20_512_neon,%function | |||||
.align 5 | |||||
ChaCha20_512_neon: | |||||
stp x29,x30,[sp,#-96]! | |||||
add x29,sp,#0 | |||||
adr @x[0],.Lsigma | |||||
stp x19,x20,[sp,#16] | |||||
stp x21,x22,[sp,#32] | |||||
stp x23,x24,[sp,#48] | |||||
stp x25,x26,[sp,#64] | |||||
stp x27,x28,[sp,#80] | |||||
.L512_or_more_neon: | |||||
sub sp,sp,#128+64 | |||||
ldp @d[0],@d[1],[@x[0]] // load sigma | |||||
ld1 {@K[0]},[@x[0]],#16 | |||||
ldp @d[2],@d[3],[$key] // load key | |||||
ldp @d[4],@d[5],[$key,#16] | |||||
ld1 {@K[1],@K[2]},[$key] | |||||
ldp @d[6],@d[7],[$ctr] // load counter | |||||
ld1 {@K[3]},[$ctr] | |||||
ld1 {$ONE},[@x[0]] | |||||
# ifdef __AARCH64EB__ | |||||
rev64 @K[0],@K[0] | |||||
ror @d[2],@d[2],#32 | |||||
ror @d[3],@d[3],#32 | |||||
ror @d[4],@d[4],#32 | |||||
ror @d[5],@d[5],#32 | |||||
ror @d[6],@d[6],#32 | |||||
ror @d[7],@d[7],#32 | |||||
# endif | |||||
add @K[3],@K[3],$ONE // += 1 | |||||
stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part | |||||
add @K[3],@K[3],$ONE // not typo | |||||
str @K[2],[sp,#32] | |||||
add @K[4],@K[3],$ONE | |||||
add @K[5],@K[4],$ONE | |||||
add @K[6],@K[5],$ONE | |||||
shl $ONE,$ONE,#2 // 1 -> 4 | |||||
stp d8,d9,[sp,#128+0] // meet ABI requirements | |||||
stp d10,d11,[sp,#128+16] | |||||
stp d12,d13,[sp,#128+32] | |||||
stp d14,d15,[sp,#128+48] | |||||
sub $len,$len,#512 // not typo | |||||
.Loop_outer_512_neon: | |||||
mov $A0,@K[0] | |||||
mov $A1,@K[0] | |||||
mov $A2,@K[0] | |||||
mov $A3,@K[0] | |||||
mov $A4,@K[0] | |||||
mov $A5,@K[0] | |||||
mov $B0,@K[1] | |||||
mov.32 @x[0],@d[0] // unpack key block | |||||
mov $B1,@K[1] | |||||
lsr @x[1],@d[0],#32 | |||||
mov $B2,@K[1] | |||||
mov.32 @x[2],@d[1] | |||||
mov $B3,@K[1] | |||||
lsr @x[3],@d[1],#32 | |||||
mov $B4,@K[1] | |||||
mov.32 @x[4],@d[2] | |||||
mov $B5,@K[1] | |||||
lsr @x[5],@d[2],#32 | |||||
mov $D0,@K[3] | |||||
mov.32 @x[6],@d[3] | |||||
mov $D1,@K[4] | |||||
lsr @x[7],@d[3],#32 | |||||
mov $D2,@K[5] | |||||
mov.32 @x[8],@d[4] | |||||
mov $D3,@K[6] | |||||
lsr @x[9],@d[4],#32 | |||||
mov $C0,@K[2] | |||||
mov.32 @x[10],@d[5] | |||||
mov $C1,@K[2] | |||||
lsr @x[11],@d[5],#32 | |||||
add $D4,$D0,$ONE // +4 | |||||
mov.32 @x[12],@d[6] | |||||
add $D5,$D1,$ONE // +4 | |||||
lsr @x[13],@d[6],#32 | |||||
mov $C2,@K[2] | |||||
mov.32 @x[14],@d[7] | |||||
mov $C3,@K[2] | |||||
lsr @x[15],@d[7],#32 | |||||
mov $C4,@K[2] | |||||
stp @K[3],@K[4],[sp,#48] // off-load key block, variable part | |||||
mov $C5,@K[2] | |||||
str @K[5],[sp,#80] | |||||
mov $ctr,#5 | |||||
subs $len,$len,#512 | |||||
.Loop_upper_neon: | |||||
sub $ctr,$ctr,#1 | |||||
___ | |||||
my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); | |||||
my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); | |||||
my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); | |||||
my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); | |||||
my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); | |||||
my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); | |||||
my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); | |||||
my $diff = ($#thread0+1)*6 - $#thread67 - 1; | |||||
my $i = 0; | |||||
foreach (@thread0) { | |||||
eval; eval(shift(@thread67)); | |||||
eval(shift(@thread1)); eval(shift(@thread67)); | |||||
eval(shift(@thread2)); eval(shift(@thread67)); | |||||
eval(shift(@thread3)); eval(shift(@thread67)); | |||||
eval(shift(@thread4)); eval(shift(@thread67)); | |||||
eval(shift(@thread5)); eval(shift(@thread67)); | |||||
} | |||||
@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); | |||||
@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); | |||||
@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); | |||||
@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); | |||||
@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); | |||||
@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); | |||||
@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); | |||||
foreach (@thread0) { | |||||
eval; eval(shift(@thread67)); | |||||
eval(shift(@thread1)); eval(shift(@thread67)); | |||||
eval(shift(@thread2)); eval(shift(@thread67)); | |||||
eval(shift(@thread3)); eval(shift(@thread67)); | |||||
eval(shift(@thread4)); eval(shift(@thread67)); | |||||
eval(shift(@thread5)); eval(shift(@thread67)); | |||||
} | |||||
$code.=<<___; | |||||
cbnz $ctr,.Loop_upper_neon | |||||
add.32 @x[0],@x[0],@d[0] // accumulate key block | |||||
add @x[1],@x[1],@d[0],lsr#32 | |||||
add.32 @x[2],@x[2],@d[1] | |||||
add @x[3],@x[3],@d[1],lsr#32 | |||||
add.32 @x[4],@x[4],@d[2] | |||||
add @x[5],@x[5],@d[2],lsr#32 | |||||
add.32 @x[6],@x[6],@d[3] | |||||
add @x[7],@x[7],@d[3],lsr#32 | |||||
add.32 @x[8],@x[8],@d[4] | |||||
add @x[9],@x[9],@d[4],lsr#32 | |||||
add.32 @x[10],@x[10],@d[5] | |||||
add @x[11],@x[11],@d[5],lsr#32 | |||||
add.32 @x[12],@x[12],@d[6] | |||||
add @x[13],@x[13],@d[6],lsr#32 | |||||
add.32 @x[14],@x[14],@d[7] | |||||
add @x[15],@x[15],@d[7],lsr#32 | |||||
add @x[0],@x[0],@x[1],lsl#32 // pack | |||||
add @x[2],@x[2],@x[3],lsl#32 | |||||
ldp @x[1],@x[3],[$inp,#0] // load input | |||||
add @x[4],@x[4],@x[5],lsl#32 | |||||
add @x[6],@x[6],@x[7],lsl#32 | |||||
ldp @x[5],@x[7],[$inp,#16] | |||||
add @x[8],@x[8],@x[9],lsl#32 | |||||
add @x[10],@x[10],@x[11],lsl#32 | |||||
ldp @x[9],@x[11],[$inp,#32] | |||||
add @x[12],@x[12],@x[13],lsl#32 | |||||
add @x[14],@x[14],@x[15],lsl#32 | |||||
ldp @x[13],@x[15],[$inp,#48] | |||||
add $inp,$inp,#64 | |||||
# ifdef __AARCH64EB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[2],@x[2] | |||||
rev @x[4],@x[4] | |||||
rev @x[6],@x[6] | |||||
rev @x[8],@x[8] | |||||
rev @x[10],@x[10] | |||||
rev @x[12],@x[12] | |||||
rev @x[14],@x[14] | |||||
# endif | |||||
eor @x[0],@x[0],@x[1] | |||||
eor @x[2],@x[2],@x[3] | |||||
eor @x[4],@x[4],@x[5] | |||||
eor @x[6],@x[6],@x[7] | |||||
eor @x[8],@x[8],@x[9] | |||||
eor @x[10],@x[10],@x[11] | |||||
eor @x[12],@x[12],@x[13] | |||||
eor @x[14],@x[14],@x[15] | |||||
stp @x[0],@x[2],[$out,#0] // store output | |||||
add @d[6],@d[6],#1 // increment counter | |||||
mov.32 @x[0],@d[0] // unpack key block | |||||
lsr @x[1],@d[0],#32 | |||||
stp @x[4],@x[6],[$out,#16] | |||||
mov.32 @x[2],@d[1] | |||||
lsr @x[3],@d[1],#32 | |||||
stp @x[8],@x[10],[$out,#32] | |||||
mov.32 @x[4],@d[2] | |||||
lsr @x[5],@d[2],#32 | |||||
stp @x[12],@x[14],[$out,#48] | |||||
add $out,$out,#64 | |||||
mov.32 @x[6],@d[3] | |||||
lsr @x[7],@d[3],#32 | |||||
mov.32 @x[8],@d[4] | |||||
lsr @x[9],@d[4],#32 | |||||
mov.32 @x[10],@d[5] | |||||
lsr @x[11],@d[5],#32 | |||||
mov.32 @x[12],@d[6] | |||||
lsr @x[13],@d[6],#32 | |||||
mov.32 @x[14],@d[7] | |||||
lsr @x[15],@d[7],#32 | |||||
mov $ctr,#5 | |||||
.Loop_lower_neon: | |||||
sub $ctr,$ctr,#1 | |||||
___ | |||||
@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); | |||||
@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); | |||||
@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); | |||||
@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0); | |||||
@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0); | |||||
@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0); | |||||
@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); | |||||
foreach (@thread0) { | |||||
eval; eval(shift(@thread67)); | |||||
eval(shift(@thread1)); eval(shift(@thread67)); | |||||
eval(shift(@thread2)); eval(shift(@thread67)); | |||||
eval(shift(@thread3)); eval(shift(@thread67)); | |||||
eval(shift(@thread4)); eval(shift(@thread67)); | |||||
eval(shift(@thread5)); eval(shift(@thread67)); | |||||
} | |||||
@thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); | |||||
@thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); | |||||
@thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); | |||||
@thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1); | |||||
@thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1); | |||||
@thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1); | |||||
@thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15)); | |||||
foreach (@thread0) { | |||||
eval; eval(shift(@thread67)); | |||||
eval(shift(@thread1)); eval(shift(@thread67)); | |||||
eval(shift(@thread2)); eval(shift(@thread67)); | |||||
eval(shift(@thread3)); eval(shift(@thread67)); | |||||
eval(shift(@thread4)); eval(shift(@thread67)); | |||||
eval(shift(@thread5)); eval(shift(@thread67)); | |||||
} | |||||
$code.=<<___; | |||||
cbnz $ctr,.Loop_lower_neon | |||||
add.32 @x[0],@x[0],@d[0] // accumulate key block | |||||
ldp @K[0],@K[1],[sp,#0] | |||||
add @x[1],@x[1],@d[0],lsr#32 | |||||
ldp @K[2],@K[3],[sp,#32] | |||||
add.32 @x[2],@x[2],@d[1] | |||||
ldp @K[4],@K[5],[sp,#64] | |||||
add @x[3],@x[3],@d[1],lsr#32 | |||||
add $A0,$A0,@K[0] | |||||
add.32 @x[4],@x[4],@d[2] | |||||
add $A1,$A1,@K[0] | |||||
add @x[5],@x[5],@d[2],lsr#32 | |||||
add $A2,$A2,@K[0] | |||||
add.32 @x[6],@x[6],@d[3] | |||||
add $A3,$A3,@K[0] | |||||
add @x[7],@x[7],@d[3],lsr#32 | |||||
add $A4,$A4,@K[0] | |||||
add.32 @x[8],@x[8],@d[4] | |||||
add $A5,$A5,@K[0] | |||||
add @x[9],@x[9],@d[4],lsr#32 | |||||
add $C0,$C0,@K[2] | |||||
add.32 @x[10],@x[10],@d[5] | |||||
add $C1,$C1,@K[2] | |||||
add @x[11],@x[11],@d[5],lsr#32 | |||||
add $C2,$C2,@K[2] | |||||
add.32 @x[12],@x[12],@d[6] | |||||
add $C3,$C3,@K[2] | |||||
add @x[13],@x[13],@d[6],lsr#32 | |||||
add $C4,$C4,@K[2] | |||||
add.32 @x[14],@x[14],@d[7] | |||||
add $C5,$C5,@K[2] | |||||
add @x[15],@x[15],@d[7],lsr#32 | |||||
add $D4,$D4,$ONE // +4 | |||||
add @x[0],@x[0],@x[1],lsl#32 // pack | |||||
add $D5,$D5,$ONE // +4 | |||||
add @x[2],@x[2],@x[3],lsl#32 | |||||
add $D0,$D0,@K[3] | |||||
ldp @x[1],@x[3],[$inp,#0] // load input | |||||
add $D1,$D1,@K[4] | |||||
add @x[4],@x[4],@x[5],lsl#32 | |||||
add $D2,$D2,@K[5] | |||||
add @x[6],@x[6],@x[7],lsl#32 | |||||
add $D3,$D3,@K[6] | |||||
ldp @x[5],@x[7],[$inp,#16] | |||||
add $D4,$D4,@K[3] | |||||
add @x[8],@x[8],@x[9],lsl#32 | |||||
add $D5,$D5,@K[4] | |||||
add @x[10],@x[10],@x[11],lsl#32 | |||||
add $B0,$B0,@K[1] | |||||
ldp @x[9],@x[11],[$inp,#32] | |||||
add $B1,$B1,@K[1] | |||||
add @x[12],@x[12],@x[13],lsl#32 | |||||
add $B2,$B2,@K[1] | |||||
add @x[14],@x[14],@x[15],lsl#32 | |||||
add $B3,$B3,@K[1] | |||||
ldp @x[13],@x[15],[$inp,#48] | |||||
add $B4,$B4,@K[1] | |||||
add $inp,$inp,#64 | |||||
add $B5,$B5,@K[1] | |||||
# ifdef __AARCH64EB__ | |||||
rev @x[0],@x[0] | |||||
rev @x[2],@x[2] | |||||
rev @x[4],@x[4] | |||||
rev @x[6],@x[6] | |||||
rev @x[8],@x[8] | |||||
rev @x[10],@x[10] | |||||
rev @x[12],@x[12] | |||||
rev @x[14],@x[14] | |||||
# endif | |||||
ld1.8 {$T0-$T3},[$inp],#64 | |||||
eor @x[0],@x[0],@x[1] | |||||
eor @x[2],@x[2],@x[3] | |||||
eor @x[4],@x[4],@x[5] | |||||
eor @x[6],@x[6],@x[7] | |||||
eor @x[8],@x[8],@x[9] | |||||
eor $A0,$A0,$T0 | |||||
eor @x[10],@x[10],@x[11] | |||||
eor $B0,$B0,$T1 | |||||
eor @x[12],@x[12],@x[13] | |||||
eor $C0,$C0,$T2 | |||||
eor @x[14],@x[14],@x[15] | |||||
eor $D0,$D0,$T3 | |||||
ld1.8 {$T0-$T3},[$inp],#64 | |||||
stp @x[0],@x[2],[$out,#0] // store output | |||||
add @d[6],@d[6],#7 // increment counter | |||||
stp @x[4],@x[6],[$out,#16] | |||||
stp @x[8],@x[10],[$out,#32] | |||||
stp @x[12],@x[14],[$out,#48] | |||||
add $out,$out,#64 | |||||
st1.8 {$A0-$D0},[$out],#64 | |||||
ld1.8 {$A0-$D0},[$inp],#64 | |||||
eor $A1,$A1,$T0 | |||||
eor $B1,$B1,$T1 | |||||
eor $C1,$C1,$T2 | |||||
eor $D1,$D1,$T3 | |||||
st1.8 {$A1-$D1},[$out],#64 | |||||
ld1.8 {$A1-$D1},[$inp],#64 | |||||
eor $A2,$A2,$A0 | |||||
ldp @K[0],@K[1],[sp,#0] | |||||
eor $B2,$B2,$B0 | |||||
ldp @K[2],@K[3],[sp,#32] | |||||
eor $C2,$C2,$C0 | |||||
eor $D2,$D2,$D0 | |||||
st1.8 {$A2-$D2},[$out],#64 | |||||
ld1.8 {$A2-$D2},[$inp],#64 | |||||
eor $A3,$A3,$A1 | |||||
eor $B3,$B3,$B1 | |||||
eor $C3,$C3,$C1 | |||||
eor $D3,$D3,$D1 | |||||
st1.8 {$A3-$D3},[$out],#64 | |||||
ld1.8 {$A3-$D3},[$inp],#64 | |||||
eor $A4,$A4,$A2 | |||||
eor $B4,$B4,$B2 | |||||
eor $C4,$C4,$C2 | |||||
eor $D4,$D4,$D2 | |||||
st1.8 {$A4-$D4},[$out],#64 | |||||
shl $A0,$ONE,#1 // 4 -> 8 | |||||
eor $A5,$A5,$A3 | |||||
eor $B5,$B5,$B3 | |||||
eor $C5,$C5,$C3 | |||||
eor $D5,$D5,$D3 | |||||
st1.8 {$A5-$D5},[$out],#64 | |||||
add @K[3],@K[3],$A0 // += 8 | |||||
add @K[4],@K[4],$A0 | |||||
add @K[5],@K[5],$A0 | |||||
add @K[6],@K[6],$A0 | |||||
b.hs .Loop_outer_512_neon | |||||
adds $len,$len,#512 | |||||
ushr $A0,$ONE,#2 // 4 -> 1 | |||||
ldp d8,d9,[sp,#128+0] // meet ABI requirements | |||||
ldp d10,d11,[sp,#128+16] | |||||
ldp d12,d13,[sp,#128+32] | |||||
ldp d14,d15,[sp,#128+48] | |||||
stp @K[0],$ONE,[sp,#0] // wipe off-load area | |||||
stp @K[0],$ONE,[sp,#32] | |||||
stp @K[0],$ONE,[sp,#64] | |||||
b.eq .Ldone_512_neon | |||||
cmp $len,#192 | |||||
sub @K[3],@K[3],$A0 // -= 1 | |||||
sub @K[4],@K[4],$A0 | |||||
sub @K[5],@K[5],$A0 | |||||
add sp,sp,#128 | |||||
b.hs .Loop_outer_neon | |||||
eor @K[1],@K[1],@K[1] | |||||
eor @K[2],@K[2],@K[2] | |||||
eor @K[3],@K[3],@K[3] | |||||
eor @K[4],@K[4],@K[4] | |||||
eor @K[5],@K[5],@K[5] | |||||
eor @K[6],@K[6],@K[6] | |||||
b .Loop_outer | |||||
.Ldone_512_neon: | |||||
ldp x19,x20,[x29,#16] | |||||
add sp,sp,#128+64 | |||||
ldp x21,x22,[x29,#32] | |||||
ldp x23,x24,[x29,#48] | |||||
ldp x25,x26,[x29,#64] | |||||
ldp x27,x28,[x29,#80] | |||||
ldp x29,x30,[sp],#96 | |||||
ret | |||||
.size ChaCha20_512_neon,.-ChaCha20_512_neon | |||||
#endif | |||||
#endif | |||||
___ | |||||
} | |||||
}}} | |||||
open SELF,$0; | |||||
while(<SELF>) { | |||||
next if (/^#!/); | |||||
last if (!s/^#/\/\// and !/^$/); | |||||
print; | |||||
} | |||||
close SELF; | |||||
foreach (split("\n",$code)) { | |||||
s/\`([^\`]*)\`/eval $1/geo; | |||||
(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or | |||||
(m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or | |||||
(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or | |||||
(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or | |||||
(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); | |||||
print $_,"\n"; | |||||
} | |||||
close STDOUT; # flush |