Page MenuHomeFreeBSD

D8434.id72758.diff
No OneTemporary

D8434.id72758.diff

Index: head/lib/libmd/Makefile
===================================================================
--- head/lib/libmd/Makefile
+++ head/lib/libmd/Makefile
@@ -116,18 +116,15 @@
SRCS+= rmd160.S
CFLAGS+= -DRMD160_ASM
.endif
-.if exists(${MACHINE_ARCH}/skein_block_asm.s)
-.if defined(XAS) || ${MK_BINUTILS_BOOTSTRAP} != "no"
-AFLAGS += --strip-local-absolute
+.if exists(${MACHINE_ARCH}/skein_block_asm.S)
# Fully unroll all loops in the assembly optimized version
-AFLAGS+= --defsym SKEIN_LOOP=0 --defsym SKEIN_USE_ASM=1792
-SRCS+= skein_block_asm.s
+ACFLAGS+= -DSKEIN_LOOP=0
+SRCS+= skein_block_asm.S
CFLAGS+= -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792
.else
.warning as not available: not using optimized Skein asm
.endif
-.endif
-.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.s)
+.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.S)
ACFLAGS+= -DELF -Wa,--noexecstack
.endif
.endif # ${USE_ASM_SOURCES} != 0
Index: head/sys/crypto/skein/amd64/skein_block_asm.S
===================================================================
--- head/sys/crypto/skein/amd64/skein_block_asm.S
+++ head/sys/crypto/skein/amd64/skein_block_asm.S
@@ -0,0 +1,1333 @@
+#
+#----------------------------------------------------------------
+# 64-bit x86 assembler code (gnu as) for Skein block functions
+#
+# Author: Doug Whiting, Hifn/Exar
+#
+# This code is released to the public domain.
+#----------------------------------------------------------------
+# $FreeBSD$
+#
+ .text
+ .altmacro
+#ifndef __clang__
+ .psize 0,128 #list file has no page boundaries
+#endif
+#
+_MASK_ALL_ = (256+512+1024) #all three algorithm bits
+_MAX_FRAME_ = 240
+#
+#################
+#ifndef SKEIN_USE_ASM
+_USE_ASM_ = _MASK_ALL_
+#else
+_USE_ASM_ = SKEIN_USE_ASM
+#endif
+#################
+#configure loop unrolling
+#ifndef SKEIN_LOOP
+_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024
+#else
+_SKEIN_LOOP = SKEIN_LOOP
+ .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line
+#.print "+++ SKEIN_LOOP = \_NN_"
+ .endr
+#endif
+# the unroll counts (0 --> fully unrolled)
+SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10
+SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10
+SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10
+#
+SKEIN_ASM_UNROLL = 0
+ .irp _NN_,256,512,1024
+ .if (SKEIN_UNROLL_\_NN_) == 0
+SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_
+ .endif
+ .endr
+#################
+#
+.ifndef SKEIN_ROUNDS
+ROUNDS_256 = 72
+ROUNDS_512 = 72
+ROUNDS_1024 = 80
+.else
+ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
+ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)
+ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)
+# only display rounds if default size is changed on command line
+.irp _NN_,256,512,1024
+ .if _USE_ASM_ && \_NN_
+ .irp _RR_,%(ROUNDS_\_NN_)
+ .if _NN_ < 1024
+.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+ .else
+.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
+ .endif
+ .endr
+ .endif
+.endr
+.endif
+#################
+#
+.ifdef SKEIN_CODE_SIZE
+_SKEIN_CODE_SIZE = (1)
+.else
+.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined
+_SKEIN_CODE_SIZE = (1)
+.else
+_SKEIN_CODE_SIZE = (0)
+.endif
+.endif
+#
+#################
+#
+.ifndef SKEIN_DEBUG
+_SKEIN_DEBUG = 0
+.else
+_SKEIN_DEBUG = 1
+.endif
+#################
+#
+# define offsets of fields in hash context structure
+#
+HASH_BITS = 0 #bits of hash output
+BCNT = 8 + HASH_BITS #number of bytes in BUFFER[]
+TWEAK = 8 + BCNT #tweak values[0..1]
+X_VARS = 16 + TWEAK #chaining vars
+#
+#(Note: buffer[] in context structure is NOT needed here :-)
+#
+KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words
+FIRST_MASK = ~ (1 << 6)
+FIRST_MASK64= ~ (1 << 62)
+#
+# rotation constants for Skein
+#
+RC_256_0_0 = 14
+RC_256_0_1 = 16
+
+RC_256_1_0 = 52
+RC_256_1_1 = 57
+
+RC_256_2_0 = 23
+RC_256_2_1 = 40
+
+RC_256_3_0 = 5
+RC_256_3_1 = 37
+
+RC_256_4_0 = 25
+RC_256_4_1 = 33
+
+RC_256_5_0 = 46
+RC_256_5_1 = 12
+
+RC_256_6_0 = 58
+RC_256_6_1 = 22
+
+RC_256_7_0 = 32
+RC_256_7_1 = 32
+
+RC_512_0_0 = 46
+RC_512_0_1 = 36
+RC_512_0_2 = 19
+RC_512_0_3 = 37
+
+RC_512_1_0 = 33
+RC_512_1_1 = 27
+RC_512_1_2 = 14
+RC_512_1_3 = 42
+
+RC_512_2_0 = 17
+RC_512_2_1 = 49
+RC_512_2_2 = 36
+RC_512_2_3 = 39
+
+RC_512_3_0 = 44
+RC_512_3_1 = 9
+RC_512_3_2 = 54
+RC_512_3_3 = 56
+
+RC_512_4_0 = 39
+RC_512_4_1 = 30
+RC_512_4_2 = 34
+RC_512_4_3 = 24
+
+RC_512_5_0 = 13
+RC_512_5_1 = 50
+RC_512_5_2 = 10
+RC_512_5_3 = 17
+
+RC_512_6_0 = 25
+RC_512_6_1 = 29
+RC_512_6_2 = 39
+RC_512_6_3 = 43
+
+RC_512_7_0 = 8
+RC_512_7_1 = 35
+RC_512_7_2 = 56
+RC_512_7_3 = 22
+
+RC_1024_0_0 = 24
+RC_1024_0_1 = 13
+RC_1024_0_2 = 8
+RC_1024_0_3 = 47
+RC_1024_0_4 = 8
+RC_1024_0_5 = 17
+RC_1024_0_6 = 22
+RC_1024_0_7 = 37
+
+RC_1024_1_0 = 38
+RC_1024_1_1 = 19
+RC_1024_1_2 = 10
+RC_1024_1_3 = 55
+RC_1024_1_4 = 49
+RC_1024_1_5 = 18
+RC_1024_1_6 = 23
+RC_1024_1_7 = 52
+
+RC_1024_2_0 = 33
+RC_1024_2_1 = 4
+RC_1024_2_2 = 51
+RC_1024_2_3 = 13
+RC_1024_2_4 = 34
+RC_1024_2_5 = 41
+RC_1024_2_6 = 59
+RC_1024_2_7 = 17
+
+RC_1024_3_0 = 5
+RC_1024_3_1 = 20
+RC_1024_3_2 = 48
+RC_1024_3_3 = 41
+RC_1024_3_4 = 47
+RC_1024_3_5 = 28
+RC_1024_3_6 = 16
+RC_1024_3_7 = 25
+
+RC_1024_4_0 = 41
+RC_1024_4_1 = 9
+RC_1024_4_2 = 37
+RC_1024_4_3 = 31
+RC_1024_4_4 = 12
+RC_1024_4_5 = 47
+RC_1024_4_6 = 44
+RC_1024_4_7 = 30
+
+RC_1024_5_0 = 16
+RC_1024_5_1 = 34
+RC_1024_5_2 = 56
+RC_1024_5_3 = 51
+RC_1024_5_4 = 4
+RC_1024_5_5 = 53
+RC_1024_5_6 = 42
+RC_1024_5_7 = 41
+
+RC_1024_6_0 = 31
+RC_1024_6_1 = 44
+RC_1024_6_2 = 47
+RC_1024_6_3 = 46
+RC_1024_6_4 = 19
+RC_1024_6_5 = 42
+RC_1024_6_6 = 44
+RC_1024_6_7 = 25
+
+RC_1024_7_0 = 9
+RC_1024_7_1 = 48
+RC_1024_7_2 = 35
+RC_1024_7_3 = 52
+RC_1024_7_4 = 23
+RC_1024_7_5 = 31
+RC_1024_7_6 = 37
+RC_1024_7_7 = 20
+#
+# Input: reg
+# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
+#
+.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM
+ .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do?
+ rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
+ .endif
+.endm
+#
+#----------------------------------------------------------------
+#
+# MACROS: define local vars and configure stack
+#
+#----------------------------------------------------------------
+# declare allocated space on the stack
+.macro StackVar localName,localSize
+\localName = _STK_OFFS_
+_STK_OFFS_ = _STK_OFFS_+(\localSize)
+.endm #StackVar
+#
+#----------------------------------------------------------------
+#
+# MACRO: Configure stack frame, allocate local vars
+#
+.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
+ WCNT = (\BLK_BITS)/64
+#
+_PushCnt_ = 0 #save nonvolatile regs on stack
+ .irp _reg_,rbp,rbx,r12,r13,r14,r15
+ pushq %\_reg_
+_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment
+ .endr
+#
+_STK_OFFS_ = 0 #starting offset from rsp
+ #---- local variables #<-- rsp
+ StackVar X_stk ,8*(WCNT) #local context vars
+ StackVar ksTwk ,8*3 #key schedule: tweak words
+ StackVar ksKey ,8*(WCNT)+8 #key schedule: key words
+ .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
+ StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
+ .endif
+ StackVar Wcopy ,8*(WCNT) #copy of input block
+ .if _SKEIN_DEBUG
+ .if \debugCnt + 0 #temp location for debug X[] info
+ StackVar xDebug_\BLK_BITS ,8*(\debugCnt)
+ .endif
+ .endif
+ .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
+ StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?)
+tmpStk_\BLK_BITS = align16 #use this
+ .endif
+ #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
+ StackVar ctxPtr ,8 #context ptr
+ StackVar blkPtr ,8 #pointer to block data
+ StackVar blkCnt ,8 #number of full blocks to process
+ StackVar bitAdd ,8 #bit count to add to tweak
+LOCAL_SIZE = _STK_OFFS_ #size of "local" vars
+ #----
+ StackVar savRegs,8*_PushCnt_ #saved registers
+ StackVar retAddr,8 #return address
+ #---- caller's stack frame (aligned mod 16)
+#
+# set up the stack frame pointer (rbp)
+#
+FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey
+ .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range
+FRAME_OFFS = _STK_OFFS_
+ .endif
+F_O = -FRAME_OFFS
+#
+ #put some useful defines in the .lst file (for grep)
+__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
+__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
+__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
+#
+# Notes on stack frame setup:
+# * the most frequently used variable is X_stk[], based at [rsp+0]
+# * the next most used is the key schedule arrays, ksKey and ksTwk
+# so rbp is "centered" there, allowing short offsets to the key
+# schedule even in 1024-bit Skein case
+# * the Wcopy variables are infrequently accessed, but they have long
+# offsets from both rsp and rbp only in the 1024-bit case.
+# * all other local vars and calling parameters can be accessed
+# with short offsets, except in the 1024-bit case
+#
+ subq $LOCAL_SIZE,%rsp #make room for the locals
+ leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets
+ movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack
+ movq %rsi, blkPtr+F_O(%rbp)
+ movq %rdx, blkCnt+F_O(%rbp)
+ movq %rcx, bitAdd+F_O(%rbp)
+#
+.endm #Setup_Stack
+#
+#----------------------------------------------------------------
+#
+.macro Reset_Stack
+ addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?)
+ .irp _reg_,r15,r14,r13,r12,rbx,rbp
+ popq %\_reg_ #restore caller's regs
+_PushCnt_ = _PushCnt_ - 1
+ .endr
+ .if _PushCnt_
+ .error "Mismatched push/pops?"
+ .endif
+.endm # Reset_Stack
+#
+#----------------------------------------------------------------
+# macros to help debug internals
+#
+.if _SKEIN_DEBUG
+ .extern Skein_Show_Block #calls to C routines
+ .extern Skein_Show_Round
+#
+SKEIN_RND_SPECIAL = 1000
+SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
+SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
+SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
+#
+.macro Skein_Debug_Block BLK_BITS
+#
+#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
+# const u08b_t *blkPtr, const u64b_t *wPtr,
+# const u64b_t *ksPtr,const u64b_t *tsPtr)
+#
+_NN_ = 0
+ .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
+ pushq %\_reg_ #save all volatile regs on tack before the call
+_NN_ = _NN_ + 1
+ .endr
+ # get and push call parameters
+ movq $\BLK_BITS ,%rdi #bits
+ movq ctxPtr+F_O(%rbp),%rsi #h (pointer)
+ leaq X_VARS (%rsi),%rdx #X (pointer)
+ movq blkPtr+F_O(%rbp),%rcx #blkPtr
+ leaq Wcopy +F_O(%rbp),%r8 #wPtr
+ leaq ksKey +F_O(%rbp),%r9 #key pointer
+ leaq ksTwk +F_O(%rbp),%rax #tweak pointer
+ pushq %rax # (pass on the stack)
+ call Skein_Show_Block #call external debug handler
+ addq $8*1,%rsp #discard parameters on stack
+ .if (_NN_ % 2 ) == 0 #check stack alignment
+ .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
+ .endif
+ .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
+ popq %\_reg_ #restore regs
+_NN_ = _NN_ - 1
+ .endr
+ .if _NN_
+ .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
+ .endif
+.endm # Skein_Debug_Block
+#
+# the macro to "call" to debug a round
+#
+.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
+ # call the appropriate (local) debug "function"
+ pushq %rdx #save rdx, so we can use it for round "number"
+ .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
+ movq $\R,%rdx
+ .else #compute round number using edi
+_rOffs_ = \RDI_OFFS + 0
+ .if \BLK_BITS == 1024
+ movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above)
+ leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
+ .else
+ leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
+ .endif
+ .endif
+ call Skein_Debug_Round_\BLK_BITS
+ popq %rdx #restore origianl rdx value
+#
+ afterOp
+.endm # Skein_Debug_Round
+.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
+.macro Skein_Debug_Block BLK_BITS
+.endm
+#
+.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
+.endm
+#
+.endif # _SKEIN_DEBUG
+#
+#----------------------------------------------------------------
+#
+.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
+ .if \immOffs + 0
+ leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
+ .elseif ((\useAddOp + 0) == 0)
+ .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs!
+ leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
+ .else
+ addq %\srcReg_A\srcReg_B,%\dstReg
+ .endif
+ .else
+ addq %\srcReg_A\srcReg_B,%\dstReg
+ .endif
+.endm
+
+# keep Intel-style ordering here, to match addReg
+.macro xorReg dstReg,srcReg_A,srcReg_B
+ xorq %\srcReg_A\srcReg_B,%\dstReg
+.endm
+#
+#----------------------------------------------------------------
+#
+.macro C_label lName
+ \lName: #use both "genders" to work across linkage conventions
+_\lName:
+ .global \lName
+ .global _\lName
+.endm
+#
+#=================================== Skein_256 =============================================
+#
+.if _USE_ASM_ & 256
+#
+# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+#
+# code
+#
+C_label Skein_256_Process_Block
+ Setup_Stack 256,((ROUNDS_256/8)+1)
+ movq TWEAK+8(%rdi),%r14
+ jmp Skein_256_block_loop
+ .p2align 4
+ # main hash loop for Skein_256
+Skein_256_block_loop:
+ #
+ # general register usage:
+ # RAX..RDX = X0..X3
+ # R08..R12 = ks[0..4]
+ # R13..R15 = ts[0..2]
+ # RSP, RBP = stack/frame pointers
+ # RDI = round counter or context pointer
+ # RSI = temp
+ #
+ movq TWEAK+0(%rdi) ,%r13
+ addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0
+ movq %r14 ,%r15
+ xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak
+
+ movq $KW_PARITY ,%r12
+ movq X_VARS+ 0(%rdi),%r8
+ movq X_VARS+ 8(%rdi),%r9
+ movq X_VARS+16(%rdi),%r10
+ movq X_VARS+24(%rdi),%r11
+ movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0]
+ xorq %r8 ,%r12 #start accumulating overall parity
+
+ movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block
+ xorq %r9 ,%r12
+ movq 0(%rsi) ,%rax #get X[0..3]
+ xorq %r10 ,%r12
+ movq 8(%rsi) ,%rbx
+ xorq %r11 ,%r12
+ movq 16(%rsi) ,%rcx
+ movq 24(%rsi) ,%rdx
+
+ movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block
+ movq %rbx,Wcopy+ 8+F_O(%rbp)
+ movq %rcx,Wcopy+16+F_O(%rbp)
+ movq %rdx,Wcopy+24+F_O(%rbp)
+
+ addq %r8 ,%rax #initial key injection
+ addq %r9 ,%rbx
+ addq %r10,%rcx
+ addq %r11,%rdx
+ addq %r13,%rbx
+ addq %r14,%rcx
+
+.if _SKEIN_DEBUG
+ movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?)
+ movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block
+ movq %r9 ,ksKey+ 8+F_O(%rbp)
+ movq %r10,ksKey+16+F_O(%rbp)
+ movq %r11,ksKey+24+F_O(%rbp)
+ movq %r12,ksKey+32+F_O(%rbp)
+
+ movq %r13,ksTwk+ 0+F_O(%rbp)
+ movq %r14,ksTwk+ 8+F_O(%rbp)
+ movq %r15,ksTwk+16+F_O(%rbp)
+
+ movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block
+ movq %rbx,X_stk + 8(%rsp)
+ movq %rcx,X_stk +16(%rsp)
+ movq %rdx,X_stk +24(%rsp)
+
+ Skein_Debug_Block 256 #debug dump
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
+.endif
+#
+.if ((SKEIN_ASM_UNROLL & 256) == 0)
+ movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code
+ movq %r9 ,ksKey+ 8+F_O(%rbp)
+ movq %r10,ksKey+16+F_O(%rbp)
+ movq %r11,ksKey+24+F_O(%rbp)
+ movq %r12,ksKey+32+F_O(%rbp)
+
+ movq %r13,ksTwk+24+F_O(%rbp)
+ movq %r14,ksTwk+ 8+F_O(%rbp)
+ movq %r15,ksTwk+16+F_O(%rbp)
+.endif
+ addq $WCNT*8,%rsi #skip the block
+ movq %rsi,blkPtr +F_O(%rbp) #update block pointer
+ #
+ # now the key schedule is computed. Start the rounds
+ #
+.if SKEIN_ASM_UNROLL & 256
+_UNROLL_CNT = ROUNDS_256/8
+.else
+_UNROLL_CNT = SKEIN_UNROLL_256
+ .if ((ROUNDS_256/8) % _UNROLL_CNT)
+ .error "Invalid SKEIN_UNROLL_256"
+ .endif
+ xorq %rdi,%rdi #rdi = iteration count
+Skein_256_round_loop:
+.endif
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+ # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled)
+ # round 4*_RBase_ + 0
+ addReg rax, rbx
+ RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0
+ addReg rcx, rdx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
+ .endif
+ xorReg rbx, rax
+ RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1
+ xorReg rdx, rcx
+ .if SKEIN_ASM_UNROLL & 256
+ .irp _r0_,%( 8+(_Rbase_+3) % 5)
+ .irp _r1_,%(13+(_Rbase_+2) % 3)
+ leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx
+ .endr
+ .endr
+ .endif
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
+ .endif
+ Skein_Debug_Round 256,%(4*_Rbase_+1)
+
+ # round 4*_Rbase_ + 1
+ addReg rax, rdx
+ RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0
+ xorReg rdx, rax
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
+ .endif
+ addReg rcx, rbx
+ RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1
+ xorReg rbx, rcx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
+ .endif
+ Skein_Debug_Round 256,%(4*_Rbase_+2)
+ .if SKEIN_ASM_UNROLL & 256
+ .irp _r0_,%( 8+(_Rbase_+2) % 5)
+ .irp _r1_,%(13+(_Rbase_+1) % 3)
+ leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx
+ .endr
+ .endr
+ .endif
+ # round 4*_Rbase_ + 2
+ addReg rax, rbx
+ RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0
+ addReg rcx, rdx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
+ .endif
+ xorReg rbx, rax
+ RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1
+ xorReg rdx, rcx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key
+ leaq 1(%r11,%rdi),%r11 #precompute key + tweak
+ .endif
+ Skein_Debug_Round 256,%(4*_Rbase_+3)
+ # round 4*_Rbase_ + 3
+ addReg rax, rdx
+ RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0
+ addReg rcx, rbx
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak
+ movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak
+ .endif
+ xorReg rdx, rax
+ RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1
+ xorReg rbx, rcx
+ Skein_Debug_Round 256,%(4*_Rbase_+4)
+ .if (SKEIN_ASM_UNROLL & 256) == 0
+ addReg r9 ,r13 #precompute key+tweak
+ .endif
+ #inject key schedule words
+_Rbase_ = _Rbase_+1
+ .if SKEIN_ASM_UNROLL & 256
+ addReg rax,r,%(8+((_Rbase_+0) % 5))
+ addReg rbx,rsi
+ addReg rcx,rdi
+ addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
+ .else
+ incq %rdi
+ addReg rax,r8
+ addReg rcx,r10
+ addReg rbx,r9
+ addReg rdx,r11
+ .endif
+ Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 256) == 0
+ cmpq $2*(ROUNDS_256/8),%rdi
+ jb Skein_256_round_loop
+.endif # (SKEIN_ASM_UNROLL & 256) == 0
+ movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
+
+ #----------------------------
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
+ movq $FIRST_MASK64 ,%r14
+ xorq Wcopy + 0+F_O (%rbp),%rax
+ xorq Wcopy + 8+F_O (%rbp),%rbx
+ xorq Wcopy +16+F_O (%rbp),%rcx
+ xorq Wcopy +24+F_O (%rbp),%rdx
+ andq TWEAK + 8 (%rdi),%r14
+ movq %rax,X_VARS+ 0(%rdi) #store final result
+ movq %rbx,X_VARS+ 8(%rdi)
+ movq %rcx,X_VARS+16(%rdi)
+ movq %rdx,X_VARS+24(%rdi)
+
+ Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
+
+ # go back for more blocks, if needed
+ decq blkCnt+F_O(%rbp)
+ jnz Skein_256_block_loop
+ movq %r14,TWEAK + 8(%rdi)
+ Reset_Stack
+ ret
+Skein_256_Process_Block_End:
+
+ .if _SKEIN_DEBUG
+Skein_Debug_Round_256: #here with rdx == round "number" from macro
+ pushq %rsi #save two regs for BLK_BITS-specific parms
+ pushq %rdi
+ movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi
+ movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it
+ movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!)
+ movq %rcx,X_stk+16+F_O(%rbp)
+ movq %rdi,X_stk+24+F_O(%rbp)
+
+ movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
+ movq $256,%rdi #now <rdi,rsi,rdx> are set for the call
+ jmp Skein_Debug_Round_Common
+ .endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein_256_Process_Block_CodeSize
+ movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
+ ret
+#
+C_label Skein_256_Unroll_Cnt
+ .if _UNROLL_CNT <> ROUNDS_256/8
+ movq $_UNROLL_CNT,%rax
+ .else
+ xorq %rax,%rax
+ .endif
+ ret
+.endif
+#
+.endif #_USE_ASM_ & 256
+#
+#=================================== Skein_512 =============================================
+#
+.if _USE_ASM_ & 512
+#
+# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
+#
+# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7)
+#
+#################
+# MACRO: one round for 512-bit blocks
+#
+.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
+#
+ addReg r\rn0, r\rn1
+ RotL64 r\rn1, 512,%((\_Rn_) % 8),0
+ xorReg r\rn1, r\rn0
+ \op1
+ addReg r\rn2, r\rn3
+ RotL64 r\rn3, 512,%((\_Rn_) % 8),1
+ xorReg r\rn3, r\rn2
+ \op2
+ addReg r\rn4, r\rn5
+ RotL64 r\rn5, 512,%((\_Rn_) % 8),2
+ xorReg r\rn5, r\rn4
+ \op3
+ addReg r\rn6, r\rn7
+ RotL64 r\rn7, 512,%((\_Rn_) % 8),3
+ xorReg r\rn7, r\rn6
+ \op4
+ Skein_Debug_Round 512,%(\_Rn_+1),-4
+#
+.endm #R_512_OneRound
+#
+#################
+# MACRO: eight rounds for 512-bit blocks
+#
+.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8)
+ .if (SKEIN_ASM_UNROLL && 512)
+ # here for fully unrolled case.
+ _II_ = ((\_RR_)/4) + 1 #key injection counter
+ R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
+ R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
+ R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
+ R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
+ # inject the key schedule
+ addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
+ addReg r11, rax
+ addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
+ addReg r12, rbx
+ addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
+ addReg r13, rcx
+ addReg r14, rdx
+ addReg r15, rsi,,,(_II_)
+ .else
+ # here for looping case #"rotate" key/tweak schedule (move up on stack)
+ incq %rdi #bump key injection counter
+ R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
+ R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
+ R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
+ R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
+ # inject the key schedule
+ addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8
+ addReg r11, rax
+ addReg r12, rbx
+ addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9
+ addReg r13, rcx
+ addReg r14, rdx
+ addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10
+ addReg r15, rsi
+ addReg r15, rdi #inject the round number
+ .endif
+
+ #show the result of the key injection
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
+.endm #R_512_EightRounds
+#
+#################
+# instantiated code
+#
+C_label Skein_512_Process_Block
+ Setup_Stack 512,ROUNDS_512/8
+ movq TWEAK+ 8(%rdi),%rbx
+ jmp Skein_512_block_loop
+ .p2align 4
+ # main hash loop for Skein_512
+Skein_512_block_loop:
+ # general register usage:
+ # RAX..RDX = temps for key schedule pre-loads
+ # R8 ..R15 = X0..X7
+ # RSP, RBP = stack/frame pointers
+ # RDI = round counter or context pointer
+ # RSI = temp
+ #
+ movq TWEAK + 0(%rdi),%rax
+ addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0
+ movq %rbx,%rcx
+ xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule
+ movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0]
+ movq %rax,ksTwk+ 0+F_O(%rbp)
+ movq $KW_PARITY,%rdx
+ movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block
+ movq %rbx,ksTwk+ 8+F_O(%rbp)
+ movq %rcx,ksTwk+16+F_O(%rbp)
+ .irp _Rn_,8,9,10,11,12,13,14,15
+ movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
+ xorq %r\_Rn_,%rdx #compute overall parity
+ movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
+ .endr #load state into %r8 ..%r15, compute parity
+ movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
+
+ addReg r13,rax #precompute key injection for tweak
+ addReg r14, rbx
+.if _SKEIN_DEBUG
+ movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
+.endif
+ movq 0(%rsi),%rax #load input block
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rcx
+ movq 24(%rsi),%rdx
+ addReg r8 , rax #do initial key injection
+ addReg r9 , rbx
+ movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward
+ movq %rbx,Wcopy+ 8+F_O(%rbp)
+ addReg r10, rcx
+ addReg r11, rdx
+ movq %rcx,Wcopy+16+F_O(%rbp)
+ movq %rdx,Wcopy+24+F_O(%rbp)
+
+ movq 32(%rsi),%rax
+ movq 40(%rsi),%rbx
+ movq 48(%rsi),%rcx
+ movq 56(%rsi),%rdx
+ addReg r12, rax
+ addReg r13, rbx
+ addReg r14, rcx
+ addReg r15, rdx
+ movq %rax,Wcopy+32+F_O(%rbp)
+ movq %rbx,Wcopy+40+F_O(%rbp)
+ movq %rcx,Wcopy+48+F_O(%rbp)
+ movq %rdx,Wcopy+56+F_O(%rbp)
+
+.if _SKEIN_DEBUG
+ .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output
+ movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
+ .endr
+
+ Skein_Debug_Block 512 #debug dump
+ Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
+.endif
+ addq $8*WCNT,%rsi #skip the block
+ movq %rsi,blkPtr+F_O(%rbp) #update block pointer
+ #
+ #################
+ # now the key schedule is computed. Start the rounds
+ #
+.if SKEIN_ASM_UNROLL & 512
+_UNROLL_CNT = ROUNDS_512/8
+.else
+_UNROLL_CNT = SKEIN_UNROLL_512
+ .if ((ROUNDS_512/8) % _UNROLL_CNT)
+ .error "Invalid SKEIN_UNROLL_512"
+ .endif
+ xorq %rdi,%rdi #rdi = round counter
+Skein_512_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2
+ R_512_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 512) == 0
+ cmpq $2*(ROUNDS_512/8),%rdi
+ jb Skein_512_round_loop
+ movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
+.endif
+ # end of rounds
+ #################
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
+ .irp _Rn_,8,9,10,11,12,13,14,15
+ .if (\_Rn_ == 8)
+ movq $FIRST_MASK64,%rbx
+ .endif
+ xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR
+ movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result
+ .if (\_Rn_ == 14)
+ andq TWEAK+ 8(%rdi),%rbx
+ .endif
+ .endr
+ Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
+
+ # go back for more blocks, if needed
+ decq blkCnt+F_O(%rbp)
+ jnz Skein_512_block_loop
+ movq %rbx,TWEAK + 8(%rdi)
+
+ Reset_Stack
+ ret
+Skein_512_Process_Block_End:
+#
+ .if _SKEIN_DEBUG
+# call here with rdx = "round number"
+Skein_Debug_Round_512:
+ pushq %rsi #save two regs for BLK_BITS-specific parms
+ pushq %rdi
+ .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it
+ movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
+ .endr
+ movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
+ movq $512,%rdi #now <rdi,rsi,rdx> are set for the call
+ jmp Skein_Debug_Round_Common
+ .endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein_512_Process_Block_CodeSize
+ movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
+ ret
+#
+C_label Skein_512_Unroll_Cnt
+ .if _UNROLL_CNT <> (ROUNDS_512/8)
+ movq $_UNROLL_CNT,%rax
+ .else
+ xorq %rax,%rax
+ .endif
+ ret
+.endif
+#
+.endif # _USE_ASM_ & 512
+#
+#=================================== Skein1024 =============================================
+.if _USE_ASM_ & 1024
+#
+# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
+#
+#################
+# use details of permutation to make register assignments
+#
+o1K_rdi = 0 #offsets in X[] associated with each register
+o1K_rsi = 1
+o1K_rbp = 2
+o1K_rax = 3
+o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate
+o1K_rbx = 5
+o1K_rdx = 7
+o1K_r8 = 8
+o1K_r9 = 9
+o1K_r10 = 10
+o1K_r11 = 11
+o1K_r12 = 12
+o1K_r13 = 13
+o1K_r14 = 14
+o1K_r15 = 15
+#
+rIdx_offs = tmpStk_1024
+#
+.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
+ addReg \reg0 , \reg1 #perform the MIX
+ RotL64 \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_
+ xorReg \reg1 , \reg0
+.if ((\_RN0_) && 3) == 3 #time to do key injection?
+ .if _SKEIN_DEBUG
+ movq %\reg0 , xDebug_1024+8*\w0(%rsp) #save intermediate values for Debug_Round
+ movq %\reg1 , xDebug_1024+8*\w1(%rsp) # (before inline key injection)
+ .endif
+_II_ = ((\_RN0_)/4)+1 #injection count
+ .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection
+ addq ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0
+ addq ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1
+ .if \w1 == 13 #tweak injection
+ addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1
+ .elseif \w0 == 14
+ addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0
+ .elseif \w1 == 15
+ addq $_II_, %\reg1 #(injection counter)
+ .endif
+ .else #here to do looping key injection
+ .if (\w0 == 0)
+ movq %rdi, X_stk+8*\w0(%rsp) #if so, store N0 so we can use reg as index
+ movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi
+ .else
+ addq ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection
+ .endif
+ .if \w1 == 13 #tweak injection
+ addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
+ .elseif \w0 == 14
+ addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
+ .elseif \w1 == 15
+ addReg \reg1,rdi,,,1 #(injection counter)
+ .endif
+ addq ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection
+ .endif
+.endif
+ # insert the op provided, .if any
+ \op1
+.endm
+#################
+# MACRO: four rounds for 1024-bit blocks
+#
+.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4)
+ # should be here with X4 set properly, X6 stored on stack
+_Rn_ = (\_RR_) + 0
+ r1024_Mix 0, 1,rdi,rsi,_Rn_,0
+ r1024_Mix 2, 3,rbp,rax,_Rn_,1
+ r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
+ r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
+ r1024_Mix 10,11,r10,r11,_Rn_,5
+ r1024_Mix 12,13,r12,r13,_Rn_,6
+ r1024_Mix 6, 7,rcx,rdx,_Rn_,3
+ r1024_Mix 14,15,r14,r15,_Rn_,7
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ .endif
+_Rn_ = (\_RR_) + 1
+ r1024_Mix 0, 9,rdi,r9 ,_Rn_,0
+ r1024_Mix 2,13,rbp,r13,_Rn_,1
+ r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
+ r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
+ r1024_Mix 12, 3,r12,rax,_Rn_,5
+ r1024_Mix 14, 5,r14,rbx,_Rn_,6
+ r1024_Mix 4,15,rcx,r15,_Rn_,3
+ r1024_Mix 8, 1,r8 ,rsi,_Rn_,7
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ .endif
+_Rn_ = (\_RR_) + 2
+ r1024_Mix 0, 7,rdi,rdx,_Rn_,0
+ r1024_Mix 2, 5,rbp,rbx,_Rn_,1
+ r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
+ r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
+ r1024_Mix 14,13,r14,r13,_Rn_,5
+ r1024_Mix 8,11,r8 ,r11,_Rn_,6
+ r1024_Mix 6, 1,rcx,rsi,_Rn_,3
+ r1024_Mix 10, 9,r10,r9 ,_Rn_,7
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ .endif
+_Rn_ = (\_RR_) + 3
+ r1024_Mix 0,15,rdi,r15,_Rn_,0
+ r1024_Mix 2,11,rbp,r11,_Rn_,1
+ r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
+ r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
+ r1024_Mix 8, 5,r8 ,rbx,_Rn_,5
+ r1024_Mix 10, 3,r10,rax,_Rn_,6
+ r1024_Mix 4, 9,rcx,r9 ,_Rn_,3
+ r1024_Mix 12, 7,r12,rdx,_Rn_,7
+ .if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,%(_Rn_+1)
+ .endif
+
+ .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack
+ #"rotate" the key schedule on the stack
+i8 = o1K_r8
+i0 = o1K_rdi
+ movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack)
+ movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word
+ movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!)
+ movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word
+ movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack)
+ movq X_stk+8*i8(%rsp) ,%r8 #get the reg back
+ incq %rdi #bump the index
+ movq %rdi, rIdx_offs (%rsp) #save rdi again
+ movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back
+ addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection
+ .endif
+ #show the result of the key injection
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
+.endm #r1024_FourRounds
+#
+################
+# code
+#
+C_label Skein1024_Process_Block
+#
+ Setup_Stack 1024,ROUNDS_1024/8,WCNT
+ movq TWEAK+ 8(%rdi),%r9
+ jmp Skein1024_block_loop
+ # main hash loop for Skein1024
+ .p2align 4
+Skein1024_block_loop:
+ # general register usage:
+ # RSP = stack pointer
+ # RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
+ # R8 ..R15 = X8..X15 (state words)
+ # RBP = temp (used for X0 and X2)
+ #
+ .if (SKEIN_ASM_UNROLL & 1024) == 0
+ xorq %rax,%rax #init loop index on the stack
+ movq %rax,rIdx_offs(%rsp)
+ .endif
+ movq TWEAK+ 0(%rdi),%r8
+ addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0
+ movq %r9 ,%r10
+ xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule
+ movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0]
+ movq %r8 ,ksTwk+ 0+F_O(%rbp)
+ movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below
+ movq %r10,ksTwk+16+F_O(%rbp)
+ .if _SKEIN_DEBUG
+ movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
+ .endif
+ movq blkPtr +F_O(%rbp),%rsi # rsi --> input block
+ movq $KW_PARITY ,%rax #overall key schedule parity
+
+ # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
+ .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps
+ movq X_VARS+8*\_rN_(%rdi),%r14 #get state word
+ movq 8*\_rN_(%rsi),%r15 #get msg word
+ xorq %r14,%rax #update key schedule overall parity
+ movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack
+ movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy
+ addq %r15,%r14 #do the initial key injection
+ movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack
+ .endr
+ # now process the rest, using the "real" registers
+ # (MUST do it in reverse order to inject tweaks r8/r9 first)
+ .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
+_oo_ = o1K_\_rr_ #offset assocated with the register
+ movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context
+ movq 8*_oo_(%rsi),%rcx #get next input msg word
+ movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack
+ xorq %\_rr_, %rax #accumulate key schedule parity
+ movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward
+ addq %rcx,%\_rr_ #do the initial key injection
+ .if _oo_ == 13 #do the initial tweak injection
+ addReg \_rr_,r8 # (only in words 13/14)
+ .elseif _oo_ == 14
+ addReg \_rr_,r9
+ .endif
+ .endr
+ movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity
+.if _SKEIN_DEBUG
+ Skein_Debug_Block 1024 #initial debug dump
+.endif
+ addq $8*WCNT,%rsi #bump the msg ptr
+ movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr
+ # re-load words 0..4 from stack, enter the main loop
+ .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack)
+ movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
+ .endr
+.if _SKEIN_DEBUG
+ Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection
+.endif
+ #
+ #################
+ # now the key schedule is computed. Start the rounds
+ #
+.if SKEIN_ASM_UNROLL & 1024
+_UNROLL_CNT = ROUNDS_1024/8
+.else
+_UNROLL_CNT = SKEIN_UNROLL_1024
+ .if ((ROUNDS_1024/8) % _UNROLL_CNT)
+ .error "Invalid SKEIN_UNROLL_1024"
+ .endif
+Skein1024_round_loop:
+.endif
+#
+_Rbase_ = 0
+.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time
+ r1024_FourRounds %(4*_Rbase_+00)
+_Rbase_ = _Rbase_+1
+.endr #rept _UNROLL_CNT
+#
+.if (SKEIN_ASM_UNROLL & 1024) == 0
+ cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
+ jb Skein1024_round_loop
+.endif
+ # end of rounds
+ #################
+ #
+ # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
+ movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
+ movq ctxPtr(%rsp),%rdx
+
+ .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7
+_oo_ = o1K_\_rr_
+ xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
+ movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
+ .if (_oo_ == 9)
+ movq $FIRST_MASK64 ,%r9
+ .endif
+ .if (_oo_ == 14)
+ andq TWEAK+ 8(%rdx),%r9
+ .endif
+ .endr
+ #
+ movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
+ movq X_stk +8*7(%rsp),%rbx
+ xorq Wcopy +8*6(%rsp),%rax
+ xorq Wcopy +8*7(%rsp),%rbx
+ movq %rax,X_VARS+8*6(%rdx)
+ decq blkCnt(%rsp) #set zero flag iff done
+ movq %rbx,X_VARS+8*7(%rdx)
+
+ Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
+ # go back for more blocks, if needed
+ movq ctxPtr(%rsp),%rdi #don't muck with the flags here!
+ lea FRAME_OFFS(%rsp),%rbp
+ jnz Skein1024_block_loop
+ movq %r9 ,TWEAK+ 8(%rdx)
+ Reset_Stack
+ ret
+#
+Skein1024_Process_Block_End:
+#
+.if _SKEIN_DEBUG
+Skein_Debug_Round_1024:
+ # call here with rdx = "round number",
+_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr
+ #
+ #save rest of X[] state on stack so debug routines can access it
+ .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
+ movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
+ .endr
+ # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack
+ cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save
+ jae save_x0
+ testq $3,%rdx #otherwise only if rdx != 0 mod 4
+ jz save_x0_not
+save_x0:
+ movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
+save_x0_not:
+ #figure out the x4/x6 swapping state and save the correct one!
+ cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
+ jae save_x4
+ testq $1,%rdx #and even ones have r4 as well
+ jz save_x4
+ movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
+ jmp debug_1024_go
+save_x4:
+ movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
+debug_1024_go:
+ #now all is saved in Xstk[] except for rdx
+ push %rsi #save two regs for BLK_BITS-specific parms
+ push %rdi
+_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32)
+
+ movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call)
+ movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
+
+ movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr
+ movq $1024,%rdi #rdi = block size
+ jmp Skein_Debug_Round_Common
+.endif
+#
+.if _SKEIN_CODE_SIZE
+C_label Skein1024_Process_Block_CodeSize
+ movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
+ ret
+#
+C_label Skein1024_Unroll_Cnt
+ .if _UNROLL_CNT <> (ROUNDS_1024/8)
+ movq $_UNROLL_CNT,%rax
+ .else
+ xorq %rax,%rax
+ .endif
+ ret
+.endif
+#
+.endif # _USE_ASM_ and 1024
+#
+.if _SKEIN_DEBUG
+#----------------------------------------------------------------
+#local debug routine to set up for calls to:
+# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
+# [ rdi rsi rdx rcx]
+#
+# here with %rdx = round number
+# %rsi = ctx_hdr_ptr
+# %rdi = block size (256/512/1024)
+# on stack: saved rdi, saved rsi, retAddr, saved rdx
+#
+Skein_Debug_Round_Common:
+_SP_OFFS_ = 32 #account for four words on stack already
+ .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs
+ pushq %\_rr_
+_SP_OFFS_ = _SP_OFFS_+8
+ .endr
+ .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here
+ .error "Debug_Round_Common: stack alignment"
+ .endif
+ # compute %rcx = ptr to the X[] array on the stack (final parameter to call)
+ leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
+ cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"?
+ jnz _got_rcxA
+ leaq X_VARS(%rsi),%rcx
+_got_rcxA:
+ .if _USE_ASM_ & 1024
+ # special handling for 1024-bit case
+ # (for rounds right before with key injection:
+ # use xDebug_1024[] instead of X_stk[])
+ cmpq $SKEIN_RND_SPECIAL,%rdx
+ jae _got_rcxB #must be a normal round
+ orq %rdx,%rdx
+ jz _got_rcxB #just before key injection
+ test $3,%rdx
+ jne _got_rcxB
+ cmp $1024,%rdi #only 1024-bit(s) for now
+ jne _got_rcxB
+ leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx
+_got_rcxB:
+ .endif
+ call Skein_Show_Round #call external debug handler
+
+ .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs
+ popq %\_rr_
+_SP_OFFS_ = _SP_OFFS_-8
+ .endr
+ .if _SP_OFFS_ - 32
+ .error "Debug_Round_Common: push/pop misalignment!"
+ .endif
+ popq %rdi
+ popq %rsi
+ ret
+.endif
+#----------------------------------------------------------------
+ .section .note.GNU-stack,"",@progbits
+
+ .end
Index: head/sys/crypto/skein/amd64/skein_block_asm.s
===================================================================
--- head/sys/crypto/skein/amd64/skein_block_asm.s
+++ head/sys/crypto/skein/amd64/skein_block_asm.s
@@ -1,1333 +0,0 @@
-#
-#----------------------------------------------------------------
-# 64-bit x86 assembler code (gnu as) for Skein block functions
-#
-# Author: Doug Whiting, Hifn/Exar
-#
-# This code is released to the public domain.
-#----------------------------------------------------------------
-# $FreeBSD$
-#
- .text
- .altmacro
-#ifndef __clang__
- .psize 0,128 #list file has no page boundaries
-#endif
-#
-_MASK_ALL_ = (256+512+1024) #all three algorithm bits
-_MAX_FRAME_ = 240
-#
-#################
-#ifndef SKEIN_USE_ASM
-_USE_ASM_ = _MASK_ALL_
-#else
-_USE_ASM_ = SKEIN_USE_ASM
-#endif
-#################
-#configure loop unrolling
-#ifndef SKEIN_LOOP
-_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024
-#else
-_SKEIN_LOOP = SKEIN_LOOP
- .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line
-#.print "+++ SKEIN_LOOP = \_NN_"
- .endr
-#endif
-# the unroll counts (0 --> fully unrolled)
-SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10
-SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10
-SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10
-#
-SKEIN_ASM_UNROLL = 0
- .irp _NN_,256,512,1024
- .if (SKEIN_UNROLL_\_NN_) == 0
-SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_
- .endif
- .endr
-#################
-#
-.ifndef SKEIN_ROUNDS
-ROUNDS_256 = 72
-ROUNDS_512 = 72
-ROUNDS_1024 = 80
-.else
-ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)
-ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)
-ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)
-# only display rounds if default size is changed on command line
-.irp _NN_,256,512,1024
- .if _USE_ASM_ && \_NN_
- .irp _RR_,%(ROUNDS_\_NN_)
- .if _NN_ < 1024
-.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
- .else
-.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_"
- .endif
- .endr
- .endif
-.endr
-.endif
-#################
-#
-.ifdef SKEIN_CODE_SIZE
-_SKEIN_CODE_SIZE = (1)
-.else
-.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined
-_SKEIN_CODE_SIZE = (1)
-.else
-_SKEIN_CODE_SIZE = (0)
-.endif
-.endif
-#
-#################
-#
-.ifndef SKEIN_DEBUG
-_SKEIN_DEBUG = 0
-.else
-_SKEIN_DEBUG = 1
-.endif
-#################
-#
-# define offsets of fields in hash context structure
-#
-HASH_BITS = 0 #bits of hash output
-BCNT = 8 + HASH_BITS #number of bytes in BUFFER[]
-TWEAK = 8 + BCNT #tweak values[0..1]
-X_VARS = 16 + TWEAK #chaining vars
-#
-#(Note: buffer[] in context structure is NOT needed here :-)
-#
-KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words
-FIRST_MASK = ~ (1 << 6)
-FIRST_MASK64= ~ (1 << 62)
-#
-# rotation constants for Skein
-#
-RC_256_0_0 = 14
-RC_256_0_1 = 16
-
-RC_256_1_0 = 52
-RC_256_1_1 = 57
-
-RC_256_2_0 = 23
-RC_256_2_1 = 40
-
-RC_256_3_0 = 5
-RC_256_3_1 = 37
-
-RC_256_4_0 = 25
-RC_256_4_1 = 33
-
-RC_256_5_0 = 46
-RC_256_5_1 = 12
-
-RC_256_6_0 = 58
-RC_256_6_1 = 22
-
-RC_256_7_0 = 32
-RC_256_7_1 = 32
-
-RC_512_0_0 = 46
-RC_512_0_1 = 36
-RC_512_0_2 = 19
-RC_512_0_3 = 37
-
-RC_512_1_0 = 33
-RC_512_1_1 = 27
-RC_512_1_2 = 14
-RC_512_1_3 = 42
-
-RC_512_2_0 = 17
-RC_512_2_1 = 49
-RC_512_2_2 = 36
-RC_512_2_3 = 39
-
-RC_512_3_0 = 44
-RC_512_3_1 = 9
-RC_512_3_2 = 54
-RC_512_3_3 = 56
-
-RC_512_4_0 = 39
-RC_512_4_1 = 30
-RC_512_4_2 = 34
-RC_512_4_3 = 24
-
-RC_512_5_0 = 13
-RC_512_5_1 = 50
-RC_512_5_2 = 10
-RC_512_5_3 = 17
-
-RC_512_6_0 = 25
-RC_512_6_1 = 29
-RC_512_6_2 = 39
-RC_512_6_3 = 43
-
-RC_512_7_0 = 8
-RC_512_7_1 = 35
-RC_512_7_2 = 56
-RC_512_7_3 = 22
-
-RC_1024_0_0 = 24
-RC_1024_0_1 = 13
-RC_1024_0_2 = 8
-RC_1024_0_3 = 47
-RC_1024_0_4 = 8
-RC_1024_0_5 = 17
-RC_1024_0_6 = 22
-RC_1024_0_7 = 37
-
-RC_1024_1_0 = 38
-RC_1024_1_1 = 19
-RC_1024_1_2 = 10
-RC_1024_1_3 = 55
-RC_1024_1_4 = 49
-RC_1024_1_5 = 18
-RC_1024_1_6 = 23
-RC_1024_1_7 = 52
-
-RC_1024_2_0 = 33
-RC_1024_2_1 = 4
-RC_1024_2_2 = 51
-RC_1024_2_3 = 13
-RC_1024_2_4 = 34
-RC_1024_2_5 = 41
-RC_1024_2_6 = 59
-RC_1024_2_7 = 17
-
-RC_1024_3_0 = 5
-RC_1024_3_1 = 20
-RC_1024_3_2 = 48
-RC_1024_3_3 = 41
-RC_1024_3_4 = 47
-RC_1024_3_5 = 28
-RC_1024_3_6 = 16
-RC_1024_3_7 = 25
-
-RC_1024_4_0 = 41
-RC_1024_4_1 = 9
-RC_1024_4_2 = 37
-RC_1024_4_3 = 31
-RC_1024_4_4 = 12
-RC_1024_4_5 = 47
-RC_1024_4_6 = 44
-RC_1024_4_7 = 30
-
-RC_1024_5_0 = 16
-RC_1024_5_1 = 34
-RC_1024_5_2 = 56
-RC_1024_5_3 = 51
-RC_1024_5_4 = 4
-RC_1024_5_5 = 53
-RC_1024_5_6 = 42
-RC_1024_5_7 = 41
-
-RC_1024_6_0 = 31
-RC_1024_6_1 = 44
-RC_1024_6_2 = 47
-RC_1024_6_3 = 46
-RC_1024_6_4 = 19
-RC_1024_6_5 = 42
-RC_1024_6_6 = 44
-RC_1024_6_7 = 25
-
-RC_1024_7_0 = 9
-RC_1024_7_1 = 48
-RC_1024_7_2 = 35
-RC_1024_7_3 = 52
-RC_1024_7_4 = 23
-RC_1024_7_5 = 31
-RC_1024_7_6 = 37
-RC_1024_7_7 = 20
-#
-# Input: reg
-# Output: <reg> <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024
-#
-.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM
- .if RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM #is there anything to do?
- rolq $RC_\BLK_SIZE\()_\ROUND_NUM\()_\MIX_NUM,%\reg
- .endif
-.endm
-#
-#----------------------------------------------------------------
-#
-# MACROS: define local vars and configure stack
-#
-#----------------------------------------------------------------
-# declare allocated space on the stack
-.macro StackVar localName,localSize
-\localName = _STK_OFFS_
-_STK_OFFS_ = _STK_OFFS_+(\localSize)
-.endm #StackVar
-#
-#----------------------------------------------------------------
-#
-# MACRO: Configure stack frame, allocate local vars
-#
-.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt
- WCNT = (\BLK_BITS)/64
-#
-_PushCnt_ = 0 #save nonvolatile regs on stack
- .irp _reg_,rbp,rbx,r12,r13,r14,r15
- pushq %\_reg_
-_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment
- .endr
-#
-_STK_OFFS_ = 0 #starting offset from rsp
- #---- local variables #<-- rsp
- StackVar X_stk ,8*(WCNT) #local context vars
- StackVar ksTwk ,8*3 #key schedule: tweak words
- StackVar ksKey ,8*(WCNT)+8 #key schedule: key words
- .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0
- StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen
- .endif
- StackVar Wcopy ,8*(WCNT) #copy of input block
- .if _SKEIN_DEBUG
- .if \debugCnt + 0 #temp location for debug X[] info
- StackVar xDebug_\BLK_BITS ,8*(\debugCnt)
- .endif
- .endif
- .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0
- StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?)
-tmpStk_\BLK_BITS = align16 #use this
- .endif
- #---- saved caller parameters (from regs rdi, rsi, rdx, rcx)
- StackVar ctxPtr ,8 #context ptr
- StackVar blkPtr ,8 #pointer to block data
- StackVar blkCnt ,8 #number of full blocks to process
- StackVar bitAdd ,8 #bit count to add to tweak
-LOCAL_SIZE = _STK_OFFS_ #size of "local" vars
- #----
- StackVar savRegs,8*_PushCnt_ #saved registers
- StackVar retAddr,8 #return address
- #---- caller's stack frame (aligned mod 16)
-#
-# set up the stack frame pointer (rbp)
-#
-FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey
- .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range
-FRAME_OFFS = _STK_OFFS_
- .endif
-F_O = -FRAME_OFFS
-#
- #put some useful defines in the .lst file (for grep)
-__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE
-__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_
-__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS
-#
-# Notes on stack frame setup:
-# * the most frequently used variable is X_stk[], based at [rsp+0]
-# * the next most used is the key schedule arrays, ksKey and ksTwk
-# so rbp is "centered" there, allowing short offsets to the key
-# schedule even in 1024-bit Skein case
-# * the Wcopy variables are infrequently accessed, but they have long
-# offsets from both rsp and rbp only in the 1024-bit case.
-# * all other local vars and calling parameters can be accessed
-# with short offsets, except in the 1024-bit case
-#
- subq $LOCAL_SIZE,%rsp #make room for the locals
- leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets
- movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack
- movq %rsi, blkPtr+F_O(%rbp)
- movq %rdx, blkCnt+F_O(%rbp)
- movq %rcx, bitAdd+F_O(%rbp)
-#
-.endm #Setup_Stack
-#
-#----------------------------------------------------------------
-#
-.macro Reset_Stack
- addq $LOCAL_SIZE,%rsp #get rid of locals (wipe?)
- .irp _reg_,r15,r14,r13,r12,rbx,rbp
- popq %\_reg_ #restore caller's regs
-_PushCnt_ = _PushCnt_ - 1
- .endr
- .if _PushCnt_
- .error "Mismatched push/pops?"
- .endif
-.endm # Reset_Stack
-#
-#----------------------------------------------------------------
-# macros to help debug internals
-#
-.if _SKEIN_DEBUG
- .extern Skein_Show_Block #calls to C routines
- .extern Skein_Show_Round
-#
-SKEIN_RND_SPECIAL = 1000
-SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0
-SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1
-SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2
-#
-.macro Skein_Debug_Block BLK_BITS
-#
-#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,
-# const u08b_t *blkPtr, const u64b_t *wPtr,
-# const u64b_t *ksPtr,const u64b_t *tsPtr)
-#
-_NN_ = 0
- .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11
- pushq %\_reg_ #save all volatile regs on tack before the call
-_NN_ = _NN_ + 1
- .endr
- # get and push call parameters
- movq $\BLK_BITS ,%rdi #bits
- movq ctxPtr+F_O(%rbp),%rsi #h (pointer)
- leaq X_VARS (%rsi),%rdx #X (pointer)
- movq blkPtr+F_O(%rbp),%rcx #blkPtr
- leaq Wcopy +F_O(%rbp),%r8 #wPtr
- leaq ksKey +F_O(%rbp),%r9 #key pointer
- leaq ksTwk +F_O(%rbp),%rax #tweak pointer
- pushq %rax # (pass on the stack)
- call Skein_Show_Block #call external debug handler
- addq $8*1,%rsp #discard parameters on stack
- .if (_NN_ % 2 ) == 0 #check stack alignment
- .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS"
- .endif
- .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax
- popq %\_reg_ #restore regs
-_NN_ = _NN_ - 1
- .endr
- .if _NN_
- .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS"
- .endif
-.endm # Skein_Debug_Block
-#
-# the macro to "call" to debug a round
-#
-.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
- # call the appropriate (local) debug "function"
- pushq %rdx #save rdx, so we can use it for round "number"
- .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL)
- movq $\R,%rdx
- .else #compute round number using edi
-_rOffs_ = \RDI_OFFS + 0
- .if \BLK_BITS == 1024
- movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above)
- leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx
- .else
- leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx
- .endif
- .endif
- call Skein_Debug_Round_\BLK_BITS
- popq %rdx #restore origianl rdx value
-#
- afterOp
-.endm # Skein_Debug_Round
-.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled)
-.macro Skein_Debug_Block BLK_BITS
-.endm
-#
-.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp
-.endm
-#
-.endif # _SKEIN_DEBUG
-#
-#----------------------------------------------------------------
-#
-.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs
- .if \immOffs + 0
- leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
- .elseif ((\useAddOp + 0) == 0)
- .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs!
- leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg
- .else
- addq %\srcReg_A\srcReg_B,%\dstReg
- .endif
- .else
- addq %\srcReg_A\srcReg_B,%\dstReg
- .endif
-.endm
-
-# keep Intel-style ordering here, to match addReg
-.macro xorReg dstReg,srcReg_A,srcReg_B
- xorq %\srcReg_A\srcReg_B,%\dstReg
-.endm
-#
-#----------------------------------------------------------------
-#
-.macro C_label lName
- \lName: #use both "genders" to work across linkage conventions
-_\lName:
- .global \lName
- .global _\lName
-.endm
-#
-#=================================== Skein_256 =============================================
-#
-.if _USE_ASM_ & 256
-#
-# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
-#
-#################
-#
-# code
-#
-C_label Skein_256_Process_Block
- Setup_Stack 256,((ROUNDS_256/8)+1)
- movq TWEAK+8(%rdi),%r14
- jmp Skein_256_block_loop
- .p2align 4
- # main hash loop for Skein_256
-Skein_256_block_loop:
- #
- # general register usage:
- # RAX..RDX = X0..X3
- # R08..R12 = ks[0..4]
- # R13..R15 = ts[0..2]
- # RSP, RBP = stack/frame pointers
- # RDI = round counter or context pointer
- # RSI = temp
- #
- movq TWEAK+0(%rdi) ,%r13
- addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0
- movq %r14 ,%r15
- xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak
-
- movq $KW_PARITY ,%r12
- movq X_VARS+ 0(%rdi),%r8
- movq X_VARS+ 8(%rdi),%r9
- movq X_VARS+16(%rdi),%r10
- movq X_VARS+24(%rdi),%r11
- movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0]
- xorq %r8 ,%r12 #start accumulating overall parity
-
- movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block
- xorq %r9 ,%r12
- movq 0(%rsi) ,%rax #get X[0..3]
- xorq %r10 ,%r12
- movq 8(%rsi) ,%rbx
- xorq %r11 ,%r12
- movq 16(%rsi) ,%rcx
- movq 24(%rsi) ,%rdx
-
- movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block
- movq %rbx,Wcopy+ 8+F_O(%rbp)
- movq %rcx,Wcopy+16+F_O(%rbp)
- movq %rdx,Wcopy+24+F_O(%rbp)
-
- addq %r8 ,%rax #initial key injection
- addq %r9 ,%rbx
- addq %r10,%rcx
- addq %r11,%rdx
- addq %r13,%rbx
- addq %r14,%rcx
-
-.if _SKEIN_DEBUG
- movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?)
- movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block
- movq %r9 ,ksKey+ 8+F_O(%rbp)
- movq %r10,ksKey+16+F_O(%rbp)
- movq %r11,ksKey+24+F_O(%rbp)
- movq %r12,ksKey+32+F_O(%rbp)
-
- movq %r13,ksTwk+ 0+F_O(%rbp)
- movq %r14,ksTwk+ 8+F_O(%rbp)
- movq %r15,ksTwk+16+F_O(%rbp)
-
- movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block
- movq %rbx,X_stk + 8(%rsp)
- movq %rcx,X_stk +16(%rsp)
- movq %rdx,X_stk +24(%rsp)
-
- Skein_Debug_Block 256 #debug dump
- Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL
-.endif
-#
-.if ((SKEIN_ASM_UNROLL & 256) == 0)
- movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code
- movq %r9 ,ksKey+ 8+F_O(%rbp)
- movq %r10,ksKey+16+F_O(%rbp)
- movq %r11,ksKey+24+F_O(%rbp)
- movq %r12,ksKey+32+F_O(%rbp)
-
- movq %r13,ksTwk+24+F_O(%rbp)
- movq %r14,ksTwk+ 8+F_O(%rbp)
- movq %r15,ksTwk+16+F_O(%rbp)
-.endif
- addq $WCNT*8,%rsi #skip the block
- movq %rsi,blkPtr +F_O(%rbp) #update block pointer
- #
- # now the key schedule is computed. Start the rounds
- #
-.if SKEIN_ASM_UNROLL & 256
-_UNROLL_CNT = ROUNDS_256/8
-.else
-_UNROLL_CNT = SKEIN_UNROLL_256
- .if ((ROUNDS_256/8) % _UNROLL_CNT)
- .error "Invalid SKEIN_UNROLL_256"
- .endif
- xorq %rdi,%rdi #rdi = iteration count
-Skein_256_round_loop:
-.endif
-_Rbase_ = 0
-.rept _UNROLL_CNT*2
- # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled)
- # round 4*_RBase_ + 0
- addReg rax, rbx
- RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0
- addReg rcx, rdx
- .if (SKEIN_ASM_UNROLL & 256) == 0
- movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8
- .endif
- xorReg rbx, rax
- RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1
- xorReg rdx, rcx
- .if SKEIN_ASM_UNROLL & 256
- .irp _r0_,%( 8+(_Rbase_+3) % 5)
- .irp _r1_,%(13+(_Rbase_+2) % 3)
- leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx
- .endr
- .endr
- .endif
- .if (SKEIN_ASM_UNROLL & 256) == 0
- movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13
- .endif
- Skein_Debug_Round 256,%(4*_Rbase_+1)
-
- # round 4*_Rbase_ + 1
- addReg rax, rdx
- RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0
- xorReg rdx, rax
- .if (SKEIN_ASM_UNROLL & 256) == 0
- movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9
- .endif
- addReg rcx, rbx
- RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1
- xorReg rbx, rcx
- .if (SKEIN_ASM_UNROLL & 256) == 0
- movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11
- .endif
- Skein_Debug_Round 256,%(4*_Rbase_+2)
- .if SKEIN_ASM_UNROLL & 256
- .irp _r0_,%( 8+(_Rbase_+2) % 5)
- .irp _r1_,%(13+(_Rbase_+1) % 3)
- leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx
- .endr
- .endr
- .endif
- # round 4*_Rbase_ + 2
- addReg rax, rbx
- RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0
- addReg rcx, rdx
- .if (SKEIN_ASM_UNROLL & 256) == 0
- movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10
- .endif
- xorReg rbx, rax
- RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1
- xorReg rdx, rcx
- .if (SKEIN_ASM_UNROLL & 256) == 0
- movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key
- leaq 1(%r11,%rdi),%r11 #precompute key + tweak
- .endif
- Skein_Debug_Round 256,%(4*_Rbase_+3)
- # round 4*_Rbase_ + 3
- addReg rax, rdx
- RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0
- addReg rcx, rbx
- .if (SKEIN_ASM_UNROLL & 256) == 0
- addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak
- movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak
- .endif
- xorReg rdx, rax
- RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1
- xorReg rbx, rcx
- Skein_Debug_Round 256,%(4*_Rbase_+4)
- .if (SKEIN_ASM_UNROLL & 256) == 0
- addReg r9 ,r13 #precompute key+tweak
- .endif
- #inject key schedule words
-_Rbase_ = _Rbase_+1
- .if SKEIN_ASM_UNROLL & 256
- addReg rax,r,%(8+((_Rbase_+0) % 5))
- addReg rbx,rsi
- addReg rcx,rdi
- addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_
- .else
- incq %rdi
- addReg rax,r8
- addReg rcx,r10
- addReg rbx,r9
- addReg rdx,r11
- .endif
- Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT
-.endr #rept _UNROLL_CNT
-#
-.if (SKEIN_ASM_UNROLL & 256) == 0
- cmpq $2*(ROUNDS_256/8),%rdi
- jb Skein_256_round_loop
-.endif # (SKEIN_ASM_UNROLL & 256) == 0
- movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
-
- #----------------------------
- # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3}
- movq $FIRST_MASK64 ,%r14
- xorq Wcopy + 0+F_O (%rbp),%rax
- xorq Wcopy + 8+F_O (%rbp),%rbx
- xorq Wcopy +16+F_O (%rbp),%rcx
- xorq Wcopy +24+F_O (%rbp),%rdx
- andq TWEAK + 8 (%rdi),%r14
- movq %rax,X_VARS+ 0(%rdi) #store final result
- movq %rbx,X_VARS+ 8(%rdi)
- movq %rcx,X_VARS+16(%rdi)
- movq %rdx,X_VARS+24(%rdi)
-
- Skein_Debug_Round 256,SKEIN_RND_FEED_FWD
-
- # go back for more blocks, if needed
- decq blkCnt+F_O(%rbp)
- jnz Skein_256_block_loop
- movq %r14,TWEAK + 8(%rdi)
- Reset_Stack
- ret
-Skein_256_Process_Block_End:
-
- .if _SKEIN_DEBUG
-Skein_Debug_Round_256: #here with rdx == round "number" from macro
- pushq %rsi #save two regs for BLK_BITS-specific parms
- pushq %rdi
- movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi
- movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it
- movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!)
- movq %rcx,X_stk+16+F_O(%rbp)
- movq %rdi,X_stk+24+F_O(%rbp)
-
- movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
- movq $256,%rdi #now <rdi,rsi,rdx> are set for the call
- jmp Skein_Debug_Round_Common
- .endif
-#
-.if _SKEIN_CODE_SIZE
-C_label Skein_256_Process_Block_CodeSize
- movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax
- ret
-#
-C_label Skein_256_Unroll_Cnt
- .if _UNROLL_CNT <> ROUNDS_256/8
- movq $_UNROLL_CNT,%rax
- .else
- xorq %rax,%rax
- .endif
- ret
-.endif
-#
-.endif #_USE_ASM_ & 256
-#
-#=================================== Skein_512 =============================================
-#
-.if _USE_ASM_ & 512
-#
-# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)
-#
-# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7)
-#
-#################
-# MACRO: one round for 512-bit blocks
-#
-.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4
-#
- addReg r\rn0, r\rn1
- RotL64 r\rn1, 512,%((\_Rn_) % 8),0
- xorReg r\rn1, r\rn0
- \op1
- addReg r\rn2, r\rn3
- RotL64 r\rn3, 512,%((\_Rn_) % 8),1
- xorReg r\rn3, r\rn2
- \op2
- addReg r\rn4, r\rn5
- RotL64 r\rn5, 512,%((\_Rn_) % 8),2
- xorReg r\rn5, r\rn4
- \op3
- addReg r\rn6, r\rn7
- RotL64 r\rn7, 512,%((\_Rn_) % 8),3
- xorReg r\rn7, r\rn6
- \op4
- Skein_Debug_Round 512,%(\_Rn_+1),-4
-#
-.endm #R_512_OneRound
-#
-#################
-# MACRO: eight rounds for 512-bit blocks
-#
-.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8)
- .if (SKEIN_ASM_UNROLL && 512)
- # here for fully unrolled case.
- _II_ = ((\_RR_)/4) + 1 #key injection counter
- R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*(((_II_)+3) % 9)+F_O(%rbp),%rax>,,<movq ksKey+8*(((_II_)+4) % 9)+F_O(%rbp),%rbx>
- R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*(((_II_)+5) % 9)+F_O(%rbp),%rcx>,,<movq ksKey+8*(((_II_)+6) % 9)+F_O(%rbp),%rdx>
- R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*(((_II_)+7) % 9)+F_O(%rbp),%rsi>,,<addq ksTwk+8*(((_II_)+0) % 3)+F_O(%rbp),%rcx>
- R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<addq ksTwk+8*(((_II_)+1) % 3)+F_O(%rbp),%rdx>,
- # inject the key schedule
- addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8
- addReg r11, rax
- addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9
- addReg r12, rbx
- addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10
- addReg r13, rcx
- addReg r14, rdx
- addReg r15, rsi,,,(_II_)
- .else
- # here for looping case #"rotate" key/tweak schedule (move up on stack)
- incq %rdi #bump key injection counter
- R_512_OneRound 8, 9,10,11,12,13,14,15,%((\_RR_)+0),<movq ksKey+8*6+F_O(%rbp,%rdi,8),%rdx>,<movq ksTwk-8*1+F_O(%rbp,%rdi,8),%rax>,<movq ksKey-8*1+F_O(%rbp,%rdi,8),%rsi>
- R_512_OneRound 10, 9,12,15,14,13, 8,11,%((\_RR_)+1),<movq ksKey+8*5+F_O(%rbp,%rdi,8),%rcx>,<movq %rax,ksTwk+8*2+F_O(%rbp,%rdi,8) >,<movq %rsi,ksKey+8*8+F_O(%rbp,%rdi,8)>
- R_512_OneRound 12, 9,14,11, 8,13,10,15,%((\_RR_)+2),<movq ksKey+8*4+F_O(%rbp,%rdi,8),%rbx>,<addq ksTwk+8*1+F_O(%rbp,%rdi,8),%rdx>,<movq ksKey+8*7+F_O(%rbp,%rdi,8),%rsi>
- R_512_OneRound 14, 9, 8,15,10,13,12,11,%((\_RR_)+3),<movq ksKey+8*3+F_O(%rbp,%rdi,8),%rax>,<addq ksTwk+8*0+F_O(%rbp,%rdi,8),%rcx>
- # inject the key schedule
- addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8
- addReg r11, rax
- addReg r12, rbx
- addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9
- addReg r13, rcx
- addReg r14, rdx
- addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10
- addReg r15, rsi
- addReg r15, rdi #inject the round number
- .endif
-
- #show the result of the key injection
- Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT
-.endm #R_512_EightRounds
-#
-#################
-# instantiated code
-#
-C_label Skein_512_Process_Block
- Setup_Stack 512,ROUNDS_512/8
- movq TWEAK+ 8(%rdi),%rbx
- jmp Skein_512_block_loop
- .p2align 4
- # main hash loop for Skein_512
-Skein_512_block_loop:
- # general register usage:
- # RAX..RDX = temps for key schedule pre-loads
- # R8 ..R15 = X0..X7
- # RSP, RBP = stack/frame pointers
- # RDI = round counter or context pointer
- # RSI = temp
- #
- movq TWEAK + 0(%rdi),%rax
- addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0
- movq %rbx,%rcx
- xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule
- movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0]
- movq %rax,ksTwk+ 0+F_O(%rbp)
- movq $KW_PARITY,%rdx
- movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block
- movq %rbx,ksTwk+ 8+F_O(%rbp)
- movq %rcx,ksTwk+16+F_O(%rbp)
- .irp _Rn_,8,9,10,11,12,13,14,15
- movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_
- xorq %r\_Rn_,%rdx #compute overall parity
- movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp)
- .endr #load state into %r8 ..%r15, compute parity
- movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity
-
- addReg r13,rax #precompute key injection for tweak
- addReg r14, rbx
-.if _SKEIN_DEBUG
- movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below
-.endif
- movq 0(%rsi),%rax #load input block
- movq 8(%rsi),%rbx
- movq 16(%rsi),%rcx
- movq 24(%rsi),%rdx
- addReg r8 , rax #do initial key injection
- addReg r9 , rbx
- movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward
- movq %rbx,Wcopy+ 8+F_O(%rbp)
- addReg r10, rcx
- addReg r11, rdx
- movq %rcx,Wcopy+16+F_O(%rbp)
- movq %rdx,Wcopy+24+F_O(%rbp)
-
- movq 32(%rsi),%rax
- movq 40(%rsi),%rbx
- movq 48(%rsi),%rcx
- movq 56(%rsi),%rdx
- addReg r12, rax
- addReg r13, rbx
- addReg r14, rcx
- addReg r15, rdx
- movq %rax,Wcopy+32+F_O(%rbp)
- movq %rbx,Wcopy+40+F_O(%rbp)
- movq %rcx,Wcopy+48+F_O(%rbp)
- movq %rdx,Wcopy+56+F_O(%rbp)
-
-.if _SKEIN_DEBUG
- .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output
- movq %r\_Rn_,X_stk+8*(\_Rn_-8)(%rsp)
- .endr
-
- Skein_Debug_Block 512 #debug dump
- Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL
-.endif
- addq $8*WCNT,%rsi #skip the block
- movq %rsi,blkPtr+F_O(%rbp) #update block pointer
- #
- #################
- # now the key schedule is computed. Start the rounds
- #
-.if SKEIN_ASM_UNROLL & 512
-_UNROLL_CNT = ROUNDS_512/8
-.else
-_UNROLL_CNT = SKEIN_UNROLL_512
- .if ((ROUNDS_512/8) % _UNROLL_CNT)
- .error "Invalid SKEIN_UNROLL_512"
- .endif
- xorq %rdi,%rdi #rdi = round counter
-Skein_512_round_loop:
-.endif
-#
-_Rbase_ = 0
-.rept _UNROLL_CNT*2
- R_512_FourRounds %(4*_Rbase_+00)
-_Rbase_ = _Rbase_+1
-.endr #rept _UNROLL_CNT
-#
-.if (SKEIN_ASM_UNROLL & 512) == 0
- cmpq $2*(ROUNDS_512/8),%rdi
- jb Skein_512_round_loop
- movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context
-.endif
- # end of rounds
- #################
- # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7}
- .irp _Rn_,8,9,10,11,12,13,14,15
- .if (\_Rn_ == 8)
- movq $FIRST_MASK64,%rbx
- .endif
- xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR
- movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result
- .if (\_Rn_ == 14)
- andq TWEAK+ 8(%rdi),%rbx
- .endif
- .endr
- Skein_Debug_Round 512,SKEIN_RND_FEED_FWD
-
- # go back for more blocks, if needed
- decq blkCnt+F_O(%rbp)
- jnz Skein_512_block_loop
- movq %rbx,TWEAK + 8(%rdi)
-
- Reset_Stack
- ret
-Skein_512_Process_Block_End:
-#
- .if _SKEIN_DEBUG
-# call here with rdx = "round number"
-Skein_Debug_Round_512:
- pushq %rsi #save two regs for BLK_BITS-specific parms
- pushq %rdi
- .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it
- movq %r\_Rn_,X_stk+8*(\_Rn_-8)+F_O(%rbp)
- .endr
- movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr
- movq $512,%rdi #now <rdi,rsi,rdx> are set for the call
- jmp Skein_Debug_Round_Common
- .endif
-#
-.if _SKEIN_CODE_SIZE
-C_label Skein_512_Process_Block_CodeSize
- movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax
- ret
-#
-C_label Skein_512_Unroll_Cnt
- .if _UNROLL_CNT <> (ROUNDS_512/8)
- movq $_UNROLL_CNT,%rax
- .else
- xorq %rax,%rax
- .endif
- ret
-.endif
-#
-.endif # _USE_ASM_ & 512
-#
-#=================================== Skein1024 =============================================
-.if _USE_ASM_ & 1024
-#
-# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)#
-#
-#################
-# use details of permutation to make register assignments
-#
-o1K_rdi = 0 #offsets in X[] associated with each register
-o1K_rsi = 1
-o1K_rbp = 2
-o1K_rax = 3
-o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate
-o1K_rbx = 5
-o1K_rdx = 7
-o1K_r8 = 8
-o1K_r9 = 9
-o1K_r10 = 10
-o1K_r11 = 11
-o1K_r12 = 12
-o1K_r13 = 13
-o1K_r14 = 14
-o1K_r15 = 15
-#
-rIdx_offs = tmpStk_1024
-#
-.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1
- addReg \reg0 , \reg1 #perform the MIX
- RotL64 \reg1 , 1024,%((\_RN0_) % 8),\_Rn1_
- xorReg \reg1 , \reg0
-.if ((\_RN0_) && 3) == 3 #time to do key injection?
- .if _SKEIN_DEBUG
- movq %\reg0 , xDebug_1024+8*\w0(%rsp) #save intermediate values for Debug_Round
- movq %\reg1 , xDebug_1024+8*\w1(%rsp) # (before inline key injection)
- .endif
-_II_ = ((\_RN0_)/4)+1 #injection count
- .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection
- addq ksKey+ 8*((_II_+\w0) % 17)(%rsp),%\reg0
- addq ksKey+ 8*((_II_+\w1) % 17)(%rsp),%\reg1
- .if \w1 == 13 #tweak injection
- addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1
- .elseif \w0 == 14
- addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0
- .elseif \w1 == 15
- addq $_II_, %\reg1 #(injection counter)
- .endif
- .else #here to do looping key injection
- .if (\w0 == 0)
- movq %rdi, X_stk+8*\w0(%rsp) #if so, store N0 so we can use reg as index
- movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi
- .else
- addq ksKey+8+8*\w0(%rsp,%rdi,8),%\reg0 #even key injection
- .endif
- .if \w1 == 13 #tweak injection
- addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1
- .elseif \w0 == 14
- addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0
- .elseif \w1 == 15
- addReg \reg1,rdi,,,1 #(injection counter)
- .endif
- addq ksKey+8+8*\w1(%rsp,%rdi,8),%\reg1 #odd key injection
- .endif
-.endif
- # insert the op provided, .if any
- \op1
-.endm
-#################
-# MACRO: four rounds for 1024-bit blocks
-#
-.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4)
- # should be here with X4 set properly, X6 stored on stack
-_Rn_ = (\_RR_) + 0
- r1024_Mix 0, 1,rdi,rsi,_Rn_,0
- r1024_Mix 2, 3,rbp,rax,_Rn_,1
- r1024_Mix 4, 5,rcx,rbx,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
- r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
- r1024_Mix 10,11,r10,r11,_Rn_,5
- r1024_Mix 12,13,r12,r13,_Rn_,6
- r1024_Mix 6, 7,rcx,rdx,_Rn_,3
- r1024_Mix 14,15,r14,r15,_Rn_,7
- .if _SKEIN_DEBUG
- Skein_Debug_Round 1024,%(_Rn_+1)
- .endif
-_Rn_ = (\_RR_) + 1
- r1024_Mix 0, 9,rdi,r9 ,_Rn_,0
- r1024_Mix 2,13,rbp,r13,_Rn_,1
- r1024_Mix 6,11,rcx,r11,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
- r1024_Mix 10, 7,r10,rdx,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
- r1024_Mix 12, 3,r12,rax,_Rn_,5
- r1024_Mix 14, 5,r14,rbx,_Rn_,6
- r1024_Mix 4,15,rcx,r15,_Rn_,3
- r1024_Mix 8, 1,r8 ,rsi,_Rn_,7
- .if _SKEIN_DEBUG
- Skein_Debug_Round 1024,%(_Rn_+1)
- .endif
-_Rn_ = (\_RR_) + 2
- r1024_Mix 0, 7,rdi,rdx,_Rn_,0
- r1024_Mix 2, 5,rbp,rbx,_Rn_,1
- r1024_Mix 4, 3,rcx,rax,_Rn_,2,<movq %rcx,X_stk+8*4(%rsp)> #save X4 on stack (x4/x6 alternate)
- r1024_Mix 12,15,r12,r15,_Rn_,4,<movq X_stk+8*6(%rsp),%rcx> #load X6 from stack
- r1024_Mix 14,13,r14,r13,_Rn_,5
- r1024_Mix 8,11,r8 ,r11,_Rn_,6
- r1024_Mix 6, 1,rcx,rsi,_Rn_,3
- r1024_Mix 10, 9,r10,r9 ,_Rn_,7
- .if _SKEIN_DEBUG
- Skein_Debug_Round 1024,%(_Rn_+1)
- .endif
-_Rn_ = (\_RR_) + 3
- r1024_Mix 0,15,rdi,r15,_Rn_,0
- r1024_Mix 2,11,rbp,r11,_Rn_,1
- r1024_Mix 6,13,rcx,r13,_Rn_,2,<movq %rcx,X_stk+8*6(%rsp)> #save X6 on stack (x4/x6 alternate)
- r1024_Mix 14, 1,r14,rsi,_Rn_,4,<movq X_stk+8*4(%rsp),%rcx> #load X4 from stack
- r1024_Mix 8, 5,r8 ,rbx,_Rn_,5
- r1024_Mix 10, 3,r10,rax,_Rn_,6
- r1024_Mix 4, 9,rcx,r9 ,_Rn_,3
- r1024_Mix 12, 7,r12,rdx,_Rn_,7
- .if _SKEIN_DEBUG
- Skein_Debug_Round 1024,%(_Rn_+1)
- .endif
-
- .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack
- #"rotate" the key schedule on the stack
-i8 = o1K_r8
-i0 = o1K_rdi
- movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack)
- movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word
- movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!)
- movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word
- movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack)
- movq X_stk+8*i8(%rsp) ,%r8 #get the reg back
- incq %rdi #bump the index
- movq %rdi, rIdx_offs (%rsp) #save rdi again
- movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back
- addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection
- .endif
- #show the result of the key injection
- Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT
-.endm #r1024_FourRounds
-#
-################
-# code
-#
-C_label Skein1024_Process_Block
-#
- Setup_Stack 1024,ROUNDS_1024/8,WCNT
- movq TWEAK+ 8(%rdi),%r9
- jmp Skein1024_block_loop
- # main hash loop for Skein1024
- .p2align 4
-Skein1024_block_loop:
- # general register usage:
- # RSP = stack pointer
- # RAX..RDX,RSI,RDI = X1, X3..X7 (state words)
- # R8 ..R15 = X8..X15 (state words)
- # RBP = temp (used for X0 and X2)
- #
- .if (SKEIN_ASM_UNROLL & 1024) == 0
- xorq %rax,%rax #init loop index on the stack
- movq %rax,rIdx_offs(%rsp)
- .endif
- movq TWEAK+ 0(%rdi),%r8
- addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0
- movq %r9 ,%r10
- xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule
- movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0]
- movq %r8 ,ksTwk+ 0+F_O(%rbp)
- movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below
- movq %r10,ksTwk+16+F_O(%rbp)
- .if _SKEIN_DEBUG
- movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block
- .endif
- movq blkPtr +F_O(%rbp),%rsi # rsi --> input block
- movq $KW_PARITY ,%rax #overall key schedule parity
-
- # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3]
- .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps
- movq X_VARS+8*\_rN_(%rdi),%r14 #get state word
- movq 8*\_rN_(%rsi),%r15 #get msg word
- xorq %r14,%rax #update key schedule overall parity
- movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack
- movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy
- addq %r15,%r14 #do the initial key injection
- movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack
- .endr
- # now process the rest, using the "real" registers
- # (MUST do it in reverse order to inject tweaks r8/r9 first)
- .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx
-_oo_ = o1K_\_rr_ #offset assocated with the register
- movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context
- movq 8*_oo_(%rsi),%rcx #get next input msg word
- movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack
- xorq %\_rr_, %rax #accumulate key schedule parity
- movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward
- addq %rcx,%\_rr_ #do the initial key injection
- .if _oo_ == 13 #do the initial tweak injection
- addReg \_rr_,r8 # (only in words 13/14)
- .elseif _oo_ == 14
- addReg \_rr_,r9
- .endif
- .endr
- movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity
-.if _SKEIN_DEBUG
- Skein_Debug_Block 1024 #initial debug dump
-.endif
- addq $8*WCNT,%rsi #bump the msg ptr
- movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr
- # re-load words 0..4 from stack, enter the main loop
- .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack)
- movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go!
- .endr
-.if _SKEIN_DEBUG
- Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection
-.endif
- #
- #################
- # now the key schedule is computed. Start the rounds
- #
-.if SKEIN_ASM_UNROLL & 1024
-_UNROLL_CNT = ROUNDS_1024/8
-.else
-_UNROLL_CNT = SKEIN_UNROLL_1024
- .if ((ROUNDS_1024/8) % _UNROLL_CNT)
- .error "Invalid SKEIN_UNROLL_1024"
- .endif
-Skein1024_round_loop:
-.endif
-#
-_Rbase_ = 0
-.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time
- r1024_FourRounds %(4*_Rbase_+00)
-_Rbase_ = _Rbase_+1
-.endr #rept _UNROLL_CNT
-#
-.if (SKEIN_ASM_UNROLL & 1024) == 0
- cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done
- jb Skein1024_round_loop
-.endif
- # end of rounds
- #################
- #
- # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15}
- movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack
- movq ctxPtr(%rsp),%rdx
-
- .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7
-_oo_ = o1K_\_rr_
- xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR
- movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context
- .if (_oo_ == 9)
- movq $FIRST_MASK64 ,%r9
- .endif
- .if (_oo_ == 14)
- andq TWEAK+ 8(%rdx),%r9
- .endif
- .endr
- #
- movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above)
- movq X_stk +8*7(%rsp),%rbx
- xorq Wcopy +8*6(%rsp),%rax
- xorq Wcopy +8*7(%rsp),%rbx
- movq %rax,X_VARS+8*6(%rdx)
- decq blkCnt(%rsp) #set zero flag iff done
- movq %rbx,X_VARS+8*7(%rdx)
-
- Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,,<cmpq $0,blkCnt(%rsp)>
- # go back for more blocks, if needed
- movq ctxPtr(%rsp),%rdi #don't muck with the flags here!
- lea FRAME_OFFS(%rsp),%rbp
- jnz Skein1024_block_loop
- movq %r9 ,TWEAK+ 8(%rdx)
- Reset_Stack
- ret
-#
-Skein1024_Process_Block_End:
-#
-.if _SKEIN_DEBUG
-Skein_Debug_Round_1024:
- # call here with rdx = "round number",
-_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr
- #
- #save rest of X[] state on stack so debug routines can access it
- .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15
- movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp)
- .endr
- # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack
- cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save
- jae save_x0
- testq $3,%rdx #otherwise only if rdx != 0 mod 4
- jz save_x0_not
-save_x0:
- movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp)
-save_x0_not:
- #figure out the x4/x6 swapping state and save the correct one!
- cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4
- jae save_x4
- testq $1,%rdx #and even ones have r4 as well
- jz save_x4
- movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp)
- jmp debug_1024_go
-save_x4:
- movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp)
-debug_1024_go:
- #now all is saved in Xstk[] except for rdx
- push %rsi #save two regs for BLK_BITS-specific parms
- push %rdi
-_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32)
-
- movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call)
- movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[]
-
- movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr
- movq $1024,%rdi #rdi = block size
- jmp Skein_Debug_Round_Common
-.endif
-#
-.if _SKEIN_CODE_SIZE
-C_label Skein1024_Process_Block_CodeSize
- movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax
- ret
-#
-C_label Skein1024_Unroll_Cnt
- .if _UNROLL_CNT <> (ROUNDS_1024/8)
- movq $_UNROLL_CNT,%rax
- .else
- xorq %rax,%rax
- .endif
- ret
-.endif
-#
-.endif # _USE_ASM_ and 1024
-#
-.if _SKEIN_DEBUG
-#----------------------------------------------------------------
-#local debug routine to set up for calls to:
-# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)
-# [ rdi rsi rdx rcx]
-#
-# here with %rdx = round number
-# %rsi = ctx_hdr_ptr
-# %rdi = block size (256/512/1024)
-# on stack: saved rdi, saved rsi, retAddr, saved rdx
-#
-Skein_Debug_Round_Common:
-_SP_OFFS_ = 32 #account for four words on stack already
- .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs
- pushq %\_rr_
-_SP_OFFS_ = _SP_OFFS_+8
- .endr
- .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here
- .error "Debug_Round_Common: stack alignment"
- .endif
- # compute %rcx = ptr to the X[] array on the stack (final parameter to call)
- leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address
- cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"?
- jnz _got_rcxA
- leaq X_VARS(%rsi),%rcx
-_got_rcxA:
- .if _USE_ASM_ & 1024
- # special handling for 1024-bit case
- # (for rounds right before with key injection:
- # use xDebug_1024[] instead of X_stk[])
- cmpq $SKEIN_RND_SPECIAL,%rdx
- jae _got_rcxB #must be a normal round
- orq %rdx,%rdx
- jz _got_rcxB #just before key injection
- test $3,%rdx
- jne _got_rcxB
- cmp $1024,%rdi #only 1024-bit(s) for now
- jne _got_rcxB
- leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx
-_got_rcxB:
- .endif
- call Skein_Show_Round #call external debug handler
-
- .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs
- popq %\_rr_
-_SP_OFFS_ = _SP_OFFS_-8
- .endr
- .if _SP_OFFS_ - 32
- .error "Debug_Round_Common: push/pop misalignment!"
- .endif
- popq %rdi
- popq %rsi
- ret
-.endif
-#----------------------------------------------------------------
- .section .note.GNU-stack,"",@progbits
-
- .end
Index: head/sys/modules/crypto/Makefile
===================================================================
--- head/sys/modules/crypto/Makefile
+++ head/sys/modules/crypto/Makefile
@@ -28,13 +28,13 @@
SRCS += skein.c skein_block.c
# unroll the 256 and 512 loops, half unroll the 1024
CFLAGS.skein_block.c += -DSKEIN_LOOP=995
-.if exists(${MACHINE_ARCH}/skein_block_asm.s)
+.if exists(${MACHINE_ARCH}/skein_block_asm.S)
.PATH: ${SRCTOP}/sys/crypto/skein/${MACHINE_ARCH}
-SRCS += skein_block_asm.s
+SRCS += skein_block_asm.S
CFLAGS += -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792
ACFLAGS += -DELF -Wa,--noexecstack
# Fully unroll all loops in the assembly optimized version
-AFLAGS+= --defsym SKEIN_LOOP=0 --defsym SKEIN_USE_ASM=1792
+ACFLAGS += -DSKEIN_LOOP=0
.endif
SRCS += siphash.c
SRCS += gmac.c gfmult.c

File Metadata

Mime Type
text/plain
Expires
Mon, Dec 22, 9:10 AM (18 h, 39 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27142957
Default Alt Text
D8434.id72758.diff (92 KB)

Event Timeline