Index: lib/libmd/Makefile =================================================================== --- lib/libmd/Makefile +++ lib/libmd/Makefile @@ -116,18 +116,18 @@ SRCS+= rmd160.S CFLAGS+= -DRMD160_ASM .endif -.if exists(${MACHINE_ARCH}/skein_block_asm.s) +.if exists(${MACHINE_ARCH}/skein_block_asm.S) .if defined(XAS) || ${MK_BINUTILS_BOOTSTRAP} != "no" AFLAGS += --strip-local-absolute # Fully unroll all loops in the assembly optimized version AFLAGS+= --defsym SKEIN_LOOP=0 -SRCS+= skein_block_asm.s +SRCS+= skein_block_asm.S CFLAGS+= -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792 .else .warning as not available: not using optimized Skein asm .endif .endif -.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.s) +.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.S) ACFLAGS+= -DELF -Wa,--noexecstack .endif .endif # ${USE_ASM_SOURCES} != 0 Index: sys/crypto/skein/amd64/skein_block_asm.S =================================================================== --- sys/crypto/skein/amd64/skein_block_asm.S +++ sys/crypto/skein/amd64/skein_block_asm.S @@ -10,60 +10,22 @@ # .text .altmacro - .psize 0,128 #list file has no page boundaries +# .psize 0,128 #list file has no page boundaries # _MASK_ALL_ = (256+512+1024) #all three algorithm bits _MAX_FRAME_ = 240 # ################# -.ifndef SKEIN_USE_ASM -_USE_ASM_ = _MASK_ALL_ -.else -_USE_ASM_ = SKEIN_USE_ASM -.endif +_USE_ASM_ = SKEIN_USE_ASM ################# -.ifndef SKEIN_LOOP #configure loop unrolling -_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 -.else -_SKEIN_LOOP = SKEIN_LOOP - .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line -#.print "+++ SKEIN_LOOP = \_NN_" - .endr -.endif +_SKEIN_LOOP = SKEIN_LOOP #default is fully unrolled for 256/512, twice for 1024 # the unroll counts (0 --> fully unrolled) -SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 -SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 -SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 -# SKEIN_ASM_UNROLL = 0 - .irp _NN_,256,512,1024 - .if (SKEIN_UNROLL_\_NN_) == 0 -SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ - .endif - .endr ################# # -.ifndef SKEIN_ROUNDS ROUNDS_256 = 72 ROUNDS_512 = 72 ROUNDS_1024 = 80 -.else -ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) -ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) -ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) -# only display rounds if default size is changed on command line -.irp _NN_,256,512,1024 - .if _USE_ASM_ && \_NN_ - .irp _RR_,%(ROUNDS_\_NN_) - .if _NN_ < 1024 -.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" - .else -.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" - .endif - .endr - .endif -.endr -.endif ################# # .ifdef SKEIN_CODE_SIZE @@ -78,13 +40,6 @@ # ################# # -.ifndef SKEIN_DEBUG -_SKEIN_DEBUG = 0 -.else -_SKEIN_DEBUG = 1 -.endif -################# -# # define offsets of fields in hash context structure # HASH_BITS = 0 #bits of hash output @@ -235,17 +190,6 @@ RC_1024_7_5 = 31 RC_1024_7_6 = 37 RC_1024_7_7 = 20 -# -# Input: reg -# Output: <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 -# -.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM -_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM - .if _RCNT_ #is there anything to do? - rolq $_RCNT_,%\reg - .endif -.endm -# #---------------------------------------------------------------- # # MACROS: define local vars and configure stack @@ -279,11 +223,6 @@ StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen .endif StackVar Wcopy ,8*(WCNT) #copy of input block - .if _SKEIN_DEBUG - .if \debugCnt + 0 #temp location for debug X[] info - StackVar xDebug_\BLK_BITS ,8*(\debugCnt) - .endif - .endif .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) tmpStk_\BLK_BITS = align16 #use this @@ -334,7 +273,7 @@ #---------------------------------------------------------------- # .macro Reset_Stack - addq $LOCAL_SIZE,%rsp #get rid of locals (wipe??) + addq $LOCAL_SIZE,%rsp #get rid of locals (wipe) .irp _reg_,r15,r14,r13,r12,rbx,rbp popq %\_reg_ #restore caller's regs _PushCnt_ = _PushCnt_ - 1 @@ -345,83 +284,7 @@ .endm # Reset_Stack # #---------------------------------------------------------------- -# macros to help debug internals # -.if _SKEIN_DEBUG - .extern Skein_Show_Block #calls to C routines - .extern Skein_Show_Round -# -SKEIN_RND_SPECIAL = 1000 -SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 -SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 -SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 -# -.macro Skein_Debug_Block BLK_BITS -# -#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, -# const u08b_t *blkPtr, const u64b_t *wPtr, -# const u64b_t *ksPtr,const u64b_t *tsPtr) -# -_NN_ = 0 - .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 - pushq %\_reg_ #save all volatile regs on tack before the call -_NN_ = _NN_ + 1 - .endr - # get and push call parameters - movq $\BLK_BITS ,%rdi #bits - movq ctxPtr+F_O(%rbp),%rsi #h (pointer) - leaq X_VARS (%rsi),%rdx #X (pointer) - movq blkPtr+F_O(%rbp),%rcx #blkPtr - leaq Wcopy +F_O(%rbp),%r8 #wPtr - leaq ksKey +F_O(%rbp),%r9 #key pointer - leaq ksTwk +F_O(%rbp),%rax #tweak pointer - pushq %rax # (pass on the stack) - call Skein_Show_Block #call external debug handler - addq $8*1,%rsp #discard parameters on stack - .if (_NN_ % 2 ) == 0 #check stack alignment - .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" - .endif - .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax - popq %\_reg_ #restore regs -_NN_ = _NN_ - 1 - .endr - .if _NN_ - .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" - .endif -.endm # Skein_Debug_Block -# -# the macro to "call" to debug a round -# -.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp - # call the appropriate (local) debug "function" - pushq %rdx #save rdx, so we can use it for round "number" - .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) - movq $\R,%rdx - .else #compute round number using edi -_rOffs_ = \RDI_OFFS + 0 - .if \BLK_BITS == 1024 - movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) - leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx - .else - leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx - .endif - .endif - call Skein_Debug_Round_\BLK_BITS - popq %rdx #restore origianl rdx value -# - afterOp -.endm # Skein_Debug_Round -.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) -.macro Skein_Debug_Block BLK_BITS -.endm -# -.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp -.endm -# -.endif # _SKEIN_DEBUG -# -#---------------------------------------------------------------- -# .macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs .if \immOffs + 0 leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg @@ -440,6 +303,13 @@ .macro xorReg dstReg,srcReg_A,srcReg_B xorq %\srcReg_A\srcReg_B,%\dstReg .endm + +# SkeinMix +.macro skeinMix a,b,c + addReg \a,\b + rolq \c,%\b + xorReg \b,\a +.endm # #---------------------------------------------------------------- # @@ -510,150 +380,807 @@ addq %r13,%rbx addq %r14,%rcx -.if _SKEIN_DEBUG - movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) - movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block - movq %r9 ,ksKey+ 8+F_O(%rbp) - movq %r10,ksKey+16+F_O(%rbp) - movq %r11,ksKey+24+F_O(%rbp) - movq %r12,ksKey+32+F_O(%rbp) - - movq %r13,ksTwk+ 0+F_O(%rbp) - movq %r14,ksTwk+ 8+F_O(%rbp) - movq %r15,ksTwk+16+F_O(%rbp) - - movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block - movq %rbx,X_stk + 8(%rsp) - movq %rcx,X_stk +16(%rsp) - movq %rdx,X_stk +24(%rsp) - - Skein_Debug_Block 256 #debug dump - Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL -.endif -# -.if ((SKEIN_ASM_UNROLL & 256) == 0) - movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code - movq %r9 ,ksKey+ 8+F_O(%rbp) - movq %r10,ksKey+16+F_O(%rbp) - movq %r11,ksKey+24+F_O(%rbp) - movq %r12,ksKey+32+F_O(%rbp) - - movq %r13,ksTwk+24+F_O(%rbp) - movq %r14,ksTwk+ 8+F_O(%rbp) - movq %r15,ksTwk+16+F_O(%rbp) -.endif addq $WCNT*8,%rsi #skip the block movq %rsi,blkPtr +F_O(%rbp) #update block pointer # # now the key schedule is computed. Start the rounds # -.if SKEIN_ASM_UNROLL & 256 _UNROLL_CNT = ROUNDS_256/8 -.else -_UNROLL_CNT = SKEIN_UNROLL_256 - .if ((ROUNDS_256/8) % _UNROLL_CNT) - .error "Invalid SKEIN_UNROLL_256" - .endif - xorq %rdi,%rdi #rdi = iteration count -Skein_256_round_loop: -.endif -_Rbase_ = 0 -.rept _UNROLL_CNT*2 - # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) - # round 4*_RBase_ + 0 - addReg rax, rbx - RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 - addReg rcx, rdx - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 - .endif - xorReg rbx, rax - RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 - xorReg rdx, rcx - .if SKEIN_ASM_UNROLL & 256 - .irp _r0_,%( 8+(_Rbase_+3) % 5) - .irp _r1_,%(13+(_Rbase_+2) % 3) - leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx - .endr - .endr - .endif - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 - .endif - Skein_Debug_Round 256,%(4*_Rbase_+1) - # round 4*_Rbase_ + 1 - addReg rax, rdx - RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 - xorReg rdx, rax - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 - .endif - addReg rcx, rbx - RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 - xorReg rbx, rcx - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 - .endif - Skein_Debug_Round 256,%(4*_Rbase_+2) - .if SKEIN_ASM_UNROLL & 256 - .irp _r0_,%( 8+(_Rbase_+2) % 5) - .irp _r1_,%(13+(_Rbase_+1) % 3) - leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx - .endr - .endr - .endif - # round 4*_Rbase_ + 2 - addReg rax, rbx - RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 - addReg rcx, rdx - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 - .endif - xorReg rbx, rax - RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 - xorReg rdx, rcx - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key - leaq 1(%r11,%rdi),%r11 #precompute key + tweak - .endif - Skein_Debug_Round 256,%(4*_Rbase_+3) - # round 4*_Rbase_ + 3 - addReg rax, rdx - RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 - addReg rcx, rbx - .if (SKEIN_ASM_UNROLL & 256) == 0 - addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak - movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak - .endif - xorReg rdx, rax - RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 - xorReg rbx, rcx - Skein_Debug_Round 256,%(4*_Rbase_+4) - .if (SKEIN_ASM_UNROLL & 256) == 0 - addReg r9 ,r13 #precompute key+tweak - .endif - #inject key schedule words -_Rbase_ = _Rbase_+1 - .if SKEIN_ASM_UNROLL & 256 - addReg rax,r,%(8+((_Rbase_+0) % 5)) - addReg rbx,rsi - addReg rcx,rdi - addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ - .else - incq %rdi - addReg rax,r8 - addReg rcx,r10 - addReg rbx,r9 - addReg rdx,r11 - .endif - Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT -.endr #rept _UNROLL_CNT + + # round 0 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r11,%r15),%rdi + + # round 1 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r10,%r14),%rsi + + #round 2 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 3 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r9, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 1(%r12,%rdx),%rdx + + # round 4 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r12,%r13),%rdi + + # round 5 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r11,%r15),%rsi + + #round 6 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 7 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r10, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 2(%r8,%rdx),%rdx + + # round 8 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r8,%r14),%rdi + + # round 9 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r12,%r13),%rsi + + #round 10 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 11 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r11, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 3(%r9,%rdx),%rdx + + # round 12 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r9,%r15),%rdi + + # round 13 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r8,%r14),%rsi + + #round 14 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 15 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r12, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 4(%r10,%rdx),%rdx + + # round 16 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r10,%r13),%rdi + + # round 17 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r9,%r15),%rsi + + #round 18 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 19 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r8, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 5(%r11,%rdx),%rdx + + # round 20 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r11,%r14),%rdi + + # round 21 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r10,%r13),%rsi + + #round 22 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 23 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r9, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 6(%r12,%rdx),%rdx + + # round 24 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r12,%r15),%rdi + + # round 25 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r11,%r14),%rsi + + #round 26 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 27 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r10, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 7(%r8,%rdx),%rdx + + # round 28 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r8,%r13),%rdi + + # round 29 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r12,%r15),%rsi + + #round 30 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 31 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r11, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 8(%r9,%rdx),%rdx + + # round 32 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r9,%r14),%rdi + + # round 33 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r8,%r13),%rsi + + #round 34 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 35 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r12, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 9(%r10,%rdx),%rdx + + # round 36 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r10,%r15),%rdi + + # round 37 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r9,%r14),%rsi + + #round 38 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 39 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r8, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 10(%r11,%rdx),%rdx + + # round 40 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r11,%r13),%rdi + + # round 41 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r10,%r15),%rsi + + #round 42 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 43 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r9, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 11(%r12,%rdx),%rdx + + # round 44 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r12,%r14),%rdi + + # round 45 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r11,%r13),%rsi + + #round 46 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 47 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r10, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 12(%r8,%rdx),%rdx + + # round 48 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r8,%r15),%rdi + + # round 49 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r12,%r14),%rsi + + #round 50 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 51 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r11, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 13(%r9,%rdx),%rdx + + # round 52 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r9,%r13),%rdi + + # round 53 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r8,%r15),%rsi + + #round 54 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 55 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r12, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 14(%r10,%rdx),%rdx + + # round 56 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r10,%r14),%rdi + + # round 57 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r9,%r13),%rsi + + #round 58 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 59 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r8, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 15(%r11,%rdx),%rdx + + # round 60 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r11,%r15),%rdi + + # round 61 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r10,%r14),%rsi + + #round 62 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 63 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r9, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 16(%r12,%rdx),%rdx + + # round 64 + leaq (%rbx, %rax),%rax + rolq $RC_256_0_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_0_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r12,%r13),%rdi + + # round 65 + leaq (%rdx, %rax),%rax + rolq $RC_256_1_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_1_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r11,%r15),%rsi + + #round 66 + leaq (%rbx, %rax),%rax + rolq $RC_256_2_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_2_1,%rdx + xorq %rcx, %rdx + + #round 67 + leaq (%rdx, %rax),%rax + rolq $RC_256_3_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_3_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r10, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 17(%r8,%rdx),%rdx + + # round 68 + leaq (%rbx, %rax),%rax + rolq $RC_256_4_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_4_1,%rdx + xorq %rcx, %rdx + + #precompute key injection value for %rcx + leaq (%r8,%r14),%rdi + + # round 69 + leaq (%rdx, %rax),%rax + rolq $RC_256_5_0,%rdx + xorq %rax, %rdx + leaq (%rbx, %rcx),%rcx + rolq $RC_256_5_1,%rbx + xorq %rcx, %rbx + + #precompute key injection value for %rbx + leaq (%r12,%r13),%rsi + + #round 70 + leaq (%rbx, %rax),%rax + rolq $RC_256_6_0,%rbx + leaq (%rdx, %rcx),%rcx + xorq %rax, %rbx + rolq $RC_256_6_1,%rdx + xorq %rcx, %rdx + + #round 71 + leaq (%rdx, %rax),%rax + rolq $RC_256_7_0,%rdx + leaq (%rbx, %rcx),%rcx + xorq %rax, %rdx + rolq $RC_256_7_1,%rbx + xorq %rcx, %rbx + + # Key injection + leaq (%r11, %rax),%rax + leaq (%rsi, %rbx),%rbx + leaq (%rdi, %rcx),%rcx + leaq 18(%r9,%rdx),%rdx + # -.if (SKEIN_ASM_UNROLL & 256) == 0 - cmpq $2*(ROUNDS_256/8),%rdi - jb Skein_256_round_loop -.endif # (SKEIN_ASM_UNROLL & 256) == 0 movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context #---------------------------- @@ -669,8 +1196,6 @@ movq %rcx,X_VARS+16(%rdi) movq %rdx,X_VARS+24(%rdi) - Skein_Debug_Round 256,SKEIN_RND_FEED_FWD - # go back for more blocks, if needed decq blkCnt+F_O(%rbp) jnz Skein_256_block_loop @@ -679,20 +1204,6 @@ ret Skein_256_Process_Block_End: - .if _SKEIN_DEBUG -Skein_Debug_Round_256: #here with rdx == round "number" from macro - pushq %rsi #save two regs for BLK_BITS-specific parms - pushq %rdi - movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi - movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it - movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) - movq %rcx,X_stk+16+F_O(%rbp) - movq %rdi,X_stk+24+F_O(%rbp) - - movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr - movq $256,%rdi #now are set for the call - jmp Skein_Debug_Round_Common - .endif # .if _SKEIN_CODE_SIZE C_label Skein_256_Process_Block_CodeSize @@ -719,74 +1230,6 @@ # X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) # ################# -# MACRO: one round for 512-bit blocks -# -.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 -# - addReg r\rn0, r\rn1 - RotL64 r\rn1, 512,%((_Rn_) % 8),0 - xorReg r\rn1, r\rn0 - op1 - addReg r\rn2, r\rn3 - RotL64 r\rn3, 512,%((_Rn_) % 8),1 - xorReg r\rn3, r\rn2 - op2 - addReg r\rn4, r\rn5 - RotL64 r\rn5, 512,%((_Rn_) % 8),2 - xorReg r\rn5, r\rn4 - op3 - addReg r\rn6, r\rn7 - RotL64 r\rn7, 512,%((_Rn_) % 8),3 - xorReg r\rn7, r\rn6 - op4 - Skein_Debug_Round 512,%(_Rn_+1),-4 -# -.endm #R_512_OneRound -# -################# -# MACRO: eight rounds for 512-bit blocks -# -.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) - .if (SKEIN_ASM_UNROLL && 512) - # here for fully unrolled case. - _II_ = ((_RR_)/4) + 1 #key injection counter - R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),,, - R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),,, - R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),,, - R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),, - # inject the key schedule - addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 - addReg r11, rax - addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 - addReg r12, rbx - addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 - addReg r13, rcx - addReg r14, rdx - addReg r15, rsi,,,(_II_) - .else - # here for looping case #"rotate" key/tweak schedule (move up on stack) - incq %rdi #bump key injection counter - R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),,, - R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),,, - R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),,, - R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),, - # inject the key schedule - addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 - addReg r11, rax - addReg r12, rbx - addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 - addReg r13, rcx - addReg r14, rdx - addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 - addReg r15, rsi - addReg r15, rdi #inject the round number - .endif - - #show the result of the key injection - Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT -.endm #R_512_EightRounds -# -################# # instantiated code # C_label Skein_512_Process_Block @@ -814,17 +1257,14 @@ movq %rbx,ksTwk+ 8+F_O(%rbp) movq %rcx,ksTwk+16+F_O(%rbp) .irp _Rn_,8,9,10,11,12,13,14,15 - movq X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_ + movq X_VARS+8*(\_Rn_-8)(%rdi),%r\_Rn_ xorq %r\_Rn_,%rdx #compute overall parity - movq %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp) + movq %r\_Rn_,ksKey+8*(\_Rn_-8)+F_O(%rbp) .endr #load state into %r8 ..%r15, compute parity movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity addReg r13,rax #precompute key injection for tweak addReg r14, rbx -.if _SKEIN_DEBUG - movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below -.endif movq 0(%rsi),%rax #load input block movq 8(%rsi),%rbx movq 16(%rsi),%rcx @@ -851,14 +1291,6 @@ movq %rcx,Wcopy+48+F_O(%rbp) movq %rdx,Wcopy+56+F_O(%rbp) -.if _SKEIN_DEBUG - .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output - movq %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp) - .endr - - Skein_Debug_Block 512 #debug dump - Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL -.endif addq $8*WCNT,%rsi #skip the block movq %rsi,blkPtr+F_O(%rbp) #update block pointer # @@ -865,42 +1297,1335 @@ ################# # now the key schedule is computed. Start the rounds # -.if SKEIN_ASM_UNROLL & 512 _UNROLL_CNT = ROUNDS_512/8 -.else -_UNROLL_CNT = SKEIN_UNROLL_512 - .if ((ROUNDS_512/8) % _UNROLL_CNT) - .error "Invalid SKEIN_UNROLL_512" - .endif - xorq %rdi,%rdi #rdi = round counter -Skein_512_round_loop: -.endif -# -_Rbase_ = 0 -.rept _UNROLL_CNT*2 - R_512_FourRounds %(4*_Rbase_+00) -_Rbase_ = _Rbase_+1 -.endr #rept _UNROLL_CNT -# -.if (SKEIN_ASM_UNROLL & 512) == 0 - cmpq $2*(ROUNDS_512/8),%rdi - jb Skein_512_round_loop - movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context -.endif + + + #Round 0 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((1)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((1)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 1 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((1)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((1)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 2 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((1)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((1)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 3 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((1)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((1)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((1)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((1)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 1(%rsi,%r15),%r15 + + #Round 4 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((2)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((2)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 5 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((2)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((2)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 6 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((2)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((2)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 7 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((2)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((2)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((2)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((2)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 2(%rsi,%r15),%r15 + + #Round 8 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((3)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((3)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 9 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((3)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((3)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 10 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((3)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((3)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 11 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((3)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((3)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((3)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((3)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 3(%rsi,%r15),%r15 + + #Round 12 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((4)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((4)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 13 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((4)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((4)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 14 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((4)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((4)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 15 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((4)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((4)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((4)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((4)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 4(%rsi,%r15),%r15 + + #Round 16 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((5)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((5)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 17 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((5)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((5)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 18 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((5)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((5)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 19 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((5)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((5)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((5)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((5)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 5(%rsi,%r15),%r15 + + #Round 20 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((6)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((6)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 21 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((6)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((6)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 22 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((6)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((6)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 23 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((6)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((6)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((6)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((6)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 6(%rsi,%r15),%r15 + + #Round 24 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((7)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((7)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 25 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((7)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((7)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 26 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((7)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((7)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 27 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((7)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((7)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((7)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((7)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 7(%rsi,%r15),%r15 + + #Round 28 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((8)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((8)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 29 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((8)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((8)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 30 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((8)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((8)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 31 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((8)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((8)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((8)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((8)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 8(%rsi,%r15),%r15 + + #Round 32 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((9)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((9)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 33 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((9)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((9)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 34 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((9)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((9)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 35 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((9)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((9)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((9)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((9)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 9(%rsi,%r15),%r15 + + #Round 36 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((10)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((10)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 37 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((10)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((10)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 38 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((10)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((10)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 39 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((10)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((10)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((10)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((10)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 10(%rsi,%r15),%r15 + + #Round 40 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((11)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((11)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 41 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((11)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((11)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 42 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((11)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((11)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 43 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((11)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((11)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((11)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((11)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 11(%rsi,%r15),%r15 + + #Round 44 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((12)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((12)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 45 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((12)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((12)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 46 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((12)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((12)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 47 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((12)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((12)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((12)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((12)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 12(%rsi,%r15),%r15 + + #Round 48 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((13)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((13)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 49 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((13)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((13)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 50 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((13)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((13)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 51 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((13)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((13)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((13)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((13)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 13(%rsi,%r15),%r15 + + #Round 52 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((14)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((14)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 53 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((14)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((14)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 54 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((14)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((14)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 55 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((14)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((14)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((14)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((14)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 14(%rsi,%r15),%r15 + + #Round 56 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((15)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((15)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 57 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((15)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((15)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 58 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((15)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((15)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 59 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((15)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((15)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((15)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((15)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 15(%rsi,%r15),%r15 + + #Round 60 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((16)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((16)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 61 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((16)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((16)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 62 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((16)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((16)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 63 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((16)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((16)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((16)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((16)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 16(%rsi,%r15),%r15 + + #Round 64 + leaq (%r9, %r8),%r8 + rolq $RC_512_0_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((17)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_0_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_0_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((17)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_0_3,%r15 + xorq %r14, %r15 + + # Round 65 + leaq (%r9, %r10),%r10 + rolq $RC_512_1_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((17)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_1_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_1_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((17)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_1_3,%r11 + xorq %r8, %r11 + + # Round 66 + leaq (%r9, %r12),%r12 + rolq $RC_512_2_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((17)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_2_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_2_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((17)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_2_3,%r15 + xorq %r10, %r15 + + # Round 67 + leaq (%r9, %r14),%r14 + rolq $RC_512_3_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((17)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_3_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_3_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_3_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((17)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((17)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((17)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 17(%rsi,%r15),%r15 + + #Round 68 + leaq (%r9, %r8),%r8 + rolq $RC_512_4_0,%r9 + xorq %r8, %r9 + movq ksKey+8*(((18)+3) % 9)+F_O(%rbp),%rax + leaq (%r11, %r10),%r10 + rolq $RC_512_4_1,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_512_4_2,%r13 + xorq %r12, %r13 + movq ksKey+8*(((18)+4) % 9)+F_O(%rbp),%rbx + leaq (%r15, %r14),%r14 + rolq $RC_512_4_3,%r15 + xorq %r14, %r15 + + # Round 69 + leaq (%r9, %r10),%r10 + rolq $RC_512_5_0,%r9 + xorq %r10, %r9 + movq ksKey+8*(((18)+5) % 9)+F_O(%rbp),%rcx + leaq (%r15, %r12),%r12 + rolq $RC_512_5_1,%r15 + xorq %r12, %r15 + leaq (%r13, %r14),%r14 + rolq $RC_512_5_2,%r13 + xorq %r14, %r13 + movq ksKey+8*(((18)+6) % 9)+F_O(%rbp),%rdx + leaq (%r11, %r8),%r8 + rolq $RC_512_5_3,%r11 + xorq %r8, %r11 + + # Round 70 + leaq (%r9, %r12),%r12 + rolq $RC_512_6_0,%r9 + xorq %r12, %r9 + movq ksKey+8*(((18)+7) % 9)+F_O(%rbp),%rsi + leaq (%r11, %r14),%r14 + rolq $RC_512_6_1,%r11 + xorq %r14, %r11 + leaq (%r13, %r8),%r8 + rolq $RC_512_6_2,%r13 + xorq %r8, %r13 + addq ksTwk+8*(((18)+0) % 3)+F_O(%rbp),%rcx + leaq (%r15, %r10),%r10 + rolq $RC_512_6_3,%r15 + xorq %r10, %r15 + + # Round 71 + leaq (%r9, %r14),%r14 + rolq $RC_512_7_0,%r9 + xorq %r14, %r9 + addq ksTwk+8*(((18)+1)%3)+F_O(%rbp),%rdx + leaq (%r15, %r8),%r8 + rolq $RC_512_7_1,%r15 + xorq %r8, %r15 + leaq (%r13, %r10),%r10 + rolq $RC_512_7_2,%r13 + xorq %r10, %r13 + leaq (%r11, %r12),%r12 + rolq $RC_512_7_3,%r11 + xorq %r12, %r11 + + # inject the key schedule + addq ksKey+8*(((18)+0)%9)+F_O(%rbp),%r8 + leaq (%rax, %r11),%r11 + addq ksKey+8*(((18)+1)%9)+F_O(%rbp),%r9 + leaq (%rbx, %r12),%r12 + addq ksKey+8*(((18)+2)%9)+F_O(%rbp),%r10 + leaq (%rcx, %r13),%r13 + leaq (%rdx, %r14),%r14 + leaq 18(%rsi,%r15),%r15 # end of rounds ################# # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} .irp _Rn_,8,9,10,11,12,13,14,15 - .if (_Rn_ == 8) + .if (\_Rn_ == 8) movq $FIRST_MASK64,%rbx .endif - xorq Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR - movq %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi) #and store result - .if (_Rn_ == 14) + xorq Wcopy+8*(\_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR + movq %r\_Rn_,X_VARS+8*(\_Rn_-8)(%rdi) #and store result + .if (\_Rn_ == 14) andq TWEAK+ 8(%rdi),%rbx .endif .endr - Skein_Debug_Round 512,SKEIN_RND_FEED_FWD # go back for more blocks, if needed decq blkCnt+F_O(%rbp) @@ -911,18 +2636,6 @@ ret Skein_512_Process_Block_End: # - .if _SKEIN_DEBUG -# call here with rdx = "round number" -Skein_Debug_Round_512: - pushq %rsi #save two regs for BLK_BITS-specific parms - pushq %rdi - .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it - movq %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp) - .endr - movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr - movq $512,%rdi #now are set for the call - jmp Skein_Debug_Round_Common - .endif # .if _SKEIN_CODE_SIZE C_label Skein_512_Process_Block_CodeSize @@ -966,125 +2679,12 @@ # rIdx_offs = tmpStk_1024 # -.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 - addReg \reg0 , \reg1 #perform the MIX - RotL64 \reg1 , 1024,%((_RN0_) % 8),_Rn1_ - xorReg \reg1 , \reg0 -.if ((_RN0_) && 3) == 3 #time to do key injection? - .if _SKEIN_DEBUG - movq %\reg0 , xDebug_1024+8*w0(%rsp) #save intermediate values for Debug_Round - movq %\reg1 , xDebug_1024+8*w1(%rsp) # (before inline key injection) - .endif -_II_ = ((_RN0_)/4)+1 #injection count - .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection - addq ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0 - addq ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1 - .if w1 == 13 #tweak injection - addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1 - .elseif w0 == 14 - addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0 - .elseif w1 == 15 - addq $_II_, %\reg1 #(injection counter) - .endif - .else #here to do looping key injection - .if (w0 == 0) - movq %rdi, X_stk+8*w0(%rsp) #if so, store N0 so we can use reg as index - movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi - .else - addq ksKey+8+8*w0(%rsp,%rdi,8),%\reg0 #even key injection - .endif - .if w1 == 13 #tweak injection - addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1 - .elseif w0 == 14 - addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0 - .elseif w1 == 15 - addReg \reg1,rdi,,,1 #(injection counter) - .endif - addq ksKey+8+8*w1(%rsp,%rdi,8),%\reg1 #odd key injection - .endif -.endif - # insert the op provided, .if any - op1 -.endm -################# -# MACRO: four rounds for 1024-bit blocks -# -.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4) - # should be here with X4 set properly, X6 stored on stack -_Rn_ = (_RR_) + 0 - r1024_Mix 0, 1,rdi,rsi,_Rn_,0 - r1024_Mix 2, 3,rbp,rax,_Rn_,1 - r1024_Mix 4, 5,rcx,rbx,_Rn_,2, #save X4 on stack (x4/x6 alternate) - r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4, #load X6 from stack - r1024_Mix 10,11,r10,r11,_Rn_,5 - r1024_Mix 12,13,r12,r13,_Rn_,6 - r1024_Mix 6, 7,rcx,rdx,_Rn_,3 - r1024_Mix 14,15,r14,r15,_Rn_,7 - .if _SKEIN_DEBUG - Skein_Debug_Round 1024,%(_Rn_+1) - .endif -_Rn_ = (_RR_) + 1 - r1024_Mix 0, 9,rdi,r9 ,_Rn_,0 - r1024_Mix 2,13,rbp,r13,_Rn_,1 - r1024_Mix 6,11,rcx,r11,_Rn_,2, #save X6 on stack (x4/x6 alternate) - r1024_Mix 10, 7,r10,rdx,_Rn_,4, #load X4 from stack - r1024_Mix 12, 3,r12,rax,_Rn_,5 - r1024_Mix 14, 5,r14,rbx,_Rn_,6 - r1024_Mix 4,15,rcx,r15,_Rn_,3 - r1024_Mix 8, 1,r8 ,rsi,_Rn_,7 - .if _SKEIN_DEBUG - Skein_Debug_Round 1024,%(_Rn_+1) - .endif -_Rn_ = (_RR_) + 2 - r1024_Mix 0, 7,rdi,rdx,_Rn_,0 - r1024_Mix 2, 5,rbp,rbx,_Rn_,1 - r1024_Mix 4, 3,rcx,rax,_Rn_,2, #save X4 on stack (x4/x6 alternate) - r1024_Mix 12,15,r12,r15,_Rn_,4, #load X6 from stack - r1024_Mix 14,13,r14,r13,_Rn_,5 - r1024_Mix 8,11,r8 ,r11,_Rn_,6 - r1024_Mix 6, 1,rcx,rsi,_Rn_,3 - r1024_Mix 10, 9,r10,r9 ,_Rn_,7 - .if _SKEIN_DEBUG - Skein_Debug_Round 1024,%(_Rn_+1) - .endif -_Rn_ = (_RR_) + 3 - r1024_Mix 0,15,rdi,r15,_Rn_,0 - r1024_Mix 2,11,rbp,r11,_Rn_,1 - r1024_Mix 6,13,rcx,r13,_Rn_,2, #save X6 on stack (x4/x6 alternate) - r1024_Mix 14, 1,r14,rsi,_Rn_,4, #load X4 from stack - r1024_Mix 8, 5,r8 ,rbx,_Rn_,5 - r1024_Mix 10, 3,r10,rax,_Rn_,6 - r1024_Mix 4, 9,rcx,r9 ,_Rn_,3 - r1024_Mix 12, 7,r12,rdx,_Rn_,7 - .if _SKEIN_DEBUG - Skein_Debug_Round 1024,%(_Rn_+1) - .endif - - .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack - #"rotate" the key schedule on the stack -i8 = o1K_r8 -i0 = o1K_rdi - movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack) - movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word - movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!) - movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word - movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack) - movq X_stk+8*i8(%rsp) ,%r8 #get the reg back - incq %rdi #bump the index - movq %rdi, rIdx_offs (%rsp) #save rdi again - movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back - addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection - .endif - #show the result of the key injection - Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT -.endm #r1024_FourRounds -# ################ # code # C_label Skein1024_Process_Block # - Setup_Stack 1024,ROUNDS_1024/8,WCNT + Setup_Stack 1024,((ROUNDS_1024/8)+1),WCNT movq TWEAK+ 8(%rdi),%r9 jmp Skein1024_block_loop # main hash loop for Skein1024 @@ -1096,10 +2696,6 @@ # R8 ..R15 = X8..X15 (state words) # RBP = temp (used for X0 and X2) # - .if (SKEIN_ASM_UNROLL & 1024) == 0 - xorq %rax,%rax #init loop index on the stack - movq %rax,rIdx_offs(%rsp) - .endif movq TWEAK+ 0(%rdi),%r8 addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0 movq %r9 ,%r10 @@ -1108,21 +2704,18 @@ movq %r8 ,ksTwk+ 0+F_O(%rbp) movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below movq %r10,ksTwk+16+F_O(%rbp) - .if _SKEIN_DEBUG - movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block - .endif movq blkPtr +F_O(%rbp),%rsi # rsi --> input block movq $KW_PARITY ,%rax #overall key schedule parity # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3] .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps - movq X_VARS+8*_rN_(%rdi),%r14 #get state word - movq 8*_rN_(%rsi),%r15 #get msg word + movq X_VARS+8*\_rN_(%rdi),%r14 #get state word + movq 8*\_rN_(%rsi),%r15 #get msg word xorq %r14,%rax #update key schedule overall parity - movq %r14,ksKey +8*_rN_+F_O(%rbp) #save key schedule word on stack - movq %r15,Wcopy +8*_rN_+F_O(%rbp) #save local msg Wcopy + movq %r14,ksKey +8*\_rN_+F_O(%rbp) #save key schedule word on stack + movq %r15,Wcopy +8*\_rN_+F_O(%rbp) #save local msg Wcopy addq %r15,%r14 #do the initial key injection - movq %r14,X_stk +8*_rN_ (%rsp) #save initial state var on stack + movq %r14,X_stk +8*\_rN_ (%rsp) #save initial state var on stack .endr # now process the rest, using the "real" registers # (MUST do it in reverse order to inject tweaks r8/r9 first) @@ -1135,15 +2728,12 @@ movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward addq %rcx,%\_rr_ #do the initial key injection .if _oo_ == 13 #do the initial tweak injection - addReg _rr_,r8 # (only in words 13/14) + addReg \_rr_,r8 # (only in words 13/14) .elseif _oo_ == 14 - addReg _rr_,r9 + addReg \_rr_,r9 .endif .endr movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity -.if _SKEIN_DEBUG - Skein_Debug_Block 1024 #initial debug dump -.endif addq $8*WCNT,%rsi #bump the msg ptr movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr # re-load words 0..4 from stack, enter the main loop @@ -1150,33 +2740,2633 @@ .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack) movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go! .endr -.if _SKEIN_DEBUG - Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection -.endif # ################# # now the key schedule is computed. Start the rounds # -.if SKEIN_ASM_UNROLL & 1024 _UNROLL_CNT = ROUNDS_1024/8 -.else -_UNROLL_CNT = SKEIN_UNROLL_1024 - .if ((ROUNDS_1024/8) % _UNROLL_CNT) - .error "Invalid SKEIN_UNROLL_1024" - .endif -Skein1024_round_loop: -.endif -# -_Rbase_ = 0 -.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time - r1024_FourRounds %(4*_Rbase_+00) -_Rbase_ = _Rbase_+1 -.endr #rept _UNROLL_CNT -# -.if (SKEIN_ASM_UNROLL & 1024) == 0 - cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done - jb Skein1024_round_loop -.endif + + + # round 0 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 1 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 2 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 3 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((1+0) % 17)(%rsp),%rdi + addq ksKey+8*((1+15) % 17)(%rsp),%r15 + addq $1,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((1+2) % 17)(%rsp),%rbp + addq ksKey+8*((1+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((1+6) % 17)(%rsp),%rcx + addq ksKey+8*((1+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((1+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((1+14) % 17)(%rsp),%r14 + addq ksKey+8*((1+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((1+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((1+8) % 17)(%rsp),%r8 + addq ksKey+8*((1+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((1+10) % 17)(%rsp),%r10 + addq ksKey+8*((1+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((1+4) % 17)(%rsp),%rcx + addq ksKey+8*((1+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((1+12) % 17)(%rsp),%r12 + addq ksKey+8*((1+7) % 17)(%rsp),%rdx + + # round 4 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 5 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 6 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 7 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((2+0) % 17)(%rsp),%rdi + addq ksKey+8*((2+15) % 17)(%rsp),%r15 + addq $2,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((2+2) % 17)(%rsp),%rbp + addq ksKey+8*((2+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((2+6) % 17)(%rsp),%rcx + addq ksKey+8*((2+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((2+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((2+14) % 17)(%rsp),%r14 + addq ksKey+8*((2+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((2+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((2+8) % 17)(%rsp),%r8 + addq ksKey+8*((2+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((2+10) % 17)(%rsp),%r10 + addq ksKey+8*((2+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((2+4) % 17)(%rsp),%rcx + addq ksKey+8*((2+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((2+12) % 17)(%rsp),%r12 + addq ksKey+8*((2+7) % 17)(%rsp),%rdx + + # round 8 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 9 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 10 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 11 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((3+0) % 17)(%rsp),%rdi + addq ksKey+8*((3+15) % 17)(%rsp),%r15 + addq $3,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((3+2) % 17)(%rsp),%rbp + addq ksKey+8*((3+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((3+6) % 17)(%rsp),%rcx + addq ksKey+8*((3+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((3+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((3+14) % 17)(%rsp),%r14 + addq ksKey+8*((3+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((3+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((3+8) % 17)(%rsp),%r8 + addq ksKey+8*((3+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((3+10) % 17)(%rsp),%r10 + addq ksKey+8*((3+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((3+4) % 17)(%rsp),%rcx + addq ksKey+8*((3+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((3+12) % 17)(%rsp),%r12 + addq ksKey+8*((3+7) % 17)(%rsp),%rdx + + # round 12 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 13 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 14 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 15 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((4+0) % 17)(%rsp),%rdi + addq ksKey+8*((4+15) % 17)(%rsp),%r15 + addq $4,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((4+2) % 17)(%rsp),%rbp + addq ksKey+8*((4+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((4+6) % 17)(%rsp),%rcx + addq ksKey+8*((4+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((4+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((4+14) % 17)(%rsp),%r14 + addq ksKey+8*((4+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((4+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((4+8) % 17)(%rsp),%r8 + addq ksKey+8*((4+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((4+10) % 17)(%rsp),%r10 + addq ksKey+8*((4+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((4+4) % 17)(%rsp),%rcx + addq ksKey+8*((4+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((4+12) % 17)(%rsp),%r12 + addq ksKey+8*((4+7) % 17)(%rsp),%rdx + + # round 16 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 17 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 18 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 19 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((5+0) % 17)(%rsp),%rdi + addq ksKey+8*((5+15) % 17)(%rsp),%r15 + addq $5,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((5+2) % 17)(%rsp),%rbp + addq ksKey+8*((5+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((5+6) % 17)(%rsp),%rcx + addq ksKey+8*((5+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((5+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((5+14) % 17)(%rsp),%r14 + addq ksKey+8*((5+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((5+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((5+8) % 17)(%rsp),%r8 + addq ksKey+8*((5+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((5+10) % 17)(%rsp),%r10 + addq ksKey+8*((5+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((5+4) % 17)(%rsp),%rcx + addq ksKey+8*((5+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((5+12) % 17)(%rsp),%r12 + addq ksKey+8*((5+7) % 17)(%rsp),%rdx + + # round 20 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 21 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 22 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 23 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((6+0) % 17)(%rsp),%rdi + addq ksKey+8*((6+15) % 17)(%rsp),%r15 + addq $6,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((6+2) % 17)(%rsp),%rbp + addq ksKey+8*((6+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((6+6) % 17)(%rsp),%rcx + addq ksKey+8*((6+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((6+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((6+14) % 17)(%rsp),%r14 + addq ksKey+8*((6+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((6+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((6+8) % 17)(%rsp),%r8 + addq ksKey+8*((6+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((6+10) % 17)(%rsp),%r10 + addq ksKey+8*((6+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((6+4) % 17)(%rsp),%rcx + addq ksKey+8*((6+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((6+12) % 17)(%rsp),%r12 + addq ksKey+8*((6+7) % 17)(%rsp),%rdx + + # round 24 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 25 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 26 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 27 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((7+0) % 17)(%rsp),%rdi + addq ksKey+8*((7+15) % 17)(%rsp),%r15 + addq $7,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((7+2) % 17)(%rsp),%rbp + addq ksKey+8*((7+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((7+6) % 17)(%rsp),%rcx + addq ksKey+8*((7+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((7+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((7+14) % 17)(%rsp),%r14 + addq ksKey+8*((7+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((7+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((7+8) % 17)(%rsp),%r8 + addq ksKey+8*((7+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((7+10) % 17)(%rsp),%r10 + addq ksKey+8*((7+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((7+4) % 17)(%rsp),%rcx + addq ksKey+8*((7+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((7+12) % 17)(%rsp),%r12 + addq ksKey+8*((7+7) % 17)(%rsp),%rdx + + # round 28 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 29 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 30 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 31 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((8+0) % 17)(%rsp),%rdi + addq ksKey+8*((8+15) % 17)(%rsp),%r15 + addq $8,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((8+2) % 17)(%rsp),%rbp + addq ksKey+8*((8+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((8+6) % 17)(%rsp),%rcx + addq ksKey+8*((8+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((8+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((8+14) % 17)(%rsp),%r14 + addq ksKey+8*((8+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((8+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((8+8) % 17)(%rsp),%r8 + addq ksKey+8*((8+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((8+10) % 17)(%rsp),%r10 + addq ksKey+8*((8+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((8+4) % 17)(%rsp),%rcx + addq ksKey+8*((8+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((8+12) % 17)(%rsp),%r12 + addq ksKey+8*((8+7) % 17)(%rsp),%rdx + + # round 32 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 33 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 34 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 35 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((9+0) % 17)(%rsp),%rdi + addq ksKey+8*((9+15) % 17)(%rsp),%r15 + addq $9,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((9+2) % 17)(%rsp),%rbp + addq ksKey+8*((9+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((9+6) % 17)(%rsp),%rcx + addq ksKey+8*((9+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((9+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((9+14) % 17)(%rsp),%r14 + addq ksKey+8*((9+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((9+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((9+8) % 17)(%rsp),%r8 + addq ksKey+8*((9+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((9+10) % 17)(%rsp),%r10 + addq ksKey+8*((9+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((9+4) % 17)(%rsp),%rcx + addq ksKey+8*((9+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((9+12) % 17)(%rsp),%r12 + addq ksKey+8*((9+7) % 17)(%rsp),%rdx + + # round 36 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 37 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 38 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 39 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((10+0) % 17)(%rsp),%rdi + addq ksKey+8*((10+15) % 17)(%rsp),%r15 + addq $10,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((10+2) % 17)(%rsp),%rbp + addq ksKey+8*((10+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((10+6) % 17)(%rsp),%rcx + addq ksKey+8*((10+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((10+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((10+14) % 17)(%rsp),%r14 + addq ksKey+8*((10+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((10+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((10+8) % 17)(%rsp),%r8 + addq ksKey+8*((10+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((10+10) % 17)(%rsp),%r10 + addq ksKey+8*((10+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((10+4) % 17)(%rsp),%rcx + addq ksKey+8*((10+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((10+12) % 17)(%rsp),%r12 + addq ksKey+8*((10+7) % 17)(%rsp),%rdx + + # round 40 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 41 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 42 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 43 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((11+0) % 17)(%rsp),%rdi + addq ksKey+8*((11+15) % 17)(%rsp),%r15 + addq $11,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((11+2) % 17)(%rsp),%rbp + addq ksKey+8*((11+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((11+6) % 17)(%rsp),%rcx + addq ksKey+8*((11+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((11+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((11+14) % 17)(%rsp),%r14 + addq ksKey+8*((11+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((11+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((11+8) % 17)(%rsp),%r8 + addq ksKey+8*((11+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((11+10) % 17)(%rsp),%r10 + addq ksKey+8*((11+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((11+4) % 17)(%rsp),%rcx + addq ksKey+8*((11+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((11+12) % 17)(%rsp),%r12 + addq ksKey+8*((11+7) % 17)(%rsp),%rdx + + # round 44 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 45 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 46 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 47 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((12+0) % 17)(%rsp),%rdi + addq ksKey+8*((12+15) % 17)(%rsp),%r15 + addq $12,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((12+2) % 17)(%rsp),%rbp + addq ksKey+8*((12+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((12+6) % 17)(%rsp),%rcx + addq ksKey+8*((12+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((12+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((12+14) % 17)(%rsp),%r14 + addq ksKey+8*((12+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((12+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((12+8) % 17)(%rsp),%r8 + addq ksKey+8*((12+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((12+10) % 17)(%rsp),%r10 + addq ksKey+8*((12+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((12+4) % 17)(%rsp),%rcx + addq ksKey+8*((12+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((12+12) % 17)(%rsp),%r12 + addq ksKey+8*((12+7) % 17)(%rsp),%rdx + + # round 48 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 49 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 50 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 51 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((13+0) % 17)(%rsp),%rdi + addq ksKey+8*((13+15) % 17)(%rsp),%r15 + addq $13,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((13+2) % 17)(%rsp),%rbp + addq ksKey+8*((13+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((13+6) % 17)(%rsp),%rcx + addq ksKey+8*((13+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((13+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((13+14) % 17)(%rsp),%r14 + addq ksKey+8*((13+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((13+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((13+8) % 17)(%rsp),%r8 + addq ksKey+8*((13+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((13+10) % 17)(%rsp),%r10 + addq ksKey+8*((13+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((13+4) % 17)(%rsp),%rcx + addq ksKey+8*((13+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((13+12) % 17)(%rsp),%r12 + addq ksKey+8*((13+7) % 17)(%rsp),%rdx + + # round 52 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 53 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 54 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 55 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((14+0) % 17)(%rsp),%rdi + addq ksKey+8*((14+15) % 17)(%rsp),%r15 + addq $14,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((14+2) % 17)(%rsp),%rbp + addq ksKey+8*((14+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((14+6) % 17)(%rsp),%rcx + addq ksKey+8*((14+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((14+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((14+14) % 17)(%rsp),%r14 + addq ksKey+8*((14+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((14+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((14+8) % 17)(%rsp),%r8 + addq ksKey+8*((14+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((14+10) % 17)(%rsp),%r10 + addq ksKey+8*((14+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((14+4) % 17)(%rsp),%rcx + addq ksKey+8*((14+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((14+12) % 17)(%rsp),%r12 + addq ksKey+8*((14+7) % 17)(%rsp),%rdx + + # round 56 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 57 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 58 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 59 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((15+0) % 17)(%rsp),%rdi + addq ksKey+8*((15+15) % 17)(%rsp),%r15 + addq $15,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((15+2) % 17)(%rsp),%rbp + addq ksKey+8*((15+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((15+6) % 17)(%rsp),%rcx + addq ksKey+8*((15+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((15+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((15+14) % 17)(%rsp),%r14 + addq ksKey+8*((15+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((15+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((15+8) % 17)(%rsp),%r8 + addq ksKey+8*((15+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((15+10) % 17)(%rsp),%r10 + addq ksKey+8*((15+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((15+4) % 17)(%rsp),%rcx + addq ksKey+8*((15+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((15+12) % 17)(%rsp),%r12 + addq ksKey+8*((15+7) % 17)(%rsp),%rdx + + # round 60 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 61 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 62 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 63 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((16+0) % 17)(%rsp),%rdi + addq ksKey+8*((16+15) % 17)(%rsp),%r15 + addq $16,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((16+2) % 17)(%rsp),%rbp + addq ksKey+8*((16+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((16+6) % 17)(%rsp),%rcx + addq ksKey+8*((16+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((16+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((16+14) % 17)(%rsp),%r14 + addq ksKey+8*((16+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((16+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((16+8) % 17)(%rsp),%r8 + addq ksKey+8*((16+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((16+10) % 17)(%rsp),%r10 + addq ksKey+8*((16+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((16+4) % 17)(%rsp),%rcx + addq ksKey+8*((16+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((16+12) % 17)(%rsp),%r12 + addq ksKey+8*((16+7) % 17)(%rsp),%rdx + + # round 64 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 65 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 66 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 67 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((17+0) % 17)(%rsp),%rdi + addq ksKey+8*((17+15) % 17)(%rsp),%r15 + addq $17,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((17+2) % 17)(%rsp),%rbp + addq ksKey+8*((17+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((17+6) % 17)(%rsp),%rcx + addq ksKey+8*((17+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((17+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((17+14) % 17)(%rsp),%r14 + addq ksKey+8*((17+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((17+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((17+8) % 17)(%rsp),%r8 + addq ksKey+8*((17+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((17+10) % 17)(%rsp),%r10 + addq ksKey+8*((17+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((17+4) % 17)(%rsp),%rcx + addq ksKey+8*((17+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((17+12) % 17)(%rsp),%r12 + addq ksKey+8*((17+7) % 17)(%rsp),%rdx + + # round 68 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 69 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 70 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 71 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((18+0) % 17)(%rsp),%rdi + addq ksKey+8*((18+15) % 17)(%rsp),%r15 + addq $18,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((18+2) % 17)(%rsp),%rbp + addq ksKey+8*((18+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((18+6) % 17)(%rsp),%rcx + addq ksKey+8*((18+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((18+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((18+14) % 17)(%rsp),%r14 + addq ksKey+8*((18+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((18+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((18+8) % 17)(%rsp),%r8 + addq ksKey+8*((18+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((18+10) % 17)(%rsp),%r10 + addq ksKey+8*((18+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((18+4) % 17)(%rsp),%rcx + addq ksKey+8*((18+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((18+12) % 17)(%rsp),%r12 + addq ksKey+8*((18+7) % 17)(%rsp),%rdx + + # round 72 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_0_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_0_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_0_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_0_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_0_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_0_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_0_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_0_7,%r15 + xorq %r14, %r15 + + # round 73 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_1_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_1_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_1_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_1_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_1_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_1_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_1_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_1_7,%rsi + xorq %r8, %rsi + + # round 74 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_2_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_2_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_2_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_2_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_2_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_2_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_2_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_2_7,%r9 + xorq %r10, %r9 + + # round 75 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_3_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((19+0) % 17)(%rsp),%rdi + addq ksKey+8*((19+15) % 17)(%rsp),%r15 + addq $19,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_3_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((19+2) % 17)(%rsp),%rbp + addq ksKey+8*((19+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_3_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((19+6) % 17)(%rsp),%rcx + addq ksKey+8*((19+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((19+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_3_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((19+14) % 17)(%rsp),%r14 + addq ksKey+8*((19+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((19+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_3_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((19+8) % 17)(%rsp),%r8 + addq ksKey+8*((19+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_3_6,%rax + xorq %r10, %rax + addq ksKey+8*((19+10) % 17)(%rsp),%r10 + addq ksKey+8*((19+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_3_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((19+4) % 17)(%rsp),%rcx + addq ksKey+8*((19+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_3_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((19+12) % 17)(%rsp),%r12 + addq ksKey+8*((19+7) % 17)(%rsp),%rdx + + # round 76 + leaq (%rsi, %rdi),%rdi + rolq $RC_1024_4_0,%rsi + xorq %rdi, %rsi + leaq (%rax, %rbp),%rbp + rolq $RC_1024_4_1,%rax + xorq %rbp, %rax + leaq (%rbx, %rcx),%rcx + rolq $RC_1024_4_2,%rbx + xorq %rcx, %rbx + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r9, %r8),%r8 + rolq $RC_1024_4_4,%r9 + xorq %r8, %r9 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r11, %r10),%r10 + rolq $RC_1024_4_5,%r11 + xorq %r10, %r11 + leaq (%r13, %r12),%r12 + rolq $RC_1024_4_6,%r13 + xorq %r12, %r13 + leaq (%rdx, %rcx),%rcx + rolq $RC_1024_4_3,%rdx + xorq %rcx, %rdx + leaq (%r15, %r14),%r14 + rolq $RC_1024_4_7,%r15 + xorq %r14, %r15 + + # round 77 + leaq (%r9, %rdi),%rdi + rolq $RC_1024_5_0,%r9 + xorq %rdi, %r9 + leaq (%r13, %rbp),%rbp + rolq $RC_1024_5_1,%r13 + xorq %rbp, %r13 + leaq (%r11, %rcx),%rcx + rolq $RC_1024_5_2,%r11 + xorq %rcx, %r11 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rdx, %r10),%r10 + rolq $RC_1024_5_4,%rdx + xorq %r10, %rdx + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rax, %r12),%r12 + rolq $RC_1024_5_5,%rax + xorq %r12, %rax + leaq (%rbx, %r14),%r14 + rolq $RC_1024_5_6,%rbx + xorq %r14, %rbx + leaq (%r15, %rcx),%rcx + rolq $RC_1024_5_3,%r15 + xorq %rcx, %r15 + leaq (%rsi, %r8),%r8 + rolq $RC_1024_5_7,%rsi + xorq %r8, %rsi + + # round 78 + leaq (%rdx, %rdi),%rdi + rolq $RC_1024_6_0,%rdx + xorq %rdi, %rdx + leaq (%rbx, %rbp),%rbp + rolq $RC_1024_6_1,%rbx + xorq %rbp, %rbx + leaq (%rax, %rcx),%rcx + rolq $RC_1024_6_2,%rax + xorq %rcx, %rax + movq %rcx,X_stk+8*4(%rsp) #save X4 on stack (x4/x6 alternate) + leaq (%r15, %r12),%r12 + rolq $RC_1024_6_4,%r15 + xorq %r12, %r15 + movq X_stk+8*6(%rsp),%rcx #load X6 from stack + leaq (%r13, %r14),%r14 + rolq $RC_1024_6_5,%r13 + xorq %r14, %r13 + leaq (%r11, %r8),%r8 + rolq $RC_1024_6_6,%r11 + xorq %r8, %r11 + leaq (%rsi, %rcx),%rcx + rolq $RC_1024_6_3,%rsi + xorq %rcx, %rsi + leaq (%r9, %r10),%r10 + rolq $RC_1024_6_7,%r9 + xorq %r10, %r9 + + # round 79 + leaq (%r15, %rdi),%rdi + rolq $RC_1024_7_0,%r15 + xorq %rdi, %r15 + addq ksKey+8*((20+0) % 17)(%rsp),%rdi + addq ksKey+8*((20+15) % 17)(%rsp),%r15 + addq $20,%r15 + leaq (%r11, %rbp),%rbp + rolq $RC_1024_7_1,%r11 + xorq %rbp, %r11 + addq ksKey+8*((20+2) % 17)(%rsp),%rbp + addq ksKey+8*((20+11) % 17)(%rsp),%r11 + leaq (%r13, %rcx),%rcx + rolq $RC_1024_7_2,%r13 + xorq %rcx, %r13 + addq ksKey+8*((20+6) % 17)(%rsp),%rcx + addq ksKey+8*((20+13) % 17)(%rsp),%r13 + addq ksTwk+ 8*((20+0) %3)(%rsp),%r13 + movq %rcx,X_stk+8*6(%rsp) #save X6 on stack (x4/x6 alternate) + leaq (%rsi, %r14),%r14 + rolq $RC_1024_7_4,%rsi + xorq %r14, %rsi + addq ksKey+8*((20+14) % 17)(%rsp),%r14 + addq ksKey+8*((20+1) % 17)(%rsp),%rsi + addq ksTwk+ 8*((20+1)%3)(%rsp),%r14 + movq X_stk+8*4(%rsp),%rcx #load X4 from stack + leaq (%rbx, %r8),%r8 + rolq $RC_1024_7_5,%rbx + xorq %r8, %rbx + addq ksKey+8*((20+8) % 17)(%rsp),%r8 + addq ksKey+8*((20+5) % 17)(%rsp),%rbx + leaq (%rax, %r10),%r10 + rolq $RC_1024_7_6,%rax + xorq %r10, %rax + addq ksKey+8*((20+10) % 17)(%rsp),%r10 + addq ksKey+8*((20+3) % 17)(%rsp),%rax + leaq (%r9, %rcx),%rcx + rolq $RC_1024_7_3,%r9 + xorq %rcx, %r9 + addq ksKey+8*((20+4) % 17)(%rsp),%rcx + addq ksKey+8*((20+9) % 17)(%rsp),%r9 + leaq (%rdx, %r12),%r12 + rolq $RC_1024_7_7,%rdx + xorq %r12, %rdx + addq ksKey+8*((20+12) % 17)(%rsp),%r12 + addq ksKey+8*((20+7) % 17)(%rsp),%rdx + # end of rounds ################# # @@ -1204,7 +5394,6 @@ decq blkCnt(%rsp) #set zero flag iff done movq %rbx,X_VARS+8*7(%rdx) - Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,, # go back for more blocks, if needed movq ctxPtr(%rsp),%rdi #don't muck with the flags here! lea FRAME_OFFS(%rsp),%rbp @@ -1215,45 +5404,6 @@ # Skein1024_Process_Block_End: # -.if _SKEIN_DEBUG -Skein_Debug_Round_1024: - # call here with rdx = "round number", -_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr - # - #save rest of X[] state on stack so debug routines can access it - .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15 - movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp) - .endr - # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack - cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save - jae save_x0 - testq $3,%rdx #otherwise only if rdx != 0 mod 4 - jz save_x0_not -save_x0: - movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp) -save_x0_not: - #figure out the x4/x6 swapping state and save the correct one! - cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4 - jae save_x4 - testq $1,%rdx #and even ones have r4 as well - jz save_x4 - movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp) - jmp debug_1024_go -save_x4: - movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp) -debug_1024_go: - #now all is saved in Xstk[] except for rdx - push %rsi #save two regs for BLK_BITS-specific parms - push %rdi -_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32) - - movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call) - movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[] - - movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr - movq $1024,%rdi #rdi = block size - jmp Skein_Debug_Round_Common -.endif # .if _SKEIN_CODE_SIZE C_label Skein1024_Process_Block_CodeSize @@ -1271,61 +5421,7 @@ # .endif # _USE_ASM_ and 1024 # -.if _SKEIN_DEBUG #---------------------------------------------------------------- -#local debug routine to set up for calls to: -# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X) -# [ rdi rsi rdx rcx] -# -# here with %rdx = round number -# %rsi = ctx_hdr_ptr -# %rdi = block size (256/512/1024) -# on stack: saved rdi, saved rsi, retAddr, saved rdx -# -Skein_Debug_Round_Common: -_SP_OFFS_ = 32 #account for four words on stack already - .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs - pushq %\_rr_ -_SP_OFFS_ = _SP_OFFS_+8 - .endr - .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here - .error "Debug_Round_Common: stack alignment" - .endif - # compute %rcx = ptr to the X[] array on the stack (final parameter to call) - leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address - cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"? - jnz _got_rcxA - leaq X_VARS(%rsi),%rcx -_got_rcxA: - .if _USE_ASM_ & 1024 - # special handling for 1024-bit case - # (for rounds right before with key injection: - # use xDebug_1024[] instead of X_stk[]) - cmpq $SKEIN_RND_SPECIAL,%rdx - jae _got_rcxB #must be a normal round - orq %rdx,%rdx - jz _got_rcxB #just before key injection - test $3,%rdx - jne _got_rcxB - cmp $1024,%rdi #only 1024-bit(s) for now - jne _got_rcxB - leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx -_got_rcxB: - .endif - call Skein_Show_Round #call external debug handler +# .section .note.GNU-stack,"",@progbits - .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs - popq %\_rr_ -_SP_OFFS_ = _SP_OFFS_-8 - .endr - .if _SP_OFFS_ - 32 - .error "Debug_Round_Common: push/pop misalignment!" - .endif - popq %rdi - popq %rsi - ret -.endif -#---------------------------------------------------------------- - .section .note.GNU-stack,"",@progbits - .end Index: sys/crypto/skein/amd64/skein_block_asm.s =================================================================== --- sys/crypto/skein/amd64/skein_block_asm.s +++ sys/crypto/skein/amd64/skein_block_asm.s @@ -1,1331 +0,0 @@ -# -#---------------------------------------------------------------- -# 64-bit x86 assembler code (gnu as) for Skein block functions -# -# Author: Doug Whiting, Hifn/Exar -# -# This code is released to the public domain. -#---------------------------------------------------------------- -# $FreeBSD$ -# - .text - .altmacro - .psize 0,128 #list file has no page boundaries -# -_MASK_ALL_ = (256+512+1024) #all three algorithm bits -_MAX_FRAME_ = 240 -# -################# -.ifndef SKEIN_USE_ASM -_USE_ASM_ = _MASK_ALL_ -.else -_USE_ASM_ = SKEIN_USE_ASM -.endif -################# -.ifndef SKEIN_LOOP #configure loop unrolling -_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 -.else -_SKEIN_LOOP = SKEIN_LOOP - .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line -#.print "+++ SKEIN_LOOP = \_NN_" - .endr -.endif -# the unroll counts (0 --> fully unrolled) -SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 -SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 -SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 -# -SKEIN_ASM_UNROLL = 0 - .irp _NN_,256,512,1024 - .if (SKEIN_UNROLL_\_NN_) == 0 -SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ - .endif - .endr -################# -# -.ifndef SKEIN_ROUNDS -ROUNDS_256 = 72 -ROUNDS_512 = 72 -ROUNDS_1024 = 80 -.else -ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) -ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) -ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) -# only display rounds if default size is changed on command line -.irp _NN_,256,512,1024 - .if _USE_ASM_ && \_NN_ - .irp _RR_,%(ROUNDS_\_NN_) - .if _NN_ < 1024 -.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" - .else -.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" - .endif - .endr - .endif -.endr -.endif -################# -# -.ifdef SKEIN_CODE_SIZE -_SKEIN_CODE_SIZE = (1) -.else -.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined -_SKEIN_CODE_SIZE = (1) -.else -_SKEIN_CODE_SIZE = (0) -.endif -.endif -# -################# -# -.ifndef SKEIN_DEBUG -_SKEIN_DEBUG = 0 -.else -_SKEIN_DEBUG = 1 -.endif -################# -# -# define offsets of fields in hash context structure -# -HASH_BITS = 0 #bits of hash output -BCNT = 8 + HASH_BITS #number of bytes in BUFFER[] -TWEAK = 8 + BCNT #tweak values[0..1] -X_VARS = 16 + TWEAK #chaining vars -# -#(Note: buffer[] in context structure is NOT needed here :-) -# -KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words -FIRST_MASK = ~ (1 << 6) -FIRST_MASK64= ~ (1 << 62) -# -# rotation constants for Skein -# -RC_256_0_0 = 14 -RC_256_0_1 = 16 - -RC_256_1_0 = 52 -RC_256_1_1 = 57 - -RC_256_2_0 = 23 -RC_256_2_1 = 40 - -RC_256_3_0 = 5 -RC_256_3_1 = 37 - -RC_256_4_0 = 25 -RC_256_4_1 = 33 - -RC_256_5_0 = 46 -RC_256_5_1 = 12 - -RC_256_6_0 = 58 -RC_256_6_1 = 22 - -RC_256_7_0 = 32 -RC_256_7_1 = 32 - -RC_512_0_0 = 46 -RC_512_0_1 = 36 -RC_512_0_2 = 19 -RC_512_0_3 = 37 - -RC_512_1_0 = 33 -RC_512_1_1 = 27 -RC_512_1_2 = 14 -RC_512_1_3 = 42 - -RC_512_2_0 = 17 -RC_512_2_1 = 49 -RC_512_2_2 = 36 -RC_512_2_3 = 39 - -RC_512_3_0 = 44 -RC_512_3_1 = 9 -RC_512_3_2 = 54 -RC_512_3_3 = 56 - -RC_512_4_0 = 39 -RC_512_4_1 = 30 -RC_512_4_2 = 34 -RC_512_4_3 = 24 - -RC_512_5_0 = 13 -RC_512_5_1 = 50 -RC_512_5_2 = 10 -RC_512_5_3 = 17 - -RC_512_6_0 = 25 -RC_512_6_1 = 29 -RC_512_6_2 = 39 -RC_512_6_3 = 43 - -RC_512_7_0 = 8 -RC_512_7_1 = 35 -RC_512_7_2 = 56 -RC_512_7_3 = 22 - -RC_1024_0_0 = 24 -RC_1024_0_1 = 13 -RC_1024_0_2 = 8 -RC_1024_0_3 = 47 -RC_1024_0_4 = 8 -RC_1024_0_5 = 17 -RC_1024_0_6 = 22 -RC_1024_0_7 = 37 - -RC_1024_1_0 = 38 -RC_1024_1_1 = 19 -RC_1024_1_2 = 10 -RC_1024_1_3 = 55 -RC_1024_1_4 = 49 -RC_1024_1_5 = 18 -RC_1024_1_6 = 23 -RC_1024_1_7 = 52 - -RC_1024_2_0 = 33 -RC_1024_2_1 = 4 -RC_1024_2_2 = 51 -RC_1024_2_3 = 13 -RC_1024_2_4 = 34 -RC_1024_2_5 = 41 -RC_1024_2_6 = 59 -RC_1024_2_7 = 17 - -RC_1024_3_0 = 5 -RC_1024_3_1 = 20 -RC_1024_3_2 = 48 -RC_1024_3_3 = 41 -RC_1024_3_4 = 47 -RC_1024_3_5 = 28 -RC_1024_3_6 = 16 -RC_1024_3_7 = 25 - -RC_1024_4_0 = 41 -RC_1024_4_1 = 9 -RC_1024_4_2 = 37 -RC_1024_4_3 = 31 -RC_1024_4_4 = 12 -RC_1024_4_5 = 47 -RC_1024_4_6 = 44 -RC_1024_4_7 = 30 - -RC_1024_5_0 = 16 -RC_1024_5_1 = 34 -RC_1024_5_2 = 56 -RC_1024_5_3 = 51 -RC_1024_5_4 = 4 -RC_1024_5_5 = 53 -RC_1024_5_6 = 42 -RC_1024_5_7 = 41 - -RC_1024_6_0 = 31 -RC_1024_6_1 = 44 -RC_1024_6_2 = 47 -RC_1024_6_3 = 46 -RC_1024_6_4 = 19 -RC_1024_6_5 = 42 -RC_1024_6_6 = 44 -RC_1024_6_7 = 25 - -RC_1024_7_0 = 9 -RC_1024_7_1 = 48 -RC_1024_7_2 = 35 -RC_1024_7_3 = 52 -RC_1024_7_4 = 23 -RC_1024_7_5 = 31 -RC_1024_7_6 = 37 -RC_1024_7_7 = 20 -# -# Input: reg -# Output: <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 -# -.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM -_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM - .if _RCNT_ #is there anything to do? - rolq $_RCNT_,%\reg - .endif -.endm -# -#---------------------------------------------------------------- -# -# MACROS: define local vars and configure stack -# -#---------------------------------------------------------------- -# declare allocated space on the stack -.macro StackVar localName,localSize -\localName = _STK_OFFS_ -_STK_OFFS_ = _STK_OFFS_+(\localSize) -.endm #StackVar -# -#---------------------------------------------------------------- -# -# MACRO: Configure stack frame, allocate local vars -# -.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt - WCNT = (\BLK_BITS)/64 -# -_PushCnt_ = 0 #save nonvolatile regs on stack - .irp _reg_,rbp,rbx,r12,r13,r14,r15 - pushq %\_reg_ -_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment - .endr -# -_STK_OFFS_ = 0 #starting offset from rsp - #---- local variables #<-- rsp - StackVar X_stk ,8*(WCNT) #local context vars - StackVar ksTwk ,8*3 #key schedule: tweak words - StackVar ksKey ,8*(WCNT)+8 #key schedule: key words - .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0 - StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen - .endif - StackVar Wcopy ,8*(WCNT) #copy of input block - .if _SKEIN_DEBUG - .if \debugCnt + 0 #temp location for debug X[] info - StackVar xDebug_\BLK_BITS ,8*(\debugCnt) - .endif - .endif - .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 - StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) -tmpStk_\BLK_BITS = align16 #use this - .endif - #---- saved caller parameters (from regs rdi, rsi, rdx, rcx) - StackVar ctxPtr ,8 #context ptr - StackVar blkPtr ,8 #pointer to block data - StackVar blkCnt ,8 #number of full blocks to process - StackVar bitAdd ,8 #bit count to add to tweak -LOCAL_SIZE = _STK_OFFS_ #size of "local" vars - #---- - StackVar savRegs,8*_PushCnt_ #saved registers - StackVar retAddr,8 #return address - #---- caller's stack frame (aligned mod 16) -# -# set up the stack frame pointer (rbp) -# -FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey - .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range -FRAME_OFFS = _STK_OFFS_ - .endif -F_O = -FRAME_OFFS -# - #put some useful defines in the .lst file (for grep) -__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE -__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_ -__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS -# -# Notes on stack frame setup: -# * the most frequently used variable is X_stk[], based at [rsp+0] -# * the next most used is the key schedule arrays, ksKey and ksTwk -# so rbp is "centered" there, allowing short offsets to the key -# schedule even in 1024-bit Skein case -# * the Wcopy variables are infrequently accessed, but they have long -# offsets from both rsp and rbp only in the 1024-bit case. -# * all other local vars and calling parameters can be accessed -# with short offsets, except in the 1024-bit case -# - subq $LOCAL_SIZE,%rsp #make room for the locals - leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets - movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack - movq %rsi, blkPtr+F_O(%rbp) - movq %rdx, blkCnt+F_O(%rbp) - movq %rcx, bitAdd+F_O(%rbp) -# -.endm #Setup_Stack -# -#---------------------------------------------------------------- -# -.macro Reset_Stack - addq $LOCAL_SIZE,%rsp #get rid of locals (wipe??) - .irp _reg_,r15,r14,r13,r12,rbx,rbp - popq %\_reg_ #restore caller's regs -_PushCnt_ = _PushCnt_ - 1 - .endr - .if _PushCnt_ - .error "Mismatched push/pops?" - .endif -.endm # Reset_Stack -# -#---------------------------------------------------------------- -# macros to help debug internals -# -.if _SKEIN_DEBUG - .extern Skein_Show_Block #calls to C routines - .extern Skein_Show_Round -# -SKEIN_RND_SPECIAL = 1000 -SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 -SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 -SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 -# -.macro Skein_Debug_Block BLK_BITS -# -#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, -# const u08b_t *blkPtr, const u64b_t *wPtr, -# const u64b_t *ksPtr,const u64b_t *tsPtr) -# -_NN_ = 0 - .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 - pushq %\_reg_ #save all volatile regs on tack before the call -_NN_ = _NN_ + 1 - .endr - # get and push call parameters - movq $\BLK_BITS ,%rdi #bits - movq ctxPtr+F_O(%rbp),%rsi #h (pointer) - leaq X_VARS (%rsi),%rdx #X (pointer) - movq blkPtr+F_O(%rbp),%rcx #blkPtr - leaq Wcopy +F_O(%rbp),%r8 #wPtr - leaq ksKey +F_O(%rbp),%r9 #key pointer - leaq ksTwk +F_O(%rbp),%rax #tweak pointer - pushq %rax # (pass on the stack) - call Skein_Show_Block #call external debug handler - addq $8*1,%rsp #discard parameters on stack - .if (_NN_ % 2 ) == 0 #check stack alignment - .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" - .endif - .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax - popq %\_reg_ #restore regs -_NN_ = _NN_ - 1 - .endr - .if _NN_ - .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" - .endif -.endm # Skein_Debug_Block -# -# the macro to "call" to debug a round -# -.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp - # call the appropriate (local) debug "function" - pushq %rdx #save rdx, so we can use it for round "number" - .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) - movq $\R,%rdx - .else #compute round number using edi -_rOffs_ = \RDI_OFFS + 0 - .if \BLK_BITS == 1024 - movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) - leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx - .else - leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx - .endif - .endif - call Skein_Debug_Round_\BLK_BITS - popq %rdx #restore origianl rdx value -# - afterOp -.endm # Skein_Debug_Round -.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) -.macro Skein_Debug_Block BLK_BITS -.endm -# -.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp -.endm -# -.endif # _SKEIN_DEBUG -# -#---------------------------------------------------------------- -# -.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs - .if \immOffs + 0 - leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg - .elseif ((\useAddOp + 0) == 0) - .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs! - leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg - .else - addq %\srcReg_A\srcReg_B,%\dstReg - .endif - .else - addq %\srcReg_A\srcReg_B,%\dstReg - .endif -.endm - -# keep Intel-style ordering here, to match addReg -.macro xorReg dstReg,srcReg_A,srcReg_B - xorq %\srcReg_A\srcReg_B,%\dstReg -.endm -# -#---------------------------------------------------------------- -# -.macro C_label lName - \lName: #use both "genders" to work across linkage conventions -_\lName: - .global \lName - .global _\lName -.endm -# -#=================================== Skein_256 ============================================= -# -.if _USE_ASM_ & 256 -# -# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# -# -################# -# -# code -# -C_label Skein_256_Process_Block - Setup_Stack 256,((ROUNDS_256/8)+1) - movq TWEAK+8(%rdi),%r14 - jmp Skein_256_block_loop - .p2align 4 - # main hash loop for Skein_256 -Skein_256_block_loop: - # - # general register usage: - # RAX..RDX = X0..X3 - # R08..R12 = ks[0..4] - # R13..R15 = ts[0..2] - # RSP, RBP = stack/frame pointers - # RDI = round counter or context pointer - # RSI = temp - # - movq TWEAK+0(%rdi) ,%r13 - addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0 - movq %r14 ,%r15 - xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak - - movq $KW_PARITY ,%r12 - movq X_VARS+ 0(%rdi),%r8 - movq X_VARS+ 8(%rdi),%r9 - movq X_VARS+16(%rdi),%r10 - movq X_VARS+24(%rdi),%r11 - movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0] - xorq %r8 ,%r12 #start accumulating overall parity - - movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block - xorq %r9 ,%r12 - movq 0(%rsi) ,%rax #get X[0..3] - xorq %r10 ,%r12 - movq 8(%rsi) ,%rbx - xorq %r11 ,%r12 - movq 16(%rsi) ,%rcx - movq 24(%rsi) ,%rdx - - movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block - movq %rbx,Wcopy+ 8+F_O(%rbp) - movq %rcx,Wcopy+16+F_O(%rbp) - movq %rdx,Wcopy+24+F_O(%rbp) - - addq %r8 ,%rax #initial key injection - addq %r9 ,%rbx - addq %r10,%rcx - addq %r11,%rdx - addq %r13,%rbx - addq %r14,%rcx - -.if _SKEIN_DEBUG - movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) - movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block - movq %r9 ,ksKey+ 8+F_O(%rbp) - movq %r10,ksKey+16+F_O(%rbp) - movq %r11,ksKey+24+F_O(%rbp) - movq %r12,ksKey+32+F_O(%rbp) - - movq %r13,ksTwk+ 0+F_O(%rbp) - movq %r14,ksTwk+ 8+F_O(%rbp) - movq %r15,ksTwk+16+F_O(%rbp) - - movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block - movq %rbx,X_stk + 8(%rsp) - movq %rcx,X_stk +16(%rsp) - movq %rdx,X_stk +24(%rsp) - - Skein_Debug_Block 256 #debug dump - Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL -.endif -# -.if ((SKEIN_ASM_UNROLL & 256) == 0) - movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code - movq %r9 ,ksKey+ 8+F_O(%rbp) - movq %r10,ksKey+16+F_O(%rbp) - movq %r11,ksKey+24+F_O(%rbp) - movq %r12,ksKey+32+F_O(%rbp) - - movq %r13,ksTwk+24+F_O(%rbp) - movq %r14,ksTwk+ 8+F_O(%rbp) - movq %r15,ksTwk+16+F_O(%rbp) -.endif - addq $WCNT*8,%rsi #skip the block - movq %rsi,blkPtr +F_O(%rbp) #update block pointer - # - # now the key schedule is computed. Start the rounds - # -.if SKEIN_ASM_UNROLL & 256 -_UNROLL_CNT = ROUNDS_256/8 -.else -_UNROLL_CNT = SKEIN_UNROLL_256 - .if ((ROUNDS_256/8) % _UNROLL_CNT) - .error "Invalid SKEIN_UNROLL_256" - .endif - xorq %rdi,%rdi #rdi = iteration count -Skein_256_round_loop: -.endif -_Rbase_ = 0 -.rept _UNROLL_CNT*2 - # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) - # round 4*_RBase_ + 0 - addReg rax, rbx - RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 - addReg rcx, rdx - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 - .endif - xorReg rbx, rax - RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 - xorReg rdx, rcx - .if SKEIN_ASM_UNROLL & 256 - .irp _r0_,%( 8+(_Rbase_+3) % 5) - .irp _r1_,%(13+(_Rbase_+2) % 3) - leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx - .endr - .endr - .endif - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 - .endif - Skein_Debug_Round 256,%(4*_Rbase_+1) - - # round 4*_Rbase_ + 1 - addReg rax, rdx - RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 - xorReg rdx, rax - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 - .endif - addReg rcx, rbx - RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 - xorReg rbx, rcx - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 - .endif - Skein_Debug_Round 256,%(4*_Rbase_+2) - .if SKEIN_ASM_UNROLL & 256 - .irp _r0_,%( 8+(_Rbase_+2) % 5) - .irp _r1_,%(13+(_Rbase_+1) % 3) - leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx - .endr - .endr - .endif - # round 4*_Rbase_ + 2 - addReg rax, rbx - RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 - addReg rcx, rdx - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 - .endif - xorReg rbx, rax - RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 - xorReg rdx, rcx - .if (SKEIN_ASM_UNROLL & 256) == 0 - movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key - leaq 1(%r11,%rdi),%r11 #precompute key + tweak - .endif - Skein_Debug_Round 256,%(4*_Rbase_+3) - # round 4*_Rbase_ + 3 - addReg rax, rdx - RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 - addReg rcx, rbx - .if (SKEIN_ASM_UNROLL & 256) == 0 - addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak - movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak - .endif - xorReg rdx, rax - RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 - xorReg rbx, rcx - Skein_Debug_Round 256,%(4*_Rbase_+4) - .if (SKEIN_ASM_UNROLL & 256) == 0 - addReg r9 ,r13 #precompute key+tweak - .endif - #inject key schedule words -_Rbase_ = _Rbase_+1 - .if SKEIN_ASM_UNROLL & 256 - addReg rax,r,%(8+((_Rbase_+0) % 5)) - addReg rbx,rsi - addReg rcx,rdi - addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ - .else - incq %rdi - addReg rax,r8 - addReg rcx,r10 - addReg rbx,r9 - addReg rdx,r11 - .endif - Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT -.endr #rept _UNROLL_CNT -# -.if (SKEIN_ASM_UNROLL & 256) == 0 - cmpq $2*(ROUNDS_256/8),%rdi - jb Skein_256_round_loop -.endif # (SKEIN_ASM_UNROLL & 256) == 0 - movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context - - #---------------------------- - # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} - movq $FIRST_MASK64 ,%r14 - xorq Wcopy + 0+F_O (%rbp),%rax - xorq Wcopy + 8+F_O (%rbp),%rbx - xorq Wcopy +16+F_O (%rbp),%rcx - xorq Wcopy +24+F_O (%rbp),%rdx - andq TWEAK + 8 (%rdi),%r14 - movq %rax,X_VARS+ 0(%rdi) #store final result - movq %rbx,X_VARS+ 8(%rdi) - movq %rcx,X_VARS+16(%rdi) - movq %rdx,X_VARS+24(%rdi) - - Skein_Debug_Round 256,SKEIN_RND_FEED_FWD - - # go back for more blocks, if needed - decq blkCnt+F_O(%rbp) - jnz Skein_256_block_loop - movq %r14,TWEAK + 8(%rdi) - Reset_Stack - ret -Skein_256_Process_Block_End: - - .if _SKEIN_DEBUG -Skein_Debug_Round_256: #here with rdx == round "number" from macro - pushq %rsi #save two regs for BLK_BITS-specific parms - pushq %rdi - movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi - movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it - movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) - movq %rcx,X_stk+16+F_O(%rbp) - movq %rdi,X_stk+24+F_O(%rbp) - - movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr - movq $256,%rdi #now are set for the call - jmp Skein_Debug_Round_Common - .endif -# -.if _SKEIN_CODE_SIZE -C_label Skein_256_Process_Block_CodeSize - movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax - ret -# -C_label Skein_256_Unroll_Cnt - .if _UNROLL_CNT <> ROUNDS_256/8 - movq $_UNROLL_CNT,%rax - .else - xorq %rax,%rax - .endif - ret -.endif -# -.endif #_USE_ASM_ & 256 -# -#=================================== Skein_512 ============================================= -# -.if _USE_ASM_ & 512 -# -# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd) -# -# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) -# -################# -# MACRO: one round for 512-bit blocks -# -.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 -# - addReg r\rn0, r\rn1 - RotL64 r\rn1, 512,%((_Rn_) % 8),0 - xorReg r\rn1, r\rn0 - op1 - addReg r\rn2, r\rn3 - RotL64 r\rn3, 512,%((_Rn_) % 8),1 - xorReg r\rn3, r\rn2 - op2 - addReg r\rn4, r\rn5 - RotL64 r\rn5, 512,%((_Rn_) % 8),2 - xorReg r\rn5, r\rn4 - op3 - addReg r\rn6, r\rn7 - RotL64 r\rn7, 512,%((_Rn_) % 8),3 - xorReg r\rn7, r\rn6 - op4 - Skein_Debug_Round 512,%(_Rn_+1),-4 -# -.endm #R_512_OneRound -# -################# -# MACRO: eight rounds for 512-bit blocks -# -.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) - .if (SKEIN_ASM_UNROLL && 512) - # here for fully unrolled case. - _II_ = ((_RR_)/4) + 1 #key injection counter - R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),,, - R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),,, - R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),,, - R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),, - # inject the key schedule - addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 - addReg r11, rax - addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 - addReg r12, rbx - addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 - addReg r13, rcx - addReg r14, rdx - addReg r15, rsi,,,(_II_) - .else - # here for looping case #"rotate" key/tweak schedule (move up on stack) - incq %rdi #bump key injection counter - R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),,, - R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),,, - R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),,, - R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),, - # inject the key schedule - addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 - addReg r11, rax - addReg r12, rbx - addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 - addReg r13, rcx - addReg r14, rdx - addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 - addReg r15, rsi - addReg r15, rdi #inject the round number - .endif - - #show the result of the key injection - Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT -.endm #R_512_EightRounds -# -################# -# instantiated code -# -C_label Skein_512_Process_Block - Setup_Stack 512,ROUNDS_512/8 - movq TWEAK+ 8(%rdi),%rbx - jmp Skein_512_block_loop - .p2align 4 - # main hash loop for Skein_512 -Skein_512_block_loop: - # general register usage: - # RAX..RDX = temps for key schedule pre-loads - # R8 ..R15 = X0..X7 - # RSP, RBP = stack/frame pointers - # RDI = round counter or context pointer - # RSI = temp - # - movq TWEAK + 0(%rdi),%rax - addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0 - movq %rbx,%rcx - xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule - movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0] - movq %rax,ksTwk+ 0+F_O(%rbp) - movq $KW_PARITY,%rdx - movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block - movq %rbx,ksTwk+ 8+F_O(%rbp) - movq %rcx,ksTwk+16+F_O(%rbp) - .irp _Rn_,8,9,10,11,12,13,14,15 - movq X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_ - xorq %r\_Rn_,%rdx #compute overall parity - movq %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp) - .endr #load state into %r8 ..%r15, compute parity - movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity - - addReg r13,rax #precompute key injection for tweak - addReg r14, rbx -.if _SKEIN_DEBUG - movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below -.endif - movq 0(%rsi),%rax #load input block - movq 8(%rsi),%rbx - movq 16(%rsi),%rcx - movq 24(%rsi),%rdx - addReg r8 , rax #do initial key injection - addReg r9 , rbx - movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward - movq %rbx,Wcopy+ 8+F_O(%rbp) - addReg r10, rcx - addReg r11, rdx - movq %rcx,Wcopy+16+F_O(%rbp) - movq %rdx,Wcopy+24+F_O(%rbp) - - movq 32(%rsi),%rax - movq 40(%rsi),%rbx - movq 48(%rsi),%rcx - movq 56(%rsi),%rdx - addReg r12, rax - addReg r13, rbx - addReg r14, rcx - addReg r15, rdx - movq %rax,Wcopy+32+F_O(%rbp) - movq %rbx,Wcopy+40+F_O(%rbp) - movq %rcx,Wcopy+48+F_O(%rbp) - movq %rdx,Wcopy+56+F_O(%rbp) - -.if _SKEIN_DEBUG - .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output - movq %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp) - .endr - - Skein_Debug_Block 512 #debug dump - Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL -.endif - addq $8*WCNT,%rsi #skip the block - movq %rsi,blkPtr+F_O(%rbp) #update block pointer - # - ################# - # now the key schedule is computed. Start the rounds - # -.if SKEIN_ASM_UNROLL & 512 -_UNROLL_CNT = ROUNDS_512/8 -.else -_UNROLL_CNT = SKEIN_UNROLL_512 - .if ((ROUNDS_512/8) % _UNROLL_CNT) - .error "Invalid SKEIN_UNROLL_512" - .endif - xorq %rdi,%rdi #rdi = round counter -Skein_512_round_loop: -.endif -# -_Rbase_ = 0 -.rept _UNROLL_CNT*2 - R_512_FourRounds %(4*_Rbase_+00) -_Rbase_ = _Rbase_+1 -.endr #rept _UNROLL_CNT -# -.if (SKEIN_ASM_UNROLL & 512) == 0 - cmpq $2*(ROUNDS_512/8),%rdi - jb Skein_512_round_loop - movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context -.endif - # end of rounds - ################# - # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} - .irp _Rn_,8,9,10,11,12,13,14,15 - .if (_Rn_ == 8) - movq $FIRST_MASK64,%rbx - .endif - xorq Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR - movq %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi) #and store result - .if (_Rn_ == 14) - andq TWEAK+ 8(%rdi),%rbx - .endif - .endr - Skein_Debug_Round 512,SKEIN_RND_FEED_FWD - - # go back for more blocks, if needed - decq blkCnt+F_O(%rbp) - jnz Skein_512_block_loop - movq %rbx,TWEAK + 8(%rdi) - - Reset_Stack - ret -Skein_512_Process_Block_End: -# - .if _SKEIN_DEBUG -# call here with rdx = "round number" -Skein_Debug_Round_512: - pushq %rsi #save two regs for BLK_BITS-specific parms - pushq %rdi - .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it - movq %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp) - .endr - movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr - movq $512,%rdi #now are set for the call - jmp Skein_Debug_Round_Common - .endif -# -.if _SKEIN_CODE_SIZE -C_label Skein_512_Process_Block_CodeSize - movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax - ret -# -C_label Skein_512_Unroll_Cnt - .if _UNROLL_CNT <> (ROUNDS_512/8) - movq $_UNROLL_CNT,%rax - .else - xorq %rax,%rax - .endif - ret -.endif -# -.endif # _USE_ASM_ & 512 -# -#=================================== Skein1024 ============================================= -.if _USE_ASM_ & 1024 -# -# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# -# -################# -# use details of permutation to make register assignments -# -o1K_rdi = 0 #offsets in X[] associated with each register -o1K_rsi = 1 -o1K_rbp = 2 -o1K_rax = 3 -o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate -o1K_rbx = 5 -o1K_rdx = 7 -o1K_r8 = 8 -o1K_r9 = 9 -o1K_r10 = 10 -o1K_r11 = 11 -o1K_r12 = 12 -o1K_r13 = 13 -o1K_r14 = 14 -o1K_r15 = 15 -# -rIdx_offs = tmpStk_1024 -# -.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 - addReg \reg0 , \reg1 #perform the MIX - RotL64 \reg1 , 1024,%((_RN0_) % 8),_Rn1_ - xorReg \reg1 , \reg0 -.if ((_RN0_) && 3) == 3 #time to do key injection? - .if _SKEIN_DEBUG - movq %\reg0 , xDebug_1024+8*w0(%rsp) #save intermediate values for Debug_Round - movq %\reg1 , xDebug_1024+8*w1(%rsp) # (before inline key injection) - .endif -_II_ = ((_RN0_)/4)+1 #injection count - .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection - addq ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0 - addq ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1 - .if w1 == 13 #tweak injection - addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1 - .elseif w0 == 14 - addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0 - .elseif w1 == 15 - addq $_II_, %\reg1 #(injection counter) - .endif - .else #here to do looping key injection - .if (w0 == 0) - movq %rdi, X_stk+8*w0(%rsp) #if so, store N0 so we can use reg as index - movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi - .else - addq ksKey+8+8*w0(%rsp,%rdi,8),%\reg0 #even key injection - .endif - .if w1 == 13 #tweak injection - addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1 - .elseif w0 == 14 - addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0 - .elseif w1 == 15 - addReg \reg1,rdi,,,1 #(injection counter) - .endif - addq ksKey+8+8*w1(%rsp,%rdi,8),%\reg1 #odd key injection - .endif -.endif - # insert the op provided, .if any - op1 -.endm -################# -# MACRO: four rounds for 1024-bit blocks -# -.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4) - # should be here with X4 set properly, X6 stored on stack -_Rn_ = (_RR_) + 0 - r1024_Mix 0, 1,rdi,rsi,_Rn_,0 - r1024_Mix 2, 3,rbp,rax,_Rn_,1 - r1024_Mix 4, 5,rcx,rbx,_Rn_,2, #save X4 on stack (x4/x6 alternate) - r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4, #load X6 from stack - r1024_Mix 10,11,r10,r11,_Rn_,5 - r1024_Mix 12,13,r12,r13,_Rn_,6 - r1024_Mix 6, 7,rcx,rdx,_Rn_,3 - r1024_Mix 14,15,r14,r15,_Rn_,7 - .if _SKEIN_DEBUG - Skein_Debug_Round 1024,%(_Rn_+1) - .endif -_Rn_ = (_RR_) + 1 - r1024_Mix 0, 9,rdi,r9 ,_Rn_,0 - r1024_Mix 2,13,rbp,r13,_Rn_,1 - r1024_Mix 6,11,rcx,r11,_Rn_,2, #save X6 on stack (x4/x6 alternate) - r1024_Mix 10, 7,r10,rdx,_Rn_,4, #load X4 from stack - r1024_Mix 12, 3,r12,rax,_Rn_,5 - r1024_Mix 14, 5,r14,rbx,_Rn_,6 - r1024_Mix 4,15,rcx,r15,_Rn_,3 - r1024_Mix 8, 1,r8 ,rsi,_Rn_,7 - .if _SKEIN_DEBUG - Skein_Debug_Round 1024,%(_Rn_+1) - .endif -_Rn_ = (_RR_) + 2 - r1024_Mix 0, 7,rdi,rdx,_Rn_,0 - r1024_Mix 2, 5,rbp,rbx,_Rn_,1 - r1024_Mix 4, 3,rcx,rax,_Rn_,2, #save X4 on stack (x4/x6 alternate) - r1024_Mix 12,15,r12,r15,_Rn_,4, #load X6 from stack - r1024_Mix 14,13,r14,r13,_Rn_,5 - r1024_Mix 8,11,r8 ,r11,_Rn_,6 - r1024_Mix 6, 1,rcx,rsi,_Rn_,3 - r1024_Mix 10, 9,r10,r9 ,_Rn_,7 - .if _SKEIN_DEBUG - Skein_Debug_Round 1024,%(_Rn_+1) - .endif -_Rn_ = (_RR_) + 3 - r1024_Mix 0,15,rdi,r15,_Rn_,0 - r1024_Mix 2,11,rbp,r11,_Rn_,1 - r1024_Mix 6,13,rcx,r13,_Rn_,2, #save X6 on stack (x4/x6 alternate) - r1024_Mix 14, 1,r14,rsi,_Rn_,4, #load X4 from stack - r1024_Mix 8, 5,r8 ,rbx,_Rn_,5 - r1024_Mix 10, 3,r10,rax,_Rn_,6 - r1024_Mix 4, 9,rcx,r9 ,_Rn_,3 - r1024_Mix 12, 7,r12,rdx,_Rn_,7 - .if _SKEIN_DEBUG - Skein_Debug_Round 1024,%(_Rn_+1) - .endif - - .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack - #"rotate" the key schedule on the stack -i8 = o1K_r8 -i0 = o1K_rdi - movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack) - movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word - movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!) - movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word - movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack) - movq X_stk+8*i8(%rsp) ,%r8 #get the reg back - incq %rdi #bump the index - movq %rdi, rIdx_offs (%rsp) #save rdi again - movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back - addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection - .endif - #show the result of the key injection - Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT -.endm #r1024_FourRounds -# -################ -# code -# -C_label Skein1024_Process_Block -# - Setup_Stack 1024,ROUNDS_1024/8,WCNT - movq TWEAK+ 8(%rdi),%r9 - jmp Skein1024_block_loop - # main hash loop for Skein1024 - .p2align 4 -Skein1024_block_loop: - # general register usage: - # RSP = stack pointer - # RAX..RDX,RSI,RDI = X1, X3..X7 (state words) - # R8 ..R15 = X8..X15 (state words) - # RBP = temp (used for X0 and X2) - # - .if (SKEIN_ASM_UNROLL & 1024) == 0 - xorq %rax,%rax #init loop index on the stack - movq %rax,rIdx_offs(%rsp) - .endif - movq TWEAK+ 0(%rdi),%r8 - addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0 - movq %r9 ,%r10 - xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule - movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0] - movq %r8 ,ksTwk+ 0+F_O(%rbp) - movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below - movq %r10,ksTwk+16+F_O(%rbp) - .if _SKEIN_DEBUG - movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block - .endif - movq blkPtr +F_O(%rbp),%rsi # rsi --> input block - movq $KW_PARITY ,%rax #overall key schedule parity - - # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3] - .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps - movq X_VARS+8*_rN_(%rdi),%r14 #get state word - movq 8*_rN_(%rsi),%r15 #get msg word - xorq %r14,%rax #update key schedule overall parity - movq %r14,ksKey +8*_rN_+F_O(%rbp) #save key schedule word on stack - movq %r15,Wcopy +8*_rN_+F_O(%rbp) #save local msg Wcopy - addq %r15,%r14 #do the initial key injection - movq %r14,X_stk +8*_rN_ (%rsp) #save initial state var on stack - .endr - # now process the rest, using the "real" registers - # (MUST do it in reverse order to inject tweaks r8/r9 first) - .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx -_oo_ = o1K_\_rr_ #offset assocated with the register - movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context - movq 8*_oo_(%rsi),%rcx #get next input msg word - movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack - xorq %\_rr_, %rax #accumulate key schedule parity - movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward - addq %rcx,%\_rr_ #do the initial key injection - .if _oo_ == 13 #do the initial tweak injection - addReg _rr_,r8 # (only in words 13/14) - .elseif _oo_ == 14 - addReg _rr_,r9 - .endif - .endr - movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity -.if _SKEIN_DEBUG - Skein_Debug_Block 1024 #initial debug dump -.endif - addq $8*WCNT,%rsi #bump the msg ptr - movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr - # re-load words 0..4 from stack, enter the main loop - .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack) - movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go! - .endr -.if _SKEIN_DEBUG - Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection -.endif - # - ################# - # now the key schedule is computed. Start the rounds - # -.if SKEIN_ASM_UNROLL & 1024 -_UNROLL_CNT = ROUNDS_1024/8 -.else -_UNROLL_CNT = SKEIN_UNROLL_1024 - .if ((ROUNDS_1024/8) % _UNROLL_CNT) - .error "Invalid SKEIN_UNROLL_1024" - .endif -Skein1024_round_loop: -.endif -# -_Rbase_ = 0 -.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time - r1024_FourRounds %(4*_Rbase_+00) -_Rbase_ = _Rbase_+1 -.endr #rept _UNROLL_CNT -# -.if (SKEIN_ASM_UNROLL & 1024) == 0 - cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done - jb Skein1024_round_loop -.endif - # end of rounds - ################# - # - # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} - movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack - movq ctxPtr(%rsp),%rdx - - .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7 -_oo_ = o1K_\_rr_ - xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR - movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context - .if (_oo_ == 9) - movq $FIRST_MASK64 ,%r9 - .endif - .if (_oo_ == 14) - andq TWEAK+ 8(%rdx),%r9 - .endif - .endr - # - movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above) - movq X_stk +8*7(%rsp),%rbx - xorq Wcopy +8*6(%rsp),%rax - xorq Wcopy +8*7(%rsp),%rbx - movq %rax,X_VARS+8*6(%rdx) - decq blkCnt(%rsp) #set zero flag iff done - movq %rbx,X_VARS+8*7(%rdx) - - Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,, - # go back for more blocks, if needed - movq ctxPtr(%rsp),%rdi #don't muck with the flags here! - lea FRAME_OFFS(%rsp),%rbp - jnz Skein1024_block_loop - movq %r9 ,TWEAK+ 8(%rdx) - Reset_Stack - ret -# -Skein1024_Process_Block_End: -# -.if _SKEIN_DEBUG -Skein_Debug_Round_1024: - # call here with rdx = "round number", -_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr - # - #save rest of X[] state on stack so debug routines can access it - .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15 - movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp) - .endr - # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack - cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save - jae save_x0 - testq $3,%rdx #otherwise only if rdx != 0 mod 4 - jz save_x0_not -save_x0: - movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp) -save_x0_not: - #figure out the x4/x6 swapping state and save the correct one! - cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4 - jae save_x4 - testq $1,%rdx #and even ones have r4 as well - jz save_x4 - movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp) - jmp debug_1024_go -save_x4: - movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp) -debug_1024_go: - #now all is saved in Xstk[] except for rdx - push %rsi #save two regs for BLK_BITS-specific parms - push %rdi -_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32) - - movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call) - movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[] - - movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr - movq $1024,%rdi #rdi = block size - jmp Skein_Debug_Round_Common -.endif -# -.if _SKEIN_CODE_SIZE -C_label Skein1024_Process_Block_CodeSize - movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax - ret -# -C_label Skein1024_Unroll_Cnt - .if _UNROLL_CNT <> (ROUNDS_1024/8) - movq $_UNROLL_CNT,%rax - .else - xorq %rax,%rax - .endif - ret -.endif -# -.endif # _USE_ASM_ and 1024 -# -.if _SKEIN_DEBUG -#---------------------------------------------------------------- -#local debug routine to set up for calls to: -# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X) -# [ rdi rsi rdx rcx] -# -# here with %rdx = round number -# %rsi = ctx_hdr_ptr -# %rdi = block size (256/512/1024) -# on stack: saved rdi, saved rsi, retAddr, saved rdx -# -Skein_Debug_Round_Common: -_SP_OFFS_ = 32 #account for four words on stack already - .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs - pushq %\_rr_ -_SP_OFFS_ = _SP_OFFS_+8 - .endr - .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here - .error "Debug_Round_Common: stack alignment" - .endif - # compute %rcx = ptr to the X[] array on the stack (final parameter to call) - leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address - cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"? - jnz _got_rcxA - leaq X_VARS(%rsi),%rcx -_got_rcxA: - .if _USE_ASM_ & 1024 - # special handling for 1024-bit case - # (for rounds right before with key injection: - # use xDebug_1024[] instead of X_stk[]) - cmpq $SKEIN_RND_SPECIAL,%rdx - jae _got_rcxB #must be a normal round - orq %rdx,%rdx - jz _got_rcxB #just before key injection - test $3,%rdx - jne _got_rcxB - cmp $1024,%rdi #only 1024-bit(s) for now - jne _got_rcxB - leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx -_got_rcxB: - .endif - call Skein_Show_Round #call external debug handler - - .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs - popq %\_rr_ -_SP_OFFS_ = _SP_OFFS_-8 - .endr - .if _SP_OFFS_ - 32 - .error "Debug_Round_Common: push/pop misalignment!" - .endif - popq %rdi - popq %rsi - ret -.endif -#---------------------------------------------------------------- - .section .note.GNU-stack,"",@progbits - - .end Index: sys/modules/crypto/Makefile =================================================================== --- sys/modules/crypto/Makefile +++ sys/modules/crypto/Makefile @@ -30,13 +30,11 @@ SRCS += skein.c skein_block.c # unroll the 256 and 512 loops, half unroll the 1024 CFLAGS+= -DSKEIN_LOOP=995 -.if exists(${MACHINE_ARCH}/skein_block_asm.s) +.if exists(${MACHINE_ARCH}/skein_block_asm.S) .PATH: ${SRCTOP}/sys/crypto/skein/${MACHINE_ARCH} -SRCS += skein_block_asm.s +SRCS += skein_block_asm.S CFLAGS += -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792 ACFLAGS += -DELF -Wa,--noexecstack -# Fully unroll all loops in the assembly optimized version -AFLAGS+= --defsym SKEIN_LOOP=0 .endif SRCS += siphash.c SRCS += gmac.c gfmult.c