Changeset View
Changeset View
Standalone View
Standalone View
sys/contrib/skein/asm/skein_block_xmm32.s
- This file was added.
Property | Old Value | New Value |
---|---|---|
svn:eol-style | null | native \ No newline at end of property |
svn:keywords | null | FreeBSD=%H \ No newline at end of property |
svn:mime-type | null | text/plain \ No newline at end of property |
# | |||||
#---------------------------------------------------------------- | |||||
cem: Identical to skein_block_xmm32.asm? | |||||
Not Done Inline Actionsdifferent assembly dialect. Probably don't need the unused ones I guess. This was just everything from the directories that made sense in the original distribution package. allanjude: different assembly dialect. Probably don't need the unused ones I guess. This was just… | |||||
# 32-bit x86 assembler code for Skein block functions using XMM registers | |||||
# | |||||
# Author: Doug Whiting, Hifn/Exar | |||||
# | |||||
# This code is released to the public domain. | |||||
#---------------------------------------------------------------- | |||||
# | |||||
.text | |||||
.altmacro #use advanced macro features | |||||
.psize 0,128 #list file has no page boundaries | |||||
# | |||||
_MASK_ALL_ = (256+512+1024) #all three algorithm bits | |||||
SAVE_REGS = 1 | |||||
# | |||||
################# | |||||
.ifndef SKEIN_USE_ASM | |||||
_USE_ASM_ = _MASK_ALL_ | |||||
.elseif SKEIN_USE_ASM & _MASK_ALL_ | |||||
_USE_ASM_ = SKEIN_USE_ASM | |||||
.else | |||||
_USE_ASM_ = _MASK_ALL_ | |||||
.endif | |||||
# | |||||
################# | |||||
.ifndef SKEIN_LOOP | |||||
_SKEIN_LOOP = 002 #default is all fully unrolled, except Skein1024 | |||||
.else | |||||
_SKEIN_LOOP = SKEIN_LOOP | |||||
.endif | |||||
#-------------- | |||||
# the unroll counts (0 --> fully unrolled) | |||||
SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 | |||||
SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 | |||||
SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 | |||||
# | |||||
SKEIN_ASM_UNROLL = 0 | |||||
.irp _NN_,256,512,1024 | |||||
.if (SKEIN_UNROLL_\_NN_) == 0 | |||||
SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ | |||||
.endif | |||||
.endr | |||||
# | |||||
################# | |||||
# | |||||
.ifndef SKEIN_ROUNDS | |||||
ROUNDS_256 = 72 | |||||
ROUNDS_512 = 72 | |||||
ROUNDS_1024 = 80 | |||||
.else | |||||
ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) | |||||
ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) | |||||
ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) | |||||
.irp _NN_,256,512,1024 | |||||
.if _USE_ASM_ && \_NN_ | |||||
.irp _RR_,%(ROUNDS_\_NN_) | |||||
.if \_NN_ < 1024 | |||||
.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" | |||||
.else | |||||
.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" | |||||
.endif | |||||
.endr | |||||
.endif | |||||
.endr | |||||
.endif | |||||
################# | |||||
# | |||||
.ifdef SKEIN_CODE_SIZE | |||||
_SKEIN_CODE_SIZE = (1) | |||||
.else | |||||
.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined | |||||
_SKEIN_CODE_SIZE = (1) | |||||
.endif | |||||
.endif | |||||
# | |||||
################# | |||||
# | |||||
.ifndef SKEIN_DEBUG | |||||
_SKEIN_DEBUG = 0 | |||||
.else | |||||
_SKEIN_DEBUG = 1 | |||||
.endif | |||||
################# | |||||
# | |||||
# define offsets of fields in hash context structure | |||||
# | |||||
HASH_BITS = 0 ## bits of hash output | |||||
BCNT = 4 + HASH_BITS #number of bytes in BUFFER[] | |||||
TWEAK = 4 + BCNT #tweak values[0..1] | |||||
X_VARS = 16 + TWEAK #chaining vars | |||||
# | |||||
#(Note: buffer[] in context structure is NOT needed here :-) | |||||
# | |||||
KW_PARITY_LO= 0xA9FC1A22 #overall parity of key schedule words (hi32/lo32) | |||||
KW_PARITY_HI= 0x1BD11BDA | |||||
FIRST_MASK8 = ~ (1 << 6) #FIRST block flag bit | |||||
# | |||||
# rotation constants for Skein | |||||
# | |||||
RC_256_0_0 = 14 | |||||
RC_256_0_1 = 16 | |||||
RC_256_1_0 = 52 | |||||
RC_256_1_1 = 57 | |||||
RC_256_2_0 = 23 | |||||
RC_256_2_1 = 40 | |||||
RC_256_3_0 = 5 | |||||
RC_256_3_1 = 37 | |||||
RC_256_4_0 = 25 | |||||
RC_256_4_1 = 33 | |||||
RC_256_5_0 = 46 | |||||
RC_256_5_1 = 12 | |||||
RC_256_6_0 = 58 | |||||
RC_256_6_1 = 22 | |||||
RC_256_7_0 = 32 | |||||
RC_256_7_1 = 32 | |||||
RC_512_0_0 = 46 | |||||
RC_512_0_1 = 36 | |||||
RC_512_0_2 = 19 | |||||
RC_512_0_3 = 37 | |||||
RC_512_1_0 = 33 | |||||
RC_512_1_1 = 27 | |||||
RC_512_1_2 = 14 | |||||
RC_512_1_3 = 42 | |||||
RC_512_2_0 = 17 | |||||
RC_512_2_1 = 49 | |||||
RC_512_2_2 = 36 | |||||
RC_512_2_3 = 39 | |||||
RC_512_3_0 = 44 | |||||
RC_512_3_1 = 9 | |||||
RC_512_3_2 = 54 | |||||
RC_512_3_3 = 56 | |||||
RC_512_4_0 = 39 | |||||
RC_512_4_1 = 30 | |||||
RC_512_4_2 = 34 | |||||
RC_512_4_3 = 24 | |||||
RC_512_5_0 = 13 | |||||
RC_512_5_1 = 50 | |||||
RC_512_5_2 = 10 | |||||
RC_512_5_3 = 17 | |||||
RC_512_6_0 = 25 | |||||
RC_512_6_1 = 29 | |||||
RC_512_6_2 = 39 | |||||
RC_512_6_3 = 43 | |||||
RC_512_7_0 = 8 | |||||
RC_512_7_1 = 35 | |||||
RC_512_7_2 = 56 | |||||
RC_512_7_3 = 22 | |||||
RC_1024_0_0 = 24 | |||||
RC_1024_0_1 = 13 | |||||
RC_1024_0_2 = 8 | |||||
RC_1024_0_3 = 47 | |||||
RC_1024_0_4 = 8 | |||||
RC_1024_0_5 = 17 | |||||
RC_1024_0_6 = 22 | |||||
RC_1024_0_7 = 37 | |||||
RC_1024_1_0 = 38 | |||||
RC_1024_1_1 = 19 | |||||
RC_1024_1_2 = 10 | |||||
RC_1024_1_3 = 55 | |||||
RC_1024_1_4 = 49 | |||||
RC_1024_1_5 = 18 | |||||
RC_1024_1_6 = 23 | |||||
RC_1024_1_7 = 52 | |||||
RC_1024_2_0 = 33 | |||||
RC_1024_2_1 = 4 | |||||
RC_1024_2_2 = 51 | |||||
RC_1024_2_3 = 13 | |||||
RC_1024_2_4 = 34 | |||||
RC_1024_2_5 = 41 | |||||
RC_1024_2_6 = 59 | |||||
RC_1024_2_7 = 17 | |||||
RC_1024_3_0 = 5 | |||||
RC_1024_3_1 = 20 | |||||
RC_1024_3_2 = 48 | |||||
RC_1024_3_3 = 41 | |||||
RC_1024_3_4 = 47 | |||||
RC_1024_3_5 = 28 | |||||
RC_1024_3_6 = 16 | |||||
RC_1024_3_7 = 25 | |||||
RC_1024_4_0 = 41 | |||||
RC_1024_4_1 = 9 | |||||
RC_1024_4_2 = 37 | |||||
RC_1024_4_3 = 31 | |||||
RC_1024_4_4 = 12 | |||||
RC_1024_4_5 = 47 | |||||
RC_1024_4_6 = 44 | |||||
RC_1024_4_7 = 30 | |||||
RC_1024_5_0 = 16 | |||||
RC_1024_5_1 = 34 | |||||
RC_1024_5_2 = 56 | |||||
RC_1024_5_3 = 51 | |||||
RC_1024_5_4 = 4 | |||||
RC_1024_5_5 = 53 | |||||
RC_1024_5_6 = 42 | |||||
RC_1024_5_7 = 41 | |||||
RC_1024_6_0 = 31 | |||||
RC_1024_6_1 = 44 | |||||
RC_1024_6_2 = 47 | |||||
RC_1024_6_3 = 46 | |||||
RC_1024_6_4 = 19 | |||||
RC_1024_6_5 = 42 | |||||
RC_1024_6_6 = 44 | |||||
RC_1024_6_7 = 25 | |||||
RC_1024_7_0 = 9 | |||||
RC_1024_7_1 = 48 | |||||
RC_1024_7_2 = 35 | |||||
RC_1024_7_3 = 52 | |||||
RC_1024_7_4 = 23 | |||||
RC_1024_7_5 = 31 | |||||
RC_1024_7_6 = 37 | |||||
RC_1024_7_7 = 20 | |||||
# | |||||
#---------------------------------------------------------------- | |||||
# declare allocated space on the stack | |||||
.macro StackVar localName,localSize | |||||
\localName = _STK_OFFS_ | |||||
_STK_OFFS_ = _STK_OFFS_+(\localSize) | |||||
.endm #StackVar | |||||
# | |||||
#---------------------------------------------------------------- | |||||
# | |||||
# MACRO: Configure stack frame, allocate local vars | |||||
# | |||||
.macro Setup_Stack WCNT,RND_CNT | |||||
_STK_OFFS_ = 0 #starting offset from esp, forced on 16-byte alignment | |||||
#----- local variables #<-- esp | |||||
StackVar X_stk , 8*(WCNT) #local context vars | |||||
StackVar Wcopy , 8*(WCNT) #copy of input block | |||||
StackVar ksTwk ,16*3 #key schedule: tweak words | |||||
StackVar ksKey ,16*(WCNT)+16#key schedule: key words | |||||
FRAME_OFFS = ksTwk+128 #<-- ebp | |||||
F_O = FRAME_OFFS #syntactic shorthand | |||||
.if (SKEIN_ASM_UNROLL && (WCNT*64)) == 0 | |||||
StackVar ksRot,16*(RND_CNT/4)#leave space for ks "rotation" to happen | |||||
.endif | |||||
LOCAL_SIZE = _STK_OFFS_ #size of local vars | |||||
# | |||||
#"restart" the stack defns, because we relocate esp to guarantee alignment | |||||
# (i.e., these vars are NOT at fixed offsets from esp) | |||||
_STK_OFFS_ = 0 | |||||
#----- | |||||
StackVar savRegs,8*4 #pushad data | |||||
StackVar retAddr,4 #return address | |||||
#----- caller parameters | |||||
StackVar ctxPtr ,4 #context ptr | |||||
StackVar blkPtr ,4 #pointer to block data | |||||
StackVar blkCnt ,4 #number of full blocks to process | |||||
StackVar bitAdd ,4 #bit count to add to tweak | |||||
#----- caller's stack frame | |||||
# | |||||
# Notes on stack frame setup: | |||||
# * the most used variable (except for Skein-256) is X_stk[], based at [esp+0] | |||||
# * the next most used is the key schedule words | |||||
# so ebp is "centered" there, allowing short offsets to the key/tweak | |||||
# schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-( | |||||
# * the Wcopy variables are infrequently accessed, and they have long | |||||
# offsets from both esp and ebp only in the 1024-bit case. | |||||
# * all other local vars and calling parameters can be accessed | |||||
# with short offsets, except in the 1024-bit case | |||||
# | |||||
pushal #save all regs | |||||
movl %esp,%ebx #keep ebx as pointer to caller parms | |||||
subl $LOCAL_SIZE,%esp #make room for the locals | |||||
andl $~15,%esp #force alignment | |||||
movl ctxPtr(%ebx),%edi #edi --> Skein context | |||||
leal FRAME_OFFS(%esp),%ebp #maximize use of short offsets from ebp | |||||
movl blkCnt(%ebx),%ecx #keep block cnt in ecx | |||||
.endm #Setup_Stack | |||||
# | |||||
#---------------------------------------------------------------- | |||||
# | |||||
.macro Reset_Stack,procStart | |||||
movl %ebx,%esp #get rid of locals (wipe??) | |||||
popal #restore all regs | |||||
.endm # Reset_Stack | |||||
# | |||||
#---------------------------------------------------------------- | |||||
# macros to help debug internals | |||||
# | |||||
.if _SKEIN_DEBUG | |||||
.extern _Skein_Show_Block #calls to C routines | |||||
.extern _Skein_Show_Round | |||||
# | |||||
SKEIN_RND_SPECIAL = 1000 | |||||
SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 | |||||
SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 | |||||
SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 | |||||
# | |||||
.macro Skein_Debug_Block BLK_BITS | |||||
# | |||||
#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, | |||||
# const u08b_t *blkPtr, const u64b_t *wPtr, | |||||
# const u64b_t *ksPtr,const u64b_t *tsPtr)# | |||||
# | |||||
call _Put_XMM_\BLK_BITS | |||||
pushal #save all regs | |||||
leal ksTwk+1-F_O(%ebp),%eax #+1 = flag: "stride" size = 2 qwords | |||||
leal ksKey+1-F_O(%ebp),%esi | |||||
leal Wcopy+32(%esp),%ecx #adjust offset by 32 for pushad | |||||
movl ctxPtr(%ebx) ,%edx #ctx_hdr_ptr | |||||
leal X_VARS(%edx) ,%edx #edx ==> cxt->X[] | |||||
pushl %eax #tsPtr | |||||
pushl %esi #ksPtr | |||||
pushl %ecx #wPtr | |||||
pushl blkPtr(%ebx) #blkPtr | |||||
pushl %edx #ctx->Xptr | |||||
pushl ctxPtr(%ebx) #ctx_hdr_ptr | |||||
movl $\BLK_BITS,%eax | |||||
pushl %eax #bits | |||||
call _Skein_Show_Block | |||||
addl $7*4,%esp #discard parameter space on stack | |||||
popal #restore regs | |||||
# | |||||
call _Get_XMM_\BLK_BITS | |||||
.endm #Skein_Debug_Block | |||||
# | |||||
.macro Skein_Debug_Round BLK_BITS,R,saveRegs=0 | |||||
# | |||||
#void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)# | |||||
# | |||||
.if \saveRegs | |||||
call _Put_XMM_\BLK_BITS | |||||
.endif | |||||
pushal #save all regs | |||||
.if R <> SKEIN_RND_FEED_FWD | |||||
leal 32+X_stk(%esp),%eax #adjust offset by 32 for pushal | |||||
.else | |||||
movl ctxPtr(%ebx),%eax | |||||
addl $X_VARS,%eax | |||||
.endif | |||||
pushl %eax #Xptr | |||||
.if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) | |||||
movl $\R,%eax | |||||
.else #compute round number from edx, R | |||||
leal 1+(((\R)-1) && 3)(,%edx,4),%eax | |||||
.endif | |||||
pushl %eax #round number | |||||
pushl ctxPtr(%ebx) #ctx_hdr_ptr | |||||
movl $\BLK_BITS,%eax | |||||
pushl %eax #bits | |||||
call _Skein_Show_Round | |||||
addl $4*4,%esp #discard parameter space on stack | |||||
popal #restore regs | |||||
.if \saveRegs | |||||
call _Get_XMM_\BLK_BITS #save internal vars for debug dump | |||||
.endif | |||||
.endm #Skein_Debug_Round | |||||
.endif #ifdef SKEIN_DEBUG | |||||
# | |||||
#---------------------------------------------------------------- | |||||
# useful macros | |||||
.macro _ldX xn | |||||
movq X_stk+8*(\xn)(%esp),%xmm\xn | |||||
.endm | |||||
.macro _stX xn | |||||
movq %xmm\xn,X_stk+8*(\xn)(%esp) | |||||
.endm | |||||
# | |||||
#---------------------------------------------------------------- | |||||
# | |||||
.macro C_label lName | |||||
\lName: #use both "genders" to work across linkage conventions | |||||
_\lName: | |||||
.global \lName | |||||
.global _\lName | |||||
.endm | |||||
# | |||||
.if _USE_ASM_ & 256 | |||||
# | |||||
# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# | |||||
# | |||||
################# | |||||
# | |||||
# Skein-256 round macros | |||||
# | |||||
.macro R_256_OneRound _RR_,x0,x1,x2,x3,t0,t1 | |||||
.irp _qq_,%((\_RR_) && 7) #figure out which rotation constants to use | |||||
.if \x0 == 0 | |||||
_RC0_ = RC_256_\_qq_&&_0 | |||||
_RC1_ = RC_256_\_qq_&&_1 | |||||
.else | |||||
_RC0_ = RC_256_\_qq_&&_1 | |||||
_RC1_ = RC_256_\_qq_&&_0 | |||||
.endif | |||||
.endr | |||||
# | |||||
paddq %xmm\x1,%xmm\x0 | |||||
movq %xmm\x1,%xmm\t0 | |||||
psllq $ _RC0_,%xmm\x1 | |||||
psrlq $64-_RC0_,%xmm\t0 | |||||
xorpd %xmm\x0,%xmm\x1 | |||||
xorpd %xmm\t0,%xmm\x1 | |||||
# | |||||
paddq %xmm\x3,%xmm\x2 | |||||
movq %xmm\x3,%xmm\t1 | |||||
psllq $ _RC1_,%xmm\x3 | |||||
psrlq $64-_RC1_,%xmm\t1 | |||||
xorpd %xmm\x2,%xmm\x3 | |||||
xorpd %xmm\t1,%xmm\x3 | |||||
.if _SKEIN_DEBUG | |||||
Skein_Debug_Round 256,%(\_RR_+1),SAVE_REGS | |||||
.endif | |||||
.endm #R_256_OneRound | |||||
# | |||||
.macro R_256_FourRounds _RN_ | |||||
R_256_OneRound %(_RN_+0),0,1,2,3,4,5 | |||||
R_256_OneRound (_RN_+1),2,1,0,3,4,5 | |||||
R_256_OneRound (_RN_+2),0,1,2,3,4,5 | |||||
R_256_OneRound (_RN_+3),2,1,0,3,4,5 | |||||
#inject key schedule | |||||
incl %edx #bump round number | |||||
movd %edx,%xmm4 | |||||
.if _UNROLL_CNT == (ROUNDS_256/8) | |||||
#fully unrolled version | |||||
_RK_ = ((_RN_)/4) #key injection counter | |||||
paddq ksKey+16*((_RK_+1) % 5)-F_O(%ebp),%xmm0 | |||||
paddq ksKey+16*((_RK_+2) % 5)-F_O(%ebp),%xmm1 | |||||
paddq ksKey+16*((_RK_+3) % 5)-F_O(%ebp),%xmm2 | |||||
paddq ksKey+16*((_RK_+4) % 5)-F_O(%ebp),%xmm3 | |||||
paddq ksTwk+16*((_RK_+1) % 3)-F_O(%ebp),%xmm1 | |||||
paddq ksTwk+16*((_RK_+2) % 3)-F_O(%ebp),%xmm2 | |||||
paddq %xmm4,%xmm3 | |||||
.else #looping version | |||||
paddq ksKey+16*1-F_O(%esi),%xmm0 | |||||
paddq ksKey+16*2-F_O(%esi),%xmm1 | |||||
paddq ksKey+16*3-F_O(%esi),%xmm2 | |||||
paddq ksKey+16*4-F_O(%esi),%xmm3 | |||||
paddq ksTwk+16*1-F_O(%esi),%xmm1 | |||||
paddq ksTwk+16*2-F_O(%esi),%xmm2 | |||||
paddq %xmm4,%xmm3 | |||||
# | |||||
movq ksKey-F_O(%esi),%xmm4 #first, "rotate" key schedule on the stack | |||||
movq ksTwk-F_O(%esi),%xmm5 # (for next time through) | |||||
movq %xmm4,ksKey+16*(WCNT+1)-F_O(%esi) | |||||
movq %xmm5,ksTwk+16*3-F_O(%esi) | |||||
addl $16,%esi #bump rolling pointer | |||||
.endif | |||||
.if _SKEIN_DEBUG | |||||
Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,SAVE_REGS | |||||
.endif | |||||
.endm #R256_FourRounds | |||||
# | |||||
.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines | |||||
_Put_XMM_256: | |||||
.irp _NN_,0,1,2,3 | |||||
movq %xmm\_NN_,X_stk+4+\_NN_*8(%esp) | |||||
.endr | |||||
ret | |||||
# | |||||
_Get_XMM_256: | |||||
.irp _NN_,0,1,2,3 | |||||
movq X_stk+4+_NN_*8(%esp),%xmm\_NN_ | |||||
.endr | |||||
ret | |||||
.endif | |||||
# | |||||
################# | |||||
# | |||||
# code | |||||
# | |||||
C_label Skein_256_Process_Block | |||||
WCNT = 4 #WCNT=4 for Skein-256 | |||||
Setup_Stack WCNT,ROUNDS_256 | |||||
# main hash loop for Skein_256 | |||||
Skein_256_block_loop: | |||||
movd bitAdd (%ebx),%xmm4 | |||||
movq TWEAK+0(%edi),%xmm5 | |||||
movq TWEAK+8(%edi),%xmm6 | |||||
paddq %xmm4 ,%xmm5 #bump T0 by the bitAdd parameter | |||||
movq %xmm5,TWEAK(%edi) #save updated tweak value T0 (for next time) | |||||
movapd %xmm6,%xmm7 | |||||
xorpd %xmm5,%xmm7 #compute overall tweak parity | |||||
movdqa %xmm5,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack | |||||
movdqa %xmm6,ksTwk+16-F_O(%ebp) | |||||
movdqa %xmm7,ksTwk+32-F_O(%ebp) | |||||
movl blkPtr(%ebx),%esi #esi --> input block | |||||
movl $KW_PARITY_LO,%eax #init key schedule parity accumulator | |||||
movl $KW_PARITY_HI,%edx | |||||
movd %eax ,%xmm4 | |||||
movd %edx ,%xmm0 | |||||
unpcklps %xmm0,%xmm4 #replicate parity dword to 64 bits | |||||
# | |||||
.irp _NN_,0,1,2,3 #copy in the chaining vars | |||||
movq X_VARS+8*\_NN_(%edi),%xmm\_NN_ | |||||
xorpd %xmm\_NN_,%xmm4 #update overall parity | |||||
movdqa %xmm\_NN_,ksKey+16*_NN_-F_O(%ebp) | |||||
.endr | |||||
movdqa %xmm4,ksKey+16*WCNT-F_O(%ebp)#save overall parity at the end of the array | |||||
# | |||||
paddq %xmm5,%xmm1 #inject the initial tweak words | |||||
paddq %xmm6,%xmm2 | |||||
# | |||||
.irp _NN_,0,1,2,3 #perform the initial key injection | |||||
movq 8*\_NN_(%esi),%xmm4#and save a copy of the input block on stack | |||||
movq %xmm4,8*\_NN_+Wcopy(%esp) | |||||
paddq %xmm4,%xmm\_NN_ #inject the key word | |||||
.endr | |||||
# | |||||
.if _SKEIN_DEBUG #debug dump of state at this point | |||||
Skein_Debug_Block 256 | |||||
Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,SAVE_REGS | |||||
.endif | |||||
addl $WCNT*8,%esi #skip to the next block | |||||
movl %esi,blkPtr(%ebx) #save the updated block pointer | |||||
# | |||||
# now the key schedule is computed. Start the rounds | |||||
# | |||||
xorl %edx,%edx #edx = iteration count | |||||
.if SKEIN_ASM_UNROLL & 256 | |||||
_UNROLL_CNT = ROUNDS_256/8 #fully unrolled | |||||
.else | |||||
_UNROLL_CNT = SKEIN_UNROLL_256 #partial unroll count | |||||
.if ((ROUNDS_256/8) % _UNROLL_CNT) | |||||
.error "Invalid SKEIN_UNROLL_256" #sanity check | |||||
.endif | |||||
movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey | |||||
Skein_256_round_loop: # (since there's no 16* scaled address mode) | |||||
.endif | |||||
# | |||||
_Rbase_ = 0 | |||||
.rept _UNROLL_CNT*2 # here with X[0..3] in XMM0..XMM3 | |||||
R_256_FourRounds _Rbase_ | |||||
_Rbase_ = _Rbase_+4 | |||||
.endr #rept _UNROLL_CNT*2 | |||||
# | |||||
.if _UNROLL_CNT <> (ROUNDS_256/8) | |||||
cmpl $2*(ROUNDS_256/8),%edx | |||||
jb Skein_256_round_loop | |||||
.endif | |||||
#---------------------------- | |||||
# feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} | |||||
.irp _NN_,0,1,2,3 | |||||
movq Wcopy+8*\_NN_(%esp),%xmm4 | |||||
xorpd %xmm4,%xmm\_NN_ | |||||
movq %xmm\_NN_,X_VARS+8*\_NN_(%edi) | |||||
.endr | |||||
andb $FIRST_MASK8,TWEAK +15(%edi) | |||||
.if _SKEIN_DEBUG | |||||
Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,SAVE_REGS | |||||
.endif | |||||
# go back for more blocks, if needed | |||||
decl %ecx | |||||
jnz Skein_256_block_loop | |||||
Reset_Stack _Skein_256_Process_Block | |||||
ret | |||||
# | |||||
.ifdef _SKEIN_CODE_SIZE | |||||
C_label Skein_256_Process_Block_CodeSize | |||||
movl $_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block,%eax | |||||
ret | |||||
# | |||||
C_label Skein_256_Unroll_Cnt | |||||
.if _UNROLL_CNT <> ROUNDS_256/8 | |||||
movl $_UNROLL_CNT,%eax | |||||
.else | |||||
xorl %eax,%eax | |||||
.endif | |||||
ret | |||||
.endif | |||||
.endif #_USE_ASM_ & 256 | |||||
# | |||||
#---------------------------------------------------------------- | |||||
# | |||||
.if _USE_ASM_ & 512 | |||||
# | |||||
# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# | |||||
# | |||||
################# | |||||
# MACRO: one round | |||||
# | |||||
.macro R_512_Round _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd | |||||
.irp _qq_,%((\_RR_) && 7) | |||||
_Ra_ = RC_512_\_qq_&&_\Ra | |||||
_Rb_ = RC_512_\_qq_&&_\Rb | |||||
_Rc_ = RC_512_\_qq_&&_\Rc | |||||
_Rd_ = RC_512_\_qq_&&_\Rd | |||||
.endr | |||||
paddq %xmm\a1 , %xmm\a0 | |||||
_stX c0 | |||||
movq %xmm\a1 , %xmm\c0 | |||||
psllq $ _Ra_ , %xmm\a1 | |||||
psrlq $64-_Ra_ , %xmm\c0 | |||||
xorpd %xmm\c0 , %xmm\a1 | |||||
xorpd %xmm\a0 , %xmm\a1 | |||||
paddq %xmm\b1 , %xmm\b0 | |||||
_stX a0 | |||||
movq %xmm\b1 , %xmm\a0 | |||||
psllq $ _Rb_ , %xmm\b1 | |||||
psrlq $64-_Rb_ , %xmm\a0 | |||||
xorpd %xmm\b0 , %xmm\b1 | |||||
_ldX c0 | |||||
xorpd %xmm\a0 , %xmm\b1 | |||||
paddq %xmm\c1 , %xmm\c0 | |||||
movq %xmm\c1 , %xmm\a0 | |||||
psllq $ _Rc_ , %xmm\c1 | |||||
psrlq $64-_Rc_ , %xmm\a0 | |||||
xorpd %xmm\c0 , %xmm\c1 | |||||
xorpd %xmm\a0 , %xmm\c1 | |||||
paddq %xmm\d1 , %xmm\d0 | |||||
movq %xmm\d1 , %xmm\a0 | |||||
psllq $ _Rd_ , %xmm\d1 | |||||
psrlq $64-_Rd_ , %xmm\a0 | |||||
xorpd %xmm\a0 , %xmm\d1 | |||||
_ldX a0 | |||||
xorpd %xmm\d0 , %xmm\d1 | |||||
.if _SKEIN_DEBUG | |||||
Skein_Debug_Round 512,%(_RR_+1),SAVE_REGS | |||||
.endif | |||||
.endm | |||||
# | |||||
# MACRO: four rounds | |||||
.macro R_512_FourRounds _RN_ | |||||
R_512_Round %((_RN_) ), 0,1,0, 2,3,1, 4,5,2, 6,7,3 | |||||
R_512_Round %((_RN_)+1), 2,1,0, 4,7,1, 6,5,2, 0,3,3 | |||||
R_512_Round %((_RN_)+2), 4,1,0, 6,3,1, 0,5,2, 2,7,3 | |||||
R_512_Round %((_RN_)+3), 6,1,0, 0,7,1, 2,5,2, 4,3,3 | |||||
#inject key schedule | |||||
.irp _NN_,0,1,2,3,4,5,6,7 | |||||
.if _UNROLL_CNT == (ROUNDS_512/8) | |||||
paddq ksKey+16*((((\_RN_)/4)+(\_NN_)+1)%9)-F_O(%ebp),%xmm\_NN_ | |||||
.else | |||||
paddq ksKey+16*((\_NN_)+1)-F_O(%esi),%xmm\_NN_ | |||||
.endif | |||||
.endr | |||||
_stX 0 #free up a register | |||||
incl %edx #bump round counter | |||||
movd %edx,%xmm0 #inject the tweak | |||||
.if _UNROLL_CNT == (ROUNDS_512/8) | |||||
paddq ksTwk+16*(((_RN_)+1) % 3)-F_O(%ebp),%xmm5 | |||||
paddq ksTwk+16*(((_RN_)+2) % 3)-F_O(%ebp),%xmm6 | |||||
paddq %xmm0 ,%xmm7 | |||||
.else #looping version | |||||
paddq ksTwk+16*1-F_O(%esi),%xmm5 | |||||
paddq ksTwk+16*2-F_O(%esi),%xmm6 | |||||
paddq %xmm0 ,%xmm7 | |||||
# "rotate" key schedule on the stack (for next time through) | |||||
movq ksKey -F_O(%esi),%xmm0 | |||||
movq %xmm0,ksKey+16*(WCNT+1)-F_O(%esi) | |||||
movq ksTwk -F_O(%esi),%xmm0 | |||||
movq %xmm0,ksTwk+16*3 -F_O(%esi) | |||||
addl $16,%esi #bump rolling pointer | |||||
.endif | |||||
_ldX 0 #restore X0 | |||||
.if _SKEIN_DEBUG | |||||
Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,SAVE_REGS | |||||
.endif | |||||
.endm #R_512_FourRounds | |||||
################# | |||||
.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines | |||||
_Put_XMM_512: | |||||
.irp _NN_,0,1,2,3,4,5,6,7 | |||||
movq %xmm\_NN_,X_stk+4+\_NN_*8(%esp) | |||||
.endr | |||||
ret | |||||
# | |||||
_Get_XMM_512: | |||||
.irp _NN_,0,1,2,3,4,5,6,7 | |||||
movq X_stk+4+\_NN_*8(%esp),%xmm\_NN_ | |||||
.endr | |||||
ret | |||||
.endif | |||||
# | |||||
################# | |||||
# | |||||
C_label Skein_512_Process_Block | |||||
WCNT = 8 #WCNT=8 for Skein-512 | |||||
Setup_Stack WCNT,ROUNDS_512 | |||||
# main hash loop for Skein_512 | |||||
Skein_512_block_loop: | |||||
movd bitAdd(%ebx) ,%xmm0 | |||||
movq TWEAK+0(%edi),%xmm1 | |||||
movq TWEAK+8(%edi),%xmm2 | |||||
paddq %xmm0,%xmm1 #bump T0 by the bitAdd parameter | |||||
movq %xmm1,TWEAK(%edi) #save updated tweak value T0 (for next time) | |||||
movq %xmm2,%xmm0 | |||||
xorpd %xmm1,%xmm0 #compute overall tweak parity | |||||
movdqa %xmm1,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack | |||||
movdqa %xmm2,ksTwk+16*1-F_O(%ebp) | |||||
movdqa %xmm0,ksTwk+16*2-F_O(%ebp) | |||||
movl blkPtr(%ebx),%esi #esi --> input block | |||||
movl $KW_PARITY_LO,%eax #init key schedule parity accumulator | |||||
movl $KW_PARITY_HI,%edx | |||||
movd %eax ,%xmm0 | |||||
movd %edx ,%xmm7 | |||||
unpcklps %xmm7,%xmm0 #replicate parity dword to 64 bits | |||||
# | |||||
.irp _NN_,7,6,5,4,3,2,1 #copy in the chaining vars (skip #0 for now) | |||||
movq X_VARS+8*\_NN_(%edi),%xmm\_NN_ | |||||
xorpd %xmm\_NN_,%xmm0 #update overall parity | |||||
movdqa %xmm\_NN_,ksKey+16*\_NN_-F_O(%ebp) | |||||
.if \_NN_ == 5 | |||||
paddq %xmm1,%xmm5 #inject the initial tweak words | |||||
paddq %xmm2,%xmm6 # (before they get trashed in %xmm1/2) | |||||
.endif | |||||
.endr | |||||
movq X_VARS(%edi),%xmm4 #handle #0 now | |||||
xorpd %xmm4,%xmm0 #update overall parity | |||||
movdqa %xmm4,ksKey+16* 0 -F_O(%ebp) #save the key value in slot #0 | |||||
movdqa %xmm0,ksKey+16*WCNT-F_O(%ebp) #save overall parity at the end of the array | |||||
# | |||||
movq %xmm4,%xmm0 | |||||
.irp _NN_,7,6,5, 4,3,2,1,0 #perform the initial key injection (except #4) | |||||
movq 8*\_NN_(%esi),%xmm4 #and save a copy of the input block on stack | |||||
movq %xmm4,8*\_NN_+Wcopy(%esp) | |||||
paddq %xmm4,%xmm\_NN_ | |||||
.endr | |||||
movq 8*4(%esi),%xmm4 #get input block word #4 | |||||
movq %xmm4,8*4+Wcopy(%esp) | |||||
paddq ksKey+16*4-F_O(%ebp),%xmm4#inject the initial key | |||||
# | |||||
.if _SKEIN_DEBUG #debug dump of state at this point | |||||
Skein_Debug_Block 512 | |||||
Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,SAVE_REGS | |||||
.endif | |||||
addl $WCNT*8,%esi #skip to the next block | |||||
movl %esi,blkPtr(%ebx) #save the updated block pointer | |||||
# | |||||
# now the key schedule is computed. Start the rounds | |||||
# | |||||
xorl %edx,%edx #edx = round counter | |||||
.if SKEIN_ASM_UNROLL & 512 | |||||
_UNROLL_CNT = ROUNDS_512/8 | |||||
.else | |||||
_UNROLL_CNT = SKEIN_UNROLL_512 | |||||
.if ((ROUNDS_512/8) % _UNROLL_CNT) | |||||
.error "Invalid SKEIN_UNROLL_512" | |||||
.endif | |||||
movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey | |||||
Skein_512_round_loop: # (since there's no 16* scaled address mode) | |||||
.endif | |||||
_Rbase_ = 0 | |||||
.rept _UNROLL_CNT*2 | |||||
R_512_FourRounds %_Rbase_ | |||||
_Rbase_ = _Rbase_+4 | |||||
.endr #rept _UNROLL_CNT | |||||
# | |||||
.if (SKEIN_ASM_UNROLL & 512) == 0 | |||||
cmpl $2*(ROUNDS_512/8),%edx | |||||
jb Skein_512_round_loop | |||||
.endif | |||||
#---------------------------- | |||||
# feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} | |||||
andb $FIRST_MASK8,TWEAK +15(%edi) | |||||
.irp _NN_,0,2,4,6 #do the aligned ones first | |||||
xorpd Wcopy+8*\_NN_(%esp),%xmm\_NN_ | |||||
movq %xmm\_NN_,X_VARS+8*_NN_(%edi) | |||||
.endr | |||||
.irp _NN_,1,3,5,7 #now we have some register space available | |||||
movq Wcopy+8*\_NN_(%esp),%xmm0 | |||||
xorpd %xmm0,%xmm&\_NN_ | |||||
movq %xmm&\_NN_,X_VARS+8*\_NN_(%edi) | |||||
.endr | |||||
.if _SKEIN_DEBUG | |||||
Skein_Debug_Round 512,SKEIN_RND_FEED_FWD | |||||
.endif | |||||
# go back for more blocks, if needed | |||||
decl %ecx | |||||
jnz Skein_512_block_loop | |||||
Reset_Stack _Skein_512_Process_Block | |||||
ret | |||||
# | |||||
.ifdef _SKEIN_CODE_SIZE | |||||
C_label Skein_512_Process_Block_CodeSize | |||||
movl $(_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block),%eax | |||||
ret | |||||
# | |||||
C_label Skein_512_Unroll_Cnt | |||||
.if _UNROLL_CNT <> ROUNDS_512/8 | |||||
movl $_UNROLL_CNT,%eax | |||||
.else | |||||
xorl %eax,%eax | |||||
.endif | |||||
ret | |||||
.endif | |||||
# | |||||
.endif # _USE_ASM_ & 512 | |||||
# | |||||
#---------------------------------------------------------------- | |||||
# | |||||
.if _USE_ASM_ & 1024 | |||||
.global _Skein1024_Process_Block | |||||
# | |||||
# void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# | |||||
# | |||||
R_1024_REGS = (5) #keep this many block variables in registers | |||||
# | |||||
################ | |||||
.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines | |||||
_Put_XMM_1024: | |||||
_NN_ = 0 | |||||
.rept R_1024_REGS | |||||
.irp _rr_,%(_NN_) | |||||
movq %xmm\_rr_,X_stk+4+8*_NN_(%esp) | |||||
.endr | |||||
_NN_ = _NN_+1 | |||||
.endr | |||||
ret | |||||
# | |||||
_Get_XMM_1024: | |||||
_NN_ = 0 | |||||
.rept R_1024_REGS | |||||
.irp _rr_,%(_NN_) | |||||
movq X_stk+4+8*_NN_(%esp),%xmm\_rr_ | |||||
.endr | |||||
_NN_ = _NN_+1 | |||||
.endr | |||||
ret | |||||
.endif | |||||
# | |||||
################# | |||||
# MACRO: one mix step | |||||
.macro MixStep_1024 x0,x1,rotIdx0,rotIdx1,_debug_=0 | |||||
_r0_ = \x0 #default, if already loaded | |||||
_r1_ = \x1 | |||||
# load the regs (if necessary) | |||||
.if (\x0 >= R_1024_REGS) | |||||
_r0_ = 5 | |||||
movq X_stk+8*(\x0)(%esp),%xmm5 | |||||
.endif | |||||
.if (\x1 >= R_1024_REGS) | |||||
_r1_ = 6 | |||||
movq X_stk+8*(\x1)(%esp),%xmm6 | |||||
.endif | |||||
# do the mix | |||||
.irp _rx_,%((rotIdx0) && 7) | |||||
_Rc_ = RC_1024_\_rx_&&_\rotIdx1 #rotation constant | |||||
.endr | |||||
.irp _x0_,%_r0_ | |||||
.irp _x1_,%_r1_ | |||||
paddq %xmm\_x1_,%xmm\_x0_ | |||||
movq %xmm\_x1_,%xmm7 | |||||
psllq $ _Rc_ ,%xmm\_x1_ | |||||
psrlq $64-_Rc_ ,%xmm7 | |||||
xorpd %xmm\_x0_,%xmm\_x1_ | |||||
xorpd %xmm7 ,%xmm\_x1_ | |||||
.endr | |||||
.endr | |||||
# save the regs (if necessary) | |||||
.if (\x0 >= R_1024_REGS) | |||||
movq %xmm5,X_stk+8*(\x0)(%esp) | |||||
.endif | |||||
.if (\x1 >= R_1024_REGS) | |||||
movq %xmm6,X_stk+8*(\x1)(%esp) | |||||
.endif | |||||
# debug output | |||||
.if _SKEIN_DEBUG && (\_debug_) | |||||
Skein_Debug_Round 1024,%((\RotIdx0)+1),SAVE_REGS | |||||
.endif | |||||
.endm | |||||
################# | |||||
# MACRO: four rounds | |||||
# | |||||
.macro R_1024_FourRounds _RR_ | |||||
#--------- round _RR_ | |||||
MixStep_1024 0, 1,%((\_RR_)+0),0 | |||||
MixStep_1024 2, 3,%((\_RR_)+0),1 | |||||
MixStep_1024 4, 5,%((\_RR_)+0),2 | |||||
MixStep_1024 6, 7,%((\_RR_)+0),3 | |||||
MixStep_1024 8, 9,%((\_RR_)+0),4 | |||||
MixStep_1024 10,11,%((\_RR_)+0),5 | |||||
MixStep_1024 12,13,%((\_RR_)+0),6 | |||||
MixStep_1024 14,15,%((\_RR_)+0),7,1 | |||||
#--------- round _RR_+1 | |||||
MixStep_1024 0, 9,%((\_RR_)+1),0 | |||||
MixStep_1024 2,13,%((\_RR_)+1),1 | |||||
MixStep_1024 6,11,%((\_RR_)+1),2 | |||||
MixStep_1024 4,15,%((\_RR_)+1),3 | |||||
MixStep_1024 10, 7,%((\_RR_)+1),4 | |||||
MixStep_1024 12, 3,%((\_RR_)+1),5 | |||||
MixStep_1024 14, 5,%((\_RR_)+1),6 | |||||
MixStep_1024 8, 1,%((\_RR_)+1),7,1 | |||||
#--------- round _RR_+2 | |||||
MixStep_1024 0, 7,%((\_RR_)+2),0 | |||||
MixStep_1024 2, 5,%((\_RR_)+2),1 | |||||
MixStep_1024 4, 3,%((\_RR_)+2),2 | |||||
MixStep_1024 6, 1,%((\_RR_)+2),3 | |||||
MixStep_1024 12,15,%((\_RR_)+2),4 | |||||
MixStep_1024 14,13,%((\_RR_)+2),5 | |||||
MixStep_1024 8,11,%((\_RR_)+2),6 | |||||
MixStep_1024 10, 9,%((\_RR_)+2),7,1 | |||||
#--------- round _RR_+3 | |||||
MixStep_1024 0,15,%((\_RR_)+3),0 | |||||
MixStep_1024 2,11,%((\_RR_)+3),1 | |||||
MixStep_1024 6,13,%((\_RR_)+3),2 | |||||
MixStep_1024 4, 9,%((\_RR_)+3),3 | |||||
MixStep_1024 14, 1,%((\_RR_)+3),4 | |||||
MixStep_1024 8, 5,%((\_RR_)+3),5 | |||||
MixStep_1024 10, 3,%((\_RR_)+3),6 | |||||
MixStep_1024 12, 7,%((\_RR_)+3),7,1 | |||||
incl %edx #edx = round number | |||||
movd %edx,%xmm7 | |||||
#inject the key | |||||
.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | |||||
.if _UNROLL_CNT <> (ROUNDS_1024/8) | |||||
.if \_NN_ < R_1024_REGS | |||||
paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm&\_NN_ | |||||
.else | |||||
movq X_stk+ 8*\_NN_(%esp),%xmm6 | |||||
.if \_NN_ == 15 | |||||
paddq %xmm7,%xmm6 | |||||
.elseif \_NN_ == 14 | |||||
paddq ksTwk+16*2-F_O(%esi),%xmm6 | |||||
.elseif \_NN_ == 13 | |||||
paddq ksTwk+16*1-F_O(%esi),%xmm6 | |||||
.endif | |||||
paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm6 | |||||
movq %xmm6,X_stk+ 8*\_NN_(%esp) | |||||
.endif | |||||
.else | |||||
.if \_NN_ < R_1024_REGS | |||||
paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm&\_NN_ | |||||
.else | |||||
movq X_stk+ 8*\_NN_(%esp), %xmm6 | |||||
paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm6 | |||||
.if \_NN_ == 15 | |||||
paddq %xmm7,%xmm6 | |||||
.elseif \_NN_ == 14 | |||||
paddq ksTwk+16*(((_Rbase_/4)+2) % 3)-F_O(%ebp),%xmm6 | |||||
.elseif \_NN_ == 13 | |||||
paddq ksTwk+16*(((_Rbase_/4)+1) % 3)-F_O(%ebp),%xmm6 | |||||
.endif | |||||
movq %xmm6,X_stk+ 8*\_NN_(%esp) | |||||
.endif | |||||
.endif | |||||
.endr | |||||
.if _UNROLL_CNT <> (ROUNDS_1024/8) #rotate the key schedule on the stack | |||||
movq ksKey-F_O(%esi), %xmm6 | |||||
movq ksTwk-F_O(%esi), %xmm7 | |||||
movq %xmm6,ksKey+16*(WCNT+1)-F_O(%esi) | |||||
movq %xmm7,ksTwk+16* 3 -F_O(%esi) | |||||
addl $16,%esi #bump rolling pointer | |||||
.endif | |||||
.if _SKEIN_DEBUG | |||||
Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,SAVE_REGS | |||||
.endif | |||||
.endm #R_1024_FourRounds | |||||
# | |||||
################ | |||||
# | |||||
C_label Skein1024_Process_Block | |||||
# | |||||
WCNT = 16 #WCNT=16 for Skein-1024 | |||||
Setup_Stack WCNT,ROUNDS_1024 | |||||
addl $0x80,%edi #bias the edi ctxt offsets to keep them all short | |||||
# main hash loop for Skein1024 | |||||
Skein1024_block_loop: | |||||
movd bitAdd(%ebx) ,%xmm0 | |||||
movq TWEAK+0-0x80(%edi),%xmm1 | |||||
movq TWEAK+8-0x80(%edi),%xmm2 | |||||
paddq %xmm0,%xmm1 #bump T0 by the bitAdd parameter | |||||
movq %xmm1,TWEAK-0x80(%edi) #save updated tweak value T0 (for next time) | |||||
movq %xmm2,%xmm0 | |||||
xorpd %xmm1,%xmm0 #compute overall tweak parity | |||||
movdqa %xmm1,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack | |||||
movdqa %xmm2,ksTwk+16-F_O(%ebp) | |||||
movdqa %xmm0,ksTwk+32-F_O(%ebp) | |||||
movl blkPtr(%ebx),%esi #esi --> input block | |||||
movl $KW_PARITY_LO,%eax #init key schedule parity accumulator | |||||
movl $KW_PARITY_HI,%edx | |||||
movd %eax ,%xmm7 | |||||
movd %edx ,%xmm6 | |||||
unpcklps %xmm6,%xmm7 #replicate parity dword to 64 bits | |||||
# | |||||
leal 0x80(%esp),%eax #use short offsets for Wcopy, X_stk writes below | |||||
.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 | |||||
movq X_VARS+8*\_NN_-0x80(%edi),%xmm6 | |||||
xorpd %xmm6,%xmm7 #update overall parity | |||||
movdqa %xmm6,ksKey+16*\_NN_-F_O(%ebp) #save the key schedule on the stack | |||||
.if \_NN_ < R_1024_REGS | |||||
_rr_ = \_NN_ | |||||
.else | |||||
_rr_ = R_1024_REGS | |||||
.endif | |||||
.irp _rn_,%(_rr_) | |||||
movq 8*\_NN_(%esi),%xmm\_rn_ #save copy of the input block on stack | |||||
movq %xmm\_rn_,Wcopy+8*\_NN_-0x80(%eax) #(for feedforward later) | |||||
paddq %xmm6,%xmm\_rn_ #inject the key into the block | |||||
.if \_NN_ == 13 | |||||
paddq %xmm1,%xmm\_rn_ #inject the initial tweak words | |||||
.elseif \_NN_ == 14 | |||||
paddq %xmm2,%xmm\_rn_ | |||||
.endif | |||||
.if \_NN_ >= R_1024_REGS #only save X[5..15] on stack, leave X[0..4] in regs | |||||
movq %xmm\_rn_,X_stk+8*\_NN_-0x80(%eax) | |||||
.endif | |||||
.endr | |||||
.endr | |||||
movdqa %xmm7,ksKey+16*WCNT-F_O(%ebp) #save overall key parity at the end of the array | |||||
# | |||||
.if _SKEIN_DEBUG #debug dump of state at this point | |||||
Skein_Debug_Block 1024 | |||||
Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,SAVE_REGS | |||||
.endif | |||||
addl $WCNT*8,%esi #skip to the next block | |||||
movl %esi,blkPtr(%ebx) #save the updated block pointer | |||||
# | |||||
# now the key schedule is computed. Start the rounds | |||||
# | |||||
xorl %edx,%edx #edx = round counter | |||||
.if SKEIN_ASM_UNROLL & 1024 | |||||
_UNROLL_CNT = ROUNDS_1024/8 | |||||
.else | |||||
_UNROLL_CNT = SKEIN_UNROLL_1024 | |||||
.if ((ROUNDS_1024/8) % _UNROLL_CNT) | |||||
.error "Invalid SKEIN_UNROLL_1024" | |||||
.endif | |||||
movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey | |||||
Skein_1024_round_loop: | |||||
.endif | |||||
# | |||||
_Rbase_ = 0 | |||||
.rept _UNROLL_CNT*2 | |||||
R_1024_FourRounds %_Rbase_ | |||||
_Rbase_ = _Rbase_+4 | |||||
.endr #rept _UNROLL_CNT | |||||
# | |||||
.if (SKEIN_ASM_UNROLL & 1024) == 0 | |||||
cmp $2*(ROUNDS_1024/8),%edx | |||||
jb Skein_1024_round_loop | |||||
.endif | |||||
andb $FIRST_MASK8,TWEAK +15-0x80(%edi) #clear tweak bit for next time thru | |||||
#---------------------------- | |||||
# feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} | |||||
leal 0x80(%esp),%eax #allow short offsets to X_stk and Wcopy | |||||
.irp _NN_,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 | |||||
.if \_NN_ < R_1024_REGS | |||||
.if \_NN_ && 1 #already in regs: no load needed | |||||
movq Wcopy+ 8*\_NN_-0x80(%eax),%xmm7 #unaligned | |||||
xorpd %xmm7,%xmm\_NN_ | |||||
.else | |||||
xorpd Wcopy+ 8*\_NN_-0x80(%eax),%xmm\_NN_ #aligned | |||||
.endif | |||||
movq %xmm\_NN_,X_VARS+8*\_NN_-0x80(%edi) | |||||
.else | |||||
movq X_stk+8*\_NN_-0x80(%eax),%xmm7 #load X value from stack | |||||
.if \_NN_ && 1 | |||||
movq Wcopy+8*\_NN_-0x80(%eax),%xmm6 #unaligned | |||||
xorpd %xmm6,%xmm7 | |||||
.else | |||||
xorpd Wcopy+8*\_NN_-0x80(%eax),%xmm7 #aligned | |||||
.endif | |||||
movq %xmm7,X_VARS+8*\_NN_-0x80(%edi) | |||||
.endif | |||||
.endr | |||||
.if _SKEIN_DEBUG | |||||
Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD #no need to save regs on stack here | |||||
.endif | |||||
# go back for more blocks, if needed | |||||
decl %ecx | |||||
jnz Skein1024_block_loop | |||||
Reset_Stack _Skein1024_Process_Block | |||||
ret | |||||
# | |||||
.ifdef _SKEIN_CODE_SIZE | |||||
C_label Skein1024_Process_Block_CodeSize | |||||
movl $(_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block),%eax | |||||
ret | |||||
# | |||||
C_label Skein1024_Unroll_Cnt | |||||
.if _UNROLL_CNT <> ROUNDS_1024/8 | |||||
movl $_UNROLL_CNT,%eax | |||||
.else | |||||
xorl %eax,%eax | |||||
.endif | |||||
ret | |||||
.endif | |||||
# | |||||
.endif # _USE_ASM_ & 1024 | |||||
#---------------------------------------------------------------- | |||||
.end |
Identical to skein_block_xmm32.asm?