Index: lib/libcrypt/Makefile =================================================================== --- lib/libcrypt/Makefile +++ lib/libcrypt/Makefile @@ -10,16 +10,25 @@ SHLIB_MAJOR= 5 LIB= crypt -.PATH: ${.CURDIR}/../libmd ${.CURDIR}/../../sys/crypto/sha2 +.PATH: ${.CURDIR}/../libmd ${.CURDIR}/../../sys/crypto/sha2 ${.CURDIR}/../../sys/crypto/skein SRCS= crypt.c misc.c \ crypt-md5.c md5c.c \ crypt-nthash.c md4c.c \ crypt-sha256.c sha256c.c \ - crypt-sha512.c sha512c.c + crypt-sha512.c sha512c.c \ + skein.c skein_block.c MAN= crypt.3 MLINKS= crypt.3 crypt_get_format.3 crypt.3 crypt_set_format.3 CFLAGS+= -I${.CURDIR}/../libmd -I${.CURDIR}/../libutil \ - -I${.CURDIR}/../../sys/crypto/sha2 + -I${.CURDIR}/../../sys/crypto/sha2 -I${.CURDIR}/../../sys/crypto/skein + +# Use assembly optimized skein if available +.if exists(${MACHINE_ARCH}/skein_block_asm.s) +.PATH: ${.CURDIR}/../../sys/crypto/skein/${MACHINE_ARCH} +SRCS += skein_block_asm.s +CFLAGS += -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792 +ACFLAGS += -DELF -Wa,--noexecstack +.endif # Pull in the strong crypto, if it is present. .if exists(${.CURDIR}/../../secure/lib/libcrypt) && ${MK_CRYPT} != "no" @@ -32,7 +41,10 @@ MD5Init MD5Final MD5Update MD5Pad \ SHA256_Init SHA256_Final SHA256_Update \ SHA384_Init SHA384_Final SHA384_Update \ - SHA512_Init SHA512_Final SHA512_Update + SHA512_Init SHA512_Final SHA512_Update \ + SKEIN256_Init SKEIN256_Final SKEIN256_Update \ + SKEIN512_Init SKEIN512_Final SKEIN512_Update \ + SKEIN1024_Init SKEIN1024_Final SKEIN1024_Update CFLAGS+= -D${sym}=__${sym} .endfor Index: lib/libmd/Makefile =================================================================== --- lib/libmd/Makefile +++ lib/libmd/Makefile @@ -9,12 +9,16 @@ sha0c.c sha0hl.c sha1c.c sha1hl.c \ sha256c.c sha256hl.c \ sha384hl.c \ - sha512c.c sha512hl.c -INCS= md4.h md5.h ripemd.h sha.h sha256.h sha384.h sha512.h + sha512c.c sha512hl.c \ + skein.c skein_block.c \ + skein256hl.c skein512hl.c skein1024hl.c +INCS= md4.h md5.h ripemd.h sha.h sha256.h sha384.h sha512.h \ + skein.h skein_port.h skein_freebsd.h skein_iv.h \ + brg_types.h brg_endian.h WARNS?= 0 -MAN+= md4.3 md5.3 ripemd.3 sha.3 sha256.3 sha512.3 +MAN+= md4.3 md5.3 ripemd.3 sha.3 sha256.3 sha512.3 skein.3 MLINKS+=md4.3 MD4Init.3 md4.3 MD4Update.3 md4.3 MD4Final.3 MLINKS+=md4.3 MD4End.3 md4.3 MD4File.3 md4.3 MD4FileChunk.3 MLINKS+=md4.3 MD4Data.3 @@ -43,11 +47,27 @@ MLINKS+=sha512.3 SHA512_Final.3 sha512.3 SHA512_End.3 MLINKS+=sha512.3 SHA512_File.3 sha512.3 SHA512_FileChunk.3 MLINKS+=sha512.3 SHA512_Data.3 +MLINKS+=skein.3 SKEIN256_Init.3 skein.3 SKEIN256_Update.3 +MLINKS+=skein.3 SKEIN256_Final.3 skein.3 SKEIN256_End.3 +MLINKS+=skein.3 SKEIN256_File.3 skein.3 SKEIN256_FileChunk.3 +MLINKS+=skein.3 SKEIN256_Data.3 skein.3 skein256.3 +MLINKS+=skein.3 SKEIN512_Init.3 skein.3 SKEIN512_Update.3 +MLINKS+=skein.3 SKEIN512_Final.3 skein.3 SKEIN512_End.3 +MLINKS+=skein.3 SKEIN512_File.3 skein.3 SKEIN512_FileChunk.3 +MLINKS+=skein.3 SKEIN512_Data.3 skein.3 skein512.3 +MLINKS+=skein.3 SKEIN1024_Init.3 skein.3 SKEIN1024_Update.3 +MLINKS+=skein.3 SKEIN1024_Final.3 skein.3 SKEIN1024_End.3 +MLINKS+=skein.3 SKEIN1024_File.3 skein.3 SKEIN1024_FileChunk.3 +MLINKS+=skein.3 SKEIN1024_Data.3 skein.3 skein1024.3 + CLEANFILES+= md[245]hl.c md[245].ref md[245].3 mddriver \ rmd160.ref rmd160hl.c rmddriver \ sha0.ref sha0hl.c sha1.ref sha1hl.c shadriver \ sha256.ref sha256hl.c sha384hl.c sha384.ref \ - sha512.ref sha512hl.c + sha512.ref sha512hl.c \ + skein256hl.c skein512hl.c skein1024hl.c \ + skein256.ref skein512.ref skein1024.ref \ + skeindriver # Define WEAK_REFS to provide weak aliases for libmd symbols # @@ -56,8 +76,10 @@ # * macros are used to rename symbols to libcrypt internal names # * no weak aliases are generated CFLAGS+= -I${.CURDIR} -I${.CURDIR}/../../sys/crypto/sha2 +CFLAGS+= -I${.CURDIR}/../../sys/crypto/skein CFLAGS+= -DWEAK_REFS .PATH: ${.CURDIR}/${MACHINE_ARCH} ${.CURDIR}/../../sys/crypto/sha2 +.PATH: ${.CURDIR}/../../sys/crypto/skein ${.CURDIR}/../../sys/crypto/skein/${MACHINE_ARCH} .if exists(${MACHINE_ARCH}/sha.S) SRCS+= sha.S @@ -67,7 +89,11 @@ SRCS+= rmd160.S CFLAGS+= -DRMD160_ASM .endif -.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) +.if exists(${MACHINE_ARCH}/skein_block_asm.s) +SRCS+= skein_block_asm.s +CFLAGS+= -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792 +.endif +.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.s) ACFLAGS+= -DELF -Wa,--noexecstack .endif @@ -113,6 +139,25 @@ -e 's/RIPEMD160__/RIPEMD160_/g' \ ${.ALLSRC}) > ${.TARGET} +skein256hl.c: mdXhl.c + (echo '#define LENGTH 32'; \ + sed -e 's/mdX/skein/g' -e 's/MDX/SKEIN256_/g' \ + -e 's/SKEIN256__/SKEIN256_/g' \ + ${.ALLSRC}) > ${.TARGET} + +skein512hl.c: mdXhl.c + (echo '#define LENGTH 64'; \ + sed -e 's/mdX/skein/g' -e 's/MDX/SKEIN512_/g' \ + -e 's/SKEIN512__/SKEIN512_/g' \ + ${.ALLSRC}) > ${.TARGET} + +skein1024hl.c: mdXhl.c + (echo '#define LENGTH 128'; \ + sed -e 's/mdX/skein/g' -e 's/MDX/SKEIN1024_/g' \ + -e 's/SKEIN1024__/SKEIN1024_/g' \ + ${.ALLSRC}) > ${.TARGET} + + .for i in 2 4 5 md${i}.3: ${.CURDIR}/mdX.3 sed -e "s/mdX/md${i}/g" -e "s/MDX/MD${i}/g" ${.ALLSRC} > ${.TARGET} @@ -224,8 +269,51 @@ @echo 'RIPEMD160 ("12345678901234567890123456789012345678901234567890123456789012345678901234567890") =' \ '9b752e45573d4b39f4dbd3323cab82bf63326bfb' >> ${.TARGET} +skein256.ref: + echo 'SKEIN256 test suite:' > ${.TARGET} + @echo 'SKEIN256 ("") = c8877087da56e072870daa843f176e9453115929094c3a40c463a196c29bf7ba' >> ${.TARGET} + @echo 'SKEIN256 ("abc") = 258bdec343b9fde1639221a5ae0144a96e552e5288753c5fec76c05fc2fc1870' >> ${.TARGET} + @echo 'SKEIN256 ("message digest") =' \ + '4d2ce0062b5eb3a4db95bc1117dd8aa014f6cd50fdc8e64f31f7d41f9231e488' >> ${.TARGET} + @echo 'SKEIN256 ("abcdefghijklmnopqrstuvwxyz") =' \ + '46d8440685461b00e3ddb891b2ecc6855287d2bd8834a95fb1c1708b00ea5e82' >> ${.TARGET} + @echo 'SKEIN256 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") =' \ + '7c5eb606389556b33d34eb2536459528dc0af97adbcd0ce273aeb650f598d4b2' >> ${.TARGET} + @echo 'SKEIN256 ("12345678901234567890123456789012345678901234567890123456789012345678901234567890") =' \ + '4def7a7e5464a140ae9c3a80279fbebce4bd00f9faad819ab7e001512f67a10d' >> ${.TARGET} + +skein512.ref: + echo 'SKEIN512 test suite:' > ${.TARGET} + @echo 'SKEIN512 ("") =' \ + 'bc5b4c50925519c290cc634277ae3d6257212395cba733bbad37a4af0fa06af41fca7903d06564fea7a2d3730dbdb80c1f85562dfcc070334ea4d1d9e72cba7a' >> ${.TARGET} + @echo 'SKEIN512 ("abc") =' \ + '8f5dd9ec798152668e35129496b029a960c9a9b88662f7f9482f110b31f9f93893ecfb25c009baad9e46737197d5630379816a886aa05526d3a70df272d96e75' >> ${.TARGET} + @echo 'SKEIN512 ("message digest") =' \ + '15b73c158ffb875fed4d72801ded0794c720b121c0c78edf45f900937e6933d9e21a3a984206933d504b5dbb2368000411477ee1b204c986068df77886542fcc' >> ${.TARGET} + @echo 'SKEIN512 ("abcdefghijklmnopqrstuvwxyz") =' \ + '23793ad900ef12f9165c8080da6fdfd2c8354a2929b8aadf83aa82a3c6470342f57cf8c035ec0d97429b626c4d94f28632c8f5134fd367dca5cf293d2ec13f8c' >> ${.TARGET} + @echo 'SKEIN512 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") =' \ + '0c6bed927e022f5ddcf81877d42e5f75798a9f8fd3ede3d83baac0a2f364b082e036c11af35fe478745459dd8f5c0b73efe3c56ba5bb2009208d5a29cc6e469c' >> ${.TARGET} + @echo 'SKEIN512 ("12345678901234567890123456789012345678901234567890123456789012345678901234567890") =' \ + '2ca9fcffb3456f297d1b5f407014ecb856f0baac8eb540f534b1f187196f21e88f31103128c2f03fcc9857d7a58eb66f9525e2302d88833ee069295537a434ce' >> ${.TARGET} + +skein1024.ref: + echo 'SKEIN1024 test suite:' > ${.TARGET} + @echo 'SKEIN1024 ("") =' \ + '0fff9563bb3279289227ac77d319b6fff8d7e9f09da1247b72a0a265cd6d2a62645ad547ed8193db48cff847c06494a03f55666d3b47eb4c20456c9373c86297d630d5578ebd34cb40991578f9f52b18003efa35d3da6553ff35db91b81ab890bec1b189b7f52cb2a783ebb7d823d725b0b4a71f6824e88f68f982eefc6d19c6' >> ${.TARGET} + @echo 'SKEIN1024 ("abc") =' \ + '35a599a0f91abcdb4cb73c19b8cb8d947742d82c309137a7caed29e8e0a2ca7a9ff9a90c34c1908cc7e7fd99bb15032fb86e76df21b72628399b5f7c3cc209d7bb31c99cd4e19465622a049afbb87c03b5ce3888d17e6e667279ec0aa9b3e2712624c01b5f5bbe1a564220bdcf6990af0c2539019f313fdd7406cca3892a1f1f' >> ${.TARGET} + @echo 'SKEIN1024 ("message digest") =' \ + 'ea891f5268acd0fac97467fc1aa89d1ce8681a9992a42540e53babee861483110c2d16f49e73bac27653ff173003e40cfb08516cd34262e6af95a5d8645c9c1abb3e813604d508b8511b30f9a5c1b352aa0791c7d2f27b2706dccea54bc7de6555b5202351751c3299f97c09cf89c40f67187e2521c0fad82b30edbb224f0458' >> ${.TARGET} + @echo 'SKEIN1024 ("abcdefghijklmnopqrstuvwxyz") =' \ + 'f23d95c2a25fbcd0e797cd058fec39d3c52d2b5afd7a9af1df934e63257d1d3dcf3246e7329c0f1104c1e51e3d22e300507b0c3b9f985bb1f645ef49835080536becf83788e17fed09c9982ba65c3cb7ffe6a5f745b911c506962adf226e435c42f6f6bc08d288f9c810e807e3216ef444f3db22744441deefa4900982a1371f' >> ${.TARGET} + @echo 'SKEIN1024 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789") =' \ + 'cf3889e8a8d11bfd3938055d7d061437962bc5eac8ae83b1b71c94be201b8cf657fdbfc38674997a008c0c903f56a23feb3ae30e012377f1cfa080a9ca7fe8b96138662653fb3335c7d06595bf8baf65e215307532094cfdfa056bd8052ab792a3944a2adaa47b30335b8badb8fe9eb94fe329cdca04e58bbc530f0af709f469' >> ${.TARGET} + @echo 'SKEIN1024 ("12345678901234567890123456789012345678901234567890123456789012345678901234567890") =' \ + 'cf21a613620e6c119eca31fdfaad449a8e02f95ca256c21d2a105f8e4157048f9fe1e897893ea18b64e0e37cb07d5ac947f27ba544caf7cbc1ad094e675aed77a366270f7eb7f46543bccfa61c526fd628408058ed00ed566ac35a9761d002e629c4fb0d430b2f4ad016fcc49c44d2981c4002da0eecc42144160e2eaea4855a' >> ${.TARGET} + test: md4.ref md5.ref sha0.ref rmd160.ref sha1.ref sha256.ref sha384.ref \ - sha512.ref + sha512.ref skein256.ref skein512.ref skein1024.ref @${ECHO} if any of these test fail, the code produces wrong results @${ECHO} and should NOT be used. ${CC} ${CFLAGS} ${LDFLAGS} -DMD=4 -o mddriver ${.CURDIR}/mddriver.c libmd.a @@ -255,5 +343,15 @@ ./shadriver | cmp sha512.ref - @${ECHO} SHA-512 passed test -rm -f shadriver + ${CC} ${CFLAGS} ${LDFLAGS} -DSKEIN=256 -o skeindriver ${.CURDIR}/skeindriver.c libmd.a + ./skeindriver | cmp skein256.ref - + @${ECHO} SKEIN256 passed test + ${CC} ${CFLAGS} ${LDFLAGS} -DSKEIN=512 -o skeindriver ${.CURDIR}/skeindriver.c libmd.a + ./skeindriver | cmp skein512.ref - + @${ECHO} SKEIN512 passed test + ${CC} ${CFLAGS} ${LDFLAGS} -DSKEIN=1024 -o skeindriver ${.CURDIR}/skeindriver.c libmd.a + ./skeindriver | cmp skein1024.ref - + @${ECHO} SKEIN1024 passed test + -rm -f skeindriver .include Index: lib/libmd/mdX.3 =================================================================== --- lib/libmd/mdX.3 +++ lib/libmd/mdX.3 @@ -8,7 +8,7 @@ .\" .\" $FreeBSD$ .\" -.Dd February 11, 1999 +.Dd April 26, 2016 .Dt MDX 3 .Os .Sh NAME @@ -145,7 +145,11 @@ .Sh SEE ALSO .Xr md4 3 , .Xr md5 3 , -.Xr sha 3 +.Xr ripemd 3 , +.Xr sha 3 , +.Xr sha256 3 , +.Xr sha512 3 , +.Xr skein 3 .Rs .%A R. Rivest .%T The MD4 Message-Digest Algorithm Index: lib/libmd/ripemd.3 =================================================================== --- lib/libmd/ripemd.3 +++ lib/libmd/ripemd.3 @@ -9,7 +9,7 @@ .\" From: Id: mdX.3,v 1.14 1999/02/11 20:31:49 wollman Exp .\" $FreeBSD$ .\" -.Dd March 28, 2014 +.Dd April 26, 2016 .Dt RIPEMD 3 .Os .Sh NAME @@ -125,7 +125,10 @@ .Sh SEE ALSO .Xr md4 3 , .Xr md5 3 , -.Xr sha 3 +.Xr sha 3 , +.Xr sha256 3 , +.Xr sha512 3 , +.Xr skein 3 .Sh HISTORY These functions appeared in .Fx 4.0 . Index: lib/libmd/sha.3 =================================================================== --- lib/libmd/sha.3 +++ lib/libmd/sha.3 @@ -9,7 +9,7 @@ .\" From: Id: mdX.3,v 1.14 1999/02/11 20:31:49 wollman Exp .\" $FreeBSD$ .\" -.Dd March 28, 2014 +.Dd April 26, 2016 .Dt SHA 3 .Os .Sh NAME @@ -157,7 +157,9 @@ .Xr md4 3 , .Xr md5 3 , .Xr ripemd 3 , -.Xr sha256 3 +.Xr sha256 3 , +.Xr sha512 3 , +.Xr skein 3 .Sh HISTORY These functions appeared in .Fx 4.0 . Index: lib/libmd/sha256.3 =================================================================== --- lib/libmd/sha256.3 +++ lib/libmd/sha256.3 @@ -9,7 +9,7 @@ .\" From: Id: mdX.3,v 1.14 1999/02/11 20:31:49 wollman Exp .\" $FreeBSD$ .\" -.Dd March 28, 2014 +.Dd April 26, 2016 .Dt SHA256 3 .Os .Sh NAME @@ -123,7 +123,10 @@ .Xr md4 3 , .Xr md5 3 , .Xr ripemd 3 , -.Xr sha 3 +.Xr sha 3 , +.Xr sha256 3 , +.Xr sha512 3 , +.Xr skein 3 .Sh HISTORY These functions appeared in .Fx 6.0 . Index: lib/libmd/sha512.3 =================================================================== --- lib/libmd/sha512.3 +++ lib/libmd/sha512.3 @@ -159,7 +159,10 @@ .Xr md4 3 , .Xr md5 3 , .Xr ripemd 3 , -.Xr sha 3 +.Xr sha 3 , +.Xr sha256 3 , +.Xr sha512 3 , +.Xr skein 3 .Sh HISTORY These functions appeared in .Fx 9.0 . Index: lib/libmd/skein.3 =================================================================== --- /dev/null +++ lib/libmd/skein.3 @@ -0,0 +1,202 @@ +.\" +.\" ---------------------------------------------------------------------------- +.\" "THE BEER-WARE LICENSE" (Revision 42): +.\" wrote this file. As long as you retain this notice you +.\" can do whatever you want with this stuff. If we meet some day, and you think +.\" this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp +.\" ---------------------------------------------------------------------------- +.\" +.\" From: Id: sha512.3 292782 2015-12-27 17:33:59Z allanjude +.\" $FreeBSD$ +.\" +.Dd April 26, 2016 +.Dt SKEIN 3 +.Os +.Sh NAME +.Nm SKEIN256_Init , +.Nm SKEIN256_Update , +.Nm SKEIN256_Final , +.Nm SKEIN256_End , +.Nm SKEIN256_File , +.Nm SKEIN256_FileChunk , +.Nm SKEIN256_Data , +.Nm SKEIN512_Init , +.Nm SKEIN512_Update , +.Nm SKEIN512_Final , +.Nm SKEIN512_End , +.Nm SKEIN512_File , +.Nm SKEIN512_FileChunk , +.Nm SKEIN512_Data , +.Nm SKEIN1024_Init , +.Nm SKEIN1024_Update , +.Nm SKEIN1024_Final , +.Nm SKEIN1024_End , +.Nm SKEIN1024_File , +.Nm SKEIN1024_FileChunk , +.Nm SKEIN1024_Data +.Nd calculate the ``SKEIN'' family of message digests +.Sh LIBRARY +.Lb libmd +.Sh SYNOPSIS +.In sys/types.h +.In skein.h +.Ft void +.Fn SKEIN256_Init "SKEIN256_CTX *context" +.Ft void +.Fn SKEIN256_Update "SKEIN256_CTX *context" "const unsigned char *data" "size_t len" +.Ft void +.Fn SKEIN256_Final "unsigned char digest[32]" "SKEIN256_CTX *context" +.Ft "char *" +.Fn SKEIN256_End "SKEIN256_CTX *context" "char *buf" +.Ft "char *" +.Fn SKEIN256_File "const char *filename" "char *buf" +.Ft "char *" +.Fn SKEIN256_FileChunk "const char *filename" "char *buf" "off_t offset" "off_t length" +.Ft "char *" +.Fn SKEIN256_Data "const unsigned char *data" "unsigned int len" "char *buf" +.Ft void +.Fn SKEIN512_Init "SKEIN512_CTX *context" +.Ft void +.Fn SKEIN512_Update "SKEIN512_CTX *context" "const unsigned char *data" "size_t len" +.Ft void +.Fn SKEIN512_Final "unsigned char digest[64]" "SKEIN512_CTX *context" +.Ft "char *" +.Fn SKEIN512_End "SKEIN512_CTX *context" "char *buf" +.Ft "char *" +.Fn SKEIN512_File "const char *filename" "char *buf" +.Ft "char *" +.Fn SKEIN512_FileChunk "const char *filename" "char *buf" "off_t offset" "off_t length" +.Ft "char *" +.Fn SKEIN512_Data "const unsigned char *data" "unsigned int len" "char *buf" +.Ft void +.Fn SKEIN1024_Init "SKEIN1024_CTX *context" +.Ft void +.Fn SKEIN1024_Update "SKEIN1024_CTX *context" "const unsigned char *data" "size_t len" +.Ft void +.Fn SKEIN1024_Final "unsigned char digest[128]" "SKEIN1024_CTX *context" +.Ft "char *" +.Fn SKEIN1024_End "SKEIN1024_CTX *context" "char *buf" +.Ft "char *" +.Fn SKEIN1024_File "const char *filename" "char *buf" +.Ft "char *" +.Fn SKEIN1024_FileChunk "const char *filename" "char *buf" "off_t offset" "off_t length" +.Ft "char *" +.Fn SKEIN1024_Data "const unsigned char *data" "unsigned int len" "char *buf" +.Sh DESCRIPTION +The +.Li SKEIN +functions calculate a 256, 512, or 1024-bit cryptographic checksum (digest) +for any number of input bytes. +A cryptographic checksum is a one-way +hash function; that is, it is computationally impractical to find +the input corresponding to a particular output. +This net result is +a +.Dq fingerprint +of the input-data, which does not disclose the actual input. +.Pp +The +.Fn SKEIN256_Init , +.Fn SKEIN256_Update , +and +.Fn SKEIN256_Final +functions are the core functions. +Allocate an +.Vt SKEIN256_CTX , +initialize it with +.Fn SKEIN256_Init , +run over the data with +.Fn SKEIN256_Update , +and finally extract the result using +.Fn SKEIN256_Final . +.Pp +.Fn SKEIN256_End +is a wrapper for +.Fn SKEIN256_Final +which converts the return value to a 33-character +(including the terminating '\e0') +.Tn ASCII +string which represents the 256 bits in hexadecimal. +.Pp +.Fn SKEIN256_File +calculates the digest of a file, and uses +.Fn SKEIN256_End +to return the result. +If the file cannot be opened, a null pointer is returned. +.Fn SKEIN256_FileChunk +is similar to +.Fn SKEIN256_File , +but it only calculates the digest over a byte-range of the file specified, +starting at +.Fa offset +and spanning +.Fa length +bytes. +If the +.Fa length +parameter is specified as 0, or more than the length of the remaining part +of the file, +.Fn SKEIN256_FileChunk +calculates the digest from +.Fa offset +to the end of file. +.Fn SKEIN256_Data +calculates the digest of a chunk of data in memory, and uses +.Fn SKEIN256_End +to return the result. +.Pp +When using +.Fn SKEIN256_End , +.Fn SKEIN256_File , +or +.Fn SKEIN256_Data , +the +.Fa buf +argument can be a null pointer, in which case the returned string +is allocated with +.Xr malloc 3 +and subsequently must be explicitly deallocated using +.Xr free 3 +after use. +If the +.Fa buf +argument is non-null it must point to at least 33 characters of buffer space. +.Pp +The +.Li SKEIN512_ +and +.Li SKEIN1024_ +functions are similar to the +.Li SKEIN256_ +functions except they produce a 512-bit, 65 character, +or 1024-bit, 129 character, output. +.Sh SEE ALSO +.Xr md4 3 , +.Xr md5 3 , +.Xr ripemd 3 , +.Xr sha 3 , +.Xr sha256 3 , +.Xr sha512 3 +.Sh HISTORY +These functions appeared in +.Fx 11.0 . +.Sh AUTHORS +.An -nosplit +The core hash routines were imported from version 1.3 of the optimized +reference implementation written by +.An Doug Whiting +as submitted to the NSA SHA-3 contest. +The algorithms were developed by +.An Niels Ferguson , +.An Stefan Lucks , +.An Bruce Schneier , +.An Doug Whiting , +.An Mihir Bellare , +.An Tadayoshi Kohno , +.An Jon Callas, +and +.An Jesse Walker . +.Sh BUGS +No method is known to exist which finds two files having the same hash value, +nor to find a file with a specific hash value. +There is on the other hand no guarantee that such a method does not exist. Index: lib/libmd/skeindriver.c =================================================================== --- /dev/null +++ lib/libmd/skeindriver.c @@ -0,0 +1,68 @@ +/* SKEINDRIVER.C - test driver for SKEIN */ + +/* Copyright (C) 1990-2, RSA Data Security, Inc. Created 1990. All rights + * reserved. + * + * RSA Data Security, Inc. makes no representations concerning either the + * merchantability of this software or the suitability of this software for + * any particular purpose. It is provided "as is" without express or implied + * warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include + +#include "skein.h" + +/* The following makes SKEIN default to SKEIN512 if it has not already been + * defined with C compiler flags. */ +#ifndef SKEIN +#define SKEIN 512 +#endif + +#if SKEIN == 256 +#undef SKEIN_Data +#define SKEIN_Data SKEIN256_Data +#elif SKEIN == 512 +#undef SKEIN_Data +#define SKEIN_Data SKEIN512_Data +#elif SKEIN == 1024 +#undef SKEIN_Data +#define SKEIN_Data SKEIN1024_Data +#endif + +/* Digests a string and prints the result. */ +static void +SKEINString(char *string) +{ + char buf[2*128 + 1]; + + printf("SKEIN%d (\"%s\") = %s\n", + SKEIN, string, SKEIN_Data(string, strlen(string), buf)); +} + +/* Digests a reference suite of strings and prints the results. */ +int +main(void) +{ + printf("SKEIN%d test suite:\n", SKEIN); + + SKEINString(""); + SKEINString("abc"); + SKEINString("message digest"); + SKEINString("abcdefghijklmnopqrstuvwxyz"); + SKEINString("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz0123456789"); + SKEINString("1234567890123456789012345678901234567890" + "1234567890123456789012345678901234567890"); + + return 0; +} Index: sbin/md5/Makefile =================================================================== --- sbin/md5/Makefile +++ sbin/md5/Makefile @@ -8,7 +8,10 @@ ${BINDIR}/md5 ${BINDIR}/sha1 \ ${BINDIR}/md5 ${BINDIR}/sha256 \ ${BINDIR}/md5 ${BINDIR}/sha384 \ - ${BINDIR}/md5 ${BINDIR}/sha512 + ${BINDIR}/md5 ${BINDIR}/sha512 \ + ${BINDIR}/md5 ${BINDIR}/skein256 \ + ${BINDIR}/md5 ${BINDIR}/skein512 \ + ${BINDIR}/md5 ${BINDIR}/skein1024 MLINKS= md5.1 rmd160.1 \ md5.1 sha1.1 \ Index: sbin/md5/md5.c =================================================================== --- sbin/md5/md5.c +++ sbin/md5/md5.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,9 @@ extern const char *SHA384_TestOutput[MDTESTCOUNT]; extern const char *SHA512_TestOutput[MDTESTCOUNT]; extern const char *RIPEMD160_TestOutput[MDTESTCOUNT]; +extern const char *SKEIN256_TestOutput[MDTESTCOUNT]; +extern const char *SKEIN512_TestOutput[MDTESTCOUNT]; +extern const char *SKEIN1024_TestOutput[MDTESTCOUNT]; typedef struct Algorithm_t { const char *progname; @@ -85,11 +89,14 @@ SHA384_CTX sha384; SHA512_CTX sha512; RIPEMD160_CTX ripemd160; + SKEIN256_CTX skein256; + SKEIN512_CTX skein512; + SKEIN1024_CTX skein1024; } DIGEST_CTX; /* max(MD5_DIGEST_LENGTH, SHA_DIGEST_LENGTH, SHA256_DIGEST_LENGTH, SHA512_DIGEST_LENGTH, - RIPEMD160_DIGEST_LENGTH)*2+1 */ + RIPEMD160_DIGEST_LENGTH, SKEIN1024_DIGEST_LENGTH)*2+1 */ #define HEX_DIGEST_LENGTH 129 /* algorithm function table */ @@ -112,7 +119,16 @@ &SHA512_Data, &SHA512_File }, { "rmd160", "RMD160", &RIPEMD160_TestOutput, (DIGEST_Init*)&RIPEMD160_Init, (DIGEST_Update*)&RIPEMD160_Update, - (DIGEST_End*)&RIPEMD160_End, &RIPEMD160_Data, &RIPEMD160_File } + (DIGEST_End*)&RIPEMD160_End, &RIPEMD160_Data, &RIPEMD160_File }, + { "skein256", "Skein256", &SKEIN256_TestOutput, + (DIGEST_Init*)&SKEIN256_Init, (DIGEST_Update*)&SKEIN256_Update, + (DIGEST_End*)&SKEIN256_End, &SKEIN256_Data, &SKEIN256_File }, + { "skein512", "Skein512", &SKEIN512_TestOutput, + (DIGEST_Init*)&SKEIN512_Init, (DIGEST_Update*)&SKEIN512_Update, + (DIGEST_End*)&SKEIN512_End, &SKEIN512_Data, &SKEIN512_File }, + { "skein1024", "Skein1024", &SKEIN1024_TestOutput, + (DIGEST_Init*)&SKEIN1024_Init, (DIGEST_Update*)&SKEIN1024_Update, + (DIGEST_End*)&SKEIN1024_End, &SKEIN1024_Data, &SKEIN1024_File } }; static void @@ -281,8 +297,8 @@ printf(" done\n"); printf("Digest = %s", p); printf("\nTime = %f seconds\n", seconds); - printf("Speed = %f bytes/second\n", - (float) TEST_BLOCK_LEN * (float) TEST_BLOCK_COUNT / seconds); + printf("Speed = %f MiB/second\n", (float) TEST_BLOCK_LEN * + (float) TEST_BLOCK_COUNT / seconds / (1 << 20)); } /* * Digests a reference suite of strings and prints the results. @@ -366,6 +382,39 @@ "5feb69c6bf7c29d95715ad55f57d8ac5b2b7dd32" }; +const char *SKEIN256_TestOutput[MDTESTCOUNT] = { + "c8877087da56e072870daa843f176e9453115929094c3a40c463a196c29bf7ba", + "7fba44ff1a31d71a0c1f82e6e82fb5e9ac6c92a39c9185b9951fed82d82fe635", + "258bdec343b9fde1639221a5ae0144a96e552e5288753c5fec76c05fc2fc1870", + "4d2ce0062b5eb3a4db95bc1117dd8aa014f6cd50fdc8e64f31f7d41f9231e488", + "46d8440685461b00e3ddb891b2ecc6855287d2bd8834a95fb1c1708b00ea5e82", + "7c5eb606389556b33d34eb2536459528dc0af97adbcd0ce273aeb650f598d4b2", + "4def7a7e5464a140ae9c3a80279fbebce4bd00f9faad819ab7e001512f67a10d", + "d9c017dbe355f318d036469eb9b5fbe129fc2b5786a9dc6746a516eab6fe0126" +}; + +const char *SKEIN512_TestOutput[MDTESTCOUNT] = { + "bc5b4c50925519c290cc634277ae3d6257212395cba733bbad37a4af0fa06af41fca7903d06564fea7a2d3730dbdb80c1f85562dfcc070334ea4d1d9e72cba7a", + "b1cd8d33f61b3737adfd59bb13ad82f4a9548e92f22956a8976cca3fdb7fee4fe91698146c4197cec85d38b83c5d93bdba92c01fd9a53870d0c7f967bc62bdce", + "8f5dd9ec798152668e35129496b029a960c9a9b88662f7f9482f110b31f9f93893ecfb25c009baad9e46737197d5630379816a886aa05526d3a70df272d96e75", + "15b73c158ffb875fed4d72801ded0794c720b121c0c78edf45f900937e6933d9e21a3a984206933d504b5dbb2368000411477ee1b204c986068df77886542fcc", + "23793ad900ef12f9165c8080da6fdfd2c8354a2929b8aadf83aa82a3c6470342f57cf8c035ec0d97429b626c4d94f28632c8f5134fd367dca5cf293d2ec13f8c", + "0c6bed927e022f5ddcf81877d42e5f75798a9f8fd3ede3d83baac0a2f364b082e036c11af35fe478745459dd8f5c0b73efe3c56ba5bb2009208d5a29cc6e469c", + "2ca9fcffb3456f297d1b5f407014ecb856f0baac8eb540f534b1f187196f21e88f31103128c2f03fcc9857d7a58eb66f9525e2302d88833ee069295537a434ce", + "1131f2aaa0e97126c9314f9f968cc827259bbfabced2943bb8c9274448998fb3b78738b4580dd500c76105fd3c03e465e1414f2c29664286b1f79d3e51128125" +}; + +const char *SKEIN1024_TestOutput[MDTESTCOUNT] = { + "0fff9563bb3279289227ac77d319b6fff8d7e9f09da1247b72a0a265cd6d2a62645ad547ed8193db48cff847c06494a03f55666d3b47eb4c20456c9373c86297d630d5578ebd34cb40991578f9f52b18003efa35d3da6553ff35db91b81ab890bec1b189b7f52cb2a783ebb7d823d725b0b4a71f6824e88f68f982eefc6d19c6", + "6ab4c4ba9814a3d976ec8bffa7fcc638ceba0544a97b3c98411323ffd2dc936315d13dc93c13c4e88cda6f5bac6f2558b2d8694d3b6143e40d644ae43ca940685cb37f809d3d0550c56cba8036dee729a4f8fb960732e59e64d57f7f7710f8670963cdcdc95b41daab4855fcf8b6762a64b173ee61343a2c7689af1d293eba97", + "35a599a0f91abcdb4cb73c19b8cb8d947742d82c309137a7caed29e8e0a2ca7a9ff9a90c34c1908cc7e7fd99bb15032fb86e76df21b72628399b5f7c3cc209d7bb31c99cd4e19465622a049afbb87c03b5ce3888d17e6e667279ec0aa9b3e2712624c01b5f5bbe1a564220bdcf6990af0c2539019f313fdd7406cca3892a1f1f", + "ea891f5268acd0fac97467fc1aa89d1ce8681a9992a42540e53babee861483110c2d16f49e73bac27653ff173003e40cfb08516cd34262e6af95a5d8645c9c1abb3e813604d508b8511b30f9a5c1b352aa0791c7d2f27b2706dccea54bc7de6555b5202351751c3299f97c09cf89c40f67187e2521c0fad82b30edbb224f0458", + "f23d95c2a25fbcd0e797cd058fec39d3c52d2b5afd7a9af1df934e63257d1d3dcf3246e7329c0f1104c1e51e3d22e300507b0c3b9f985bb1f645ef49835080536becf83788e17fed09c9982ba65c3cb7ffe6a5f745b911c506962adf226e435c42f6f6bc08d288f9c810e807e3216ef444f3db22744441deefa4900982a1371f", + "cf3889e8a8d11bfd3938055d7d061437962bc5eac8ae83b1b71c94be201b8cf657fdbfc38674997a008c0c903f56a23feb3ae30e012377f1cfa080a9ca7fe8b96138662653fb3335c7d06595bf8baf65e215307532094cfdfa056bd8052ab792a3944a2adaa47b30335b8badb8fe9eb94fe329cdca04e58bbc530f0af709f469", + "cf21a613620e6c119eca31fdfaad449a8e02f95ca256c21d2a105f8e4157048f9fe1e897893ea18b64e0e37cb07d5ac947f27ba544caf7cbc1ad094e675aed77a366270f7eb7f46543bccfa61c526fd628408058ed00ed566ac35a9761d002e629c4fb0d430b2f4ad016fcc49c44d2981c4002da0eecc42144160e2eaea4855a", + "e6799b78db54085a2be7ff4c8007f147fa88d326abab30be0560b953396d8802feee9a15419b48a467574e9283be15685ca8a079ee52b27166b64dd70b124b1d4e4f6aca37224c3f2685e67e67baef9f94b905698adc794a09672aba977a61b20966912acdb08c21a2c37001785355dc884751a21f848ab36e590331ff938138" +}; + static void MDTestSuite(const Algorithm_t *alg) { Index: sys/contrib/skein/SHA3api_ref.h =================================================================== --- /dev/null +++ sys/contrib/skein/SHA3api_ref.h @@ -0,0 +1,66 @@ +#ifndef _AHS_API_H_ +#define _AHS_API_H_ + +/*********************************************************************** +** +** Interface declarations of the AHS API using the Skein hash function. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +************************************************************************/ + +#include "skein.h" + +typedef enum + { + SUCCESS = SKEIN_SUCCESS, + FAIL = SKEIN_FAIL, + BAD_HASHLEN = SKEIN_BAD_HASHLEN + } + HashReturn; + +typedef size_t DataLength; /* bit count type */ +typedef u08b_t BitSequence; /* bit stream type */ + +typedef struct + { + uint_t statebits; /* 256, 512, or 1024 */ + union + { + Skein_Ctxt_Hdr_t h; /* common header "overlay" */ + Skein_256_Ctxt_t ctx_256; + Skein_512_Ctxt_t ctx_512; + Skein1024_Ctxt_t ctx1024; + } u; + } + hashState; + +/* "incremental" hashing API */ +HashReturn Init (hashState *state, int hashbitlen); +HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen); +HashReturn Final (hashState *state, BitSequence *hashval); + +/* "all-in-one" call */ +HashReturn Hash (int hashbitlen, const BitSequence *data, + DataLength databitlen, BitSequence *hashval); + + +/* +** Re-define the compile-time constants below to change the selection +** of the Skein state size in the Init() function in SHA3api_ref.c. +** +** That is, the NIST API does not allow for explicit selection of the +** Skein block size, so it must be done implicitly in the Init() function. +** The selection is controlled by these constants. +*/ +#ifndef SKEIN_256_NIST_MAX_HASHBITS +#define SKEIN_256_NIST_MAX_HASHBITS (0) +#endif + +#ifndef SKEIN_512_NIST_MAX_HASHBITS +#define SKEIN_512_NIST_MAX_HASHBITS (512) +#endif + +#endif /* ifdef _AHS_API_H_ */ Index: sys/contrib/skein/SHA3api_ref.c =================================================================== --- /dev/null +++ sys/contrib/skein/SHA3api_ref.c @@ -0,0 +1,115 @@ +/*********************************************************************** +** +** Implementation of the AHS API using the Skein hash function. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +************************************************************************/ + +#include /* get the memcpy/memset functions */ +#include "skein.h" /* get the Skein API definitions */ +#include "SHA3api_ref.h"/* get the AHS API definitions */ + +/******************************************************************/ +/* AHS API code */ +/******************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* select the context size and init the context */ +HashReturn Init(hashState *state, int hashbitlen) + { +#if SKEIN_256_NIST_MAX_HASH_BITS + if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS) + { + Skein_Assert(hashbitlen > 0,BAD_HASHLEN); + state->statebits = 64*SKEIN_256_STATE_WORDS; + return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen); + } +#endif + if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS) + { + state->statebits = 64*SKEIN_512_STATE_WORDS; + return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen); + } + else + { + state->statebits = 64*SKEIN1024_STATE_WORDS; + return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen); + } + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process data to be hashed */ +HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) + { + /* only the final Update() call is allowed do partial bytes, else assert an error */ + Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, FAIL); + + Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL); + if ((databitlen & 7) == 0) /* partial bytes? */ + { + switch ((state->statebits >> 8) & 3) + { + case 2: return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3); + case 1: return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3); + case 0: return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3); + default: return FAIL; + } + } + else + { /* handle partial final byte */ + size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */ + u08b_t b,mask; + + mask = (u08b_t) (1u << (7 - (databitlen & 7))); /* partial byte bit mask */ + b = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask); /* apply bit padding on final byte */ + + switch ((state->statebits >> 8) & 3) + { + case 2: Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte */ + Skein_512_Update(&state->u.ctx_512,&b , 1 ); /* process the (masked) partial byte */ + break; + case 1: Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte */ + Skein_256_Update(&state->u.ctx_256,&b , 1 ); /* process the (masked) partial byte */ + break; + case 0: Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte */ + Skein1024_Update(&state->u.ctx1024,&b , 1 ); /* process the (masked) partial byte */ + break; + default: return FAIL; + } + Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */ + + return SUCCESS; + } + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize hash computation and output the result (hashbitlen bits) */ +HashReturn Final(hashState *state, BitSequence *hashval) + { + Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL); + switch ((state->statebits >> 8) & 3) + { + case 2: return Skein_512_Final(&state->u.ctx_512,hashval); + case 1: return Skein_256_Final(&state->u.ctx_256,hashval); + case 0: return Skein1024_Final(&state->u.ctx1024,hashval); + default: return FAIL; + } + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* all-in-one hash function */ +HashReturn Hash(int hashbitlen, const BitSequence *data, /* all-in-one call */ + DataLength databitlen,BitSequence *hashval) + { + hashState state; + HashReturn r = Init(&state,hashbitlen); + if (r == SUCCESS) + { /* these calls do not fail when called properly */ + r = Update(&state,data,databitlen); + Final(&state,hashval); + } + return r; + } Index: sys/contrib/skein/asm/skein_block_x64.asm =================================================================== --- /dev/null +++ sys/contrib/skein/asm/skein_block_x64.asm @@ -0,0 +1,1335 @@ +; +;---------------------------------------------------------------- +; 64-bit x86 assembler code (Microsoft ML64) for Skein block functions +; +; Author: Doug Whiting, Hifn +; +; This code is released to the public domain. +;---------------------------------------------------------------- +; + .code +; +_MASK_ALL_ equ (256+512+1024) ;all three algorithm bits +_MAX_FRAME_ equ 240 +; +;;;;;;;;;;;;;;;;; +ifndef SKEIN_USE_ASM +_USE_ASM_ = _MASK_ALL_ +elseif SKEIN_USE_ASM and _MASK_ALL_ +_USE_ASM_ = SKEIN_USE_ASM +else +_USE_ASM_ = _MASK_ALL_ +endif +;;;;;;;;;;;;;;;;; +ifndef SKEIN_LOOP ;configure loop unrolling +_SKEIN_LOOP = 0 ;default is all fully unrolled +else +_SKEIN_LOOP = SKEIN_LOOP +endif +; the unroll counts (0 --> fully unrolled) +SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) mod 10 +SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) mod 10 +SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) mod 10 +; +SKEIN_ASM_UNROLL = 0 + irp _NN_,<256,512,1024> + if (SKEIN_UNROLL_&_NN_) eq 0 +SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + _NN_ + endif + endm +;;;;;;;;;;;;;;;;; +; +ifndef SKEIN_ROUNDS +ROUNDS_256 = 72 +ROUNDS_512 = 72 +ROUNDS_1024 = 80 +else +ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5) +ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) mod 10) + 5) +ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) mod 10) + 5) +endif +; +irp _NN_,<256,512,1024> + if _USE_ASM_ and _NN_ + irp _RR_,<%(ROUNDS_&_NN_)> + if _NN_ eq 1024 +%out +++ SKEIN_ROUNDS_&_NN_ = _RR_ + else +%out +++ SKEIN_ROUNDS_&_NN_ = _RR_ + endif + endm + endif +endm +;;;;;;;;;;;;;;;;; +; +ifndef SKEIN_CODE_SIZE +ifdef SKEIN_PERF +SKEIN_CODE_SIZE equ (1) +endif +endif +; +;;;;;;;;;;;;;;;;; +; +ifndef SKEIN_DEBUG +_SKEIN_DEBUG = 0 +else +_SKEIN_DEBUG = 1 +endif +;;;;;;;;;;;;;;;;; +; +; define offsets of fields in hash context structure +; +HASH_BITS = 0 ;# bits of hash output +BCNT = 8 + HASH_BITS ;number of bytes in BUFFER[] +TWEAK = 8 + BCNT ;tweak values[0..1] +X_VARS = 16 + TWEAK ;chaining vars +; +;(Note: buffer[] in context structure is NOT needed here :-) +; +r08 equ +r09 equ +; +KW_PARITY = 01BD11BDAA9FC1A22h ;overall parity of key schedule words +FIRST_MASK = NOT (1 SHL 62) +; +; rotation constants for Skein +; +RC_256_0_0 = 14 +RC_256_0_1 = 16 + +RC_256_1_0 = 52 +RC_256_1_1 = 57 + +RC_256_2_0 = 23 +RC_256_2_1 = 40 + +RC_256_3_0 = 5 +RC_256_3_1 = 37 + +RC_256_4_0 = 25 +RC_256_4_1 = 33 + +RC_256_5_0 = 46 +RC_256_5_1 = 12 + +RC_256_6_0 = 58 +RC_256_6_1 = 22 + +RC_256_7_0 = 32 +RC_256_7_1 = 32 + +RC_512_0_0 = 46 +RC_512_0_1 = 36 +RC_512_0_2 = 19 +RC_512_0_3 = 37 + +RC_512_1_0 = 33 +RC_512_1_1 = 27 +RC_512_1_2 = 14 +RC_512_1_3 = 42 + +RC_512_2_0 = 17 +RC_512_2_1 = 49 +RC_512_2_2 = 36 +RC_512_2_3 = 39 + +RC_512_3_0 = 44 +RC_512_3_1 = 9 +RC_512_3_2 = 54 +RC_512_3_3 = 56 + +RC_512_4_0 = 39 +RC_512_4_1 = 30 +RC_512_4_2 = 34 +RC_512_4_3 = 24 + +RC_512_5_0 = 13 +RC_512_5_1 = 50 +RC_512_5_2 = 10 +RC_512_5_3 = 17 + +RC_512_6_0 = 25 +RC_512_6_1 = 29 +RC_512_6_2 = 39 +RC_512_6_3 = 43 + +RC_512_7_0 = 8 +RC_512_7_1 = 35 +RC_512_7_2 = 56 +RC_512_7_3 = 22 + +RC_1024_0_0 = 24 +RC_1024_0_1 = 13 +RC_1024_0_2 = 8 +RC_1024_0_3 = 47 +RC_1024_0_4 = 8 +RC_1024_0_5 = 17 +RC_1024_0_6 = 22 +RC_1024_0_7 = 37 + +RC_1024_1_0 = 38 +RC_1024_1_1 = 19 +RC_1024_1_2 = 10 +RC_1024_1_3 = 55 +RC_1024_1_4 = 49 +RC_1024_1_5 = 18 +RC_1024_1_6 = 23 +RC_1024_1_7 = 52 + +RC_1024_2_0 = 33 +RC_1024_2_1 = 4 +RC_1024_2_2 = 51 +RC_1024_2_3 = 13 +RC_1024_2_4 = 34 +RC_1024_2_5 = 41 +RC_1024_2_6 = 59 +RC_1024_2_7 = 17 + +RC_1024_3_0 = 5 +RC_1024_3_1 = 20 +RC_1024_3_2 = 48 +RC_1024_3_3 = 41 +RC_1024_3_4 = 47 +RC_1024_3_5 = 28 +RC_1024_3_6 = 16 +RC_1024_3_7 = 25 + +RC_1024_4_0 = 41 +RC_1024_4_1 = 9 +RC_1024_4_2 = 37 +RC_1024_4_3 = 31 +RC_1024_4_4 = 12 +RC_1024_4_5 = 47 +RC_1024_4_6 = 44 +RC_1024_4_7 = 30 + +RC_1024_5_0 = 16 +RC_1024_5_1 = 34 +RC_1024_5_2 = 56 +RC_1024_5_3 = 51 +RC_1024_5_4 = 4 +RC_1024_5_5 = 53 +RC_1024_5_6 = 42 +RC_1024_5_7 = 41 + +RC_1024_6_0 = 31 +RC_1024_6_1 = 44 +RC_1024_6_2 = 47 +RC_1024_6_3 = 46 +RC_1024_6_4 = 19 +RC_1024_6_5 = 42 +RC_1024_6_6 = 44 +RC_1024_6_7 = 25 + +RC_1024_7_0 = 9 +RC_1024_7_1 = 48 +RC_1024_7_2 = 35 +RC_1024_7_3 = 52 +RC_1024_7_4 = 23 +RC_1024_7_5 = 31 +RC_1024_7_6 = 37 +RC_1024_7_7 = 20 +; +; Input: reg +; Output: <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 +; +RotL64 macro reg,BLK_SIZE,ROUND_NUM,MIX_NUM +_RCNT_ = ( RC_&BLK_SIZE&_&ROUND_NUM&_&MIX_NUM AND 63 ) + if _RCNT_ ;is there anything to do? + rol reg,_RCNT_ + endif +endm +; +;---------------------------------------------------------------- +; +; MACROS: define local vars and configure stack +; +;---------------------------------------------------------------- +; declare allocated space on the stack +StackVar macro localName,localSize +localName = _STK_OFFS_ +_STK_OFFS_ = _STK_OFFS_+(localSize) +endm ;StackVar +; +;---------------------------------------------------------------- +; +; MACRO: Configure stack frame, allocate local vars +; +Setup_Stack macro BLK_BITS,KS_CNT,NO_FRAME,debugCnt + WCNT = (BLK_BITS)/64 +; +_PushCnt_ = 0 ;save nonvolatile regs on stack + irp _reg_, + push _reg_ + .pushreg _reg_ ;pseudo-op push for exception handling +_PushCnt_ = _PushCnt_ + 1 ;track count to keep alignment + endm +; +_STK_OFFS_ = 0 ;starting offset from rsp + ;---- local variables ;<-- rsp + StackVar X_stk ,8*(WCNT) ;local context vars + StackVar ksTwk ,8*3 ;key schedule: tweak words + StackVar ksKey ,8*(WCNT)+8 ;key schedule: key words + if (SKEIN_ASM_UNROLL and (BLK_BITS)) eq 0 + StackVar ksRot ,16*(KS_CNT+0);leave space for "rotation" to happen + endif + StackVar Wcopy ,8*(WCNT) ;copy of input block + if _SKEIN_DEBUG + ifnb ;temp location for debug X[] info + StackVar xDebug_&BLK_BITS ,8*(debugCnt) + endif + endif + if ((8*_PushCnt_ + _STK_OFFS_) and 8) eq 0 + StackVar align16,8 ;keep 16-byte aligned (adjust for retAddr?) +tmpStk_&BLK_BITS = align16 ;use this + endif +LOCAL_SIZE = _STK_OFFS_ ;size of local vars + ;---- + StackVar savRegs,8*_PushCnt_ ;saved registers + StackVar retAddr,8 ;return address + ;---- caller parameters + StackVar ctxPtr ,8 ;context ptr + StackVar blkPtr ,8 ;pointer to block data + StackVar blkCnt ,8 ;number of full blocks to process + StackVar bitAdd ,8 ;bit count to add to tweak + ;---- caller's stack frame +; +; set up the stack frame pointer (rbp) +; +FRAME_OFFS = ksTwk + 128 ;allow short (negative) offset to ksTwk, kwKey + if FRAME_OFFS gt _STK_OFFS_ ;keep rbp in the "locals" range +FRAME_OFFS = _STK_OFFS_ + endif + if FRAME_OFFS gt _MAX_FRAME_ ;keep Microsoft .setframe happy +FRAME_OFFS = _MAX_FRAME_ + endif +; +ifdef SKEIN_ASM_INFO + if FRAME_OFFS+128 lt savRegs +%out +++ SKEIN_&BLK_BITS: Unable to reach all of Wcopy with short offset from rbp. + elseif FRAME_OFFS+128 lt Wcopy +%out +++ SKEIN_&BLK_BITS: Unable to reach end of Wcopy with short offset from rbp. + elseif FRAME_OFFS+128 lt _STK_OFFS_ +%out +++ SKEIN_&BLK_BITS: Unable to reach caller parms with short offset from rbp + endif +endif + ;put some useful defines in the .lst file (for grep) +__STK_LCL_SIZE_&BLK_BITS = LOCAL_SIZE +__STK_TOT_SIZE_&BLK_BITS = _STK_OFFS_ +__STK_FRM_OFFS_&BLK_BITS = FRAME_OFFS +; +; Notes on stack frame setup: +; * the most frequently used variable is X_stk[], based at [rsp+0] +; * the next most used is the key schedule arrays, ksKey and ksTwk +; so rbp is "centered" there, allowing short offsets to the key +; schedule even in 1024-bit Skein case +; * the Wcopy variables are infrequently accessed, but they have long +; offsets from both rsp and rbp only in the 1024-bit case. +; * all other local vars and calling parameters can be accessed +; with short offsets, except in the 1024-bit case +; + sub rsp,LOCAL_SIZE ;make room for the locals + .allocstack LOCAL_SIZE ;pseudo op for exception handling + lea rbp,[rsp+FRAME_OFFS] ;maximize use of short offsets + ifb + .setframe rbp, FRAME_OFFS ;pseudo op for exception handling + endif + mov [FP_+ctxPtr],rcx ;save caller's parameters on the stack + mov [FP_+blkPtr],rdx + mov [FP_+blkCnt],r08 + mov [FP_+bitAdd],r09 + .endprolog ;pseudo op to support exception handling + + mov rdi,[FP_+ctxPtr ] ;rdi --> context +; +endm ;Setup_Stack +; +FP_ equ ;keep as many short offsets as possible +; +;---------------------------------------------------------------- +; +Reset_Stack macro procStart + add rsp,LOCAL_SIZE ;get rid of locals (wipe??) + irp _reg_, + pop _reg_ +_PushCnt_ = _PushCnt_ - 1 + endm + if _PushCnt_ + .err "Mismatched push/pops?" + endif + + ;display code size in bytes to stdout + irp _BCNT_,<%($+1-procStart)> ;account for return opcode +_ProcBytes_ = _BCNT_ +if _BCNT_ ge 10000 +%out procStart code size = _BCNT_ bytes +elseif _BCNT_ ge 1000 +%out procStart code size = _BCNT_ bytes +else +%out procStart code size = _BCNT_ bytes +endif + endm ;irp _BCNT_ +endm ; Reset_Stack +; +;---------------------------------------------------------------- +; macros to help debug internals +; +if _SKEIN_DEBUG + extrn Skein_Show_Block:proc ;calls to C routines + extrn Skein_Show_Round:proc +; +SKEIN_RND_SPECIAL = 1000 +SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 +SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 +SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 +; +Skein_Debug_Block macro BLK_BITS +; +;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, +; const u08b_t *blkPtr, const u64b_t *wPtr, +; const u64b_t *ksPtr,const u64b_t *tsPtr); +; + irp _reg_, + push _reg_ ;save all volatile regs on tack before the call + endm + ; get and push call parameters + lea rax,[FP_+ksTwk] ;tweak pointer + push rax + lea rax,[FP_+ksKey] ;key pointer + push rax + lea rax,[FP_+Wcopy] ;wPtr + push rax + mov r09,[FP_+blkPtr] ;blkPtr + push r09 ;(push register parameters anyway to make room on stack) + mov rdx,[FP_+ctxPtr] + lea r08,[rdx+X_VARS] ;X (pointer) + push r08 + push rdx ;h (pointer) + mov rcx, BLK_BITS ;bits + push rdx + call Skein_Show_Block ;call external debug handler + add rsp,7*8 ;discard parameters on stack + irp _reg_, + pop _reg_ ;restore regs + endm +endm ; Skein_Debug_Block +; +; +; the macro to "call" to debug a round +; +Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp + ; call the appropriate (local) debug function + push r08 + if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL) + mov r08, R + else ;compute round number using edi +_rOffs_ = RDI_OFFS + 0 + if BLK_BITS eq 1024 + mov r08,[rsp+8+rIdx_offs] ;get rIdx off the stack (adjust for push r08) + lea r08,[4*r08+1+(((R)-1) and 3)+_rOffs_] + else + lea r08,[4*rdi+1+(((R)-1) and 3)+_rOffs_] + endif + endif + call Skein_Debug_Round_&BLK_BITS + pop r08 +; + afterOp +endm ; Skein_Debug_Round +else ;------- _SKEIN_DEBUG (dummy macros if debug not enabled) +Skein_Debug_Block macro BLK_BITS,afterOp +endm +; +Skein_Debug_Round macro BLK_BITS,R,RDI_OFFS,afterOp +endm +; +endif ; _SKEIN_DEBUG +; +;---------------------------------------------------------------- +; +addReg macro dstReg,srcReg_A,srcReg_B,useAddOp,immOffs + ifnb + lea dstReg,[srcReg_A&&srcReg_B + dstReg + immOffs] + elseif ((useAddOp + 0) eq 0) + ifndef ASM_NO_LEA + ;lea seems to be faster on Core 2 Duo CPUs! + lea dstReg,[srcReg_A&&srcReg_B + dstReg] + else + add dstReg, srcReg_A&&srcReg_B + endif + else + add dstReg, srcReg_A&&srcReg_B + endif +endm +; +;=================================== Skein_256 ============================================= +; +if _USE_ASM_ and 256 + public Skein_256_Process_Block +; +; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +;;;;;;;;;;;;;;;;; +; +; code +; +Skein_256_Process_Block proc frame + Setup_Stack 256,((ROUNDS_256/8)+1) + mov r14,[rdi+TWEAK+8] + jmp short Skein_256_block_loop + align 16 + ; main hash loop for Skein_256 +Skein_256_block_loop: + ; + ; general register usage: + ; RAX..RDX = X0..X3 + ; R08..R12 = ks[0..4] + ; R13..R15 = ts[0..2] + ; RSP, RBP = stack/frame pointers + ; RDI = round counter or context pointer + ; RSI = temp + ; + mov r13,[rdi+TWEAK+0] + add r13,[FP_+bitAdd] ;computed updated tweak value T0 + mov r15,r14 + xor r15,r13 ;now r13.r15 is set as the tweak + + mov r12,KW_PARITY + mov r08,[rdi+X_VARS+ 0] + mov r09,[rdi+X_VARS+ 8] + mov r10,[rdi+X_VARS+16] + mov r11,[rdi+X_VARS+24] + mov [rdi+TWEAK+0],r13 ;save updated tweak value ctx->h.T[0] + xor r12,r08 ;start accumulating overall parity + + mov rsi,[FP_+blkPtr ] ;esi --> input block + xor r12,r09 + mov rax,[rsi+ 0] ;get X[0..3] + xor r12,r10 + mov rbx,[rsi+ 8] + xor r12,r11 + mov rcx,[rsi+16] + mov rdx,[rsi+24] + + mov [FP_+Wcopy+ 0],rax ;save copy of input block + mov [FP_+Wcopy+ 8],rbx + mov [FP_+Wcopy+16],rcx + mov [FP_+Wcopy+24],rdx + + add rax, r08 ;initial key injection + add rbx, r09 + add rcx, r10 + add rdx, r11 + add rbx, r13 + add rcx, r14 + +if _SKEIN_DEBUG + mov [rdi+TWEAK+ 8],r14 ;save updated tweak T[1] (start bit cleared?) + mov [FP_+ksKey+ 0],r08 ;save key schedule on stack for Skein_Debug_Block + mov [FP_+ksKey+ 8],r09 + mov [FP_+ksKey+16],r10 + mov [FP_+ksKey+24],r11 + mov [FP_+ksKey+32],r12 + + mov [FP_+ksTwk+ 0],r13 + mov [FP_+ksTwk+ 8],r14 + mov [FP_+ksTwk+16],r15 + + mov [rsp+X_stk + 0],rax ;save X[] on stack for Skein_Debug_Block + mov [rsp+X_stk + 8],rbx + mov [rsp+X_stk +16],rcx + mov [rsp+X_stk +24],rdx + + Skein_Debug_Block 256 ;debug dump + Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL +endif +; +if ((SKEIN_ASM_UNROLL and 256) eq 0) + mov [FP_+ksKey+40],r08 ;save key schedule on stack for looping code + mov [FP_+ksKey+ 8],r09 + mov [FP_+ksKey+16],r10 + mov [FP_+ksKey+24],r11 + mov [FP_+ksKey+32],r12 + + mov [FP_+ksTwk+24],r13 + mov [FP_+ksTwk+ 8],r14 + mov [FP_+ksTwk+16],r15 +endif + add rsi, WCNT*8 ;skip the block + mov [FP_+blkPtr ],rsi ;update block pointer +; +opLoop macro op1,op2 + if (SKEIN_ASM_UNROLL and 256) eq 0 + op1 + else + op2 + endif +endm +; + ; + ; now the key schedule is computed. Start the rounds + ; +if SKEIN_ASM_UNROLL and 256 +_UNROLL_CNT = ROUNDS_256/8 +else +_UNROLL_CNT = SKEIN_UNROLL_256 + if ((ROUNDS_256/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_256" + endif + xor rdi,rdi ;rdi = iteration count +Skein_256_round_loop: +endif +_Rbase_ = 0 +rept _UNROLL_CNT*2 + ; all X and ks vars in regs ; (ops to "rotate" ks vars, via mem, if not unrolled) + ; round 4*_RBase_ + 0 + addReg rax, rbx + RotL64 rbx, 256,%((4*_RBase_+0) and 7),0 + addReg rcx, rdx + opLoop + xor rbx, rax + RotL64 rdx, 256,%((4*_RBase_+0) and 7),1 + xor rdx, rcx + if SKEIN_ASM_UNROLL and 256 + irp _r0_,<%(08+(_Rbase_+3) mod 5)> + irp _r1_,<%(13+(_Rbase_+2) mod 3)> + lea rdi,[r&_r0_+r&_r1_] ;precompute key injection value for rcx + endm + endm + endif + opLoop + Skein_Debug_Round 256,%(4*_RBase_+1) + + ; round 4*_RBase_ + 1 + addReg rax, rdx + RotL64 rdx, 256,%((4*_RBase_+1) and 7),0 + xor rdx, rax + opLoop + addReg rcx, rbx + RotL64 rbx, 256,%((4*_RBase_+1) and 7),1 + xor rbx, rcx + opLoop + Skein_Debug_Round 256,%(4*_RBase_+2) + if SKEIN_ASM_UNROLL and 256 + irp _r0_,<%(08+(_Rbase_+2) mod 5)> + irp _r1_,<%(13+(_Rbase_+1) mod 3)> + lea rsi,[r&_r0_+r&_r1_] ;precompute key injection value for rbx + endm + endm + endif + ; round 4*_RBase_ + 2 + addReg rax, rbx + RotL64 rbx, 256,%((4*_RBase_+2) and 7),0 + addReg rcx, rdx + opLoop + xor rbx, rax + RotL64 rdx, 256,%((4*_RBase_+2) and 7),1 + xor rdx, rcx + opLoop ;"rotate" the key + opLoop ;precompute key + tweak + Skein_Debug_Round 256,%(4*_RBase_+3) + ; round 4*_RBase_ + 3 + addReg rax, rdx + RotL64 rdx, 256,%((4*_RBase_+3) and 7),0 + addReg rcx, rbx + opLoop ;precompute key + tweak + opLoop ;"rotate" the tweak + xor rdx, rax + RotL64 rbx, 256,%((4*_RBase_+3) and 7),1 + xor rbx, rcx + Skein_Debug_Round 256,%(4*_RBase_+4) + opLoop ;precompute key+tweak + ;inject key schedule words +_Rbase_ = _Rbase_+1 + if SKEIN_ASM_UNROLL and 256 + addReg rax,r,%(08+((_Rbase_+0) mod 5)) + addReg rbx,rsi + addReg rcx,rdi + addReg rdx,r,%(08+((_Rbase_+3) mod 5)),,_Rbase_ + else + inc rdi + addReg rax,r08 + addReg rcx,r10 + addReg rbx,r09 + addReg rdx,r11 + endif + Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT +endm ;rept _UNROLL_CNT + +; +if (SKEIN_ASM_UNROLL and 256) eq 0 + cmp rdi,2*(ROUNDS_256/8) + jb Skein_256_round_loop +endif ; (SKEIN_ASM_UNROLL and 256) eq 0 + mov rdi,[FP_+ctxPtr ] ;restore edi --> context + + ;---------------------------- + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} + xor rax,[FP_+Wcopy + 0] + mov r14,FIRST_MASK + xor rbx,[FP_+Wcopy + 8] + xor rcx,[FP_+Wcopy +16] + xor rdx,[FP_+Wcopy +24] + mov [rdi+X_VARS+ 0],rax ;store final result + and r14,[rdi+TWEAK + 8] + dec qword ptr [FP_+blkCnt] ;set zero flag + mov [rdi+X_VARS+ 8],rbx + mov [rdi+X_VARS+16],rcx + mov [rdi+X_VARS+24],rdx + + Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,, + + ; go back for more blocks, if needed + jnz Skein_256_block_loop + mov [rdi+TWEAK + 8],r14 + Reset_Stack Skein_256_Process_Block + ret + + if _SKEIN_DEBUG +Skein_Debug_Round_256: + mov [FP_+X_stk+ 0],rax ;first, save X[] state on stack so debug routines can access it + mov [FP_+X_stk+ 8],rbx ;(use FP_ since rsp has changed!) + mov [FP_+X_stk+16],rcx + mov [FP_+X_stk+24],rdx + push rdx ;save two regs for BLK_BITS-specific parms + push rcx + mov rdx,[FP_+ctxPtr] ;ctx_hdr_ptr + mov rcx, 256 + jmp Skein_Debug_Round_Common + endif + +Skein_256_Process_Block endp +; +ifdef SKEIN_CODE_SIZE + public Skein_256_Process_Block_CodeSize +Skein_256_Process_Block_CodeSize proc + mov rax,_ProcBytes_ + ret +Skein_256_Process_Block_CodeSize endp +; + public Skein_256_Unroll_Cnt +Skein_256_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_256/8 + mov rax,_UNROLL_CNT + else + xor rax,rax + endif + ret +Skein_256_Unroll_Cnt endp +endif +; +endif ;_USE_ASM_ and 256 +; +;=================================== Skein_512 ============================================= +; +if _USE_ASM_ and 512 + public Skein_512_Process_Block +; +; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +rX_512_0 equ r08 ;register assignments for X[] values during rounds +rX_512_1 equ r09 +rX_512_2 equ r10 +rX_512_3 equ r11 +rX_512_4 equ r12 +rX_512_5 equ r13 +rX_512_6 equ r14 +rX_512_7 equ r15 +; +;;;;;;;;;;;;;;;;; +; MACRO: one round for 512-bit blocks +; +R_512_OneRound macro r0,r1,r2,r3,r4,r5,r6,r7,_Rn_,op1,op2,op3,op4 +; + addReg rX_512_&r0, rX_512_&r1 + RotL64 rX_512_&r1, 512,%((_Rn_) and 7),0 + xor rX_512_&r1, rX_512_&r0 + op1 + addReg rX_512_&r2, rX_512_&r3 + RotL64 rX_512_&r3, 512,%((_Rn_) and 7),1 + xor rX_512_&r3, rX_512_&r2 + op2 + addReg rX_512_&r4, rX_512_&r5 + RotL64 rX_512_&r5, 512,%((_Rn_) and 7),2 + xor rX_512_&r5, rX_512_&r4 + op3 + addReg rX_512_&r6, rX_512_&r7 + RotL64 rX_512_&r7, 512,%((_Rn_) and 7),3 + xor rX_512_&r7, rX_512_&r6 + op4 + Skein_Debug_Round 512,%(_Rn_+1),-4 +; +endm ;R_512_OneRound +; +;;;;;;;;;;;;;;;;; +; MACRO: eight rounds for 512-bit blocks +; +R_512_FourRounds macro _RR_ ;RR = base round number (0 mod 8) + if SKEIN_ASM_UNROLL and 512 + ; here for fully unrolled case. + _II_ = ((_RR_)/4) + 1 ;key injection counter + R_512_OneRound 0,1,2,3,4,5,6,7,%((_RR_)+0),,, + R_512_OneRound 2,1,4,7,6,5,0,3,%((_RR_)+1),,, + R_512_OneRound 4,1,6,3,0,5,2,7,%((_RR_)+2),,, + R_512_OneRound 6,1,0,7,2,5,4,3,%((_RR_)+3),, + ; inject the key schedule + add r08,[FP_+ksKey+8*(((_II_)+0) mod 9)] + addReg r11,rax + add r09,[FP_+ksKey+8*(((_II_)+1) mod 9)] + addReg r12,rbx + add r10,[FP_+ksKey+8*(((_II_)+2) mod 9)] + addReg r13,rcx + addReg r14,rdx + addReg r15,rsi,,,(_II_) + else + ; here for looping case ;"rotate" key/tweak schedule (move up on stack) + inc rdi ;bump key injection counter + R_512_OneRound 0,1,2,3,4,5,6,7,%((_RR_)+0),, , + R_512_OneRound 2,1,4,7,6,5,0,3,%((_RR_)+1),,, + R_512_OneRound 4,1,6,3,0,5,2,7,%((_RR_)+2),, , + R_512_OneRound 6,1,0,7,2,5,4,3,%((_RR_)+3),, + ; inject the key schedule + add r08,[FP_+ksKey+8*rdi+8*0] + addReg r11,rax + addReg r12,rbx + add r09,[FP_+ksKey+8*rdi+8*1] + addReg r13,rcx + addReg r14,rdx + add r10,[FP_+ksKey+8*rdi+8*2] + addReg r15,rsi + addReg r15,rdi ;inject the round number + endif + ;show the result of the key injection + Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT +endm ;R_512_EightRounds +; +;;;;;;;;;;;;;;;;; +; instantiated code +; +Skein_512_Process_Block proc frame + Setup_Stack 512,ROUNDS_512/8 + mov rbx,[rdi+TWEAK+ 8] + jmp short Skein_512_block_loop + align 16 + ; main hash loop for Skein_512 +Skein_512_block_loop: + ; general register usage: + ; RAX..RDX = temps for key schedule pre-loads + ; R08..R15 = X0..X7 + ; RSP, RBP = stack/frame pointers + ; RDI = round counter or context pointer + ; RSI = temp + ; + mov rax,[rdi+TWEAK+ 0] + add rax,[FP_+bitAdd] ;computed updated tweak value T0 + mov rcx,rbx + xor rcx,rax ;rax/rbx/rcx = tweak schedule + mov [rdi+TWEAK+ 0],rax ;save updated tweak value ctx->h.T[0] + mov [FP_+ksTwk+ 0],rax + mov rdx,KW_PARITY + mov rsi,[FP_+blkPtr ] ;rsi --> input block + mov [FP_+ksTwk+ 8],rbx + mov [FP_+ksTwk+16],rcx + + irp _Rn_,<0,1,2,3,4,5,6,7> + mov rX_512_&_Rn_,[rdi+X_VARS+8*(_Rn_)] + xor rdx,rX_512_&_Rn_ ;compute overall parity + mov [FP_+ksKey+8*(_Rn_)],rX_512_&_Rn_ + endm ;load state into r08..r15, compute parity + mov [FP_+ksKey+8*(8)],rdx ;save key schedule parity + + addReg rX_512_5,rax ;precompute key injection for tweak + addReg rX_512_6,rbx +if _SKEIN_DEBUG + mov [rdi+TWEAK+ 8],rbx ;save updated tweak value ctx->h.T[1] for Skein_Debug_Block below +endif + mov rax,[rsi+ 0] ;load input block + mov rbx,[rsi+ 8] + mov rcx,[rsi+16] + mov rdx,[rsi+24] + addReg r08,rax ;do initial key injection + addReg r09,rbx + mov [FP_+Wcopy+ 0],rax ;keep local copy for feedforward + mov [FP_+Wcopy+ 8],rbx + addReg r10,rcx + addReg r11,rdx + mov [FP_+Wcopy+16],rcx + mov [FP_+Wcopy+24],rdx + + mov rax,[rsi+32] + mov rbx,[rsi+40] + mov rcx,[rsi+48] + mov rdx,[rsi+56] + addReg r12,rax + addReg r13,rbx + addReg r14,rcx + addReg r15,rdx + mov [FP_+Wcopy+32],rax + mov [FP_+Wcopy+40],rbx + mov [FP_+Wcopy+48],rcx + mov [FP_+Wcopy+56],rdx + +if _SKEIN_DEBUG + irp _Rn_,<0,1,2,3,4,5,6,7> ;save values on stack for debug output + mov [rsp+X_stk+8*(_Rn_)],rX_512_&_Rn_ + endm + + Skein_Debug_Block 512 ;debug dump + Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL +endif + add rsi, 8*WCNT ;skip the block + mov [FP_+blkPtr ],rsi ;update block pointer + ; + ;;;;;;;;;;;;;;;;; + ; now the key schedule is computed. Start the rounds + ; +if SKEIN_ASM_UNROLL and 512 +_UNROLL_CNT = ROUNDS_512/8 +else +_UNROLL_CNT = SKEIN_UNROLL_512 + if ((ROUNDS_512/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_512" + endif + xor rdi,rdi ;rdi = round counter +Skein_512_round_loop: +endif +; +_Rbase_ = 0 +rept _UNROLL_CNT*2 + R_512_FourRounds %(4*_Rbase_+00) +_Rbase_ = _Rbase_+1 +endm ;rept _UNROLL_CNT +; +if (SKEIN_ASM_UNROLL and 512) eq 0 + cmp rdi,2*(ROUNDS_512/8) + jb Skein_512_round_loop + mov rdi,[FP_+ctxPtr ] ;restore rdi --> context +endif + ; end of rounds + ;;;;;;;;;;;;;;;;; + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} + irp _Rn_,<0,1,2,3,4,5,6,7> + if (_Rn_ eq 0) + mov rbx,FIRST_MASK + endif + xor rX_512_&_Rn_,[FP_+Wcopy+8*(_Rn_)] ;feedforward XOR + mov [rdi+X_VARS+8*(_Rn_)],rX_512_&_Rn_ ;and store result + if (_Rn_ eq 6) + and rbx,[rdi+TWEAK+ 8] + endif + endm + Skein_Debug_Round 512,SKEIN_RND_FEED_FWD + + ; go back for more blocks, if needed + dec qword ptr [FP_+blkCnt] + jnz Skein_512_block_loop + mov [rdi+TWEAK + 8],rbx + + Reset_Stack Skein_512_Process_Block + ret +; + if _SKEIN_DEBUG +; call here with r08 = "round number" +Skein_Debug_Round_512: + push rdx ;save two regs for BLK_BITS-specific parms + push rcx + mov rcx,[rsp+24] ;get back original r08 (pushed on stack in macro call) + mov [FP_+X_stk],rcx ;and save it in X_stk + irp _Rn_,<1,2,3,4,5,6,7> ;save rest of X[] state on stack so debug routines can access it + mov [FP_+X_stk+8*(_Rn_)],rX_512_&_Rn_ + endm + mov rdx,[FP_+ctxPtr] ;ctx_hdr_ptr + mov rcx, 512 ;block size + jmp Skein_Debug_Round_Common + endif +; +Skein_512_Process_Block endp +; +ifdef SKEIN_CODE_SIZE + public Skein_512_Process_Block_CodeSize +Skein_512_Process_Block_CodeSize proc + mov rax,_ProcBytes_ + ret +Skein_512_Process_Block_CodeSize endp +; + public Skein_512_Unroll_Cnt +Skein_512_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_512/8 + mov rax,_UNROLL_CNT + else + xor rax,rax + endif + ret +Skein_512_Unroll_Cnt endp +endif +; +endif ; _USE_ASM_ and 512 +; +;=================================== Skein1024 ============================================= +if _USE_ASM_ and 1024 + public Skein1024_Process_Block +; +; void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +;;;;;;;;;;;;;;;;; +; use details of permutation to make register assignments +; +r1K_x0 equ rdi +r1K_x1 equ rsi +r1K_x2 equ rbp +r1K_x3 equ rax +r1K_x4 equ rcx ;"shared" with X6, since X4/X6 alternate +r1K_x5 equ rbx +r1K_x6 equ rcx +r1K_x7 equ rdx +r1K_x8 equ r08 +r1K_x9 equ r09 +r1K_xA equ r10 +r1K_xB equ r11 +r1K_xC equ r12 +r1K_xD equ r13 +r1K_xE equ r14 +r1K_xF equ r15 +; +rIdx equ r1K_x0 ;index register for looping versions +rIdx_offs equ tmpStk_1024 +; +R1024_Mix macro w0,w1,_RN0_,_Rn1_,op1 +_w0 = 0&w0&h ;handle the hex conversion +_w1 = 0&w1&h +_II_ = ((_RN0_)/4)+1 ;injection count + ; + addReg r1K_x&w0 , r1K_x&w1 ;perform the MIX + RotL64 r1K_x&w1 , 1024,%((_RN0_) and 7),_Rn1_ + xor r1K_x&w1 , r1K_x&w0 + if ((_RN0_) and 3) eq 3 ;time to do key injection? + if _SKEIN_DEBUG + mov [rsp+xDebug_1024+8*_w0],r1K_x&w0 ;save intermediate values for Debug_Round + mov [rsp+xDebug_1024+8*_w1],r1K_x&w1 ; (before inline key injection) + endif + if SKEIN_ASM_UNROLL and 1024 ;here to do fully unrolled key injection + add r1K_x&w0, [rsp+ksKey+ 8*((_II_+_w0) mod 17)] + add r1K_x&w1, [rsp+ksKey+ 8*((_II_+_w1) mod 17)] + if _w1 eq 13 ;tweak injection + add r1K_x&w1, [rsp+ksTwk+ 8*((_II_+0 ) mod 3)] + elseif _w0 eq 14 + add r1K_x&w0, [rsp+ksTwk+ 8*((_II_+1 ) mod 3)] + elseif _w1 eq 15 + add r1K_x&w1, _II_ ;(injection counter) + endif + else ;here to do looping key injection + if (_w0 eq 0) + mov [rsp+X_stk+8*_w0],r1K_x0 ;if so, store N0 so we can use reg as index + mov rIdx, [rsp+rIdx_offs] ;get the injection counter index into rIdx (N0) + else + add r1K_x&w0, [rsp+ksKey+8+8*rIdx+8*_w0] ;even key injection + endif + if _w1 eq 13 ;tweak injection + add r1K_x&w1, [rsp+ksTwk+8+8*rIdx+8*0 ] + elseif _w0 eq 14 + add r1K_x&w0, [rsp+ksTwk+8+8*rIdx+8*1 ] + elseif _w1 eq 15 + addReg r1K_x&w1, rIdx,,,1 ;(injection counter) + endif + add r1K_x&w1, [rsp+ksKey+8+8*rIdx+8*_w1] ;odd key injection + endif + endif + ; insert the op provided, if any + op1 +endm +;;;;;;;;;;;;;;;;; +; MACRO: one round for 1024-bit blocks +; +R1024_OneRound macro x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF,_Rn_ + if (x0 ne 0) or ((x4 ne 4) and (x4 ne 6)) or (x4 ne (x6 xor 2)) + .err "faulty register assignment!" + endif + R1024_Mix x0,x1,_Rn_,0 + R1024_Mix x2,x3,_Rn_,1 + R1024_Mix x4,x5,_Rn_,2, ;save x4 on stack (x4/x6 alternate) + R1024_Mix x8,x9,_Rn_,4, ;load x6 from stack + R1024_Mix xA,xB,_Rn_,5 + R1024_Mix xC,xD,_Rn_,6 + R1024_Mix x6,x7,_Rn_,3 + R1024_Mix xE,xF,_Rn_,7 + if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + endif +endm ;R1024_OneRound +;;;;;;;;;;;;;;;;; +; MACRO: four rounds for 1024-bit blocks +; +R1024_FourRounds macro _RR_ ;RR = base round number (0 mod 4) + ; should be here with r1K_x4 set properly, x6 stored on stack + R1024_OneRound 0,1,2,3,4,5,6,7,8,9,A,B,C,D,E,F,%((_RR_)+0) + R1024_OneRound 0,9,2,D,6,B,4,F,A,7,C,3,E,5,8,1,%((_RR_)+1) + R1024_Oneround 0,7,2,5,4,3,6,1,C,F,E,D,8,B,A,9,%((_RR_)+2) + R1024_Oneround 0,F,2,B,6,D,4,9,E,1,8,5,A,3,C,7,%((_RR_)+3) + if (SKEIN_ASM_UNROLL and 1024) eq 0 ;here with r1K_x0 == rIdx, X0 on stack + ;rotate the key schedule on the stack + mov [rsp+X_stk+ 8* 8],r1K_x8;free up a reg + mov r1K_x8,[rsp+ksKey+8*rIdx+8* 0] ;get key + mov [rsp+ksKey+8*rIdx+8*17],r1K_x8 ;rotate it (must do key first or tweak clobbers it!) + mov r1K_x8,[rsp+ksTwk+8*rIdx+8* 0] ;get tweak + mov [rsp+ksTwk+8*rIdx+8* 3],r1K_x8 ;rotate it + mov r1K_x8,[rsp+X_stk+ 8* 8] ;get the reg back + inc rIdx ;bump the index + mov [rsp+rIdx_offs],rIdx ;save it + mov r1K_x0,[rsp+ksKey+8*rIdx] ;get the key schedule word for X0 + add r1K_x0,[rsp+X_stk+8*0] ;perform the X0 key injection + endif + ;show the result of the key injection + Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT +endm ;R1024_FourRounds +; +;;;;;;;;;;;;;;;; +; code +; +Skein1024_Process_Block proc frame +; + Setup_Stack 1024,ROUNDS_1024/8,NO_FRAME, + mov r09,[rdi+TWEAK+ 8] + jmp short Skein1024_block_loop + align 16 + ; main hash loop for Skein1024 +Skein1024_block_loop: + ; general register usage: + ; RSP = stack pointer + ; RAX..RDX,RSI,RDI= X1, X3..X7 (state words) + ; R08..R15 = X8..X15 (state words) + ; RBP = temp (used for X0 and X2) + ; + if (SKEIN_ASM_UNROLL and 1024) eq 0 + xor rax,rax ;init loop index on the stack + mov [rsp+rIdx_offs],rax + endif + mov r08,[rdi+TWEAK+ 0] + add r08,[FP_+bitAdd] ;computed updated tweak value T0 + mov r10,r09 + xor r10,r08 ;rax/rbx/rcx = tweak schedule + mov [rdi+TWEAK+ 0],r08 ;save updated tweak value ctx->h.T[0] + mov [FP_+ksTwk+ 0],r08 + mov [FP_+ksTwk+ 8],r09 ;keep values in r08,r09 for initial tweak injection below + mov [FP_+ksTwk+16],r10 + if _SKEIN_DEBUG + mov [rdi+TWEAK+ 8],r09 ;save updated tweak value ctx->h.T[1] for Skein_Debug_Block + endif + mov rsi ,[FP_+blkPtr ] ;r1K_x2 --> input block + mov rax , KW_PARITY ;overall key schedule parity + + ; logic here assumes the set {rdi,rsi,rbp,rax} = r1K_x{0,1,2,3} + + irp _rN_,<0,1,2,3,4,6> ;process the "initial" words, using r14,r15 as temps + mov r14,[rdi+X_VARS+8*_rN_] ;get state word + mov r15,[rsi+ 8*_rN_] ;get msg word + xor rax,r14 ;update key schedule parity + mov [FP_+ksKey +8*_rN_],r14 ;save key schedule word on stack + mov [FP_+Wcopy +8*_rN_],r15 ;save local msg Wcopy + add r14,r15 ;do the initial key injection + mov [rsp+X_stk +8*_rN_],r14 ;save initial state var on stack + endm + ; now process the rest, using the "real" registers + ; (MUST do it in reverse order to inject tweaks r08/r09 first) + irp _rN_, +_rr_ = 0&_rN_&h + mov r1K_x&_rN_,[rdi+X_VARS+8*_rr_] ;get key schedule word from context + mov r1K_x4 ,[rsi+ 8*_rr_] ;get next input msg word + mov [rsp+ksKey +8*_rr_],r1K_x&_rN_ ;save key schedule on stack + xor rax , r1K_x&_rN_ ;accumulate key schedule parity + mov [FP_+Wcopy +8*_rr_],r1K_x4 ;save copy of msg word for feedforward + add r1K_x&_rN_, r1K_x4 ;do the initial key injection + if _rr_ eq 13 ;do the initial tweak injection + addReg r1K_x&_rN_,r08 ; (only in words 13/14) + elseif _rr_ eq 14 + addReg r1K_x&_rN_,r09 + endif + endm + mov [FP_+ksKey+8*WCNT],rax ;save key schedule parity +if _SKEIN_DEBUG + Skein_Debug_Block 1024 ;debug dump +endif + addReg rsi,8*WCNT ;bump the msg ptr + mov [FP_+blkPtr],rsi ;save bumped msg ptr + ; re-load words 0..4 [rbp,rsi,rdi,rax,rbx] from stack, enter the main loop + irp _rN_,<0,1,2,3,4> ;(no need to re-load x6) + mov r1K_x&_rN_,[rsp+X_stk+8*_rN_] ;re-load state and get ready to go! + endm +if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL ;show state after initial key injection +endif + ; + ;;;;;;;;;;;;;;;;; + ; now the key schedule is computed. Start the rounds + ; +if SKEIN_ASM_UNROLL and 1024 +_UNROLL_CNT = ROUNDS_1024/8 +else +_UNROLL_CNT = SKEIN_UNROLL_1024 + if ((ROUNDS_1024/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL1024" + endif +Skein1024_round_loop: +endif +; +_Rbase_ = 0 +rept _UNROLL_CNT*2 ;implement the rounds, 4 at a time + R1024_FourRounds %(4*_Rbase_+00) +_Rbase_ = _Rbase_+1 +endm ;rept _UNROLL_CNT +; +if (SKEIN_ASM_UNROLL and 1024) eq 0 + cmp qword ptr [rsp+tmpStk_1024],2*(ROUNDS_1024/8) ;see if we are done + jb Skein1024_round_loop +endif + ; end of rounds + ;;;;;;;;;;;;;;;;; + ; + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} + mov [rsp+X_stk+8*7],r1K_x7 ;we need a register. x6 already on stack + mov r1K_x7,[rsp+ctxPtr] + + irp _rN_,<0,1,2,3,4,5,8,9,A,B,C,D,E,F> ;do all but x6,x7 + xor r1K_x&_rN_,[rsp +Wcopy +8*(0&_rN_&h)] ;feedforward XOR + mov [r1K_x7+X_VARS+8*(0&_rN_&h)],r1K_x&_rN_ ;save result into context + if (0&_rN_&h eq 9) + mov r09,FIRST_MASK + endif + if (0&_rN_&h eq 0eh) + and r09,[r1K_x7+TWEAK+ 8] + endif + endm + ; + mov rax,[rsp+X_stk +8*6] ;now process x6,x7 + mov rbx,[rsp+X_stk +8*7] + xor rax,[rsp+Wcopy +8*6] + xor rbx,[rsp+Wcopy +8*7] + mov [r1K_x7+X_VARS+8*6],rax + dec qword ptr [rsp+blkCnt] ;set zero flag iff done + mov [r1K_x7+X_VARS+8*7],rbx + + Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,, + ; go back for more blocks, if needed + mov rdi,[rsp+ctxPtr] ;don't muck with the flags here! + lea rbp,[rsp+FRAME_OFFS] + jnz Skein1024_block_loop + mov [r1K_x7+TWEAK+ 8],r09 + Reset_Stack Skein1024_Process_Block + ret +; +if _SKEIN_DEBUG +; call here with r08 = "round number" +Skein_Debug_Round_1024: +_SP_OFFS_ = 8*2 ;stack "offset" here: r08, return addr + SP_ equ ;useful shorthand below +; + irp _wN_,<1,2,3,5,7,9,A,B,C,D,E,F> ;save rest of X[] state on stack so debug routines can access it + mov [SP_+X_stk+8*(0&_wN_&h)],r1K_x&_wN_ + endm + ;figure out what to do with x0. On rounds R where R==0 mod 4, it's already on the stack + cmp r08,SKEIN_RND_SPECIAL ;special rounds always save + jae save_x0 + test r08,3 + jz save_x0_not +save_x0: + mov [SP_+X_stk+8*0],r1K_x0 +save_x0_not: + ;figure out the x4/x6 swapping state and save the correct one! + cmp r08,SKEIN_RND_SPECIAL ;special rounds always do x4 + jae save_x4 + test r08,1 ;and even ones have r4 as well + jz save_x4 + mov [SP_+X_stk+8*6],r1K_x6 + jmp short debug_1024_go +save_x4: + mov [SP_+X_stk+8*4],r1K_x4 +debug_1024_go: + ;now all is saved in Xstk[] except for X8 + push rdx ;save two regs for BLK_BITS-specific parms + push rcx +_SP_OFFS_ = _SP_OFFS_ + 16 ;adjust stack offset accordingly + ; now stack offset is 32 to X_stk + mov rcx,[SP_ - 8] ;get back original r08 (pushed on stack in macro call) + mov [SP_+X_stk+8*8],rcx ;and save it in its rightful place in X_stk[8] + mov rdx,[SP_+ctxPtr] ;ctx_hdr_ptr + mov rcx, 1024 ;block size + jmp Skein_Debug_Round_Common +endif +; +Skein1024_Process_Block endp +; +ifdef SKEIN_CODE_SIZE + public Skein1024_Process_Block_CodeSize +Skein1024_Process_Block_CodeSize proc + mov rax,_ProcBytes_ + ret +Skein1024_Process_Block_CodeSize endp +; + public Skein1024_Unroll_Cnt +Skein1024_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_1024/8 + mov rax,_UNROLL_CNT + else + xor rax,rax + endif + ret +Skein1024_Unroll_Cnt endp +endif +; +endif ; _USE_ASM_ and 1024 +; +if _SKEIN_DEBUG +;---------------------------------------------------------------- +;local debug routine to set up for calls to: +; void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X); +; +; here with r08 = round number +; rdx = ctx_hdr_ptr +; rcx = block size (256/512/1024) +; +Skein_Debug_Round_Common: +_SP_OFFS_ = 32 ;current stack "offset": r08, retAddr, rcx, rdx + irp _rr_, ;save the rest of the regs + push _rr_ +_SP_OFFS_ = _SP_OFFS_+8 + endm + if (_SP_OFFS_ and 0Fh) ; make sure stack is still 16-byte aligned here + .err "Debug_Round_Common: stack alignment" + endif + ; compute r09 = ptr to the X[] array on the stack + lea r09,[SP_+X_stk] ;adjust for reg pushes, return address + cmp r08,SKEIN_RND_FEED_FWD ;special handling for feedforward "round"? + jnz _got_r09a + lea r09,[rdx+X_VARS] +_got_r09a: + if _USE_ASM_ and 1024 + ; special handling for 1024-bit case + ; (for rounds right before with key injection: + ; use xDebug_1024[] instead of X_stk[]) + cmp r08,SKEIN_RND_SPECIAL + jae _got_r09b ;must be a normal round + or r08,r08 + jz _got_r09b ;just before key injection + test r08,3 + jne _got_r09b + cmp rcx,1024 ;only 1024-bit(s) for now + jne _got_r09b + lea r09,[SP_+xDebug_1024] +_got_r09b: + endif + sub rsp, 8*4 ;make room for parms on stack + call Skein_Show_Round ;call external debug handler + add rsp, 8*4 ;discard parm space on the stack + + irp _rr_, ;restore regs + pop _rr_ +_SP_OFFS_ = _SP_OFFS_-8 + endm + if _SP_OFFS_ - 32 + .err "Debug_Round_Common: push/pop misalignment!" + endif + pop rcx + pop rdx + ret +endif +;---------------------------------------------------------------- + end Index: sys/contrib/skein/asm/skein_block_x64.s =================================================================== --- /dev/null +++ sys/contrib/skein/asm/skein_block_x64.s @@ -0,0 +1,1328 @@ +# +#---------------------------------------------------------------- +# 64-bit x86 assembler code (gnu as) for Skein block functions +# +# Author: Doug Whiting, Hifn/Exar +# +# This code is released to the public domain. +#---------------------------------------------------------------- +# + .text + .altmacro + .psize 0,128 #list file has no page boundaries +# +_MASK_ALL_ = (256+512+1024) #all three algorithm bits +_MAX_FRAME_ = 240 +# +################# +.ifndef SKEIN_USE_ASM +_USE_ASM_ = _MASK_ALL_ +.else +_USE_ASM_ = SKEIN_USE_ASM +.endif +################# +.ifndef SKEIN_LOOP #configure loop unrolling +_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 +.else +_SKEIN_LOOP = SKEIN_LOOP + .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line +.print "+++ SKEIN_LOOP = \_NN_" + .endr +.endif +# the unroll counts (0 --> fully unrolled) +SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 +SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 +SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 +# +SKEIN_ASM_UNROLL = 0 + .irp _NN_,256,512,1024 + .if (SKEIN_UNROLL_\_NN_) == 0 +SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ + .endif + .endr +################# +# +.ifndef SKEIN_ROUNDS +ROUNDS_256 = 72 +ROUNDS_512 = 72 +ROUNDS_1024 = 80 +.else +ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) +ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) +ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) +# only display rounds if default size is changed on command line +.irp _NN_,256,512,1024 + .if _USE_ASM_ && \_NN_ + .irp _RR_,%(ROUNDS_\_NN_) + .if _NN_ < 1024 +.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" + .else +.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" + .endif + .endr + .endif +.endr +.endif +################# +# +.ifdef SKEIN_CODE_SIZE +_SKEIN_CODE_SIZE = (1) +.else +.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined +_SKEIN_CODE_SIZE = (1) +.else +_SKEIN_CODE_SIZE = (0) +.endif +.endif +# +################# +# +.ifndef SKEIN_DEBUG +_SKEIN_DEBUG = 0 +.else +_SKEIN_DEBUG = 1 +.endif +################# +# +# define offsets of fields in hash context structure +# +HASH_BITS = 0 #bits of hash output +BCNT = 8 + HASH_BITS #number of bytes in BUFFER[] +TWEAK = 8 + BCNT #tweak values[0..1] +X_VARS = 16 + TWEAK #chaining vars +# +#(Note: buffer[] in context structure is NOT needed here :-) +# +KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words +FIRST_MASK = ~ (1 << 6) +FIRST_MASK64= ~ (1 << 62) +# +# rotation constants for Skein +# +RC_256_0_0 = 14 +RC_256_0_1 = 16 + +RC_256_1_0 = 52 +RC_256_1_1 = 57 + +RC_256_2_0 = 23 +RC_256_2_1 = 40 + +RC_256_3_0 = 5 +RC_256_3_1 = 37 + +RC_256_4_0 = 25 +RC_256_4_1 = 33 + +RC_256_5_0 = 46 +RC_256_5_1 = 12 + +RC_256_6_0 = 58 +RC_256_6_1 = 22 + +RC_256_7_0 = 32 +RC_256_7_1 = 32 + +RC_512_0_0 = 46 +RC_512_0_1 = 36 +RC_512_0_2 = 19 +RC_512_0_3 = 37 + +RC_512_1_0 = 33 +RC_512_1_1 = 27 +RC_512_1_2 = 14 +RC_512_1_3 = 42 + +RC_512_2_0 = 17 +RC_512_2_1 = 49 +RC_512_2_2 = 36 +RC_512_2_3 = 39 + +RC_512_3_0 = 44 +RC_512_3_1 = 9 +RC_512_3_2 = 54 +RC_512_3_3 = 56 + +RC_512_4_0 = 39 +RC_512_4_1 = 30 +RC_512_4_2 = 34 +RC_512_4_3 = 24 + +RC_512_5_0 = 13 +RC_512_5_1 = 50 +RC_512_5_2 = 10 +RC_512_5_3 = 17 + +RC_512_6_0 = 25 +RC_512_6_1 = 29 +RC_512_6_2 = 39 +RC_512_6_3 = 43 + +RC_512_7_0 = 8 +RC_512_7_1 = 35 +RC_512_7_2 = 56 +RC_512_7_3 = 22 + +RC_1024_0_0 = 24 +RC_1024_0_1 = 13 +RC_1024_0_2 = 8 +RC_1024_0_3 = 47 +RC_1024_0_4 = 8 +RC_1024_0_5 = 17 +RC_1024_0_6 = 22 +RC_1024_0_7 = 37 + +RC_1024_1_0 = 38 +RC_1024_1_1 = 19 +RC_1024_1_2 = 10 +RC_1024_1_3 = 55 +RC_1024_1_4 = 49 +RC_1024_1_5 = 18 +RC_1024_1_6 = 23 +RC_1024_1_7 = 52 + +RC_1024_2_0 = 33 +RC_1024_2_1 = 4 +RC_1024_2_2 = 51 +RC_1024_2_3 = 13 +RC_1024_2_4 = 34 +RC_1024_2_5 = 41 +RC_1024_2_6 = 59 +RC_1024_2_7 = 17 + +RC_1024_3_0 = 5 +RC_1024_3_1 = 20 +RC_1024_3_2 = 48 +RC_1024_3_3 = 41 +RC_1024_3_4 = 47 +RC_1024_3_5 = 28 +RC_1024_3_6 = 16 +RC_1024_3_7 = 25 + +RC_1024_4_0 = 41 +RC_1024_4_1 = 9 +RC_1024_4_2 = 37 +RC_1024_4_3 = 31 +RC_1024_4_4 = 12 +RC_1024_4_5 = 47 +RC_1024_4_6 = 44 +RC_1024_4_7 = 30 + +RC_1024_5_0 = 16 +RC_1024_5_1 = 34 +RC_1024_5_2 = 56 +RC_1024_5_3 = 51 +RC_1024_5_4 = 4 +RC_1024_5_5 = 53 +RC_1024_5_6 = 42 +RC_1024_5_7 = 41 + +RC_1024_6_0 = 31 +RC_1024_6_1 = 44 +RC_1024_6_2 = 47 +RC_1024_6_3 = 46 +RC_1024_6_4 = 19 +RC_1024_6_5 = 42 +RC_1024_6_6 = 44 +RC_1024_6_7 = 25 + +RC_1024_7_0 = 9 +RC_1024_7_1 = 48 +RC_1024_7_2 = 35 +RC_1024_7_3 = 52 +RC_1024_7_4 = 23 +RC_1024_7_5 = 31 +RC_1024_7_6 = 37 +RC_1024_7_7 = 20 +# +# Input: reg +# Output: <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 +# +.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM +_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM + .if _RCNT_ #is there anything to do? + rolq $_RCNT_,%\reg + .endif +.endm +# +#---------------------------------------------------------------- +# +# MACROS: define local vars and configure stack +# +#---------------------------------------------------------------- +# declare allocated space on the stack +.macro StackVar localName,localSize +\localName = _STK_OFFS_ +_STK_OFFS_ = _STK_OFFS_+(\localSize) +.endm #StackVar +# +#---------------------------------------------------------------- +# +# MACRO: Configure stack frame, allocate local vars +# +.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt + WCNT = (\BLK_BITS)/64 +# +_PushCnt_ = 0 #save nonvolatile regs on stack + .irp _reg_,rbp,rbx,r12,r13,r14,r15 + pushq %\_reg_ +_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment + .endr +# +_STK_OFFS_ = 0 #starting offset from rsp + #---- local variables #<-- rsp + StackVar X_stk ,8*(WCNT) #local context vars + StackVar ksTwk ,8*3 #key schedule: tweak words + StackVar ksKey ,8*(WCNT)+8 #key schedule: key words + .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0 + StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen + .endif + StackVar Wcopy ,8*(WCNT) #copy of input block + .if _SKEIN_DEBUG + .if \debugCnt + 0 #temp location for debug X[] info + StackVar xDebug_\BLK_BITS ,8*(\debugCnt) + .endif + .endif + .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 + StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) +tmpStk_\BLK_BITS = align16 #use this + .endif + #---- saved caller parameters (from regs rdi, rsi, rdx, rcx) + StackVar ctxPtr ,8 #context ptr + StackVar blkPtr ,8 #pointer to block data + StackVar blkCnt ,8 #number of full blocks to process + StackVar bitAdd ,8 #bit count to add to tweak +LOCAL_SIZE = _STK_OFFS_ #size of "local" vars + #---- + StackVar savRegs,8*_PushCnt_ #saved registers + StackVar retAddr,8 #return address + #---- caller's stack frame (aligned mod 16) +# +# set up the stack frame pointer (rbp) +# +FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey + .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range +FRAME_OFFS = _STK_OFFS_ + .endif +F_O = -FRAME_OFFS +# + #put some useful defines in the .lst file (for grep) +__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE +__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_ +__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS +# +# Notes on stack frame setup: +# * the most frequently used variable is X_stk[], based at [rsp+0] +# * the next most used is the key schedule arrays, ksKey and ksTwk +# so rbp is "centered" there, allowing short offsets to the key +# schedule even in 1024-bit Skein case +# * the Wcopy variables are infrequently accessed, but they have long +# offsets from both rsp and rbp only in the 1024-bit case. +# * all other local vars and calling parameters can be accessed +# with short offsets, except in the 1024-bit case +# + subq $LOCAL_SIZE,%rsp #make room for the locals + leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets + movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack + movq %rsi, blkPtr+F_O(%rbp) + movq %rdx, blkCnt+F_O(%rbp) + movq %rcx, bitAdd+F_O(%rbp) +# +.endm #Setup_Stack +# +#---------------------------------------------------------------- +# +.macro Reset_Stack + addq $LOCAL_SIZE,%rsp #get rid of locals (wipe??) + .irp _reg_,r15,r14,r13,r12,rbx,rbp + popq %\_reg_ #restore caller's regs +_PushCnt_ = _PushCnt_ - 1 + .endr + .if _PushCnt_ + .error "Mismatched push/pops?" + .endif +.endm # Reset_Stack +# +#---------------------------------------------------------------- +# macros to help debug internals +# +.if _SKEIN_DEBUG + .extern Skein_Show_Block #calls to C routines + .extern Skein_Show_Round +# +SKEIN_RND_SPECIAL = 1000 +SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 +SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 +SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 +# +.macro Skein_Debug_Block BLK_BITS +# +#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, +# const u08b_t *blkPtr, const u64b_t *wPtr, +# const u64b_t *ksPtr,const u64b_t *tsPtr) +# +_NN_ = 0 + .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 + pushq %\_reg_ #save all volatile regs on tack before the call +_NN_ = _NN_ + 1 + .endr + # get and push call parameters + movq $\BLK_BITS ,%rdi #bits + movq ctxPtr+F_O(%rbp),%rsi #h (pointer) + leaq X_VARS (%rsi),%rdx #X (pointer) + movq blkPtr+F_O(%rbp),%rcx #blkPtr + leaq Wcopy +F_O(%rbp),%r8 #wPtr + leaq ksKey +F_O(%rbp),%r9 #key pointer + leaq ksTwk +F_O(%rbp),%rax #tweak pointer + pushq %rax # (pass on the stack) + call Skein_Show_Block #call external debug handler + addq $8*1,%rsp #discard parameters on stack + .if (_NN_ % 2 ) == 0 #check stack alignment + .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" + .endif + .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax + popq %\_reg_ #restore regs +_NN_ = _NN_ - 1 + .endr + .if _NN_ + .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" + .endif +.endm # Skein_Debug_Block +# +# the macro to "call" to debug a round +# +.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp + # call the appropriate (local) debug "function" + pushq %rdx #save rdx, so we can use it for round "number" + .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) + movq $\R,%rdx + .else #compute round number using edi +_rOffs_ = \RDI_OFFS + 0 + .if \BLK_BITS == 1024 + movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) + leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx + .else + leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx + .endif + .endif + call Skein_Debug_Round_\BLK_BITS + popq %rdx #restore origianl rdx value +# + afterOp +.endm # Skein_Debug_Round +.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) +.macro Skein_Debug_Block BLK_BITS +.endm +# +.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp +.endm +# +.endif # _SKEIN_DEBUG +# +#---------------------------------------------------------------- +# +.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs + .if \immOffs + 0 + leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg + .elseif ((\useAddOp + 0) == 0) + .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs! + leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg + .else + addq %\srcReg_A\srcReg_B,%\dstReg + .endif + .else + addq %\srcReg_A\srcReg_B,%\dstReg + .endif +.endm + +# keep Intel-style ordering here, to match addReg +.macro xorReg dstReg,srcReg_A,srcReg_B + xorq %\srcReg_A\srcReg_B,%\dstReg +.endm +# +#---------------------------------------------------------------- +# +.macro C_label lName + \lName: #use both "genders" to work across linkage conventions +_\lName: + .global \lName + .global _\lName +.endm +# +#=================================== Skein_256 ============================================= +# +.if _USE_ASM_ & 256 +# +# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +################# +# +# code +# +C_label Skein_256_Process_Block + Setup_Stack 256,((ROUNDS_256/8)+1) + movq TWEAK+8(%rdi),%r14 + jmp Skein_256_block_loop + .p2align 4 + # main hash loop for Skein_256 +Skein_256_block_loop: + # + # general register usage: + # RAX..RDX = X0..X3 + # R08..R12 = ks[0..4] + # R13..R15 = ts[0..2] + # RSP, RBP = stack/frame pointers + # RDI = round counter or context pointer + # RSI = temp + # + movq TWEAK+0(%rdi) ,%r13 + addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0 + movq %r14 ,%r15 + xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak + + movq $KW_PARITY ,%r12 + movq X_VARS+ 0(%rdi),%r8 + movq X_VARS+ 8(%rdi),%r9 + movq X_VARS+16(%rdi),%r10 + movq X_VARS+24(%rdi),%r11 + movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0] + xorq %r8 ,%r12 #start accumulating overall parity + + movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block + xorq %r9 ,%r12 + movq 0(%rsi) ,%rax #get X[0..3] + xorq %r10 ,%r12 + movq 8(%rsi) ,%rbx + xorq %r11 ,%r12 + movq 16(%rsi) ,%rcx + movq 24(%rsi) ,%rdx + + movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block + movq %rbx,Wcopy+ 8+F_O(%rbp) + movq %rcx,Wcopy+16+F_O(%rbp) + movq %rdx,Wcopy+24+F_O(%rbp) + + addq %r8 ,%rax #initial key injection + addq %r9 ,%rbx + addq %r10,%rcx + addq %r11,%rdx + addq %r13,%rbx + addq %r14,%rcx + +.if _SKEIN_DEBUG + movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) + movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block + movq %r9 ,ksKey+ 8+F_O(%rbp) + movq %r10,ksKey+16+F_O(%rbp) + movq %r11,ksKey+24+F_O(%rbp) + movq %r12,ksKey+32+F_O(%rbp) + + movq %r13,ksTwk+ 0+F_O(%rbp) + movq %r14,ksTwk+ 8+F_O(%rbp) + movq %r15,ksTwk+16+F_O(%rbp) + + movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block + movq %rbx,X_stk + 8(%rsp) + movq %rcx,X_stk +16(%rsp) + movq %rdx,X_stk +24(%rsp) + + Skein_Debug_Block 256 #debug dump + Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL +.endif +# +.if ((SKEIN_ASM_UNROLL & 256) == 0) + movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code + movq %r9 ,ksKey+ 8+F_O(%rbp) + movq %r10,ksKey+16+F_O(%rbp) + movq %r11,ksKey+24+F_O(%rbp) + movq %r12,ksKey+32+F_O(%rbp) + + movq %r13,ksTwk+24+F_O(%rbp) + movq %r14,ksTwk+ 8+F_O(%rbp) + movq %r15,ksTwk+16+F_O(%rbp) +.endif + addq $WCNT*8,%rsi #skip the block + movq %rsi,blkPtr +F_O(%rbp) #update block pointer + # + # now the key schedule is computed. Start the rounds + # +.if SKEIN_ASM_UNROLL & 256 +_UNROLL_CNT = ROUNDS_256/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_256 + .if ((ROUNDS_256/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_256" + .endif + xorq %rdi,%rdi #rdi = iteration count +Skein_256_round_loop: +.endif +_Rbase_ = 0 +.rept _UNROLL_CNT*2 + # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) + # round 4*_RBase_ + 0 + addReg rax, rbx + RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 + addReg rcx, rdx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 + .endif + xorReg rbx, rax + RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 + xorReg rdx, rcx + .if SKEIN_ASM_UNROLL & 256 + .irp _r0_,%( 8+(_Rbase_+3) % 5) + .irp _r1_,%(13+(_Rbase_+2) % 3) + leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx + .endr + .endr + .endif + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 + .endif + Skein_Debug_Round 256,%(4*_Rbase_+1) + + # round 4*_Rbase_ + 1 + addReg rax, rdx + RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 + xorReg rdx, rax + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 + .endif + addReg rcx, rbx + RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 + xorReg rbx, rcx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 + .endif + Skein_Debug_Round 256,%(4*_Rbase_+2) + .if SKEIN_ASM_UNROLL & 256 + .irp _r0_,%( 8+(_Rbase_+2) % 5) + .irp _r1_,%(13+(_Rbase_+1) % 3) + leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx + .endr + .endr + .endif + # round 4*_Rbase_ + 2 + addReg rax, rbx + RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 + addReg rcx, rdx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 + .endif + xorReg rbx, rax + RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 + xorReg rdx, rcx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key + leaq 1(%r11,%rdi),%r11 #precompute key + tweak + .endif + Skein_Debug_Round 256,%(4*_Rbase_+3) + # round 4*_Rbase_ + 3 + addReg rax, rdx + RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 + addReg rcx, rbx + .if (SKEIN_ASM_UNROLL & 256) == 0 + addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak + movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak + .endif + xorReg rdx, rax + RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 + xorReg rbx, rcx + Skein_Debug_Round 256,%(4*_Rbase_+4) + .if (SKEIN_ASM_UNROLL & 256) == 0 + addReg r9 ,r13 #precompute key+tweak + .endif + #inject key schedule words +_Rbase_ = _Rbase_+1 + .if SKEIN_ASM_UNROLL & 256 + addReg rax,r,%(8+((_Rbase_+0) % 5)) + addReg rbx,rsi + addReg rcx,rdi + addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ + .else + incq %rdi + addReg rax,r8 + addReg rcx,r10 + addReg rbx,r9 + addReg rdx,r11 + .endif + Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 256) == 0 + cmpq $2*(ROUNDS_256/8),%rdi + jb Skein_256_round_loop +.endif # (SKEIN_ASM_UNROLL & 256) == 0 + movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context + + #---------------------------- + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} + movq $FIRST_MASK64 ,%r14 + xorq Wcopy + 0+F_O (%rbp),%rax + xorq Wcopy + 8+F_O (%rbp),%rbx + xorq Wcopy +16+F_O (%rbp),%rcx + xorq Wcopy +24+F_O (%rbp),%rdx + andq TWEAK + 8 (%rdi),%r14 + movq %rax,X_VARS+ 0(%rdi) #store final result + movq %rbx,X_VARS+ 8(%rdi) + movq %rcx,X_VARS+16(%rdi) + movq %rdx,X_VARS+24(%rdi) + + Skein_Debug_Round 256,SKEIN_RND_FEED_FWD + + # go back for more blocks, if needed + decq blkCnt+F_O(%rbp) + jnz Skein_256_block_loop + movq %r14,TWEAK + 8(%rdi) + Reset_Stack + ret +Skein_256_Process_Block_End: + + .if _SKEIN_DEBUG +Skein_Debug_Round_256: #here with rdx == round "number" from macro + pushq %rsi #save two regs for BLK_BITS-specific parms + pushq %rdi + movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi + movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it + movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) + movq %rcx,X_stk+16+F_O(%rbp) + movq %rdi,X_stk+24+F_O(%rbp) + + movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr + movq $256,%rdi #now are set for the call + jmp Skein_Debug_Round_Common + .endif +# +.if _SKEIN_CODE_SIZE +C_label Skein_256_Process_Block_CodeSize + movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax + ret +# +C_label Skein_256_Unroll_Cnt + .if _UNROLL_CNT <> ROUNDS_256/8 + movq $_UNROLL_CNT,%rax + .else + xorq %rax,%rax + .endif + ret +.endif +# +.endif #_USE_ASM_ & 256 +# +#=================================== Skein_512 ============================================= +# +.if _USE_ASM_ & 512 +# +# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd) +# +# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) +# +################# +# MACRO: one round for 512-bit blocks +# +.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 +# + addReg r\rn0, r\rn1 + RotL64 r\rn1, 512,%((_Rn_) % 8),0 + xorReg r\rn1, r\rn0 + op1 + addReg r\rn2, r\rn3 + RotL64 r\rn3, 512,%((_Rn_) % 8),1 + xorReg r\rn3, r\rn2 + op2 + addReg r\rn4, r\rn5 + RotL64 r\rn5, 512,%((_Rn_) % 8),2 + xorReg r\rn5, r\rn4 + op3 + addReg r\rn6, r\rn7 + RotL64 r\rn7, 512,%((_Rn_) % 8),3 + xorReg r\rn7, r\rn6 + op4 + Skein_Debug_Round 512,%(_Rn_+1),-4 +# +.endm #R_512_OneRound +# +################# +# MACRO: eight rounds for 512-bit blocks +# +.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) + .if (SKEIN_ASM_UNROLL && 512) + # here for fully unrolled case. + _II_ = ((_RR_)/4) + 1 #key injection counter + R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),,, + R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),,, + R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),,, + R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),, + # inject the key schedule + addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 + addReg r11, rax + addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 + addReg r12, rbx + addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 + addReg r13, rcx + addReg r14, rdx + addReg r15, rsi,,,(_II_) + .else + # here for looping case #"rotate" key/tweak schedule (move up on stack) + incq %rdi #bump key injection counter + R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),,, + R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),,, + R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),,, + R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),, + # inject the key schedule + addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 + addReg r11, rax + addReg r12, rbx + addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 + addReg r13, rcx + addReg r14, rdx + addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 + addReg r15, rsi + addReg r15, rdi #inject the round number + .endif + + #show the result of the key injection + Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT +.endm #R_512_EightRounds +# +################# +# instantiated code +# +C_label Skein_512_Process_Block + Setup_Stack 512,ROUNDS_512/8 + movq TWEAK+ 8(%rdi),%rbx + jmp Skein_512_block_loop + .p2align 4 + # main hash loop for Skein_512 +Skein_512_block_loop: + # general register usage: + # RAX..RDX = temps for key schedule pre-loads + # R8 ..R15 = X0..X7 + # RSP, RBP = stack/frame pointers + # RDI = round counter or context pointer + # RSI = temp + # + movq TWEAK + 0(%rdi),%rax + addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0 + movq %rbx,%rcx + xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule + movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0] + movq %rax,ksTwk+ 0+F_O(%rbp) + movq $KW_PARITY,%rdx + movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block + movq %rbx,ksTwk+ 8+F_O(%rbp) + movq %rcx,ksTwk+16+F_O(%rbp) + .irp _Rn_,8,9,10,11,12,13,14,15 + movq X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_ + xorq %r\_Rn_,%rdx #compute overall parity + movq %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp) + .endr #load state into %r8 ..%r15, compute parity + movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity + + addReg r13,rax #precompute key injection for tweak + addReg r14, rbx +.if _SKEIN_DEBUG + movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below +.endif + movq 0(%rsi),%rax #load input block + movq 8(%rsi),%rbx + movq 16(%rsi),%rcx + movq 24(%rsi),%rdx + addReg r8 , rax #do initial key injection + addReg r9 , rbx + movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward + movq %rbx,Wcopy+ 8+F_O(%rbp) + addReg r10, rcx + addReg r11, rdx + movq %rcx,Wcopy+16+F_O(%rbp) + movq %rdx,Wcopy+24+F_O(%rbp) + + movq 32(%rsi),%rax + movq 40(%rsi),%rbx + movq 48(%rsi),%rcx + movq 56(%rsi),%rdx + addReg r12, rax + addReg r13, rbx + addReg r14, rcx + addReg r15, rdx + movq %rax,Wcopy+32+F_O(%rbp) + movq %rbx,Wcopy+40+F_O(%rbp) + movq %rcx,Wcopy+48+F_O(%rbp) + movq %rdx,Wcopy+56+F_O(%rbp) + +.if _SKEIN_DEBUG + .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output + movq %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp) + .endr + + Skein_Debug_Block 512 #debug dump + Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL +.endif + addq $8*WCNT,%rsi #skip the block + movq %rsi,blkPtr+F_O(%rbp) #update block pointer + # + ################# + # now the key schedule is computed. Start the rounds + # +.if SKEIN_ASM_UNROLL & 512 +_UNROLL_CNT = ROUNDS_512/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_512 + .if ((ROUNDS_512/8) % _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_512" + .endif + xorq %rdi,%rdi #rdi = round counter +Skein_512_round_loop: +.endif +# +_Rbase_ = 0 +.rept _UNROLL_CNT*2 + R_512_FourRounds %(4*_Rbase_+00) +_Rbase_ = _Rbase_+1 +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 512) == 0 + cmpq $2*(ROUNDS_512/8),%rdi + jb Skein_512_round_loop + movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context +.endif + # end of rounds + ################# + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} + .irp _Rn_,8,9,10,11,12,13,14,15 + .if (_Rn_ == 8) + movq $FIRST_MASK64,%rbx + .endif + xorq Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR + movq %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi) #and store result + .if (_Rn_ == 14) + andq TWEAK+ 8(%rdi),%rbx + .endif + .endr + Skein_Debug_Round 512,SKEIN_RND_FEED_FWD + + # go back for more blocks, if needed + decq blkCnt+F_O(%rbp) + jnz Skein_512_block_loop + movq %rbx,TWEAK + 8(%rdi) + + Reset_Stack + ret +Skein_512_Process_Block_End: +# + .if _SKEIN_DEBUG +# call here with rdx = "round number" +Skein_Debug_Round_512: + pushq %rsi #save two regs for BLK_BITS-specific parms + pushq %rdi + .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it + movq %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp) + .endr + movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr + movq $512,%rdi #now are set for the call + jmp Skein_Debug_Round_Common + .endif +# +.if _SKEIN_CODE_SIZE +C_label Skein_512_Process_Block_CodeSize + movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax + ret +# +C_label Skein_512_Unroll_Cnt + .if _UNROLL_CNT <> (ROUNDS_512/8) + movq $_UNROLL_CNT,%rax + .else + xorq %rax,%rax + .endif + ret +.endif +# +.endif # _USE_ASM_ & 512 +# +#=================================== Skein1024 ============================================= +.if _USE_ASM_ & 1024 +# +# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +################# +# use details of permutation to make register assignments +# +o1K_rdi = 0 #offsets in X[] associated with each register +o1K_rsi = 1 +o1K_rbp = 2 +o1K_rax = 3 +o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate +o1K_rbx = 5 +o1K_rdx = 7 +o1K_r8 = 8 +o1K_r9 = 9 +o1K_r10 = 10 +o1K_r11 = 11 +o1K_r12 = 12 +o1K_r13 = 13 +o1K_r14 = 14 +o1K_r15 = 15 +# +rIdx_offs = tmpStk_1024 +# +.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 + addReg \reg0 , \reg1 #perform the MIX + RotL64 \reg1 , 1024,%((_RN0_) % 8),_Rn1_ + xorReg \reg1 , \reg0 +.if ((_RN0_) && 3) == 3 #time to do key injection? + .if _SKEIN_DEBUG + movq %\reg0 , xDebug_1024+8*w0(%rsp) #save intermediate values for Debug_Round + movq %\reg1 , xDebug_1024+8*w1(%rsp) # (before inline key injection) + .endif +_II_ = ((_RN0_)/4)+1 #injection count + .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection + addq ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0 + addq ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1 + .if w1 == 13 #tweak injection + addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1 + .elseif w0 == 14 + addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0 + .elseif w1 == 15 + addq $_II_, %\reg1 #(injection counter) + .endif + .else #here to do looping key injection + .if (w0 == 0) + movq %rdi, X_stk+8*w0(%rsp) #if so, store N0 so we can use reg as index + movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi + .else + addq ksKey+8+8*w0(%rsp,%rdi,8),%\reg0 #even key injection + .endif + .if w1 == 13 #tweak injection + addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1 + .elseif w0 == 14 + addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0 + .elseif w1 == 15 + addReg \reg1,rdi,,,1 #(injection counter) + .endif + addq ksKey+8+8*w1(%rsp,%rdi,8),%\reg1 #odd key injection + .endif +.endif + # insert the op provided, .if any + op1 +.endm +################# +# MACRO: four rounds for 1024-bit blocks +# +.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4) + # should be here with X4 set properly, X6 stored on stack +_Rn_ = (_RR_) + 0 + r1024_Mix 0, 1,rdi,rsi,_Rn_,0 + r1024_Mix 2, 3,rbp,rax,_Rn_,1 + r1024_Mix 4, 5,rcx,rbx,_Rn_,2, #save X4 on stack (x4/x6 alternate) + r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4, #load X6 from stack + r1024_Mix 10,11,r10,r11,_Rn_,5 + r1024_Mix 12,13,r12,r13,_Rn_,6 + r1024_Mix 6, 7,rcx,rdx,_Rn_,3 + r1024_Mix 14,15,r14,r15,_Rn_,7 + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + .endif +_Rn_ = (_RR_) + 1 + r1024_Mix 0, 9,rdi,r9 ,_Rn_,0 + r1024_Mix 2,13,rbp,r13,_Rn_,1 + r1024_Mix 6,11,rcx,r11,_Rn_,2, #save X6 on stack (x4/x6 alternate) + r1024_Mix 10, 7,r10,rdx,_Rn_,4, #load X4 from stack + r1024_Mix 12, 3,r12,rax,_Rn_,5 + r1024_Mix 14, 5,r14,rbx,_Rn_,6 + r1024_Mix 4,15,rcx,r15,_Rn_,3 + r1024_Mix 8, 1,r8 ,rsi,_Rn_,7 + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + .endif +_Rn_ = (_RR_) + 2 + r1024_Mix 0, 7,rdi,rdx,_Rn_,0 + r1024_Mix 2, 5,rbp,rbx,_Rn_,1 + r1024_Mix 4, 3,rcx,rax,_Rn_,2, #save X4 on stack (x4/x6 alternate) + r1024_Mix 12,15,r12,r15,_Rn_,4, #load X6 from stack + r1024_Mix 14,13,r14,r13,_Rn_,5 + r1024_Mix 8,11,r8 ,r11,_Rn_,6 + r1024_Mix 6, 1,rcx,rsi,_Rn_,3 + r1024_Mix 10, 9,r10,r9 ,_Rn_,7 + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + .endif +_Rn_ = (_RR_) + 3 + r1024_Mix 0,15,rdi,r15,_Rn_,0 + r1024_Mix 2,11,rbp,r11,_Rn_,1 + r1024_Mix 6,13,rcx,r13,_Rn_,2, #save X6 on stack (x4/x6 alternate) + r1024_Mix 14, 1,r14,rsi,_Rn_,4, #load X4 from stack + r1024_Mix 8, 5,r8 ,rbx,_Rn_,5 + r1024_Mix 10, 3,r10,rax,_Rn_,6 + r1024_Mix 4, 9,rcx,r9 ,_Rn_,3 + r1024_Mix 12, 7,r12,rdx,_Rn_,7 + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + .endif + + .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack + #"rotate" the key schedule on the stack +i8 = o1K_r8 +i0 = o1K_rdi + movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack) + movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word + movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!) + movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word + movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack) + movq X_stk+8*i8(%rsp) ,%r8 #get the reg back + incq %rdi #bump the index + movq %rdi, rIdx_offs (%rsp) #save rdi again + movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back + addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection + .endif + #show the result of the key injection + Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT +.endm #r1024_FourRounds +# +################ +# code +# +C_label Skein1024_Process_Block +# + Setup_Stack 1024,ROUNDS_1024/8,WCNT + movq TWEAK+ 8(%rdi),%r9 + jmp Skein1024_block_loop + # main hash loop for Skein1024 + .p2align 4 +Skein1024_block_loop: + # general register usage: + # RSP = stack pointer + # RAX..RDX,RSI,RDI = X1, X3..X7 (state words) + # R8 ..R15 = X8..X15 (state words) + # RBP = temp (used for X0 and X2) + # + .if (SKEIN_ASM_UNROLL & 1024) == 0 + xorq %rax,%rax #init loop index on the stack + movq %rax,rIdx_offs(%rsp) + .endif + movq TWEAK+ 0(%rdi),%r8 + addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0 + movq %r9 ,%r10 + xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule + movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0] + movq %r8 ,ksTwk+ 0+F_O(%rbp) + movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below + movq %r10,ksTwk+16+F_O(%rbp) + .if _SKEIN_DEBUG + movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block + .endif + movq blkPtr +F_O(%rbp),%rsi # rsi --> input block + movq $KW_PARITY ,%rax #overall key schedule parity + + # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3] + .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps + movq X_VARS+8*_rN_(%rdi),%r14 #get state word + movq 8*_rN_(%rsi),%r15 #get msg word + xorq %r14,%rax #update key schedule overall parity + movq %r14,ksKey +8*_rN_+F_O(%rbp) #save key schedule word on stack + movq %r15,Wcopy +8*_rN_+F_O(%rbp) #save local msg Wcopy + addq %r15,%r14 #do the initial key injection + movq %r14,X_stk +8*_rN_ (%rsp) #save initial state var on stack + .endr + # now process the rest, using the "real" registers + # (MUST do it in reverse order to inject tweaks r8/r9 first) + .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx +_oo_ = o1K_\_rr_ #offset assocated with the register + movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context + movq 8*_oo_(%rsi),%rcx #get next input msg word + movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack + xorq %\_rr_, %rax #accumulate key schedule parity + movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward + addq %rcx,%\_rr_ #do the initial key injection + .if _oo_ == 13 #do the initial tweak injection + addReg _rr_,r8 # (only in words 13/14) + .elseif _oo_ == 14 + addReg _rr_,r9 + .endif + .endr + movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity +.if _SKEIN_DEBUG + Skein_Debug_Block 1024 #initial debug dump +.endif + addq $8*WCNT,%rsi #bump the msg ptr + movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr + # re-load words 0..4 from stack, enter the main loop + .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack) + movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go! + .endr +.if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection +.endif + # + ################# + # now the key schedule is computed. Start the rounds + # +.if SKEIN_ASM_UNROLL & 1024 +_UNROLL_CNT = ROUNDS_1024/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_1024 + .if ((ROUNDS_1024/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_1024" + .endif +Skein1024_round_loop: +.endif +# +_Rbase_ = 0 +.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time + r1024_FourRounds %(4*_Rbase_+00) +_Rbase_ = _Rbase_+1 +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 1024) == 0 + cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done + jb Skein1024_round_loop +.endif + # end of rounds + ################# + # + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} + movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack + movq ctxPtr(%rsp),%rdx + + .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7 +_oo_ = o1K_\_rr_ + xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR + movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context + .if (_oo_ == 9) + movq $FIRST_MASK64 ,%r9 + .endif + .if (_oo_ == 14) + andq TWEAK+ 8(%rdx),%r9 + .endif + .endr + # + movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above) + movq X_stk +8*7(%rsp),%rbx + xorq Wcopy +8*6(%rsp),%rax + xorq Wcopy +8*7(%rsp),%rbx + movq %rax,X_VARS+8*6(%rdx) + decq blkCnt(%rsp) #set zero flag iff done + movq %rbx,X_VARS+8*7(%rdx) + + Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,, + # go back for more blocks, if needed + movq ctxPtr(%rsp),%rdi #don't muck with the flags here! + lea FRAME_OFFS(%rsp),%rbp + jnz Skein1024_block_loop + movq %r9 ,TWEAK+ 8(%rdx) + Reset_Stack + ret +# +Skein1024_Process_Block_End: +# +.if _SKEIN_DEBUG +Skein_Debug_Round_1024: + # call here with rdx = "round number", +_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr + # + #save rest of X[] state on stack so debug routines can access it + .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15 + movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp) + .endr + # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack + cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save + jae save_x0 + testq $3,%rdx #otherwise only if rdx != 0 mod 4 + jz save_x0_not +save_x0: + movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp) +save_x0_not: + #figure out the x4/x6 swapping state and save the correct one! + cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4 + jae save_x4 + testq $1,%rdx #and even ones have r4 as well + jz save_x4 + movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp) + jmp debug_1024_go +save_x4: + movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp) +debug_1024_go: + #now all is saved in Xstk[] except for rdx + push %rsi #save two regs for BLK_BITS-specific parms + push %rdi +_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32) + + movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call) + movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[] + + movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr + movq $1024,%rdi #rdi = block size + jmp Skein_Debug_Round_Common +.endif +# +.if _SKEIN_CODE_SIZE +C_label Skein1024_Process_Block_CodeSize + movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax + ret +# +C_label Skein1024_Unroll_Cnt + .if _UNROLL_CNT <> (ROUNDS_1024/8) + movq $_UNROLL_CNT,%rax + .else + xorq %rax,%rax + .endif + ret +.endif +# +.endif # _USE_ASM_ and 1024 +# +.if _SKEIN_DEBUG +#---------------------------------------------------------------- +#local debug routine to set up for calls to: +# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X) +# [ rdi rsi rdx rcx] +# +# here with %rdx = round number +# %rsi = ctx_hdr_ptr +# %rdi = block size (256/512/1024) +# on stack: saved rdi, saved rsi, retAddr, saved rdx +# +Skein_Debug_Round_Common: +_SP_OFFS_ = 32 #account for four words on stack already + .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs + pushq %\_rr_ +_SP_OFFS_ = _SP_OFFS_+8 + .endr + .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here + .error "Debug_Round_Common: stack alignment" + .endif + # compute %rcx = ptr to the X[] array on the stack (final parameter to call) + leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address + cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"? + jnz _got_rcxA + leaq X_VARS(%rsi),%rcx +_got_rcxA: + .if _USE_ASM_ & 1024 + # special handling for 1024-bit case + # (for rounds right before with key injection: + # use xDebug_1024[] instead of X_stk[]) + cmpq $SKEIN_RND_SPECIAL,%rdx + jae _got_rcxB #must be a normal round + orq %rdx,%rdx + jz _got_rcxB #just before key injection + test $3,%rdx + jne _got_rcxB + cmp $1024,%rdi #only 1024-bit(s) for now + jne _got_rcxB + leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx +_got_rcxB: + .endif + call Skein_Show_Round #call external debug handler + + .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs + popq %\_rr_ +_SP_OFFS_ = _SP_OFFS_-8 + .endr + .if _SP_OFFS_ - 32 + .error "Debug_Round_Common: push/pop misalignment!" + .endif + popq %rdi + popq %rsi + ret +.endif +#---------------------------------------------------------------- + .end Index: sys/contrib/skein/asm/skein_block_x86.asm =================================================================== --- /dev/null +++ sys/contrib/skein/asm/skein_block_x86.asm @@ -0,0 +1,1180 @@ +; +;---------------------------------------------------------------- +; 32-bit x86 assembler code for Skein block functions +; +; Author: Doug Whiting, Hifn +; +; This code is released to the public domain. +;---------------------------------------------------------------- +; + .386p + .model flat + .code +; +_MASK_ALL_ equ (256+512+1024) ;all three algorithm bits +; +;;;;;;;;;;;;;;;;; +ifndef SKEIN_USE_ASM +_USE_ASM_ = _MASK_ALL_ +elseif SKEIN_USE_ASM and _MASK_ALL_ +_USE_ASM_ = SKEIN_USE_ASM +else +_USE_ASM_ = _MASK_ALL_ +endif +;;;;;;;;;;;;;;;;; +ifndef SKEIN_LOOP +_SKEIN_LOOP = 0 ;default is all fully unrolled +else +_SKEIN_LOOP = SKEIN_LOOP +endif +; the unroll counts (0 --> fully unrolled) +SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) mod 10 +SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) mod 10 +SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) mod 10 +; +SKEIN_ASM_UNROLL = 0 + irp _NN_,<256,512,1024> + if (SKEIN_UNROLL_&_NN_) eq 0 +SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + _NN_ + endif + endm +;;;;;;;;;;;;;;;;; +; +ifndef SKEIN_ROUNDS +ROUNDS_256 = 72 +ROUNDS_512 = 72 +ROUNDS_1024 = 80 +else +ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5) +ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) mod 10) + 5) +ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) mod 10) + 5) +endif +irp _NN_,<256,512,1024> + if _USE_ASM_ and _NN_ + irp _RR_,<%(ROUNDS_&_NN_)> + if _NN_ eq 1024 +%out +++ SKEIN_ROUNDS_&_NN_ = _RR_ + else +%out +++ SKEIN_ROUNDS_&_NN_ = _RR_ + endif + endm + endif +endm +;;;;;;;;;;;;;;;;; +; +ifdef SKEIN_CODE_SIZE +_SKEIN_CODE_SIZE equ (1) +else +ifdef SKEIN_PERF ;use code size if SKEIN_PERF is defined +_SKEIN_CODE_SIZE equ (1) +endif +endif +; +;;;;;;;;;;;;;;;;; +; +ifndef SKEIN_DEBUG +_SKEIN_DEBUG = 0 +else +_SKEIN_DEBUG = 1 +endif +;;;;;;;;;;;;;;;;; +; +; define offsets of fields in hash context structure +; +HASH_BITS = 0 ;# bits of hash output +BCNT = 4 + HASH_BITS ;number of bytes in BUFFER[] +TWEAK = 4 + BCNT ;tweak values[0..1] +X_VARS = 16 + TWEAK ;chaining vars +; +;(Note: buffer[] in context structure is NOT needed here :-) +; +KW_PARITY_LO= 0A9FC1A22h ;overall parity of key schedule words (hi32/lo32) +KW_PARITY_HI= 01BD11BDAh ;overall parity of key schedule words (hi32/lo32) +FIRST_MASK = NOT (1 SHL 30) ;FIRST block flag bit +; +; rotation constants for Skein +; +RC_256_0_0 = 14 +RC_256_0_1 = 16 + +RC_256_1_0 = 52 +RC_256_1_1 = 57 + +RC_256_2_0 = 23 +RC_256_2_1 = 40 + +RC_256_3_0 = 5 +RC_256_3_1 = 37 + +RC_256_4_0 = 25 +RC_256_4_1 = 33 + +RC_256_5_0 = 46 +RC_256_5_1 = 12 + +RC_256_6_0 = 58 +RC_256_6_1 = 22 + +RC_256_7_0 = 32 +RC_256_7_1 = 32 + +RC_512_0_0 = 46 +RC_512_0_1 = 36 +RC_512_0_2 = 19 +RC_512_0_3 = 37 + +RC_512_1_0 = 33 +RC_512_1_1 = 27 +RC_512_1_2 = 14 +RC_512_1_3 = 42 + +RC_512_2_0 = 17 +RC_512_2_1 = 49 +RC_512_2_2 = 36 +RC_512_2_3 = 39 + +RC_512_3_0 = 44 +RC_512_3_1 = 9 +RC_512_3_2 = 54 +RC_512_3_3 = 56 + +RC_512_4_0 = 39 +RC_512_4_1 = 30 +RC_512_4_2 = 34 +RC_512_4_3 = 24 + +RC_512_5_0 = 13 +RC_512_5_1 = 50 +RC_512_5_2 = 10 +RC_512_5_3 = 17 + +RC_512_6_0 = 25 +RC_512_6_1 = 29 +RC_512_6_2 = 39 +RC_512_6_3 = 43 + +RC_512_7_0 = 8 +RC_512_7_1 = 35 +RC_512_7_2 = 56 +RC_512_7_3 = 22 + +RC_1024_0_0 = 24 +RC_1024_0_1 = 13 +RC_1024_0_2 = 8 +RC_1024_0_3 = 47 +RC_1024_0_4 = 8 +RC_1024_0_5 = 17 +RC_1024_0_6 = 22 +RC_1024_0_7 = 37 + +RC_1024_1_0 = 38 +RC_1024_1_1 = 19 +RC_1024_1_2 = 10 +RC_1024_1_3 = 55 +RC_1024_1_4 = 49 +RC_1024_1_5 = 18 +RC_1024_1_6 = 23 +RC_1024_1_7 = 52 + +RC_1024_2_0 = 33 +RC_1024_2_1 = 4 +RC_1024_2_2 = 51 +RC_1024_2_3 = 13 +RC_1024_2_4 = 34 +RC_1024_2_5 = 41 +RC_1024_2_6 = 59 +RC_1024_2_7 = 17 + +RC_1024_3_0 = 5 +RC_1024_3_1 = 20 +RC_1024_3_2 = 48 +RC_1024_3_3 = 41 +RC_1024_3_4 = 47 +RC_1024_3_5 = 28 +RC_1024_3_6 = 16 +RC_1024_3_7 = 25 + +RC_1024_4_0 = 41 +RC_1024_4_1 = 9 +RC_1024_4_2 = 37 +RC_1024_4_3 = 31 +RC_1024_4_4 = 12 +RC_1024_4_5 = 47 +RC_1024_4_6 = 44 +RC_1024_4_7 = 30 + +RC_1024_5_0 = 16 +RC_1024_5_1 = 34 +RC_1024_5_2 = 56 +RC_1024_5_3 = 51 +RC_1024_5_4 = 4 +RC_1024_5_5 = 53 +RC_1024_5_6 = 42 +RC_1024_5_7 = 41 + +RC_1024_6_0 = 31 +RC_1024_6_1 = 44 +RC_1024_6_2 = 47 +RC_1024_6_3 = 46 +RC_1024_6_4 = 19 +RC_1024_6_5 = 42 +RC_1024_6_6 = 44 +RC_1024_6_7 = 25 + +RC_1024_7_0 = 9 +RC_1024_7_1 = 48 +RC_1024_7_2 = 35 +RC_1024_7_3 = 52 +RC_1024_7_4 = 23 +RC_1024_7_5 = 31 +RC_1024_7_6 = 37 +RC_1024_7_7 = 20 +; +; Input: rHi,rLo +; Output: <<< _RCNT_ +Rol64 macro rHi,rLo,tmp,_RCNT_ + if _RCNT_ ;is there anything to do? + if _RCNT_ lt 32 + mov tmp,rLo + shld rLo,rHi,_RCNT_ + shld rHi,tmp,_RCNT_ + elseif _RCNT_ gt 32 + mov tmp,rLo + shrd rLo,rHi,((64-_RCNT_) AND 63) + shrd rHi,tmp,((64-_RCNT_) AND 63) + else + xchg rHi,rLo ;special case for _RCNT_ == 32 + endif + endif +endm +; +; Input: rHi,rLo +; Output: <<< rName&&rNum, and tmp trashed; +RotL64 macro rHi,rLo,tmp,BLK_SIZE,ROUND_NUM,MIX_NUM +_RCNT_ = ( RC_&BLK_SIZE&_&ROUND_NUM&_&MIX_NUM AND 63 ) + Rol64 rHi,rLo,tmp,_RCNT_ +endm +; +;---------------------------------------------------------------- +; declare allocated space on the stack +StackVar macro localName,localSize +localName = _STK_OFFS_ +_STK_OFFS_ = _STK_OFFS_+(localSize) +endm ;StackVar +; +;---------------------------------------------------------------- +; +; MACRO: Configure stack frame, allocate local vars +; +Setup_Stack macro WCNT,KS_CNT +_STK_OFFS_ = 0 ;starting offset from esp + ;----- local variables ;<-- esp + StackVar X_stk ,8*(WCNT) ;local context vars + StackVar Wcopy ,8*(WCNT) ;copy of input block + StackVar ksTwk ,8*3 ;key schedule: tweak words + StackVar ksKey ,8*(WCNT)+8 ;key schedule: key words + if WCNT le 8 +FRAME_OFFS = _STK_OFFS_ ;<-- ebp + else +FRAME_OFFS = _STK_OFFS_-8*4 ;<-- ebp + endif + if (SKEIN_ASM_UNROLL and (WCNT*64)) eq 0 + StackVar ksRot ,16*(KS_CNT+0);leave space for "rotation" to happen + endif +LOCAL_SIZE = _STK_OFFS_ ;size of local vars + ;----- + StackVar savRegs,8*4 ;pushad data + StackVar retAddr,4 ;return address + ;----- caller parameters + StackVar ctxPtr ,4 ;context ptr + StackVar blkPtr ,4 ;pointer to block data + StackVar blkCnt ,4 ;number of full blocks to process + StackVar bitAdd ,4 ;bit count to add to tweak + ;----- caller's stack frame +; +; Notes on stack frame setup: +; * the most frequently used variable is X_stk[], based at [esp+0] +; * the next most used is the key schedule words +; so ebp is "centered" there, allowing short offsets to the key/tweak +; schedule even in 1024-bit Skein case +; * the Wcopy variables are infrequently accessed, but they have long +; offsets from both esp and ebp only in the 1024-bit case. +; * all other local vars and calling parameters can be accessed +; with short offsets, except in the 1024-bit case +; + pushad ;save all regs + sub esp,LOCAL_SIZE ;make room for the locals + lea ebp,[esp+FRAME_OFFS] ;maximize use of short offsets + mov edi,[FP_+ctxPtr ] ;edi --> context +; +endm ;Setup_Stack +; +FP_ equ ;keep as many short offsets as possible +; +;---------------------------------------------------------------- +; +Reset_Stack macro procStart + add esp,LOCAL_SIZE ;get rid of locals (wipe??) + popad ;restore all regs + + ;display code size in bytes to stdout + irp _BCNT_,<%($+1-procStart)> ;account for return opcode +if _BCNT_ ge 10000 ;(align it all pretty) +%out procStart code size = _BCNT_ bytes +elseif _BCNT_ ge 1000 +%out procStart code size = _BCNT_ bytes +else +%out procStart code size = _BCNT_ bytes +endif + endm ;irp _BCNT_ + +endm ; Reset_Stack +; +;---------------------------------------------------------------- +; macros to help debug internals +; +if _SKEIN_DEBUG + extrn _Skein_Show_Block:near ;calls to C routines + extrn _Skein_Show_Round:near +; +SKEIN_RND_SPECIAL = 1000 +SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 +SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 +SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 +; +Skein_Debug_Block macro BLK_BITS +; +;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, +; const u08b_t *blkPtr, const u64b_t *wPtr, +; const u64b_t *ksPtr,const u64b_t *tsPtr); +; + pushad ;save all regs + lea eax,[FP_+ksTwk] + lea ebx,[FP_+ksKey] + lea ecx,[esp+32+Wcopy] + mov edx,[FP_+ctxPtr] ;ctx_hdr_ptr + lea edx,[edx+X_VARS] ;edx ==> cxt->X[] + push eax ;tsPtr + push ebx ;ksPtr + push ecx ;wPtr + push dword ptr [FP_+blkPtr] ;blkPtr + push edx ;ctx->Xptr + push dword ptr [FP_+ctxPtr] ;ctx_hdr_ptr + mov eax,BLK_BITS + push eax ;bits + ifdef _MINGW_ + call _Skein_Show_Block-4 ;strange linkage?? + else + call _Skein_Show_Block + endif + add esp,7*4 ;discard parameter space on stack + popad ;restore regs +endm ;Skein_Debug_Block + +; +Skein_Debug_Round macro BLK_SIZE,R,saveRegs +; +;void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X); +; + ifnb + mov [esp+X_stk+ 0],eax ;save internal vars for debug dump + mov [esp+X_stk+ 4],ebx + mov [esp+X_stk+ 8],ecx + mov [esp+X_stk+12],edx + endif + pushad ;save all regs + if R ne SKEIN_RND_FEED_FWD + lea eax,[esp+32+X_stk] + else + mov eax,[FP_+ctxPtr] + add eax,X_VARS + endif + push eax ;Xptr + if (SKEIN_ASM_UNROLL and BLK_SIZE) or (R ge SKEIN_RND_SPECIAL) + mov eax,R + else + lea eax,[4*edi+1+(((R)-1) and 3)] ;compute round number using edi + endif + push eax ;round number + push dword ptr [FP_+ctxPtr] ;ctx_hdr_ptr + mov eax,BLK_SIZE + push eax ;bits + ifdef _MINGW_ + call _Skein_Show_Round-4 ;strange linkage?? + else + call _Skein_Show_Round + endif + add esp,4*4 ;discard parameter space on stack + popad ;restore regs +endm ;Skein_Debug_Round +endif ;ifdef SKEIN_DEBUG +; +;---------------------------------------------------------------- +; +; MACRO: a mix step +; +MixStep macro BLK_SIZE,ld_A,ld_C,st_A,st_C,RotNum0,RotNum1,_debug_ + ifnb + mov eax,[esp+X_stk+8*(ld_A)+0] + mov ebx,[esp+X_stk+8*(ld_A)+4] + endif + ifnb + mov ecx,[esp+X_stk+8*(ld_C)+0] + mov edx,[esp+X_stk+8*(ld_C)+4] + endif + add eax, ecx ;X[A] += X[C] + adc ebx, edx + ifnb + mov [esp+X_stk+8*(st_A)+0],eax + mov [esp+X_stk+8*(st_A)+4],ebx + endif +__rNum0 = (RotNum0) AND 7 + RotL64 ecx, edx, esi,%(BLK_SIZE),%(__rNum0),%(RotNum1) ;X[C] <<<= RC_ + xor ecx, eax ;X[C] ^= X[A] + xor edx, ebx + if _SKEIN_DEBUG or (0 eq (_debug_ + 0)) + ifb + mov [esp+X_stk+8*(ld_C)+0],ecx + mov [esp+X_stk+8*(ld_C)+4],edx + else + mov [esp+X_stk+8*(st_C)+0],ecx + mov [esp+X_stk+8*(st_C)+4],edx + endif + endif + if _SKEIN_DEBUG and (0 ne (_debug_ + 0)) + Skein_Debug_Round BLK_SIZE,%(RotNum0+1) + endif +endm ;MixStep +; +;;;;;;;;;;;;;;;;; +; +; MACRO: key schedule injection +; +ks_Inject macro BLK_SIZE,X_load,X_stor,rLo,rHi,rndBase,keyIdx,twkIdx,ROUND_ADD + ;are rLo,rHi values already loaded? if not, load them now + ifnb + mov rLo,[esp+X_stk +8*(X_load) ] + mov rHi,[esp+X_stk +8*(X_load)+4] + endif + + ;inject the 64-bit key schedule value (and maybe the tweak as well) +if SKEIN_ASM_UNROLL and BLK_SIZE +_kOffs_ = ((rndBase)+(keyIdx)) mod ((BLK_SIZE/64)+1) + add rLo,[FP_+ksKey+8*_kOffs_+ 0] + adc rHi,[FP_+ksKey+8*_kOffs_+ 4] + ifnb +_tOffs_ = ((rndBase)+(twkIdx)) mod 3 + add rLo,[FP_+ksTwk+8*_tOffs_+ 0] + adc rHi,[FP_+ksTwk+8*_tOffs_+ 4] + endif + ifnb + add rLo,(ROUND_ADD) + adc rHi,0 + endif +else + add rLo,[FP_+ksKey+8*(keyIdx)+8*edi ] + adc rHi,[FP_+ksKey+8*(keyIdx)+8*edi+4] + ifnb + add rLo,[FP_+ksTwk+8*(twkIdx)+8*edi ] + adc rHi,[FP_+ksTwk+8*(twkIdx)+8*edi+4] + endif + ifnb + add rLo,edi ;edi is the round number + adc rHi,0 + endif +endif + + ;do we need to store updated rLo,rHi values? if so, do it now + ifnb + mov [esp+X_stk +8*(X_stor) ],rLo + mov [esp+X_stk +8*(X_stor)+4],rHi + endif +endm ;ks_Inject +; +;---------------------------------------------------------------- +; MACRO: key schedule rotation +; +ks_Rotate macro rLo,rHi,WCNT + mov rLo,[FP_+ksKey+8*edi+ 0] ;"rotate" the key schedule in memory + mov rHi,[FP_+ksKey+8*edi+ 4] + mov [FP_+ksKey+8*edi+8*(WCNT+1)+ 0],rLo + mov [FP_+ksKey+8*edi+8*(WCNT+1)+ 4],rHi + mov rLo,[FP_+ksTwk+8*edi+ 0] + mov rHi,[FP_+ksTwk+8*edi+ 4] + mov [FP_+ksTwk+8*edi+8*3+ 0],rLo + mov [FP_+ksTwk+8*edi+8*3+ 4],rHi +endm +; +;---------------------------------------------------------------- +; +if _USE_ASM_ and 256 + public _Skein_256_Process_Block +; +; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +;;;;;;;;;;;;;;;;; +; +; MACRO: two rounds +; +R_256_TwoRounds macro _RR_,ld_0 + ; here with edx:ecx = X[1] + ;--------- round _RR_ + MixStep 256,ld_0, ,0,1,((_RR_)+0),0 + MixStep 256, 2,3,2,3,((_RR_)+0),1,1 + + ; here with edx:ecx = X[3] + ;--------- round _RR_ + 1 + MixStep 256, 0, ,0,3,((_RR_)+1),0 + MixStep 256, 2,1,2,1,((_RR_)+1),1,1 + + ; here with edx:ecx = X[1] +endm ;R_256_TwoRounds +; +;;;;;;;;;;;;;;;;; +; +; code +; +_Skein_256_Process_Block proc near + WCNT = 4 ;WCNT=4 for Skein-256 + Setup_Stack WCNT,(ROUNDS_256/8) + + ; main hash loop for Skein_256 +Skein_256_block_loop: + mov eax,[edi+TWEAK+ 0] ;ebx:eax = tweak word T0 + mov ebx,[edi+TWEAK+ 4] + mov ecx,[edi+TWEAK+ 8] ;edx:ecx = tweak word T1 + mov edx,[edi+TWEAK+12] + + add eax,[FP_+bitAdd ] ;bump T0 by the bitAdd parameter + adc ebx, 0 + mov [edi+TWEAK ],eax ;save updated tweak value T0 + mov [edi+TWEAK+ 4],ebx + + mov [FP_+ksTwk ],eax ;build the tweak schedule on the stack + mov [FP_+ksTwk+ 4],ebx + xor eax,ecx ;ebx:eax = T0 ^ T1 + xor ebx,edx + mov [FP_+ksTwk+ 8],ecx + mov [FP_+ksTwk+12],edx + mov [FP_+ksTwk+16],eax + mov [FP_+ksTwk+20],ebx + + mov eax,KW_PARITY_LO ;init parity accumulator + mov ebx,KW_PARITY_HI +; +_NN_ = 0 + rept WCNT ;copy in the chaining vars + mov ecx,[edi+X_VARS+_NN_ ] + mov edx,[edi+X_VARS+_NN_+ 4] + xor eax,ecx ;compute overall parity along the way + xor ebx,edx + mov [FP_+ksKey +_NN_ ],ecx + mov [FP_+ksKey +_NN_+ 4],edx +_NN_ = _NN_+8 + endm +; + mov [FP_+ksKey +_NN_ ],eax ;save overall parity at the end of the array + mov [FP_+ksKey +_NN_+ 4],ebx + + mov esi,[FP_+blkPtr ] ;esi --> input block +; +_NN_ = WCNT*8-16 ;work down from the end + rept WCNT/2 ;perform initial key injection + mov eax,[esi+_NN_ + 0] + mov ebx,[esi+_NN_ + 4] + mov ecx,[esi+_NN_ + 8] + mov edx,[esi+_NN_ +12] + mov [esp+_NN_+Wcopy + 0],eax + mov [esp+_NN_+Wcopy + 4],ebx + mov [esp+_NN_+Wcopy + 8],ecx + mov [esp+_NN_+Wcopy +12],edx + add eax,[FP_+_NN_+ksKey + 0] + adc ebx,[FP_+_NN_+ksKey + 4] + add ecx,[FP_+_NN_+ksKey + 8] + adc edx,[FP_+_NN_+ksKey +12] + if _NN_ eq (WCNT*8-16) ;inject the tweak words + add eax,[FP_+ ksTwk + 8]; (at the appropriate points) + adc ebx,[FP_+ ksTwk +12] + elseif _NN_ eq (WCNT*8-32) + add ecx,[FP_+ ksTwk + 0] + adc edx,[FP_+ ksTwk + 4] + endif + if _NN_ or _SKEIN_DEBUG + mov [esp+_NN_+X_stk + 0],eax + mov [esp+_NN_+X_stk + 4],ebx + mov [esp+_NN_+X_stk + 8],ecx + mov [esp+_NN_+X_stk +12],edx + endif +_NN_ = _NN_ - 16 ;end at X[0], so regs are already loaded for first MIX! + endm +; +if _SKEIN_DEBUG ;debug dump of state at this point + Skein_Debug_Block WCNT*64 + Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL +endif + add esi, WCNT*8 ;skip the block + mov [FP_+blkPtr ],esi ;update block pointer + ; + ; now the key schedule is computed. Start the rounds + ; +if SKEIN_ASM_UNROLL and 256 +_UNROLL_CNT = ROUNDS_256/8 +else +_UNROLL_CNT = SKEIN_UNROLL_256 ;unroll count + if ((ROUNDS_256/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_256" + endif + xor edi,edi ;edi = iteration count +Skein_256_round_loop: +endif +_Rbase_ = 0 +rept _UNROLL_CNT*2 + ; here with X[0], X[1] already loaded into eax..edx + R_256_TwoRounds %(4*_Rbase_+00), + R_256_TwoRounds %(4*_Rbase_+02),0 + + ;inject key schedule + if _UNROLL_CNT ne (ROUNDS_256/8) + ks_Rotate eax,ebx,WCNT + inc edi ;edi = round number + endif +_Rbase_ = _Rbase_+1 + ks_Inject 256,3,3,eax,ebx,_Rbase_,3, ,_Rbase_ + ks_Inject 256,2,2,eax,ebx,_Rbase_,2,1 + ks_Inject 256, , ,ecx,edx,_Rbase_,1,0 + ks_Inject 256,0, ,eax,ebx,_Rbase_,0 + if _SKEIN_DEBUG + Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,saveRegs + endif +endm ;rept _UNROLL_CNT +; + if _UNROLL_CNT ne (ROUNDS_256/8) + cmp edi,2*(ROUNDS_256/8) + jb Skein_256_round_loop + mov edi,[FP_+ctxPtr ] ;restore edi --> context + endif + ;---------------------------- + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} +_NN_ = 0 + rept WCNT/2 + if _NN_ ;eax..edx already loaded the first time + mov eax,[esp+X_stk + _NN_ + 0] + mov ebx,[esp+X_stk + _NN_ + 4] + mov ecx,[esp+X_stk + _NN_ + 8] + mov edx,[esp+X_stk + _NN_ +12] + endif + if _NN_ eq 0 + and dword ptr [edi +TWEAK +12],FIRST_MASK + endif + xor eax,[esp+Wcopy + _NN_ + 0] + xor ebx,[esp+Wcopy + _NN_ + 4] + xor ecx,[esp+Wcopy + _NN_ + 8] + xor edx,[esp+Wcopy + _NN_ +12] + mov [edi+X_VARS+ _NN_ + 0],eax + mov [edi+X_VARS+ _NN_ + 4],ebx + mov [edi+X_VARS+ _NN_ + 8],ecx + mov [edi+X_VARS+ _NN_ +12],edx +_NN_ = _NN_+16 + endm +if _SKEIN_DEBUG + Skein_Debug_Round 256,SKEIN_RND_FEED_FWD +endif + ; go back for more blocks, if needed + dec dword ptr [FP_+blkCnt] + jnz Skein_256_block_loop + + Reset_Stack _Skein_256_Process_Block + ret +_Skein_256_Process_Block endp +; +ifdef _SKEIN_CODE_SIZE + public _Skein_256_Process_Block_CodeSize +_Skein_256_Process_Block_CodeSize proc + mov eax,_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block + ret +_Skein_256_Process_Block_CodeSize endp +; + public _Skein_256_Unroll_Cnt +_Skein_256_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_256/8 + mov eax,_UNROLL_CNT + else + xor eax,eax + endif + ret +_Skein_256_Unroll_Cnt endp +endif +endif ;_USE_ASM_ and 256 +; +;---------------------------------------------------------------- +; +if _USE_ASM_ and 512 + public _Skein_512_Process_Block +; +; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +;;;;;;;;;;;;;;;;; +; MACRO: four rounds +; +R_512_FourRounds macro _RR_,ld_0 + ; here with edx:ecx = X[1] + ;--------- round _RR_ + ; R512(0,1,2,3,4,5,6,7,R_0, 1); + MixStep 512, ld_0, ,0,1,((_RR_)+0),0 + MixStep 512, 2,3,2,3,((_RR_)+0),1 + MixStep 512, 4,5,4,5,((_RR_)+0),2 + MixStep 512, 6,7,6, ,((_RR_)+0),3,1 + + ; here with edx:ecx = X[7] + ; R512(2,1,4,7,6,5,0,3,R_1, 2); + MixStep 512, 4, ,4,7,((_RR_)+1),1 + MixStep 512, 6,5,6,5,((_RR_)+1),2 + MixStep 512, 0,3,0,3,((_RR_)+1),3 + MixStep 512, 2,1,2, ,((_RR_)+1),0,1 + + ; here with edx:ecx = X[1] + ; R512(4,1,6,3,0,5,2,7,R_2, 3); + MixStep 512, 4, ,4,1,((_RR_)+2),0 + MixStep 512, 6,3,6,3,((_RR_)+2),1 + MixStep 512, 0,5,0,5,((_RR_)+2),2 + MixStep 512, 2,7,2, ,((_RR_)+2),3,1 + + ; here with edx:ecx = X[7] + ; R512(6,1,0,7,2,5,4,3,R_3, 4); + MixStep 512, 0, ,0,7,((_RR_)+3),1 + MixStep 512, 2,5,2,5,((_RR_)+3),2 + MixStep 512, 4,3,4,3,((_RR_)+3),3 + MixStep 512, 6,1,6, ,((_RR_)+3),0,1 + +endm ;R_512_FourRounds +; +;;;;;;;;;;;;;;;;; +; code +; +_Skein_512_Process_Block proc near + WCNT = 8 ;WCNT=8 for Skein-512 + Setup_Stack WCNT,(ROUNDS_512/8) + + ; main hash loop for Skein_512 +Skein_512_block_loop: + mov eax,[edi+TWEAK+ 0] ;ebx:eax = tweak word T0 + mov ebx,[edi+TWEAK+ 4] + mov ecx,[edi+TWEAK+ 8] ;edx:ecx = tweak word T1 + mov edx,[edi+TWEAK+12] + + add eax,[FP_+bitAdd ] ;bump T0 by the bitAdd parameter + adc ebx, 0 + mov [edi+TWEAK ],eax ;save updated tweak value T0 + mov [edi+TWEAK+ 4],ebx + + mov [FP_+ksTwk ],eax ;build the tweak schedule on the stack + mov [FP_+ksTwk+ 4],ebx + xor eax,ecx ;ebx:eax = T0 ^ T1 + xor ebx,edx + mov [FP_+ksTwk+ 8],ecx + mov [FP_+ksTwk+12],edx + mov [FP_+ksTwk+16],eax + mov [FP_+ksTwk+20],ebx + + mov eax,KW_PARITY_LO ;init parity accumulator + mov ebx,KW_PARITY_HI +; +_NN_ = 0 + rept WCNT ;copy in the chaining vars + mov ecx,[edi+X_VARS+_NN_ ] + mov edx,[edi+X_VARS+_NN_+ 4] + xor eax,ecx ;compute overall parity along the way + xor ebx,edx + mov [FP_+ksKey +_NN_ ],ecx + mov [FP_+ksKey +_NN_+ 4],edx +_NN_ = _NN_+8 + endm +; + mov [FP_+ksKey +_NN_ ],eax ;save overall parity at the end of the array + mov [FP_+ksKey +_NN_+ 4],ebx + + mov esi,[FP_+blkPtr ] ;esi --> input block +; +_NN_ = WCNT*8-16 ;work down from the end + rept WCNT/2 ;perform initial key injection + mov eax,[esi+_NN_ + 0] + mov ebx,[esi+_NN_ + 4] + mov ecx,[esi+_NN_ + 8] + mov edx,[esi+_NN_ +12] + mov [esp+_NN_+Wcopy + 0],eax + mov [esp+_NN_+Wcopy + 4],ebx + mov [esp+_NN_+Wcopy + 8],ecx + mov [esp+_NN_+Wcopy +12],edx + add eax,[FP_+_NN_+ksKey + 0] + adc ebx,[FP_+_NN_+ksKey + 4] + add ecx,[FP_+_NN_+ksKey + 8] + adc edx,[FP_+_NN_+ksKey +12] + if _NN_ eq (WCNT*8-16) ;inject the tweak words + add eax,[FP_+ ksTwk + 8]; (at the appropriate points) + adc ebx,[FP_+ ksTwk +12] + elseif _NN_ eq (WCNT*8-32) + add ecx,[FP_+ ksTwk + 0] + adc edx,[FP_+ ksTwk + 4] + endif + if _NN_ or _SKEIN_DEBUG + mov [esp+_NN_+X_stk + 0],eax + mov [esp+_NN_+X_stk + 4],ebx + mov [esp+_NN_+X_stk + 8],ecx + mov [esp+_NN_+X_stk +12],edx + endif +_NN_ = _NN_ - 16 ;end at X[0], so regs are already loaded for first MIX! + endm +; +if _SKEIN_DEBUG ;debug dump of state at this point + Skein_Debug_Block WCNT*64 + Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL +endif + add esi, WCNT*8 ;skip the block + mov [FP_+blkPtr ],esi ;update block pointer + ; + ; now the key schedule is computed. Start the rounds + ; +if SKEIN_ASM_UNROLL and 512 +_UNROLL_CNT = ROUNDS_512/8 +else +_UNROLL_CNT = SKEIN_UNROLL_512 + if ((ROUNDS_512/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_512" + endif + xor edi,edi ;edi = round counter +Skein_512_round_loop: +endif +_Rbase_ = 0 +rept _UNROLL_CNT*2 + ; here with X[0], X[1] already loaded into eax..edx + R_512_FourRounds %(4*_Rbase_+00), + + ;inject odd key schedule words + if _UNROLL_CNT ne (ROUNDS_512/8) + ks_Rotate eax,ebx,WCNT + inc edi ;edi = round number + endif +_Rbase_ = _Rbase_+1 + ks_Inject 512,7,7,eax,ebx,_Rbase_,7, ,_Rbase_ + ks_Inject 512,6,6,eax,ebx,_Rbase_,6,1 + ks_Inject 512,5,5,eax,ebx,_Rbase_,5,0 + ks_Inject 512,4,4,eax,ebx,_Rbase_,4 + ks_Inject 512,3,3,eax,ebx,_Rbase_,3 + ks_Inject 512,2,2,eax,ebx,_Rbase_,2 + ks_Inject 512, , ,ecx,edx,_Rbase_,1 + ks_Inject 512,0, ,eax,ebx,_Rbase_,0 + if _SKEIN_DEBUG + Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT ,saveRegs + endif +endm ;rept _UNROLL_CNT +; +if (SKEIN_ASM_UNROLL and 512) eq 0 + cmp edi,2*(ROUNDS_512/8) + jb Skein_512_round_loop + mov edi,[FP_+ctxPtr ] ;restore edi --> context +endif + ;---------------------------- + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} +_NN_ = 0 + rept WCNT/2 + if _NN_ ;eax..edx already loaded the first time + mov eax,[esp+X_stk + _NN_ + 0] + mov ebx,[esp+X_stk + _NN_ + 4] + mov ecx,[esp+X_stk + _NN_ + 8] + mov edx,[esp+X_stk + _NN_ +12] + endif + if _NN_ eq 0 + and dword ptr [edi + TWEAK+12],FIRST_MASK + endif + xor eax,[esp+Wcopy + _NN_ + 0] + xor ebx,[esp+Wcopy + _NN_ + 4] + xor ecx,[esp+Wcopy + _NN_ + 8] + xor edx,[esp+Wcopy + _NN_ +12] + mov [edi+X_VARS+ _NN_ + 0],eax + mov [edi+X_VARS+ _NN_ + 4],ebx + mov [edi+X_VARS+ _NN_ + 8],ecx + mov [edi+X_VARS+ _NN_ +12],edx +_NN_ = _NN_+16 + endm +if _SKEIN_DEBUG + Skein_Debug_Round 512,SKEIN_RND_FEED_FWD +endif + ; go back for more blocks, if needed + dec dword ptr [FP_+blkCnt] + jnz Skein_512_block_loop + + Reset_Stack _Skein_512_Process_Block + ret +_Skein_512_Process_Block endp +; +ifdef _SKEIN_CODE_SIZE + public _Skein_512_Process_Block_CodeSize +_Skein_512_Process_Block_CodeSize proc + mov eax,_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block + ret +_Skein_512_Process_Block_CodeSize endp +; + public _Skein_512_Unroll_Cnt +_Skein_512_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_512/8 + mov eax,_UNROLL_CNT + else + xor eax,eax + endif + ret +_Skein_512_Unroll_Cnt endp +endif +; +endif ; _USE_ASM_ and 512 +; +;---------------------------------------------------------------- +; +if _USE_ASM_ and 1024 + public _Skein1024_Process_Block +; +; void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +;;;;;;;;;;;;;;;;; +; MACRO: four rounds +; +R_1024_FourRounds macro _RR_,ld_0 + ; here with edx:ecx = X[1] + + ;--------- round _RR_ + MixStep 1024, ld_0, , 0, 1,((_RR_)+0),0 + MixStep 1024, 2, 3, 2, 3,((_RR_)+0),1 + MixStep 1024, 4, 5, 4, 5,((_RR_)+0),2 + MixStep 1024, 6, 7, 6, 7,((_RR_)+0),3 + MixStep 1024, 8, 9, 8, 9,((_RR_)+0),4 + MixStep 1024, 10,11,10,11,((_RR_)+0),5 + MixStep 1024, 12,13,12,13,((_RR_)+0),6 + MixStep 1024, 14,15,14, ,((_RR_)+0),7,1 + ; here with edx:ecx = X[15] + + ;--------- round _RR_+1 + MixStep 1024, 4, , 4,15,((_RR_)+1),3 + MixStep 1024, 0, 9, 0, 9,((_RR_)+1),0 + MixStep 1024, 2,13, 2,13,((_RR_)+1),1 + MixStep 1024, 6,11, 6,11,((_RR_)+1),2 + MixStep 1024, 10, 7,10, 7,((_RR_)+1),4 + MixStep 1024, 12, 3,12, 3,((_RR_)+1),5 + MixStep 1024, 14, 5,14, 5,((_RR_)+1),6 + MixStep 1024, 8, 1, 8, ,((_RR_)+1),7,1 + ; here with edx:ecx = X[1] + + ;--------- round _RR_+2 + MixStep 1024, 6, , 6, 1,((_RR_)+2),3 + MixStep 1024, 0, 7, 0, 7,((_RR_)+2),0 + MixStep 1024, 2, 5, 2, 5,((_RR_)+2),1 + MixStep 1024, 4, 3, 4, 3,((_RR_)+2),2 + MixStep 1024, 12,15,12,15,((_RR_)+2),4 + MixStep 1024, 14,13,14,13,((_RR_)+2),5 + MixStep 1024, 8,11, 8,11,((_RR_)+2),6 + MixStep 1024, 10, 9,10, ,((_RR_)+2),7,1 + ; here with edx:ecx = X[9] + + ;--------- round _RR_+3 + MixStep 1024, 4, , 4, 9,((_RR_)+3),3 + MixStep 1024, 0,15, 0,15,((_RR_)+3),0 + MixStep 1024, 2,11, 2,11,((_RR_)+3),1 + MixStep 1024, 6,13, 6,13,((_RR_)+3),2 + MixStep 1024, 8, 5, 8, 5,((_RR_)+3),5 + MixStep 1024, 10, 3,10, 3,((_RR_)+3),6 + MixStep 1024, 12, 7,12, 7,((_RR_)+3),7 + MixStep 1024, 14, 1,14, ,((_RR_)+3),4,1 + + ; here with edx:ecx = X[1] +endm ;R_1024_FourRounds +; +;;;;;;;;;;;;;;;;; +; code +; +_Skein1024_Process_Block proc near +; + WCNT = 16 ;WCNT=16 for Skein-1024 + Setup_Stack WCNT,(ROUNDS_1024/8) + + ; main hash loop for Skein1024 +Skein1024_block_loop: + mov eax,[edi+TWEAK+ 0] ;ebx:eax = tweak word T0 + mov ebx,[edi+TWEAK+ 4] + mov ecx,[edi+TWEAK+ 8] ;edx:ecx = tweak word T1 + mov edx,[edi+TWEAK+12] + + add eax,[FP_+bitAdd ] ;bump T0 by the bitAdd parameter + adc ebx, 0 + mov [edi+TWEAK ],eax ;save updated tweak value T0 + mov [edi+TWEAK+ 4],ebx + + mov [FP_+ksTwk ],eax ;build the tweak schedule on the stack + mov [FP_+ksTwk+ 4],ebx + xor eax,ecx ;ebx:eax = T0 ^ T1 + xor ebx,edx + mov [FP_+ksTwk+ 8],ecx + mov [FP_+ksTwk+12],edx + mov [FP_+ksTwk+16],eax + mov [FP_+ksTwk+20],ebx + + mov eax,KW_PARITY_LO ;init parity accumulator + mov ebx,KW_PARITY_HI +EDI_BIAS equ 70h ;bias the edi offsets to make them short! + add edi, EDI_BIAS +CT_ equ +; +_NN_ = 0 + rept WCNT ;copy in the chaining vars + mov ecx,[CT_+X_VARS+_NN_ ] + mov edx,[CT_+X_VARS+_NN_+ 4] + xor eax,ecx ;compute overall parity along the way + xor ebx,edx + mov [FP_+ksKey +_NN_ ],ecx + mov [FP_+ksKey +_NN_+ 4],edx +_NN_ = _NN_+8 + endm +; + mov [FP_+ksKey +_NN_ ],eax ;save overall parity at the end of the array + mov [FP_+ksKey +_NN_+ 4],ebx + + mov esi,[FP_+blkPtr ] ;esi --> input block + lea edi,[esp+Wcopy] +; +_NN_ = WCNT*8-16 ;work down from the end + rept WCNT/2 ;perform initial key injection + mov eax,[esi+_NN_ + 0] + mov ebx,[esi+_NN_ + 4] + mov ecx,[esi+_NN_ + 8] + mov edx,[esi+_NN_ +12] + mov [edi+_NN_+ + 0],eax + mov [edi+_NN_+ + 4],ebx + mov [edi+_NN_+ + 8],ecx + mov [edi+_NN_+ +12],edx + add eax,[FP_+_NN_+ksKey + 0] + adc ebx,[FP_+_NN_+ksKey + 4] + add ecx,[FP_+_NN_+ksKey + 8] + adc edx,[FP_+_NN_+ksKey +12] + if _NN_ eq (WCNT*8-16) ;inject the tweak words + add eax,[FP_+ ksTwk + 8]; (at the appropriate points) + adc ebx,[FP_+ ksTwk +12] + elseif _NN_ eq (WCNT*8-32) + add ecx,[FP_+ ksTwk + 0] + adc edx,[FP_+ ksTwk + 4] + endif + if _NN_ or _SKEIN_DEBUG + mov [esp+_NN_+X_stk + 0],eax + mov [esp+_NN_+X_stk + 4],ebx + mov [esp+_NN_+X_stk + 8],ecx + mov [esp+_NN_+X_stk +12],edx + endif +_NN_ = _NN_ - 16 ;end at X[0], so regs are already loaded for first MIX! + endm +; +if _SKEIN_DEBUG ;debug dump of state at this point + Skein_Debug_Block WCNT*64 + Skein_Debug_Round WCNT*64,SKEIN_RND_KEY_INITIAL +endif + sub esi,-WCNT*8 ;skip the block (short immediate) + mov [FP_+blkPtr ],esi ;update block pointer + ; + ; now the key schedule is computed. Start the rounds + ; +if SKEIN_ASM_UNROLL and 1024 +_UNROLL_CNT = ROUNDS_1024/8 +else +_UNROLL_CNT = SKEIN_UNROLL_1024 + if ((ROUNDS_1024/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_1024" + endif + xor edi,edi ;edi = round counter +Skein_1024_round_loop: +endif + +_Rbase_ = 0 +rept _UNROLL_CNT*2 + ; here with X[0], X[1] already loaded into eax..edx + R_1024_FourRounds %(4*_Rbase_+00), + + ;inject odd key schedule words + ;inject odd key schedule words + if _UNROLL_CNT ne (ROUNDS_1024/8) + ks_Rotate eax,ebx,WCNT + inc edi ;edi = round number + endif +_Rbase_ = _Rbase_+1 + ks_Inject 1024,15,15,eax,ebx,_Rbase_,15, ,_Rbase_ + ks_Inject 1024,14,14,eax,ebx,_Rbase_,14,1 + ks_Inject 1024,13,13,eax,ebx,_Rbase_,13,0 + irp _w,<12,11,10,9,8,7,6,5,4,3,2> + ks_Inject 1024,_w,_w,eax,ebx,_Rbase_,_w + endm + ks_Inject 1024, , ,ecx,edx,_Rbase_,1 + ks_Inject 1024, 0, ,eax,ebx,_Rbase_,0 + + if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,saveRegs + endif +endm ;rept _UNROLL_CNT +; +if (SKEIN_ASM_UNROLL and 1024) eq 0 + cmp edi,2*(ROUNDS_1024/8) + jb Skein_1024_round_loop +endif + mov edi,[FP_+ctxPtr ] ;restore edi --> context + add edi,EDI_BIAS ;and bias it for short offsets below + ;---------------------------- + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} + lea esi,[esp+Wcopy] ;use short offsets below +_NN_ = 0 + rept WCNT/2 + if _NN_ ;eax..edx already loaded the first time + mov eax,[esp+X_stk + _NN_ + 0] + mov ebx,[esp+X_stk + _NN_ + 4] + mov ecx,[esp+X_stk + _NN_ + 8] + mov edx,[esp+X_stk + _NN_ +12] + endif + if _NN_ eq 0 + and dword ptr [CT_ + TWEAK+12],FIRST_MASK + endif + xor eax,[esi + _NN_ + 0] + xor ebx,[esi + _NN_ + 4] + xor ecx,[esi + _NN_ + 8] + xor edx,[esi + _NN_ +12] + mov [CT_+X_VARS+ _NN_ + 0],eax + mov [CT_+X_VARS+ _NN_ + 4],ebx + mov [CT_+X_VARS+ _NN_ + 8],ecx + mov [CT_+X_VARS+ _NN_ +12],edx +_NN_ = _NN_+16 + endm + sub edi,EDI_BIAS ;undo the bias for return + +if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD +endif + ; go back for more blocks, if needed + dec dword ptr [FP_+blkCnt] + jnz Skein1024_block_loop + + Reset_Stack _Skein1024_Process_Block + ret +_Skein1024_Process_Block endp +; +ifdef _SKEIN_CODE_SIZE + public _Skein1024_Process_Block_CodeSize +_Skein1024_Process_Block_CodeSize proc + mov eax,_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block + ret +_Skein1024_Process_Block_CodeSize endp +; + public _Skein1024_Unroll_Cnt +_Skein1024_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_1024/8 + mov eax,_UNROLL_CNT + else + xor eax,eax + endif + ret +_Skein1024_Unroll_Cnt endp +endif +; +endif ; _USE_ASM_ and 1024 +;---------------------------------------------------------------- + end Index: sys/contrib/skein/asm/skein_block_xmm32.asm =================================================================== --- /dev/null +++ sys/contrib/skein/asm/skein_block_xmm32.asm @@ -0,0 +1,1167 @@ +; +;---------------------------------------------------------------- +; 32-bit x86 assembler code for Skein block functions using XMM registers +; +; Author: Doug Whiting, Hifn +; +; This code is released to the public domain. +;---------------------------------------------------------------- +; + .386p + .model flat + .code + .xmm ;enable XMM instructions +; +_MASK_ALL_ equ (256+512+1024) ;all three algorithm bits +; +;;;;;;;;;;;;;;;;; +ifndef SKEIN_USE_ASM +_USE_ASM_ = _MASK_ALL_ +elseif SKEIN_USE_ASM and _MASK_ALL_ +_USE_ASM_ = SKEIN_USE_ASM +else +_USE_ASM_ = _MASK_ALL_ +endif +; +;;;;;;;;;;;;;;;;; +ifndef SKEIN_LOOP +_SKEIN_LOOP = 0 ;default is all fully unrolled +else +_SKEIN_LOOP = SKEIN_LOOP +endif +;-------------- +; the unroll counts (0 --> fully unrolled) +SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) mod 10 +SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) mod 10 +SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) mod 10 +; +SKEIN_ASM_UNROLL = 0 + irp _NN_,<256,512,1024> + if (SKEIN_UNROLL_&_NN_) eq 0 +SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + _NN_ + endif + endm +; +;;;;;;;;;;;;;;;;; +; +ifndef SKEIN_ROUNDS +ROUNDS_256 = 72 +ROUNDS_512 = 72 +ROUNDS_1024 = 80 +else +ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) mod 10) + 5) +ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) mod 10) + 5) +ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) mod 10) + 5) +endif +irp _NN_,<256,512,1024> + if _USE_ASM_ and _NN_ + irp _RR_,<%(ROUNDS_&_NN_)> + if _NN_ eq 1024 +%out +++ SKEIN_ROUNDS_&_NN_ = _RR_ + else +%out +++ SKEIN_ROUNDS_&_NN_ = _RR_ + endif + endm + endif +endm +;;;;;;;;;;;;;;;;; +; +ifdef SKEIN_CODE_SIZE +_SKEIN_CODE_SIZE equ (1) +else +ifdef SKEIN_PERF ;use code size if SKEIN_PERF is defined +_SKEIN_CODE_SIZE equ (1) +endif +endif +; +;;;;;;;;;;;;;;;;; +; +ifndef SKEIN_DEBUG +_SKEIN_DEBUG = 0 +else +_SKEIN_DEBUG = 1 +endif +;;;;;;;;;;;;;;;;; +; +; define offsets of fields in hash context structure +; +HASH_BITS = 0 ;# bits of hash output +BCNT = 4 + HASH_BITS ;number of bytes in BUFFER[] +TWEAK = 4 + BCNT ;tweak values[0..1] +X_VARS = 16 + TWEAK ;chaining vars +; +;(Note: buffer[] in context structure is NOT needed here :-) +; +KW_PARITY_LO= 0A9FC1A22h ;overall parity of key schedule words (hi32/lo32) +KW_PARITY_HI= 01BD11BDAh +FIRST_MASK8 = NOT (1 SHL 6) ;FIRST block flag bit +; +; rotation constants for Skein +; +RC_256_0_0 = 14 +RC_256_0_1 = 16 + +RC_256_1_0 = 52 +RC_256_1_1 = 57 + +RC_256_2_0 = 23 +RC_256_2_1 = 40 + +RC_256_3_0 = 5 +RC_256_3_1 = 37 + +RC_256_4_0 = 25 +RC_256_4_1 = 33 + +RC_256_5_0 = 46 +RC_256_5_1 = 12 + +RC_256_6_0 = 58 +RC_256_6_1 = 22 + +RC_256_7_0 = 32 +RC_256_7_1 = 32 + +RC_512_0_0 = 46 +RC_512_0_1 = 36 +RC_512_0_2 = 19 +RC_512_0_3 = 37 + +RC_512_1_0 = 33 +RC_512_1_1 = 27 +RC_512_1_2 = 14 +RC_512_1_3 = 42 + +RC_512_2_0 = 17 +RC_512_2_1 = 49 +RC_512_2_2 = 36 +RC_512_2_3 = 39 + +RC_512_3_0 = 44 +RC_512_3_1 = 9 +RC_512_3_2 = 54 +RC_512_3_3 = 56 + +RC_512_4_0 = 39 +RC_512_4_1 = 30 +RC_512_4_2 = 34 +RC_512_4_3 = 24 + +RC_512_5_0 = 13 +RC_512_5_1 = 50 +RC_512_5_2 = 10 +RC_512_5_3 = 17 + +RC_512_6_0 = 25 +RC_512_6_1 = 29 +RC_512_6_2 = 39 +RC_512_6_3 = 43 + +RC_512_7_0 = 8 +RC_512_7_1 = 35 +RC_512_7_2 = 56 +RC_512_7_3 = 22 + +RC_1024_0_0 = 24 +RC_1024_0_1 = 13 +RC_1024_0_2 = 8 +RC_1024_0_3 = 47 +RC_1024_0_4 = 8 +RC_1024_0_5 = 17 +RC_1024_0_6 = 22 +RC_1024_0_7 = 37 + +RC_1024_1_0 = 38 +RC_1024_1_1 = 19 +RC_1024_1_2 = 10 +RC_1024_1_3 = 55 +RC_1024_1_4 = 49 +RC_1024_1_5 = 18 +RC_1024_1_6 = 23 +RC_1024_1_7 = 52 + +RC_1024_2_0 = 33 +RC_1024_2_1 = 4 +RC_1024_2_2 = 51 +RC_1024_2_3 = 13 +RC_1024_2_4 = 34 +RC_1024_2_5 = 41 +RC_1024_2_6 = 59 +RC_1024_2_7 = 17 + +RC_1024_3_0 = 5 +RC_1024_3_1 = 20 +RC_1024_3_2 = 48 +RC_1024_3_3 = 41 +RC_1024_3_4 = 47 +RC_1024_3_5 = 28 +RC_1024_3_6 = 16 +RC_1024_3_7 = 25 + +RC_1024_4_0 = 41 +RC_1024_4_1 = 9 +RC_1024_4_2 = 37 +RC_1024_4_3 = 31 +RC_1024_4_4 = 12 +RC_1024_4_5 = 47 +RC_1024_4_6 = 44 +RC_1024_4_7 = 30 + +RC_1024_5_0 = 16 +RC_1024_5_1 = 34 +RC_1024_5_2 = 56 +RC_1024_5_3 = 51 +RC_1024_5_4 = 4 +RC_1024_5_5 = 53 +RC_1024_5_6 = 42 +RC_1024_5_7 = 41 + +RC_1024_6_0 = 31 +RC_1024_6_1 = 44 +RC_1024_6_2 = 47 +RC_1024_6_3 = 46 +RC_1024_6_4 = 19 +RC_1024_6_5 = 42 +RC_1024_6_6 = 44 +RC_1024_6_7 = 25 + +RC_1024_7_0 = 9 +RC_1024_7_1 = 48 +RC_1024_7_2 = 35 +RC_1024_7_3 = 52 +RC_1024_7_4 = 23 +RC_1024_7_5 = 31 +RC_1024_7_6 = 37 +RC_1024_7_7 = 20 +; +mov64 macro x0,x1 + movq x0,x1 +endm +; +;---------------------------------------------------------------- +; declare allocated space on the stack +StackVar macro localName,localSize +localName = _STK_OFFS_ +_STK_OFFS_ = _STK_OFFS_+(localSize) +endm ;StackVar +; +;---------------------------------------------------------------- +; +; MACRO: Configure stack frame, allocate local vars +; +Setup_Stack macro WCNT,RND_CNT +_STK_OFFS_ = 0 ;starting offset from esp, forced on 16-byte alignment + ;----- local variables ;<-- esp + StackVar X_stk , 8*(WCNT) ;local context vars + StackVar Wcopy , 8*(WCNT) ;copy of input block + StackVar ksTwk ,16*3 ;key schedule: tweak words + StackVar ksKey ,16*(WCNT)+16;key schedule: key words +FRAME_OFFS = ksTwk+128 ;<-- ebp + if (SKEIN_ASM_UNROLL and (WCNT*64)) eq 0 + StackVar ksRot,16*(RND_CNT/4);leave space for ks "rotation" to happen + endif +LOCAL_SIZE = _STK_OFFS_ ;size of local vars + ; + ;"restart" the stack defns, because we relocate esp to guarantee alignment + ; (i.e., these vars are NOT at fixed offsets from esp) +_STK_OFFS_ = 0 + ;----- + StackVar savRegs,8*4 ;pushad data + StackVar retAddr,4 ;return address + ;----- caller parameters + StackVar ctxPtr ,4 ;context ptr + StackVar blkPtr ,4 ;pointer to block data + StackVar blkCnt ,4 ;number of full blocks to process + StackVar bitAdd ,4 ;bit count to add to tweak + ;----- caller's stack frame +; +; Notes on stack frame setup: +; * the most used variable (except for Skein-256) is X_stk[], based at [esp+0] +; * the next most used is the key schedule words +; so ebp is "centered" there, allowing short offsets to the key/tweak +; schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-( +; * the Wcopy variables are infrequently accessed, and they have long +; offsets from both esp and ebp only in the 1024-bit case. +; * all other local vars and calling parameters can be accessed +; with short offsets, except in the 1024-bit case +; + pushad ;save all regs + mov ebx,esp ;keep ebx as pointer to caller parms + sub esp,LOCAL_SIZE ;make room for the locals + and esp,not 15 ;force alignment + mov edi,[ebx+ctxPtr ] ;edi --> Skein context + lea ebp,[esp+FRAME_OFFS] ;maximize use of short offsets from ebp + mov ecx,ptr32 [ebx+blkCnt] ;keep block cnt in ecx +; +endm ;Setup_Stack +; +FP_ equ ;keep as many short offsets as possible +SI_ equ ;keep as many short offsets as possible +ptr64 equ ;useful abbreviations +ptr32 equ +ptr08 equ +; +;---------------------------------------------------------------- +; +Reset_Stack macro procStart + mov esp,ebx ;get rid of locals (wipe??) + popad ;restore all regs + + ;display code size in bytes to stdout + irp _BCNT_,<%($+1-procStart)> ;account for return opcode +if _BCNT_ ge 10000 ;(align it all pretty) +%out procStart code size = _BCNT_ bytes +elseif _BCNT_ ge 1000 +%out procStart code size = _BCNT_ bytes +else +%out procStart code size = _BCNT_ bytes +endif + endm ;irp _BCNT_ + +endm ; Reset_Stack +; +;---------------------------------------------------------------- +; macros to help debug internals +; +if _SKEIN_DEBUG + extrn _Skein_Show_Block:near ;calls to C routines + extrn _Skein_Show_Round:near +; +SKEIN_RND_SPECIAL = 1000 +SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 +SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 +SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 +; +Skein_Debug_Block macro BLK_BITS +; +;void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, +; const u08b_t *blkPtr, const u64b_t *wPtr, +; const u64b_t *ksPtr,const u64b_t *tsPtr); +; + Put_XMM_&BLK_BITS + pushad ;save all regs + lea eax,[FP_+ksTwk+1] ;+1 = flag: "stride" size = 2 qwords + lea esi,[FP_+ksKey+1] + lea ecx,[esp+32+Wcopy] ;adjust offset by 32 for pushad + mov edx,[ebx+ctxPtr] ;ctx_hdr_ptr + lea edx,[edx+X_VARS] ;edx ==> cxt->X[] + push eax ;tsPtr + push esi ;ksPtr + push ecx ;wPtr + push ptr32 [ebx+blkPtr] ;blkPtr + push edx ;ctx->Xptr + push ptr32 [ebx+ctxPtr] ;ctx_hdr_ptr + mov eax,BLK_BITS + push eax ;bits + ifdef _MINGW_ + call _Skein_Show_Block-4 ;strange linkage?? + else + call _Skein_Show_Block + endif + add esp,7*4 ;discard parameter space on stack + popad ;restore regs +; + Get_XMM_&BLK_BITS +endm ;Skein_Debug_Block + +; +Skein_Debug_Round macro BLK_BITS,R,saveRegs +; +;void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X); +; + ifnb + Put_XMM_&BLK_BITS + endif + pushad ;save all regs + if R ne SKEIN_RND_FEED_FWD + lea eax,[esp+32+X_stk] ;adjust offset by 32 for pushad + else + mov eax,[ebx+ctxPtr] + add eax,X_VARS + endif + push eax ;Xptr + if (SKEIN_ASM_UNROLL and BLK_BITS) or (R ge SKEIN_RND_SPECIAL) + mov eax,R + else + lea eax,[4*edx+1+(((R)-1) and 3)] ;compute round number using edx + endif + push eax ;round number + push ptr32 [ebx+ctxPtr] ;ctx_hdr_ptr + mov eax,BLK_BITS + push eax ;bits + ifdef _MINGW_ + call _Skein_Show_Round-4 ;strange linkage?? + else + call _Skein_Show_Round + endif + add esp,4*4 ;discard parameter space on stack + popad ;restore regs + + ifnb + Get_XMM_&BLK_BITS ;save internal vars for debug dump + endif +endm ;Skein_Debug_Round +endif ;ifdef SKEIN_DEBUG +; +;---------------------------------------------------------------- +; useful macros +_ldX macro xn + ifnb + mov64 xmm&xn,ptr64 [esp+X_stk+8*xn] + endif +endm + +_stX macro xn + ifnb + mov64 ptr64 [esp+X_stk+8*xn],xmm&xn + endif +endm +; +;---------------------------------------------------------------- +; +if _USE_ASM_ and 256 + public _Skein_256_Process_Block +; +; void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +;;;;;;;;;;;;;;;;; +; +; Skein-256 round macros +; +R_256_OneRound macro _RR_,x0,x1,x2,x3,t0,t1 + irp _qq_,<%((_RR_) and 7)> ;figure out which rotation constants to use + if x0 eq 0 +_RC0_ = RC_256_&_qq_&_0 +_RC1_ = RC_256_&_qq_&_1 + else +_RC0_ = RC_256_&_qq_&_1 +_RC1_ = RC_256_&_qq_&_0 + endif + endm +; + paddq xmm&x0,xmm&x1 + mov64 xmm&t0,xmm&x1 + psllq xmm&x1, _RC0_ + psrlq xmm&t0,64-_RC0_ + xorpd xmm&x1,xmm&x0 + xorpd xmm&x1,xmm&t0 +; + paddq xmm&x2,xmm&x3 + mov64 xmm&t1,xmm&x3 + psllq xmm&x3, _RC1_ + psrlq xmm&t1,64-_RC1_ + xorpd xmm&x3,xmm&x2 + xorpd xmm&x3,xmm&t1 + if _SKEIN_DEBUG + Skein_Debug_Round 256,%(_RR_+1),saveRegs + endif +endm ;R_256_OneRound +; +R_256_FourRounds macro _RN_ + R_256_OneRound (_RN_+0),0,1,2,3,4,5 + R_256_OneRound (_RN_+1),2,1,0,3,4,5 + + R_256_OneRound (_RN_+2),0,1,2,3,4,5 + R_256_OneRound (_RN_+3),2,1,0,3,4,5 + + ;inject key schedule + inc edx ;bump round number + movd xmm4,edx + if _UNROLL_CNT eq (ROUNDS_256/8) + ;fully unrolled version +_RK_ = ((_RN_)/4) ;key injection counter + paddq xmm0,[FP_+ksKey+16*((_RK_+1) mod 5)] + paddq xmm1,[FP_+ksKey+16*((_RK_+2) mod 5)] + paddq xmm2,[FP_+ksKey+16*((_RK_+3) mod 5)] + paddq xmm3,[FP_+ksKey+16*((_RK_+4) mod 5)] + paddq xmm1,[FP_+ksTwk+16*((_RK_+1) mod 3)] + paddq xmm2,[FP_+ksTwk+16*((_RK_+2) mod 3)] + paddq xmm3,xmm4 + else ;looping version + paddq xmm0,[SI_+ksKey+16*1] + paddq xmm1,[SI_+ksKey+16*2] + paddq xmm2,[SI_+ksKey+16*3] + paddq xmm3,[SI_+ksKey+16*4] + paddq xmm1,[SI_+ksTwk+16*1] + paddq xmm2,[SI_+ksTwk+16*2] + paddq xmm3,xmm4 +; + mov64 xmm4,;first, "rotate" key schedule on the stack + mov64 xmm5,; (for next time through) + mov64 ,xmm4 + mov64 ,xmm5 + add esi,16 ;bump rolling pointer + endif + if _SKEIN_DEBUG + Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,saveRegs + endif +endm ;R256_FourRounds +; +if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines +Put_XMM_256 equ +Get_XMM_256 equ + +_Put_XMM_256: + irp _NN_,<0,1,2,3> + mov64 ptr64 [esp+X_stk+4+_NN_*8],xmm&_NN_ + endm + ret +; +_Get_XMM_256: + irp _NN_,<0,1,2,3> + mov64 xmm&_NN_,ptr64 [esp+X_stk+4+_NN_*8] + endm + ret +endif +; +;;;;;;;;;;;;;;;;; +; +; code +; +_Skein_256_Process_Block proc near + WCNT = 4 ;WCNT=4 for Skein-256 + Setup_Stack WCNT,ROUNDS_256 + ; main hash loop for Skein_256 +Skein_256_block_loop: + movd xmm4,ptr32 [ebx+bitAdd] + mov64 xmm5,ptr64 [edi+TWEAK+0] + mov64 xmm6,ptr64 [edi+TWEAK+8] + paddq xmm5,xmm4 ;bump T0 by the bitAdd parameter + mov64 ptr64 [edi+TWEAK],xmm5 ;save updated tweak value T0 (for next time) + movapd xmm7,xmm6 + xorpd xmm7,xmm5 ;compute overall tweak parity + movdqa [FP_+ksTwk ],xmm5 ;save the expanded tweak schedule on the stack + movdqa [FP_+ksTwk+16],xmm6 + movdqa [FP_+ksTwk+32],xmm7 + + mov esi,[ebx+blkPtr] ;esi --> input block + mov eax,KW_PARITY_LO ;init key schedule parity accumulator + mov edx,KW_PARITY_HI + movd xmm4,eax + movd xmm0,edx + unpcklps xmm4,xmm0 ;pack two 32-bit words into xmm4 +; + irp _NN_,<0,1,2,3> ;copy in the chaining vars + mov64 xmm&_NN_,ptr64 [edi+X_VARS+8*_NN_] + xorpd xmm4,xmm&_NN_ ;update overall parity + movdqa [FP_+ksKey+16*_NN_],xmm&_NN_ + endm + movdqa [FP_+ksKey+16*WCNT],xmm4;save overall parity at the end of the array +; + paddq xmm1,xmm5 ;inject the initial tweak words + paddq xmm2,xmm6 +; + irp _NN_,<0,1,2,3> ;perform the initial key injection + mov64 xmm4,ptr64 [esi+8*_NN_] ;and save a copy of the input block on stack + mov64 ptr64 [esp+8*_NN_+Wcopy],xmm4 + paddq xmm&_NN_,xmm4 + endm +; +if _SKEIN_DEBUG ;debug dump of state at this point + Skein_Debug_Block 256 + Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,saveRegs +endif + add esi, WCNT*8 ;skip to the next block + mov [ebx+blkPtr ],esi ;save the updated block pointer + ; + ; now the key schedule is computed. Start the rounds + ; + xor edx,edx ;edx = iteration count +if SKEIN_ASM_UNROLL and 256 +_UNROLL_CNT = ROUNDS_256/8 ;fully unrolled +else +_UNROLL_CNT = SKEIN_UNROLL_256 ;partial unroll count + if ((ROUNDS_256/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_256" ;sanity check + endif + mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey +Skein_256_round_loop: ; (since there's no 16* scaled address mode) +endif +; +_Rbase_ = 0 +rept _UNROLL_CNT*2 ; here with X[0..3] in XMM0..XMM3 + R_256_FourRounds _Rbase_ +_Rbase_ = _Rbase_+4 +endm ;rept _UNROLL_CNT*2 +; + if _UNROLL_CNT ne (ROUNDS_256/8) + cmp edx,2*(ROUNDS_256/8) + jb Skein_256_round_loop + endif + ;---------------------------- + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} + irp _NN_,<0,1,2,3> + mov64 xmm4,ptr64 [esp+Wcopy+8*_NN_] + xorpd xmm&_NN_,xmm4 + mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_ + endm + and ptr08 [edi +TWEAK +15],FIRST_MASK8 +if _SKEIN_DEBUG + Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,saveRegs +endif + ; go back for more blocks, if needed + dec ecx + jnz Skein_256_block_loop + + Reset_Stack _Skein_256_Process_Block + ret +; +_Skein_256_Process_Block endp +; +ifdef _SKEIN_CODE_SIZE + public _Skein_256_Process_Block_CodeSize +_Skein_256_Process_Block_CodeSize proc + mov eax,_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block + ret +_Skein_256_Process_Block_CodeSize endp +; + public _Skein_256_Unroll_Cnt +_Skein_256_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_256/8 + mov eax,_UNROLL_CNT + else + xor eax,eax + endif + ret +_Skein_256_Unroll_Cnt endp +endif +endif ;_USE_ASM_ and 256 +; +;---------------------------------------------------------------- +; +if _USE_ASM_ and 512 + public _Skein_512_Process_Block +; +; void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +;;;;;;;;;;;;;;;;; +; MACRO: one round +; +R_512_Round macro _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd +irp _nr_,<%((_RR_) and 7)> +_Ra_ = RC_512_&_nr_&_&Ra +_Rb_ = RC_512_&_nr_&_&Rb +_Rc_ = RC_512_&_nr_&_&Rc +_Rd_ = RC_512_&_nr_&_&Rd +endm + paddq xmm&a0,xmm&a1 + _stX c0 + mov64 xmm&c0,xmm&a1 + psllq xmm&a1, _Ra_ + psrlq xmm&c0,64-_Ra_ + xorpd xmm&a1,xmm&c0 + xorpd xmm&a1,xmm&a0 + + paddq xmm&b0,xmm&b1 + _stX a0 + mov64 xmm&a0,xmm&b1 + psllq xmm&b1, _Rb_ + psrlq xmm&a0,64-_Rb_ + xorpd xmm&b1,xmm&b0 + _ldX c0 + xorpd xmm&b1,xmm&a0 + + paddq xmm&c0,xmm&c1 + mov64 xmm&a0,xmm&c1 + psllq xmm&c1, _Rc_ + psrlq xmm&a0,64-_Rc_ + xorpd xmm&c1,xmm&c0 + xorpd xmm&c1,xmm&a0 + + paddq xmm&d0,xmm&d1 + mov64 xmm&a0,xmm&d1 + psllq xmm&d1, _Rd_ + psrlq xmm&a0,64-_Rd_ + xorpd xmm&d1,xmm&a0 + _ldX a0 + xorpd xmm&d1,xmm&d0 + if _SKEIN_DEBUG + Skein_Debug_Round 512,%(_RR_+1),saveRegs + endif +endm +; +; MACRO: four rounds +R_512_FourRounds macro _RN_ + R_512_Round (_RN_) , 0,1,0, 2,3,1, 4,5,2, 6,7,3 + R_512_Round (_RN_)+1, 2,1,0, 4,7,1, 6,5,2, 0,3,3 + R_512_Round (_RN_)+2, 4,1,0, 6,3,1, 0,5,2, 2,7,3 + R_512_Round (_RN_)+3, 6,1,0, 0,7,1, 2,5,2, 4,3,3 + + ;inject key schedule + irp _NN_,<0,1,2,3,4,5,6,7> + if _UNROLL_CNT eq (ROUNDS_512/8) + paddq xmm&_NN_,[FP_+ksKey+16*((((_RN_)/4)+(_NN_)+1) mod 9)] + else + paddq xmm&_NN_,[SI_+ksKey+16*((_NN_)+1)] + endif + endm + _stX 0 ;free up a register + inc edx ;bump round counter + movd xmm0,edx ;inject the tweak + if _UNROLL_CNT eq (ROUNDS_512/8) + paddq xmm5,[FP_+ksTwk+16*(((_RN_)+1) mod 3)] + paddq xmm6,[FP_+ksTwk+16*(((_RN_)+2) mod 3)] + paddq xmm7,xmm0 + else ;looping version + paddq xmm5,[SI_+ksTwk+16*1] + paddq xmm6,[SI_+ksTwk+16*2] + paddq xmm7,xmm0 +; + mov64 xmm0,;first, "rotate" key schedule on the stack + mov64 ,xmm0 + mov64 xmm0,; (for next time through) + mov64 ,xmm0 + add esi,16 ;bump rolling pointer + endif + _ldX 0 ;restore X0 + if _SKEIN_DEBUG + Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,saveRegs + endif +endm ;R_512_FourRounds +;;;;;;;;;;;;;;;;; +if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines +Put_XMM_512 equ +Get_XMM_512 equ + +_Put_XMM_512: + irp _NN_,<0,1,2,3,4,5,6,7> + mov64 ptr64 [esp+X_stk+4+_NN_*8],xmm&_NN_ + endm + ret +; +_Get_XMM_512: + irp _NN_,<0,1,2,3,4,5,6,7> + mov64 xmm&_NN_,ptr64 [esp+X_stk+4+_NN_*8] + endm + ret +endif +; +;;;;;;;;;;;;;;;;; +; code +; +_Skein_512_Process_Block proc near + WCNT = 8 ;WCNT=8 for Skein-512 + Setup_Stack WCNT,ROUNDS_512 + ; main hash loop for Skein_512 +Skein_512_block_loop: + movd xmm0,ptr32 [ebx+bitAdd] + mov64 xmm1,ptr64 [edi+TWEAK+0] + mov64 xmm2,ptr64 [edi+TWEAK+8] + paddq xmm1,xmm0 ;bump T0 by the bitAdd parameter + mov64 ptr64 [edi+TWEAK],xmm1 ;save updated tweak value T0 (for next time) + mov64 xmm0,xmm2 + xorpd xmm0,xmm1 ;compute overall tweak parity + movdqa [FP_+ksTwk ],xmm1 ;save the expanded tweak schedule on the stack + movdqa [FP_+ksTwk+16*1],xmm2 + movdqa [FP_+ksTwk+16*2],xmm0 + + mov esi,[ebx+blkPtr] ;esi --> input block + mov eax,KW_PARITY_LO ;init key schedule parity accumulator + mov edx,KW_PARITY_HI + movd xmm0,eax + movd xmm7,edx + unpcklps xmm0,xmm7 ;pack two 32-bit words into xmm0 +; + irp _NN_,<7,6,5,4,3,2,1> ;copy in the chaining vars (skip #0 for now) + mov64 xmm&_NN_,ptr64 [edi+X_VARS+8*_NN_] + xorpd xmm0,xmm&_NN_ ;update overall parity + movdqa [FP_+ksKey+16*_NN_],xmm&_NN_ + if _NN_ eq 5 + paddq xmm5,xmm1 ;inject the initial tweak words + paddq xmm6,xmm2 ; (before they get trashed in xmm1/2) + endif + endm + mov64 xmm4,ptr64 [edi+X_VARS] ;handle #0 now + xorpd xmm0,xmm4 ;update overall parity + movdqa [FP_+ksKey+16* 0 ],xmm4;save the key value in slot #0 + movdqa [FP_+ksKey+16*WCNT],xmm0;save overall parity at the end of the array +; + mov64 xmm0,xmm4 + irp _NN_,<7,6,5, 4,3,2,1,0> ;perform the initial key injection (except #4) + mov64 xmm4,ptr64 [esi+ 8*_NN_];and save a copy of the input block on stack + mov64 ptr64 [esp+ 8*_NN_+Wcopy],xmm4 + paddq xmm&_NN_,xmm4 + endm + mov64 xmm4,ptr64 [esi+ 8*4] ;get input block word #4 + mov64 ptr64 [esp+ 8*4+Wcopy],xmm4 + paddq xmm4,[FP_+ksKey+16*4] ;inject the initial key +; +if _SKEIN_DEBUG ;debug dump of state at this point + Skein_Debug_Block 512 + Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,saveRegs +endif + add esi, WCNT*8 ;skip to the next block + mov [ebx+blkPtr],esi ;save the updated block pointer + ; + ; now the key schedule is computed. Start the rounds + ; + xor edx,edx ;edx = round counter +if SKEIN_ASM_UNROLL and 512 +_UNROLL_CNT = ROUNDS_512/8 +else +_UNROLL_CNT = SKEIN_UNROLL_512 + if ((ROUNDS_512/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_512" + endif + mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey +Skein_512_round_loop: ; (since there's no 16* scaled address mode) +endif +_Rbase_ = 0 +rept _UNROLL_CNT*2 + R_512_FourRounds _Rbase_ +_Rbase_ = _Rbase_+4 +endm ;rept _UNROLL_CNT +; +if (SKEIN_ASM_UNROLL and 512) eq 0 + cmp edx,2*(ROUNDS_512/8) + jb Skein_512_round_loop +endif + ;---------------------------- + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} + and ptr08 [edi +TWEAK +15],FIRST_MASK8 +irp _NN_,<0,2,4,6> ;do the aligned ones first + xorpd xmm&_NN_,[esp+Wcopy+8*_NN_] + mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_ +endm +irp _NN_,<1,3,5,7> ;now we have some register space available + mov64 xmm0,ptr64 [esp+Wcopy+8*_NN_] + xorpd xmm&_NN_,xmm0 + mov64 ptr64 [edi+X_VARS+8*_NN_],xmm&_NN_ +endm +if _SKEIN_DEBUG + Skein_Debug_Round 512,SKEIN_RND_FEED_FWD +endif + ; go back for more blocks, if needed + dec ecx + jnz Skein_512_block_loop + + Reset_Stack _Skein_512_Process_Block + ret +_Skein_512_Process_Block endp +; +ifdef _SKEIN_CODE_SIZE + public _Skein_512_Process_Block_CodeSize +_Skein_512_Process_Block_CodeSize proc + mov eax,_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block + ret +_Skein_512_Process_Block_CodeSize endp +; + public _Skein_512_Unroll_Cnt +_Skein_512_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_512/8 + mov eax,_UNROLL_CNT + else + xor eax,eax + endif + ret +_Skein_512_Unroll_Cnt endp +endif +; +endif ; _USE_ASM_ and 512 +; +;---------------------------------------------------------------- +; +if _USE_ASM_ and 1024 + public _Skein1024_Process_Block +; +; void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd); +; +R_1024_REGS equ (5) ;keep this many block variables in registers +; +;;;;;;;;;;;;;;;; +if _SKEIN_DEBUG ; macros for saving/restoring X_stk for debug routines +Put_XMM_1024 equ +Get_XMM_1024 equ + +_Put_XMM_1024: +_NN_ = 0 + rept R_1024_REGS + irp _rr_,<%(_NN_)> + mov64 ptr64 [esp+X_stk+4+8*_NN_],xmm&_rr_ + endm +_NN_ = _NN_+1 + endm + ret +; +_Get_XMM_1024: +_NN_ = 0 + rept R_1024_REGS + irp _rr_,<%(_NN_)> + mov64 xmm&_rr_,ptr64 [esp+X_stk+4+8*_NN_] + endm +_NN_ = _NN_+1 + endm + ret +endif +; +;;;;;;;;;;;;;;;;; +; MACRO: one mix step +MixStep_1024 macro x0,x1,rotIdx0,rotIdx1,_debug_ +_r0_ = x0 ;default, if already loaded +_r1_ = x1 + ; load the regs (if necessary) + if (x0 ge R_1024_REGS) +_r0_ = 5 + mov64 xmm5,ptr64 [esp+X_stk+8*(x0)] + endif + if (x1 ge R_1024_REGS) +_r1_ = 6 + mov64 xmm6,ptr64 [esp+X_stk+8*(x1)] + endif + ; do the mix + irp _rx_,<%((rotIdx0) and 7)> +_Rc_ = RC_1024_&_rx_&_&rotIdx1 ;rotation constant + endm + irp _x0_,<%_r0_> + irp _x1_,<%_r1_> + paddq xmm&_x0_,xmm&_x1_ + mov64 xmm7 ,xmm&_x1_ + psllq xmm&_x1_, _Rc_ + psrlq xmm7 ,64-_Rc_ + xorpd xmm&_x1_,xmm&_x0_ + xorpd xmm&_x1_,xmm7 + endm + endm + ; save the regs (if necessary) + if (x0 ge R_1024_REGS) + mov64 ptr64 [esp+X_stk+8*(x0)],xmm5 + endif + if (x1 ge R_1024_REGS) + mov64 ptr64 [esp+X_stk+8*(x1)],xmm6 + endif + ; debug output + if _SKEIN_DEBUG and (0 ne (_debug_ + 0)) + Skein_Debug_Round 1024,%((RotIdx0)+1),saveRegs + endif +endm +;;;;;;;;;;;;;;;;; +; MACRO: four rounds +; +R_1024_FourRounds macro _RR_ + ;--------- round _RR_ + MixStep_1024 0, 1,%((_RR_)+0),0 + MixStep_1024 2, 3,%((_RR_)+0),1 + MixStep_1024 4, 5,%((_RR_)+0),2 + MixStep_1024 6, 7,%((_RR_)+0),3 + MixStep_1024 8, 9,%((_RR_)+0),4 + MixStep_1024 10,11,%((_RR_)+0),5 + MixStep_1024 12,13,%((_RR_)+0),6 + MixStep_1024 14,15,%((_RR_)+0),7,1 + ;--------- round _RR_+1 + MixStep_1024 0, 9,%((_RR_)+1),0 + MixStep_1024 2,13,%((_RR_)+1),1 + MixStep_1024 6,11,%((_RR_)+1),2 + MixStep_1024 4,15,%((_RR_)+1),3 + MixStep_1024 10, 7,%((_RR_)+1),4 + MixStep_1024 12, 3,%((_RR_)+1),5 + MixStep_1024 14, 5,%((_RR_)+1),6 + MixStep_1024 8, 1,%((_RR_)+1),7,1 + ;--------- round _RR_+2 + MixStep_1024 0, 7,%((_RR_)+2),0 + MixStep_1024 2, 5,%((_RR_)+2),1 + MixStep_1024 4, 3,%((_RR_)+2),2 + MixStep_1024 6, 1,%((_RR_)+2),3 + MixStep_1024 12,15,%((_RR_)+2),4 + MixStep_1024 14,13,%((_RR_)+2),5 + MixStep_1024 8,11,%((_RR_)+2),6 + MixStep_1024 10, 9,%((_RR_)+2),7,1 + ;--------- round _RR_+3 + MixStep_1024 0,15,%((_RR_)+3),0 + MixStep_1024 2,11,%((_RR_)+3),1 + MixStep_1024 6,13,%((_RR_)+3),2 + MixStep_1024 4, 9,%((_RR_)+3),3 + MixStep_1024 14, 1,%((_RR_)+3),4 + MixStep_1024 8, 5,%((_RR_)+3),5 + MixStep_1024 10, 3,%((_RR_)+3),6 + MixStep_1024 12, 7,%((_RR_)+3),7,1 + + inc edx ;edx = round number + movd xmm7,edx + ;inject the key +irp _NN_,<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0> + if _UNROLL_CNT ne (ROUNDS_1024/8) + if _NN_ lt R_1024_REGS + paddq xmm&_NN_,ptr64 [SI_+ksKey+16*_NN_+16] + else + mov64 xmm6 ,ptr64 [esp+X_stk+ 8*_NN_] + if _NN_ eq 15 + paddq xmm6,xmm7 + elseif _NN_ eq 14 + paddq xmm6,ptr64 [SI_+ksTwk+16*2] + elseif _NN_ eq 13 + paddq xmm6,ptr64 [SI_+ksTwk+16*1] + endif + paddq xmm6 ,ptr64 [SI_+ksKey+16*_NN_+16] + mov64 ptr64 [esp+X_stk+ 8*_NN_],xmm6 + endif + else + if _NN_ lt R_1024_REGS + paddq xmm&_NN_,ptr64 [FP_+ksKey+16*(((_Rbase_/4)+(_NN_)+1) mod 17)] + else + mov64 xmm6,ptr64 [esp+X_stk+ 8*_NN_] + paddq xmm6,ptr64 [FP_+ksKey+16*(((_Rbase_/4)+(_NN_)+1) mod 17)] + if _NN_ eq 15 + paddq xmm6,xmm7 + elseif _NN_ eq 14 + paddq xmm6,ptr64 [FP_+ksTwk+16*(((_Rbase_/4)+2) mod 3)] + elseif _NN_ eq 13 + paddq xmm6,ptr64 [FP_+ksTwk+16*(((_Rbase_/4)+1) mod 3)] + endif + mov64 ptr64 [esp+X_stk+ 8*_NN_],xmm6 + endif + endif +endm +if _UNROLL_CNT ne (ROUNDS_1024/8) ;rotate the key schedule on the stack + mov64 xmm6,ptr64 [SI_+ksKey] + mov64 xmm7,ptr64 [SI_+ksTwk] + mov64 ptr64 [SI_+ksKey+16*(WCNT+1)],xmm6 + mov64 ptr64 [SI_+ksTwk+16* 3 ],xmm7 + add esi,16 ;bump rolling pointer +endif +if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,saveRegs +endif +endm ;R_1024_FourRounds +;;;;;;;;;;;;;;;; +; code +; +_Skein1024_Process_Block proc near +; + WCNT = 16 ;WCNT=16 for Skein-1024 + Setup_Stack WCNT,ROUNDS_1024 + add edi,80h ;bias the edi ctxt offsets to keep them all short +ctx equ ;offset alias + ; main hash loop for Skein1024 +Skein1024_block_loop: + movd xmm0,ptr32 [ebx+bitAdd] + mov64 xmm1,ptr64 [ctx+TWEAK+0] + mov64 xmm2,ptr64 [ctx+TWEAK+8] + paddq xmm1,xmm0 ;bump T0 by the bitAdd parameter + mov64 ptr64 [ctx+TWEAK],xmm1 ;save updated tweak value T0 (for next time) + mov64 xmm0,xmm2 + xorpd xmm0,xmm1 ;compute overall tweak parity + movdqa [FP_+ksTwk ],xmm1 ;save the expanded tweak schedule on the stack + movdqa [FP_+ksTwk+16],xmm2 + movdqa [FP_+ksTwk+32],xmm0 + + mov esi,[ebx+blkPtr] ;esi --> input block + mov eax,KW_PARITY_LO ;init key schedule parity accumulator + mov edx,KW_PARITY_HI + movd xmm7,eax + movd xmm6,edx + unpcklps xmm7,xmm6 ;pack two 32-bit words into xmm7 +; + lea eax,[esp+80h] ;use short offsets for Wcopy, X_stk writes below +SP_ equ ;[eax+OFFS] mode is one byte shorter than [esp+OFFS] +irp _NN_,<15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0> + mov64 xmm6,ptr64 [ctx+X_VARS+8*_NN_] + xorpd xmm7,xmm6 ;update overall parity + movdqa [FP_+ksKey+16*_NN_],xmm6;save the key schedule on the stack + if _NN_ lt R_1024_REGS + _rr_ = _NN_ + else + _rr_ = R_1024_REGS + endif + irp _rn_,<%(_rr_)> + mov64 xmm&_rn_,ptr64 [esi+ 8*_NN_];save copy of the input block on stack + mov64 ptr64 [SP_+ Wcopy + 8*_NN_],xmm&_rn_ ;(for feedforward later) + paddq xmm&_rn_,xmm6 ;inject the key into the block + if _NN_ eq 13 + paddq xmm&_rn_,xmm1 ;inject the initial tweak words + elseif _NN_ eq 14 + paddq xmm&_rn_,xmm2 + endif + if _NN_ ge R_1024_REGS ;only save X[5..15] on stack, leave X[0..4] in regs + mov64 ptr64 [SP_+X_stk+8*_NN_],xmm&_rn_ + endif + endm +endm + movdqa [FP_+ksKey+16*WCNT],xmm7;save overall key parity at the end of the array +; +if _SKEIN_DEBUG ;debug dump of state at this point + Skein_Debug_Block 1024 + Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,saveRegs +endif + add esi, WCNT*8 ;skip to the next block + mov [ebx+blkPtr],esi ;save the updated block pointer + ; + ; now the key schedule is computed. Start the rounds + ; + xor edx,edx ;edx = round counter +if SKEIN_ASM_UNROLL and 1024 +_UNROLL_CNT = ROUNDS_1024/8 +else +_UNROLL_CNT = SKEIN_UNROLL_1024 + if ((ROUNDS_1024/8) mod _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_1024" + endif + mov esi,ebp ;use this as "rolling" pointer into ksTwk/ksKey +Skein_1024_round_loop: +endif +; +_Rbase_ = 0 +rept _UNROLL_CNT*2 + R_1024_FourRounds %_Rbase_ +_Rbase_ = _Rbase_+4 +endm ;rept _UNROLL_CNT +; +if (SKEIN_ASM_UNROLL and 1024) eq 0 + cmp edx,2*(ROUNDS_1024/8) + jb Skein_1024_round_loop +endif + and ptr08 [ctx +TWEAK +15],FIRST_MASK8 ;clear tweak bit for next time thru + ;---------------------------- + ; feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} + lea eax,[esp+80h] ;allow short offsets to X_stk and Wcopy +irp _NN_,<0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15> + if _NN_ lt R_1024_REGS + if _NN_ and 1 ;already in regs: no load needed + mov64 xmm7 ,ptr64 [SP_+ Wcopy + 8*_NN_] ;unaligned + xorpd xmm&_NN_,xmm7 + else + xorpd xmm&_NN_, [SP_+ Wcopy + 8*_NN_] ;aligned + endif + mov64 ptr64 [ctx+ X_vars+ 8*_NN_],xmm&_NN_ + else + mov64 xmm7,ptr64 [SP_+ X_stk + 8*_NN_] ;load X value from stack + if _NN_ and 1 + mov64 xmm6,ptr64 [SP_+ Wcopy + 8*_NN_] ;unaligned + xorpd xmm7,xmm6 + else + xorpd xmm7, [SP_+ Wcopy + 8*_NN_] ;aligned + endif + mov64 ptr64 [ctx+ X_vars+ 8*_NN_],xmm7 + endif +endm +if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD ;no need to save regs on stack here +endif + ; go back for more blocks, if needed + dec ecx + jnz Skein1024_block_loop + + Reset_Stack _Skein1024_Process_Block + ret +_Skein1024_Process_Block endp +; +ifdef _SKEIN_CODE_SIZE + public _Skein1024_Process_Block_CodeSize +_Skein1024_Process_Block_CodeSize proc + mov eax,_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block + ret +_Skein1024_Process_Block_CodeSize endp +; + public _Skein1024_Unroll_Cnt +_Skein1024_Unroll_Cnt proc + if _UNROLL_CNT ne ROUNDS_1024/8 + mov eax,_UNROLL_CNT + else + xor eax,eax + endif + ret +_Skein1024_Unroll_Cnt endp +endif +; +endif ; _USE_ASM_ and 1024 +;---------------------------------------------------------------- + end Index: sys/contrib/skein/asm/skein_block_xmm32.s =================================================================== --- /dev/null +++ sys/contrib/skein/asm/skein_block_xmm32.s @@ -0,0 +1,1110 @@ +# +#---------------------------------------------------------------- +# 32-bit x86 assembler code for Skein block functions using XMM registers +# +# Author: Doug Whiting, Hifn/Exar +# +# This code is released to the public domain. +#---------------------------------------------------------------- +# + .text + .altmacro #use advanced macro features + .psize 0,128 #list file has no page boundaries +# +_MASK_ALL_ = (256+512+1024) #all three algorithm bits +SAVE_REGS = 1 +# +################# +.ifndef SKEIN_USE_ASM +_USE_ASM_ = _MASK_ALL_ +.elseif SKEIN_USE_ASM & _MASK_ALL_ +_USE_ASM_ = SKEIN_USE_ASM +.else +_USE_ASM_ = _MASK_ALL_ +.endif +# +################# +.ifndef SKEIN_LOOP +_SKEIN_LOOP = 002 #default is all fully unrolled, except Skein1024 +.else +_SKEIN_LOOP = SKEIN_LOOP +.endif +#-------------- +# the unroll counts (0 --> fully unrolled) +SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 +SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 +SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 +# +SKEIN_ASM_UNROLL = 0 + .irp _NN_,256,512,1024 + .if (SKEIN_UNROLL_\_NN_) == 0 +SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ + .endif + .endr +# +################# +# +.ifndef SKEIN_ROUNDS +ROUNDS_256 = 72 +ROUNDS_512 = 72 +ROUNDS_1024 = 80 +.else +ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) +ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) +ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) +.irp _NN_,256,512,1024 + .if _USE_ASM_ && \_NN_ + .irp _RR_,%(ROUNDS_\_NN_) + .if \_NN_ < 1024 +.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" + .else +.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" + .endif + .endr + .endif +.endr +.endif +################# +# +.ifdef SKEIN_CODE_SIZE +_SKEIN_CODE_SIZE = (1) +.else +.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined +_SKEIN_CODE_SIZE = (1) +.endif +.endif +# +################# +# +.ifndef SKEIN_DEBUG +_SKEIN_DEBUG = 0 +.else +_SKEIN_DEBUG = 1 +.endif +################# +# +# define offsets of fields in hash context structure +# +HASH_BITS = 0 ## bits of hash output +BCNT = 4 + HASH_BITS #number of bytes in BUFFER[] +TWEAK = 4 + BCNT #tweak values[0..1] +X_VARS = 16 + TWEAK #chaining vars +# +#(Note: buffer[] in context structure is NOT needed here :-) +# +KW_PARITY_LO= 0xA9FC1A22 #overall parity of key schedule words (hi32/lo32) +KW_PARITY_HI= 0x1BD11BDA +FIRST_MASK8 = ~ (1 << 6) #FIRST block flag bit +# +# rotation constants for Skein +# +RC_256_0_0 = 14 +RC_256_0_1 = 16 + +RC_256_1_0 = 52 +RC_256_1_1 = 57 + +RC_256_2_0 = 23 +RC_256_2_1 = 40 + +RC_256_3_0 = 5 +RC_256_3_1 = 37 + +RC_256_4_0 = 25 +RC_256_4_1 = 33 + +RC_256_5_0 = 46 +RC_256_5_1 = 12 + +RC_256_6_0 = 58 +RC_256_6_1 = 22 + +RC_256_7_0 = 32 +RC_256_7_1 = 32 + +RC_512_0_0 = 46 +RC_512_0_1 = 36 +RC_512_0_2 = 19 +RC_512_0_3 = 37 + +RC_512_1_0 = 33 +RC_512_1_1 = 27 +RC_512_1_2 = 14 +RC_512_1_3 = 42 + +RC_512_2_0 = 17 +RC_512_2_1 = 49 +RC_512_2_2 = 36 +RC_512_2_3 = 39 + +RC_512_3_0 = 44 +RC_512_3_1 = 9 +RC_512_3_2 = 54 +RC_512_3_3 = 56 + +RC_512_4_0 = 39 +RC_512_4_1 = 30 +RC_512_4_2 = 34 +RC_512_4_3 = 24 + +RC_512_5_0 = 13 +RC_512_5_1 = 50 +RC_512_5_2 = 10 +RC_512_5_3 = 17 + +RC_512_6_0 = 25 +RC_512_6_1 = 29 +RC_512_6_2 = 39 +RC_512_6_3 = 43 + +RC_512_7_0 = 8 +RC_512_7_1 = 35 +RC_512_7_2 = 56 +RC_512_7_3 = 22 + +RC_1024_0_0 = 24 +RC_1024_0_1 = 13 +RC_1024_0_2 = 8 +RC_1024_0_3 = 47 +RC_1024_0_4 = 8 +RC_1024_0_5 = 17 +RC_1024_0_6 = 22 +RC_1024_0_7 = 37 + +RC_1024_1_0 = 38 +RC_1024_1_1 = 19 +RC_1024_1_2 = 10 +RC_1024_1_3 = 55 +RC_1024_1_4 = 49 +RC_1024_1_5 = 18 +RC_1024_1_6 = 23 +RC_1024_1_7 = 52 + +RC_1024_2_0 = 33 +RC_1024_2_1 = 4 +RC_1024_2_2 = 51 +RC_1024_2_3 = 13 +RC_1024_2_4 = 34 +RC_1024_2_5 = 41 +RC_1024_2_6 = 59 +RC_1024_2_7 = 17 + +RC_1024_3_0 = 5 +RC_1024_3_1 = 20 +RC_1024_3_2 = 48 +RC_1024_3_3 = 41 +RC_1024_3_4 = 47 +RC_1024_3_5 = 28 +RC_1024_3_6 = 16 +RC_1024_3_7 = 25 + +RC_1024_4_0 = 41 +RC_1024_4_1 = 9 +RC_1024_4_2 = 37 +RC_1024_4_3 = 31 +RC_1024_4_4 = 12 +RC_1024_4_5 = 47 +RC_1024_4_6 = 44 +RC_1024_4_7 = 30 + +RC_1024_5_0 = 16 +RC_1024_5_1 = 34 +RC_1024_5_2 = 56 +RC_1024_5_3 = 51 +RC_1024_5_4 = 4 +RC_1024_5_5 = 53 +RC_1024_5_6 = 42 +RC_1024_5_7 = 41 + +RC_1024_6_0 = 31 +RC_1024_6_1 = 44 +RC_1024_6_2 = 47 +RC_1024_6_3 = 46 +RC_1024_6_4 = 19 +RC_1024_6_5 = 42 +RC_1024_6_6 = 44 +RC_1024_6_7 = 25 + +RC_1024_7_0 = 9 +RC_1024_7_1 = 48 +RC_1024_7_2 = 35 +RC_1024_7_3 = 52 +RC_1024_7_4 = 23 +RC_1024_7_5 = 31 +RC_1024_7_6 = 37 +RC_1024_7_7 = 20 +# +#---------------------------------------------------------------- +# declare allocated space on the stack +.macro StackVar localName,localSize +\localName = _STK_OFFS_ +_STK_OFFS_ = _STK_OFFS_+(\localSize) +.endm #StackVar +# +#---------------------------------------------------------------- +# +# MACRO: Configure stack frame, allocate local vars +# +.macro Setup_Stack WCNT,RND_CNT +_STK_OFFS_ = 0 #starting offset from esp, forced on 16-byte alignment + #----- local variables #<-- esp + StackVar X_stk , 8*(WCNT) #local context vars + StackVar Wcopy , 8*(WCNT) #copy of input block + StackVar ksTwk ,16*3 #key schedule: tweak words + StackVar ksKey ,16*(WCNT)+16#key schedule: key words +FRAME_OFFS = ksTwk+128 #<-- ebp +F_O = FRAME_OFFS #syntactic shorthand + .if (SKEIN_ASM_UNROLL && (WCNT*64)) == 0 + StackVar ksRot,16*(RND_CNT/4)#leave space for ks "rotation" to happen + .endif +LOCAL_SIZE = _STK_OFFS_ #size of local vars + # + #"restart" the stack defns, because we relocate esp to guarantee alignment + # (i.e., these vars are NOT at fixed offsets from esp) +_STK_OFFS_ = 0 + #----- + StackVar savRegs,8*4 #pushad data + StackVar retAddr,4 #return address + #----- caller parameters + StackVar ctxPtr ,4 #context ptr + StackVar blkPtr ,4 #pointer to block data + StackVar blkCnt ,4 #number of full blocks to process + StackVar bitAdd ,4 #bit count to add to tweak + #----- caller's stack frame +# +# Notes on stack frame setup: +# * the most used variable (except for Skein-256) is X_stk[], based at [esp+0] +# * the next most used is the key schedule words +# so ebp is "centered" there, allowing short offsets to the key/tweak +# schedule in 256/512-bit Skein cases, but not posible for Skein-1024 :-( +# * the Wcopy variables are infrequently accessed, and they have long +# offsets from both esp and ebp only in the 1024-bit case. +# * all other local vars and calling parameters can be accessed +# with short offsets, except in the 1024-bit case +# + pushal #save all regs + movl %esp,%ebx #keep ebx as pointer to caller parms + subl $LOCAL_SIZE,%esp #make room for the locals + andl $~15,%esp #force alignment + movl ctxPtr(%ebx),%edi #edi --> Skein context + leal FRAME_OFFS(%esp),%ebp #maximize use of short offsets from ebp + movl blkCnt(%ebx),%ecx #keep block cnt in ecx +.endm #Setup_Stack +# +#---------------------------------------------------------------- +# +.macro Reset_Stack,procStart + movl %ebx,%esp #get rid of locals (wipe??) + popal #restore all regs +.endm # Reset_Stack +# +#---------------------------------------------------------------- +# macros to help debug internals +# +.if _SKEIN_DEBUG + .extern _Skein_Show_Block #calls to C routines + .extern _Skein_Show_Round +# +SKEIN_RND_SPECIAL = 1000 +SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 +SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 +SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 +# +.macro Skein_Debug_Block BLK_BITS +# +#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, +# const u08b_t *blkPtr, const u64b_t *wPtr, +# const u64b_t *ksPtr,const u64b_t *tsPtr)# +# + call _Put_XMM_\BLK_BITS + pushal #save all regs + leal ksTwk+1-F_O(%ebp),%eax #+1 = flag: "stride" size = 2 qwords + leal ksKey+1-F_O(%ebp),%esi + leal Wcopy+32(%esp),%ecx #adjust offset by 32 for pushad + movl ctxPtr(%ebx) ,%edx #ctx_hdr_ptr + leal X_VARS(%edx) ,%edx #edx ==> cxt->X[] + pushl %eax #tsPtr + pushl %esi #ksPtr + pushl %ecx #wPtr + pushl blkPtr(%ebx) #blkPtr + pushl %edx #ctx->Xptr + pushl ctxPtr(%ebx) #ctx_hdr_ptr + movl $\BLK_BITS,%eax + pushl %eax #bits + call _Skein_Show_Block + addl $7*4,%esp #discard parameter space on stack + popal #restore regs +# + call _Get_XMM_\BLK_BITS +.endm #Skein_Debug_Block + +# +.macro Skein_Debug_Round BLK_BITS,R,saveRegs=0 +# +#void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X)# +# + .if \saveRegs + call _Put_XMM_\BLK_BITS + .endif + pushal #save all regs + .if R <> SKEIN_RND_FEED_FWD + leal 32+X_stk(%esp),%eax #adjust offset by 32 for pushal + .else + movl ctxPtr(%ebx),%eax + addl $X_VARS,%eax + .endif + pushl %eax #Xptr + .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) + movl $\R,%eax + .else #compute round number from edx, R + leal 1+(((\R)-1) && 3)(,%edx,4),%eax + .endif + pushl %eax #round number + pushl ctxPtr(%ebx) #ctx_hdr_ptr + movl $\BLK_BITS,%eax + pushl %eax #bits + call _Skein_Show_Round + addl $4*4,%esp #discard parameter space on stack + popal #restore regs + .if \saveRegs + call _Get_XMM_\BLK_BITS #save internal vars for debug dump + .endif +.endm #Skein_Debug_Round +.endif #ifdef SKEIN_DEBUG +# +#---------------------------------------------------------------- +# useful macros +.macro _ldX xn + movq X_stk+8*(\xn)(%esp),%xmm\xn +.endm + +.macro _stX xn + movq %xmm\xn,X_stk+8*(\xn)(%esp) +.endm +# +#---------------------------------------------------------------- +# +.macro C_label lName + \lName: #use both "genders" to work across linkage conventions +_\lName: + .global \lName + .global _\lName +.endm +# + +.if _USE_ASM_ & 256 +# +# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +################# +# +# Skein-256 round macros +# +.macro R_256_OneRound _RR_,x0,x1,x2,x3,t0,t1 + .irp _qq_,%((\_RR_) && 7) #figure out which rotation constants to use + .if \x0 == 0 +_RC0_ = RC_256_\_qq_&&_0 +_RC1_ = RC_256_\_qq_&&_1 + .else +_RC0_ = RC_256_\_qq_&&_1 +_RC1_ = RC_256_\_qq_&&_0 + .endif + .endr +# + paddq %xmm\x1,%xmm\x0 + movq %xmm\x1,%xmm\t0 + psllq $ _RC0_,%xmm\x1 + psrlq $64-_RC0_,%xmm\t0 + xorpd %xmm\x0,%xmm\x1 + xorpd %xmm\t0,%xmm\x1 +# + paddq %xmm\x3,%xmm\x2 + movq %xmm\x3,%xmm\t1 + psllq $ _RC1_,%xmm\x3 + psrlq $64-_RC1_,%xmm\t1 + xorpd %xmm\x2,%xmm\x3 + xorpd %xmm\t1,%xmm\x3 + .if _SKEIN_DEBUG + Skein_Debug_Round 256,%(\_RR_+1),SAVE_REGS + .endif +.endm #R_256_OneRound +# +.macro R_256_FourRounds _RN_ + R_256_OneRound %(_RN_+0),0,1,2,3,4,5 + R_256_OneRound (_RN_+1),2,1,0,3,4,5 + + R_256_OneRound (_RN_+2),0,1,2,3,4,5 + R_256_OneRound (_RN_+3),2,1,0,3,4,5 + + #inject key schedule + incl %edx #bump round number + movd %edx,%xmm4 + .if _UNROLL_CNT == (ROUNDS_256/8) + #fully unrolled version +_RK_ = ((_RN_)/4) #key injection counter + paddq ksKey+16*((_RK_+1) % 5)-F_O(%ebp),%xmm0 + paddq ksKey+16*((_RK_+2) % 5)-F_O(%ebp),%xmm1 + paddq ksKey+16*((_RK_+3) % 5)-F_O(%ebp),%xmm2 + paddq ksKey+16*((_RK_+4) % 5)-F_O(%ebp),%xmm3 + paddq ksTwk+16*((_RK_+1) % 3)-F_O(%ebp),%xmm1 + paddq ksTwk+16*((_RK_+2) % 3)-F_O(%ebp),%xmm2 + paddq %xmm4,%xmm3 + .else #looping version + paddq ksKey+16*1-F_O(%esi),%xmm0 + paddq ksKey+16*2-F_O(%esi),%xmm1 + paddq ksKey+16*3-F_O(%esi),%xmm2 + paddq ksKey+16*4-F_O(%esi),%xmm3 + paddq ksTwk+16*1-F_O(%esi),%xmm1 + paddq ksTwk+16*2-F_O(%esi),%xmm2 + paddq %xmm4,%xmm3 +# + movq ksKey-F_O(%esi),%xmm4 #first, "rotate" key schedule on the stack + movq ksTwk-F_O(%esi),%xmm5 # (for next time through) + movq %xmm4,ksKey+16*(WCNT+1)-F_O(%esi) + movq %xmm5,ksTwk+16*3-F_O(%esi) + addl $16,%esi #bump rolling pointer + .endif + .if _SKEIN_DEBUG + Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT,SAVE_REGS + .endif +.endm #R256_FourRounds +# +.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines +_Put_XMM_256: + .irp _NN_,0,1,2,3 + movq %xmm\_NN_,X_stk+4+\_NN_*8(%esp) + .endr + ret +# +_Get_XMM_256: + .irp _NN_,0,1,2,3 + movq X_stk+4+_NN_*8(%esp),%xmm\_NN_ + .endr + ret +.endif +# +################# +# +# code +# +C_label Skein_256_Process_Block + WCNT = 4 #WCNT=4 for Skein-256 + Setup_Stack WCNT,ROUNDS_256 + # main hash loop for Skein_256 +Skein_256_block_loop: + movd bitAdd (%ebx),%xmm4 + movq TWEAK+0(%edi),%xmm5 + movq TWEAK+8(%edi),%xmm6 + paddq %xmm4 ,%xmm5 #bump T0 by the bitAdd parameter + movq %xmm5,TWEAK(%edi) #save updated tweak value T0 (for next time) + movapd %xmm6,%xmm7 + xorpd %xmm5,%xmm7 #compute overall tweak parity + movdqa %xmm5,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack + movdqa %xmm6,ksTwk+16-F_O(%ebp) + movdqa %xmm7,ksTwk+32-F_O(%ebp) + + movl blkPtr(%ebx),%esi #esi --> input block + movl $KW_PARITY_LO,%eax #init key schedule parity accumulator + movl $KW_PARITY_HI,%edx + movd %eax ,%xmm4 + movd %edx ,%xmm0 + unpcklps %xmm0,%xmm4 #replicate parity dword to 64 bits +# + .irp _NN_,0,1,2,3 #copy in the chaining vars + movq X_VARS+8*\_NN_(%edi),%xmm\_NN_ + xorpd %xmm\_NN_,%xmm4 #update overall parity + movdqa %xmm\_NN_,ksKey+16*_NN_-F_O(%ebp) + .endr + movdqa %xmm4,ksKey+16*WCNT-F_O(%ebp)#save overall parity at the end of the array +# + paddq %xmm5,%xmm1 #inject the initial tweak words + paddq %xmm6,%xmm2 +# + .irp _NN_,0,1,2,3 #perform the initial key injection + movq 8*\_NN_(%esi),%xmm4#and save a copy of the input block on stack + movq %xmm4,8*\_NN_+Wcopy(%esp) + paddq %xmm4,%xmm\_NN_ #inject the key word + .endr +# +.if _SKEIN_DEBUG #debug dump of state at this point + Skein_Debug_Block 256 + Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL,SAVE_REGS +.endif + addl $WCNT*8,%esi #skip to the next block + movl %esi,blkPtr(%ebx) #save the updated block pointer + # + # now the key schedule is computed. Start the rounds + # + xorl %edx,%edx #edx = iteration count +.if SKEIN_ASM_UNROLL & 256 +_UNROLL_CNT = ROUNDS_256/8 #fully unrolled +.else +_UNROLL_CNT = SKEIN_UNROLL_256 #partial unroll count + .if ((ROUNDS_256/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_256" #sanity check + .endif + movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey +Skein_256_round_loop: # (since there's no 16* scaled address mode) +.endif +# +_Rbase_ = 0 +.rept _UNROLL_CNT*2 # here with X[0..3] in XMM0..XMM3 + R_256_FourRounds _Rbase_ +_Rbase_ = _Rbase_+4 +.endr #rept _UNROLL_CNT*2 +# + .if _UNROLL_CNT <> (ROUNDS_256/8) + cmpl $2*(ROUNDS_256/8),%edx + jb Skein_256_round_loop + .endif + #---------------------------- + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} + .irp _NN_,0,1,2,3 + movq Wcopy+8*\_NN_(%esp),%xmm4 + xorpd %xmm4,%xmm\_NN_ + movq %xmm\_NN_,X_VARS+8*\_NN_(%edi) + .endr + andb $FIRST_MASK8,TWEAK +15(%edi) +.if _SKEIN_DEBUG + Skein_Debug_Round 256,SKEIN_RND_FEED_FWD,SAVE_REGS +.endif + # go back for more blocks, if needed + decl %ecx + jnz Skein_256_block_loop + Reset_Stack _Skein_256_Process_Block + ret +# +.ifdef _SKEIN_CODE_SIZE +C_label Skein_256_Process_Block_CodeSize + movl $_Skein_256_Process_Block_CodeSize - _Skein_256_Process_Block,%eax + ret +# +C_label Skein_256_Unroll_Cnt + .if _UNROLL_CNT <> ROUNDS_256/8 + movl $_UNROLL_CNT,%eax + .else + xorl %eax,%eax + .endif + ret +.endif +.endif #_USE_ASM_ & 256 +# +#---------------------------------------------------------------- +# +.if _USE_ASM_ & 512 +# +# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +################# +# MACRO: one round +# +.macro R_512_Round _RR_, a0,a1,Ra, b0,b1,Rb, c0,c1,Rc, d0,d1,Rd + .irp _qq_,%((\_RR_) && 7) +_Ra_ = RC_512_\_qq_&&_\Ra +_Rb_ = RC_512_\_qq_&&_\Rb +_Rc_ = RC_512_\_qq_&&_\Rc +_Rd_ = RC_512_\_qq_&&_\Rd + .endr + paddq %xmm\a1 , %xmm\a0 + _stX c0 + movq %xmm\a1 , %xmm\c0 + psllq $ _Ra_ , %xmm\a1 + psrlq $64-_Ra_ , %xmm\c0 + xorpd %xmm\c0 , %xmm\a1 + xorpd %xmm\a0 , %xmm\a1 + + paddq %xmm\b1 , %xmm\b0 + _stX a0 + movq %xmm\b1 , %xmm\a0 + psllq $ _Rb_ , %xmm\b1 + psrlq $64-_Rb_ , %xmm\a0 + xorpd %xmm\b0 , %xmm\b1 + _ldX c0 + xorpd %xmm\a0 , %xmm\b1 + + paddq %xmm\c1 , %xmm\c0 + movq %xmm\c1 , %xmm\a0 + psllq $ _Rc_ , %xmm\c1 + psrlq $64-_Rc_ , %xmm\a0 + xorpd %xmm\c0 , %xmm\c1 + xorpd %xmm\a0 , %xmm\c1 + + paddq %xmm\d1 , %xmm\d0 + movq %xmm\d1 , %xmm\a0 + psllq $ _Rd_ , %xmm\d1 + psrlq $64-_Rd_ , %xmm\a0 + xorpd %xmm\a0 , %xmm\d1 + _ldX a0 + xorpd %xmm\d0 , %xmm\d1 + .if _SKEIN_DEBUG + Skein_Debug_Round 512,%(_RR_+1),SAVE_REGS + .endif +.endm +# +# MACRO: four rounds +.macro R_512_FourRounds _RN_ + R_512_Round %((_RN_) ), 0,1,0, 2,3,1, 4,5,2, 6,7,3 + R_512_Round %((_RN_)+1), 2,1,0, 4,7,1, 6,5,2, 0,3,3 + R_512_Round %((_RN_)+2), 4,1,0, 6,3,1, 0,5,2, 2,7,3 + R_512_Round %((_RN_)+3), 6,1,0, 0,7,1, 2,5,2, 4,3,3 + + #inject key schedule +.irp _NN_,0,1,2,3,4,5,6,7 + .if _UNROLL_CNT == (ROUNDS_512/8) + paddq ksKey+16*((((\_RN_)/4)+(\_NN_)+1)%9)-F_O(%ebp),%xmm\_NN_ + .else + paddq ksKey+16*((\_NN_)+1)-F_O(%esi),%xmm\_NN_ + .endif +.endr + _stX 0 #free up a register + incl %edx #bump round counter + movd %edx,%xmm0 #inject the tweak + .if _UNROLL_CNT == (ROUNDS_512/8) + paddq ksTwk+16*(((_RN_)+1) % 3)-F_O(%ebp),%xmm5 + paddq ksTwk+16*(((_RN_)+2) % 3)-F_O(%ebp),%xmm6 + paddq %xmm0 ,%xmm7 + .else #looping version + paddq ksTwk+16*1-F_O(%esi),%xmm5 + paddq ksTwk+16*2-F_O(%esi),%xmm6 + paddq %xmm0 ,%xmm7 + # "rotate" key schedule on the stack (for next time through) + movq ksKey -F_O(%esi),%xmm0 + movq %xmm0,ksKey+16*(WCNT+1)-F_O(%esi) + movq ksTwk -F_O(%esi),%xmm0 + movq %xmm0,ksTwk+16*3 -F_O(%esi) + addl $16,%esi #bump rolling pointer + .endif + _ldX 0 #restore X0 + .if _SKEIN_DEBUG + Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT,SAVE_REGS + .endif +.endm #R_512_FourRounds +################# +.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines +_Put_XMM_512: + .irp _NN_,0,1,2,3,4,5,6,7 + movq %xmm\_NN_,X_stk+4+\_NN_*8(%esp) + .endr + ret +# +_Get_XMM_512: + .irp _NN_,0,1,2,3,4,5,6,7 + movq X_stk+4+\_NN_*8(%esp),%xmm\_NN_ + .endr + ret +.endif +# +################# +# +C_label Skein_512_Process_Block + WCNT = 8 #WCNT=8 for Skein-512 + Setup_Stack WCNT,ROUNDS_512 + # main hash loop for Skein_512 +Skein_512_block_loop: + movd bitAdd(%ebx) ,%xmm0 + movq TWEAK+0(%edi),%xmm1 + movq TWEAK+8(%edi),%xmm2 + paddq %xmm0,%xmm1 #bump T0 by the bitAdd parameter + movq %xmm1,TWEAK(%edi) #save updated tweak value T0 (for next time) + movq %xmm2,%xmm0 + xorpd %xmm1,%xmm0 #compute overall tweak parity + movdqa %xmm1,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack + movdqa %xmm2,ksTwk+16*1-F_O(%ebp) + movdqa %xmm0,ksTwk+16*2-F_O(%ebp) + + movl blkPtr(%ebx),%esi #esi --> input block + movl $KW_PARITY_LO,%eax #init key schedule parity accumulator + movl $KW_PARITY_HI,%edx + movd %eax ,%xmm0 + movd %edx ,%xmm7 + unpcklps %xmm7,%xmm0 #replicate parity dword to 64 bits +# + .irp _NN_,7,6,5,4,3,2,1 #copy in the chaining vars (skip #0 for now) + movq X_VARS+8*\_NN_(%edi),%xmm\_NN_ + xorpd %xmm\_NN_,%xmm0 #update overall parity + movdqa %xmm\_NN_,ksKey+16*\_NN_-F_O(%ebp) + .if \_NN_ == 5 + paddq %xmm1,%xmm5 #inject the initial tweak words + paddq %xmm2,%xmm6 # (before they get trashed in %xmm1/2) + .endif + .endr + movq X_VARS(%edi),%xmm4 #handle #0 now + xorpd %xmm4,%xmm0 #update overall parity + movdqa %xmm4,ksKey+16* 0 -F_O(%ebp) #save the key value in slot #0 + movdqa %xmm0,ksKey+16*WCNT-F_O(%ebp) #save overall parity at the end of the array +# + movq %xmm4,%xmm0 + .irp _NN_,7,6,5, 4,3,2,1,0 #perform the initial key injection (except #4) + movq 8*\_NN_(%esi),%xmm4 #and save a copy of the input block on stack + movq %xmm4,8*\_NN_+Wcopy(%esp) + paddq %xmm4,%xmm\_NN_ + .endr + movq 8*4(%esi),%xmm4 #get input block word #4 + movq %xmm4,8*4+Wcopy(%esp) + paddq ksKey+16*4-F_O(%ebp),%xmm4#inject the initial key +# +.if _SKEIN_DEBUG #debug dump of state at this point + Skein_Debug_Block 512 + Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL,SAVE_REGS +.endif + addl $WCNT*8,%esi #skip to the next block + movl %esi,blkPtr(%ebx) #save the updated block pointer + # + # now the key schedule is computed. Start the rounds + # + xorl %edx,%edx #edx = round counter +.if SKEIN_ASM_UNROLL & 512 +_UNROLL_CNT = ROUNDS_512/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_512 + .if ((ROUNDS_512/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_512" + .endif + movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey +Skein_512_round_loop: # (since there's no 16* scaled address mode) +.endif +_Rbase_ = 0 +.rept _UNROLL_CNT*2 + R_512_FourRounds %_Rbase_ +_Rbase_ = _Rbase_+4 +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 512) == 0 + cmpl $2*(ROUNDS_512/8),%edx + jb Skein_512_round_loop +.endif + #---------------------------- + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} + andb $FIRST_MASK8,TWEAK +15(%edi) +.irp _NN_,0,2,4,6 #do the aligned ones first + xorpd Wcopy+8*\_NN_(%esp),%xmm\_NN_ + movq %xmm\_NN_,X_VARS+8*_NN_(%edi) +.endr +.irp _NN_,1,3,5,7 #now we have some register space available + movq Wcopy+8*\_NN_(%esp),%xmm0 + xorpd %xmm0,%xmm&\_NN_ + movq %xmm&\_NN_,X_VARS+8*\_NN_(%edi) +.endr +.if _SKEIN_DEBUG + Skein_Debug_Round 512,SKEIN_RND_FEED_FWD +.endif + # go back for more blocks, if needed + decl %ecx + jnz Skein_512_block_loop + + Reset_Stack _Skein_512_Process_Block + ret +# +.ifdef _SKEIN_CODE_SIZE +C_label Skein_512_Process_Block_CodeSize + movl $(_Skein_512_Process_Block_CodeSize - _Skein_512_Process_Block),%eax + ret +# +C_label Skein_512_Unroll_Cnt + .if _UNROLL_CNT <> ROUNDS_512/8 + movl $_UNROLL_CNT,%eax + .else + xorl %eax,%eax + .endif + ret +.endif +# +.endif # _USE_ASM_ & 512 +# +#---------------------------------------------------------------- +# +.if _USE_ASM_ & 1024 + .global _Skein1024_Process_Block +# +# void Skein_1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +R_1024_REGS = (5) #keep this many block variables in registers +# +################ +.if _SKEIN_DEBUG # macros for saving/restoring X_stk for debug routines +_Put_XMM_1024: +_NN_ = 0 + .rept R_1024_REGS + .irp _rr_,%(_NN_) + movq %xmm\_rr_,X_stk+4+8*_NN_(%esp) + .endr +_NN_ = _NN_+1 + .endr + ret +# +_Get_XMM_1024: +_NN_ = 0 + .rept R_1024_REGS + .irp _rr_,%(_NN_) + movq X_stk+4+8*_NN_(%esp),%xmm\_rr_ + .endr +_NN_ = _NN_+1 + .endr + ret +.endif +# +################# +# MACRO: one mix step +.macro MixStep_1024 x0,x1,rotIdx0,rotIdx1,_debug_=0 +_r0_ = \x0 #default, if already loaded +_r1_ = \x1 + # load the regs (if necessary) + .if (\x0 >= R_1024_REGS) +_r0_ = 5 + movq X_stk+8*(\x0)(%esp),%xmm5 + .endif + .if (\x1 >= R_1024_REGS) +_r1_ = 6 + movq X_stk+8*(\x1)(%esp),%xmm6 + .endif + # do the mix + .irp _rx_,%((rotIdx0) && 7) +_Rc_ = RC_1024_\_rx_&&_\rotIdx1 #rotation constant + .endr + .irp _x0_,%_r0_ + .irp _x1_,%_r1_ + paddq %xmm\_x1_,%xmm\_x0_ + movq %xmm\_x1_,%xmm7 + psllq $ _Rc_ ,%xmm\_x1_ + psrlq $64-_Rc_ ,%xmm7 + xorpd %xmm\_x0_,%xmm\_x1_ + xorpd %xmm7 ,%xmm\_x1_ + .endr + .endr + # save the regs (if necessary) + .if (\x0 >= R_1024_REGS) + movq %xmm5,X_stk+8*(\x0)(%esp) + .endif + .if (\x1 >= R_1024_REGS) + movq %xmm6,X_stk+8*(\x1)(%esp) + .endif + # debug output + .if _SKEIN_DEBUG && (\_debug_) + Skein_Debug_Round 1024,%((\RotIdx0)+1),SAVE_REGS + .endif +.endm +################# +# MACRO: four rounds +# +.macro R_1024_FourRounds _RR_ + #--------- round _RR_ + MixStep_1024 0, 1,%((\_RR_)+0),0 + MixStep_1024 2, 3,%((\_RR_)+0),1 + MixStep_1024 4, 5,%((\_RR_)+0),2 + MixStep_1024 6, 7,%((\_RR_)+0),3 + MixStep_1024 8, 9,%((\_RR_)+0),4 + MixStep_1024 10,11,%((\_RR_)+0),5 + MixStep_1024 12,13,%((\_RR_)+0),6 + MixStep_1024 14,15,%((\_RR_)+0),7,1 + #--------- round _RR_+1 + MixStep_1024 0, 9,%((\_RR_)+1),0 + MixStep_1024 2,13,%((\_RR_)+1),1 + MixStep_1024 6,11,%((\_RR_)+1),2 + MixStep_1024 4,15,%((\_RR_)+1),3 + MixStep_1024 10, 7,%((\_RR_)+1),4 + MixStep_1024 12, 3,%((\_RR_)+1),5 + MixStep_1024 14, 5,%((\_RR_)+1),6 + MixStep_1024 8, 1,%((\_RR_)+1),7,1 + #--------- round _RR_+2 + MixStep_1024 0, 7,%((\_RR_)+2),0 + MixStep_1024 2, 5,%((\_RR_)+2),1 + MixStep_1024 4, 3,%((\_RR_)+2),2 + MixStep_1024 6, 1,%((\_RR_)+2),3 + MixStep_1024 12,15,%((\_RR_)+2),4 + MixStep_1024 14,13,%((\_RR_)+2),5 + MixStep_1024 8,11,%((\_RR_)+2),6 + MixStep_1024 10, 9,%((\_RR_)+2),7,1 + #--------- round _RR_+3 + MixStep_1024 0,15,%((\_RR_)+3),0 + MixStep_1024 2,11,%((\_RR_)+3),1 + MixStep_1024 6,13,%((\_RR_)+3),2 + MixStep_1024 4, 9,%((\_RR_)+3),3 + MixStep_1024 14, 1,%((\_RR_)+3),4 + MixStep_1024 8, 5,%((\_RR_)+3),5 + MixStep_1024 10, 3,%((\_RR_)+3),6 + MixStep_1024 12, 7,%((\_RR_)+3),7,1 + + incl %edx #edx = round number + movd %edx,%xmm7 + + #inject the key +.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 + .if _UNROLL_CNT <> (ROUNDS_1024/8) + .if \_NN_ < R_1024_REGS + paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm&\_NN_ + .else + movq X_stk+ 8*\_NN_(%esp),%xmm6 + .if \_NN_ == 15 + paddq %xmm7,%xmm6 + .elseif \_NN_ == 14 + paddq ksTwk+16*2-F_O(%esi),%xmm6 + .elseif \_NN_ == 13 + paddq ksTwk+16*1-F_O(%esi),%xmm6 + .endif + paddq ksKey+16*\_NN_+16-F_O(%esi),%xmm6 + movq %xmm6,X_stk+ 8*\_NN_(%esp) + .endif + .else + .if \_NN_ < R_1024_REGS + paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm&\_NN_ + .else + movq X_stk+ 8*\_NN_(%esp), %xmm6 + paddq ksKey+16*(((_Rbase_/4)+(\_NN_)+1) % 17)-F_O(%ebp),%xmm6 + .if \_NN_ == 15 + paddq %xmm7,%xmm6 + .elseif \_NN_ == 14 + paddq ksTwk+16*(((_Rbase_/4)+2) % 3)-F_O(%ebp),%xmm6 + .elseif \_NN_ == 13 + paddq ksTwk+16*(((_Rbase_/4)+1) % 3)-F_O(%ebp),%xmm6 + .endif + movq %xmm6,X_stk+ 8*\_NN_(%esp) + .endif + .endif +.endr + .if _UNROLL_CNT <> (ROUNDS_1024/8) #rotate the key schedule on the stack + movq ksKey-F_O(%esi), %xmm6 + movq ksTwk-F_O(%esi), %xmm7 + movq %xmm6,ksKey+16*(WCNT+1)-F_O(%esi) + movq %xmm7,ksTwk+16* 3 -F_O(%esi) + addl $16,%esi #bump rolling pointer + .endif + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT ,SAVE_REGS + .endif +.endm #R_1024_FourRounds +# +################ +# +C_label Skein1024_Process_Block +# + WCNT = 16 #WCNT=16 for Skein-1024 + Setup_Stack WCNT,ROUNDS_1024 + addl $0x80,%edi #bias the edi ctxt offsets to keep them all short + # main hash loop for Skein1024 +Skein1024_block_loop: + movd bitAdd(%ebx) ,%xmm0 + movq TWEAK+0-0x80(%edi),%xmm1 + movq TWEAK+8-0x80(%edi),%xmm2 + paddq %xmm0,%xmm1 #bump T0 by the bitAdd parameter + movq %xmm1,TWEAK-0x80(%edi) #save updated tweak value T0 (for next time) + movq %xmm2,%xmm0 + xorpd %xmm1,%xmm0 #compute overall tweak parity + movdqa %xmm1,ksTwk -F_O(%ebp)#save the expanded tweak schedule on the stack + movdqa %xmm2,ksTwk+16-F_O(%ebp) + movdqa %xmm0,ksTwk+32-F_O(%ebp) + + movl blkPtr(%ebx),%esi #esi --> input block + movl $KW_PARITY_LO,%eax #init key schedule parity accumulator + movl $KW_PARITY_HI,%edx + movd %eax ,%xmm7 + movd %edx ,%xmm6 + unpcklps %xmm6,%xmm7 #replicate parity dword to 64 bits +# + leal 0x80(%esp),%eax #use short offsets for Wcopy, X_stk writes below +.irp _NN_,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 + movq X_VARS+8*\_NN_-0x80(%edi),%xmm6 + xorpd %xmm6,%xmm7 #update overall parity + movdqa %xmm6,ksKey+16*\_NN_-F_O(%ebp) #save the key schedule on the stack + .if \_NN_ < R_1024_REGS + _rr_ = \_NN_ + .else + _rr_ = R_1024_REGS + .endif + .irp _rn_,%(_rr_) + movq 8*\_NN_(%esi),%xmm\_rn_ #save copy of the input block on stack + movq %xmm\_rn_,Wcopy+8*\_NN_-0x80(%eax) #(for feedforward later) + paddq %xmm6,%xmm\_rn_ #inject the key into the block + .if \_NN_ == 13 + paddq %xmm1,%xmm\_rn_ #inject the initial tweak words + .elseif \_NN_ == 14 + paddq %xmm2,%xmm\_rn_ + .endif + .if \_NN_ >= R_1024_REGS #only save X[5..15] on stack, leave X[0..4] in regs + movq %xmm\_rn_,X_stk+8*\_NN_-0x80(%eax) + .endif + .endr +.endr + movdqa %xmm7,ksKey+16*WCNT-F_O(%ebp) #save overall key parity at the end of the array +# +.if _SKEIN_DEBUG #debug dump of state at this point + Skein_Debug_Block 1024 + Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL,SAVE_REGS +.endif + addl $WCNT*8,%esi #skip to the next block + movl %esi,blkPtr(%ebx) #save the updated block pointer + # + # now the key schedule is computed. Start the rounds + # + xorl %edx,%edx #edx = round counter +.if SKEIN_ASM_UNROLL & 1024 +_UNROLL_CNT = ROUNDS_1024/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_1024 + .if ((ROUNDS_1024/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_1024" + .endif + movl %ebp,%esi #use this as "rolling" pointer into ksTwk/ksKey +Skein_1024_round_loop: +.endif +# +_Rbase_ = 0 +.rept _UNROLL_CNT*2 + R_1024_FourRounds %_Rbase_ +_Rbase_ = _Rbase_+4 +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 1024) == 0 + cmp $2*(ROUNDS_1024/8),%edx + jb Skein_1024_round_loop +.endif + andb $FIRST_MASK8,TWEAK +15-0x80(%edi) #clear tweak bit for next time thru + #---------------------------- + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} + leal 0x80(%esp),%eax #allow short offsets to X_stk and Wcopy +.irp _NN_,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .if \_NN_ < R_1024_REGS + .if \_NN_ && 1 #already in regs: no load needed + movq Wcopy+ 8*\_NN_-0x80(%eax),%xmm7 #unaligned + xorpd %xmm7,%xmm\_NN_ + .else + xorpd Wcopy+ 8*\_NN_-0x80(%eax),%xmm\_NN_ #aligned + .endif + movq %xmm\_NN_,X_VARS+8*\_NN_-0x80(%edi) + .else + movq X_stk+8*\_NN_-0x80(%eax),%xmm7 #load X value from stack + .if \_NN_ && 1 + movq Wcopy+8*\_NN_-0x80(%eax),%xmm6 #unaligned + xorpd %xmm6,%xmm7 + .else + xorpd Wcopy+8*\_NN_-0x80(%eax),%xmm7 #aligned + .endif + movq %xmm7,X_VARS+8*\_NN_-0x80(%edi) + .endif +.endr +.if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD #no need to save regs on stack here +.endif + # go back for more blocks, if needed + decl %ecx + jnz Skein1024_block_loop + + Reset_Stack _Skein1024_Process_Block + ret +# +.ifdef _SKEIN_CODE_SIZE +C_label Skein1024_Process_Block_CodeSize + movl $(_Skein1024_Process_Block_CodeSize - _Skein1024_Process_Block),%eax + ret +# +C_label Skein1024_Unroll_Cnt + .if _UNROLL_CNT <> ROUNDS_1024/8 + movl $_UNROLL_CNT,%eax + .else + xorl %eax,%eax + .endif + ret +.endif +# +.endif # _USE_ASM_ & 1024 +#---------------------------------------------------------------- + .end Index: sys/contrib/skein/brg_endian.h =================================================================== --- /dev/null +++ sys/contrib/skein/brg_endian.h @@ -0,0 +1,148 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 20/10/2006 +*/ + +#ifndef BRG_ENDIAN_H +#define BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined(AVR) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) || defined( AVR ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order +#endif +#endif + +/* special handler for IA64, which may be either endianness (?) */ +/* here we assume little-endian, but this may need to be changed */ +#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) +# define PLATFORM_MUST_ALIGN (1) +#ifndef PLATFORM_BYTE_ORDER +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif +#endif + +#ifndef PLATFORM_MUST_ALIGN +# define PLATFORM_MUST_ALIGN (0) +#endif + +#endif /* ifndef BRG_ENDIAN_H */ Index: sys/contrib/skein/brg_types.h =================================================================== --- /dev/null +++ sys/contrib/skein/brg_types.h @@ -0,0 +1,188 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 09/09/2006 + + The unsigned integer types defined here are of the form uint_t where + is the length of the type; for example, the unsigned 32-bit type is + 'uint_32t'. These are NOT the same as the 'C99 integer types' that are + defined in the inttypes.h and stdint.h headers since attempts to use these + types have shown that support for them is still highly variable. However, + since the latter are of the form uint_t, a regular expression search + and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') + can be used to convert the types used here to the C99 standard types. +*/ + +#ifndef BRG_TYPES_H +#define BRG_TYPES_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include + +#ifndef BRG_UI8 +# define BRG_UI8 +# if UCHAR_MAX == 255u + typedef unsigned char uint_8t; +# else +# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI16 +# define BRG_UI16 +# if USHRT_MAX == 65535u + typedef unsigned short uint_16t; +# else +# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h +# endif +#endif + +#ifndef BRG_UI32 +# define BRG_UI32 +# if UINT_MAX == 4294967295u +# define li_32(h) 0x##h##u + typedef unsigned int uint_32t; +# elif ULONG_MAX == 4294967295u +# define li_32(h) 0x##h##ul + typedef unsigned long uint_32t; +# elif defined( _CRAY ) +# error This code needs 32-bit data types, which Cray machines do not provide +# else +# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI64 +# if defined( __BORLANDC__ ) && !defined( __MSDOS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */ +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( __sun ) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u +# if UINT_MAX == 18446744073709551615u +# define BRG_UI64 +# define li_64(h) 0x##h##u + typedef unsigned int uint_64t; +# endif +# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u +# if ULONG_MAX == 18446744073709551615ul +# define BRG_UI64 +# define li_64(h) 0x##h##ul + typedef unsigned long uint_64t; +# endif +# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u +# if ULLONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u +# if ULONG_LONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# elif defined(__GNUC__) /* DLW: avoid mingw problem with -ansi */ +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +#endif + +#if defined( NEED_UINT_64T ) && !defined( BRG_UI64 ) +# error Please define uint_64t as an unsigned 64 bit type in brg_types.h +#endif + +#ifndef RETURN_VALUES +# define RETURN_VALUES +# if defined( DLL_EXPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllexport ) void __stdcall +# define INT_RETURN __declspec( dllexport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllexport__ ) void +# define INT_RETURN __declspec( __dllexport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( DLL_IMPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllimport ) void __stdcall +# define INT_RETURN __declspec( dllimport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllimport__ ) void +# define INT_RETURN __declspec( __dllimport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( __WATCOMC__ ) +# define VOID_RETURN void __cdecl +# define INT_RETURN int __cdecl +# else +# define VOID_RETURN void +# define INT_RETURN int +# endif +#endif + +/* These defines are used to declare buffers in a way that allows + faster operations on longer variables to be used. In all these + defines 'size' must be a power of 2 and >= 8 + + dec_unit_type(size,x) declares a variable 'x' of length + 'size' bits + + dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' + bytes defined as an array of variables + each of 'size' bits (bsize must be a + multiple of size / 8) + + ptr_cast(x,size) casts a pointer to a pointer to a + varaiable of length 'size' bits +*/ + +#define ui_type(size) uint_##size##t +#define dec_unit_type(size,x) typedef ui_type(size) x +#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)] +#define ptr_cast(x,size) ((ui_type(size)*)(x)) + +#if defined(__cplusplus) +} +#endif + +#endif Index: sys/contrib/skein/skein.h =================================================================== --- /dev/null +++ sys/contrib/skein/skein.h @@ -0,0 +1,327 @@ +#ifndef _SKEIN_H_ +#define _SKEIN_H_ 1 +/************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** +** +** The following compile-time switches may be defined to control some +** tradeoffs between speed, code size, error checking, and security. +** +** The "default" note explains what happens when the switch is not defined. +** +** SKEIN_DEBUG -- make callouts from inside Skein code +** to examine/display intermediate values. +** [default: no callouts (no overhead)] +** +** SKEIN_ERR_CHECK -- how error checking is handled inside Skein +** code. If not defined, most error checking +** is disabled (for performance). Otherwise, +** the switch value is interpreted as: +** 0: use assert() to flag errors +** 1: return SKEIN_FAIL to flag errors +** +***************************************************************************/ +#ifdef __cplusplus +extern "C" +{ +#endif + +#include /* get size_t definition */ +#include "skein_port.h" /* get platform-specific definitions */ + +enum + { + SKEIN_SUCCESS = 0, /* return codes from Skein calls */ + SKEIN_FAIL = 1, + SKEIN_BAD_HASHLEN = 2 + }; + +#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ + +#define SKEIN_256_STATE_WORDS ( 4) +#define SKEIN_512_STATE_WORDS ( 8) +#define SKEIN1024_STATE_WORDS (16) +#define SKEIN_MAX_STATE_WORDS (16) + +#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) + +#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) + +#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) + +typedef struct + { + size_t hashBitLen; /* size of hash result, in bits */ + size_t bCnt; /* current byte count in buffer b[] */ + u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ + } Skein_Ctxt_Hdr_t; + +typedef struct /* 256-bit Skein hash context structure */ + { + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + } Skein_256_Ctxt_t; + +typedef struct /* 512-bit Skein hash context structure */ + { + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + } Skein_512_Ctxt_t; + +typedef struct /* 1024-bit Skein hash context structure */ + { + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + } Skein1024_Ctxt_t; + +/* Skein APIs for (incremental) "straight hashing" */ +int Skein_256_Init (Skein_256_Ctxt_t *ctx, size_t hashBitLen); +int Skein_512_Init (Skein_512_Ctxt_t *ctx, size_t hashBitLen); +int Skein1024_Init (Skein1024_Ctxt_t *ctx, size_t hashBitLen); + +int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); +int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); + +int Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal); +int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); +int Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); + +/* +** Skein APIs for "extended" initialization: MAC keys, tree hashing. +** After an InitExt() call, just use Update/Final calls as with Init(). +** +** Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes. +** When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, +** the results of InitExt() are identical to calling Init(). +** The function Init() may be called once to "precompute" the IV for +** a given hashBitLen value, then by saving a copy of the context +** the IV computation may be avoided in later calls. +** Similarly, the function InitExt() may be called once per MAC key +** to precompute the MAC IV, then a copy of the context saved and +** reused for each new MAC computation. +**/ +int Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes); +int Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes); +int Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes); + +/* +** Skein APIs for MAC and tree hash: +** Final_Pad: pad, do final block, but no OUTPUT type +** Output: do just the output stage +*/ +int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t * hashVal); +int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t * hashVal); +int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal); + +#ifndef SKEIN_TREE_HASH +#define SKEIN_TREE_HASH (1) +#endif +#if SKEIN_TREE_HASH +int Skein_256_Output (Skein_256_Ctxt_t *ctx, u08b_t * hashVal); +int Skein_512_Output (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); +int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); +#endif + +/***************************************************************** +** "Internal" Skein definitions +** -- not needed for sequential hashing API, but will be +** helpful for other uses of Skein (e.g., tree hash mode). +** -- included here so that they can be shared between +** reference and optimized code. +******************************************************************/ + +/* tweak word T[1]: bit field starting positions */ +#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ + +#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ + +/* tweak word T[1]: flag bit definition(s) */ +#define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL) +#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t) 1 ) << SKEIN_T1_POS_BIT_PAD) + +/* tweak word T[1]: tree level bit field mask */ +#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL) +#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL) + +/* tweak word T[1]: block type field */ +#define SKEIN_BLK_TYPE_KEY ( 0) /* key, for MAC and KDF */ +#define SKEIN_BLK_TYPE_CFG ( 4) /* configuration block */ +#define SKEIN_BLK_TYPE_PERS ( 8) /* personalization string */ +#define SKEIN_BLK_TYPE_PK (12) /* public key (for digital signature hashing) */ +#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */ +#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ +#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */ + +#define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) /* key, for MAC and KDF */ +#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) /* configuration block */ +#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) /* personalization string */ +#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) /* public key (for digital signature hashing) */ +#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) /* key identifier for KDF */ +#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */ +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ +#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */ + +#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL) +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define SKEIN_VERSION (1) + +#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */ +#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/ +#endif + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32)) +#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) + +#define SKEIN_CFG_STR_LEN (4*8) + +/* bit field definitions in config block treeInfo word */ +#define SKEIN_CFG_TREE_LEAF_SIZE_POS ( 0) +#define SKEIN_CFG_TREE_NODE_SIZE_POS ( 8) +#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16) + +#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS) +#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS) +#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS) + +#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl) \ + ( (((u64b_t)(leaf )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \ + (((u64b_t)(node )) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \ + (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) ) + +#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */ + +/* +** Skein macros for getting/setting tweak words, etc. +** These are useful for partial input bytes, hash tree init/update, etc. +**/ +#define Skein_Get_Tweak(ctxPtr,TWK_NUM) ((ctxPtr)->h.T[TWK_NUM]) +#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} + +#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr,0) +#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr,1) +#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) +#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) + +/* set both tweak words at once */ +#define Skein_Set_T0_T1(ctxPtr,T0,T1) \ + { \ + Skein_Set_T0(ctxPtr,(T0)); \ + Skein_Set_T1(ctxPtr,(T1)); \ + } + +#define Skein_Set_Type(ctxPtr,BLK_TYPE) \ + Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE) + +/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */ +#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ + { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } + +#define Skein_Clear_First_Flag(hdr) { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; } +#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } + +#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);} + +/***************************************************************** +** "Internal" Skein definitions for debugging and error checking +******************************************************************/ +#ifdef SKEIN_DEBUG /* examine/display intermediate values? */ +#include "skein_debug.h" +#else /* default is no callouts */ +#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr) +#define Skein_Show_Round(bits,ctx,r,X) +#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr) +#define Skein_Show_Final(bits,ctx,cnt,outPtr) +#define Skein_Show_Key(bits,ctx,key,keyBytes) +#endif + +#ifndef SKEIN_ERR_CHECK /* run-time checks (e.g., bad params, uninitialized context)? */ +#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */ +#define Skein_assert(x) +#elif defined(SKEIN_ASSERT) +#include +#define Skein_Assert(x,retCode) assert(x) +#define Skein_assert(x) assert(x) +#else +#include +#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */ +#define Skein_assert(x) assert(x) /* internal error */ +#endif + +/***************************************************************** +** Skein block function constants (shared across Ref and Opt code) +******************************************************************/ +enum + { + /* Skein_256 round rotation constants */ + R_256_0_0=14, R_256_0_1=16, + R_256_1_0=52, R_256_1_1=57, + R_256_2_0=23, R_256_2_1=40, + R_256_3_0= 5, R_256_3_1=37, + R_256_4_0=25, R_256_4_1=33, + R_256_5_0=46, R_256_5_1=12, + R_256_6_0=58, R_256_6_1=22, + R_256_7_0=32, R_256_7_1=32, + + /* Skein_512 round rotation constants */ + R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, + R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, + R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, + R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, + R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, + R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, + R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, + R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22, + + /* Skein1024 round rotation constants */ + R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37, + R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52, + R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17, + R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25, + R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30, + R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41, + R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25, + R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20 + }; + +#ifndef SKEIN_ROUNDS +#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */ +#define SKEIN_512_ROUNDS_TOTAL (72) +#define SKEIN1024_ROUNDS_TOTAL (80) +#else /* allow command-line define in range 8*(5..14) */ +#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5)) +#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5)) +#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _SKEIN_H_ */ Index: sys/contrib/skein/skein.c =================================================================== --- /dev/null +++ sys/contrib/skein/skein.c @@ -0,0 +1,753 @@ +/*********************************************************************** +** +** Implementation of the Skein hash function. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +************************************************************************/ + +#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */ + +#include /* get the memcpy/memset functions */ +#include "skein.h" /* get the Skein API definitions */ +#include "skein_iv.h" /* get precomputed IVs */ + +/*****************************************************************/ +/* External function to process blkCnt (nonzero) full block(s) of data. */ +void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); +void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); +void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); + +/*****************************************************************/ +/* 256-bit Skein */ +/*****************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a straight hashing operation */ +int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen) + { + union + { + u08b_t b[SKEIN_256_STATE_BYTES]; + u64b_t w[SKEIN_256_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) + { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 256: memcpy(ctx->X,SKEIN_256_IV_256,sizeof(ctx->X)); break; + case 224: memcpy(ctx->X,SKEIN_256_IV_224,sizeof(ctx->X)); break; + case 160: memcpy(ctx->X,SKEIN_256_IV_160,sizeof(ctx->X)); break; + case 128: memcpy(ctx->X,SKEIN_256_IV_128,sizeof(ctx->X)); break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ + Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + break; + } + /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ + /* Set up to process the data message portion of the hash (default) */ + Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a MAC and/or tree hash operation */ +/* [identical to Skein_256_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */ +int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes) + { + union + { + u08b_t b[SKEIN_256_STATE_BYTES]; + u64b_t w[SKEIN_256_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) /* is there a key? */ + { + memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */ + } + else /* here to pre-process a key */ + { + Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X)); + /* do a mini-Init right here */ + ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */ + Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */ + Skein_256_Update(ctx,key,keyBytes); /* hash the key */ + Skein_256_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */ + memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */ +#if SKEIN_NEED_SWAP + { + uint_t i; + for (i=0;iX[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* build/process the config block, type == CONFIG (could be precomputed for each key) */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx,CFG_FINAL); + + memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + + Skein_Show_Key(256,&ctx->h,key,keyBytes); + + /* compute the initial chaining values from config block */ + Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx,MSG); + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process the input bytes */ +int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) + { + size_t n; + + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) + { + if (ctx->h.bCnt) /* finish up any buffered message data */ + { + n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if (n) + { + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES); + Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* now process any remaining full blocks, directly from input message data */ + if (msgByteCnt > SKEIN_256_BLOCK_BYTES) + { + n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */ + Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_256_BLOCK_BYTES; + msg += n * SKEIN_256_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) + { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES); + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the result */ +int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN_256_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + + Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_256_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein_256_API_CodeSize(void) + { + return ((u08b_t *) Skein_256_API_CodeSize) - + ((u08b_t *) Skein_256_Init); + } +#endif + +/*****************************************************************/ +/* 512-bit Skein */ +/*****************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a straight hashing operation */ +int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen) + { + union + { + u08b_t b[SKEIN_512_STATE_BYTES]; + u64b_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) + { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X)); break; + case 384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X)); break; + case 256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X)); break; + case 224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X)); break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ + Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + break; + } + + /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ + /* Set up to process the data message portion of the hash (default) */ + Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a MAC and/or tree hash operation */ +/* [identical to Skein_512_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */ +int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes) + { + union + { + u08b_t b[SKEIN_512_STATE_BYTES]; + u64b_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) /* is there a key? */ + { + memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */ + } + else /* here to pre-process a key */ + { + Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X)); + /* do a mini-Init right here */ + ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */ + Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */ + Skein_512_Update(ctx,key,keyBytes); /* hash the key */ + Skein_512_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */ + memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */ +#if SKEIN_NEED_SWAP + { + uint_t i; + for (i=0;iX[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* build/process the config block, type == CONFIG (could be precomputed for each key) */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx,CFG_FINAL); + + memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + + Skein_Show_Key(512,&ctx->h,key,keyBytes); + + /* compute the initial chaining values from config block */ + Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx,MSG); + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process the input bytes */ +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) + { + size_t n; + + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + { + if (ctx->h.bCnt) /* finish up any buffered message data */ + { + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if (n) + { + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* now process any remaining full blocks, directly from input message data */ + if (msgByteCnt > SKEIN_512_BLOCK_BYTES) + { + n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ + Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) + { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the result */ +int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN_512_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + + Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein_512_API_CodeSize(void) + { + return ((u08b_t *) Skein_512_API_CodeSize) - + ((u08b_t *) Skein_512_Init); + } +#endif + +/*****************************************************************/ +/* 1024-bit Skein */ +/*****************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a straight hashing operation */ +int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen) + { + union + { + u08b_t b[SKEIN1024_STATE_BYTES]; + u64b_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) + { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break; + case 384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break; + case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ + Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + break; + } + + /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ + /* Set up to process the data message portion of the hash (default) */ + Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a MAC and/or tree hash operation */ +/* [identical to Skein1024_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */ +int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes) + { + union + { + u08b_t b[SKEIN1024_STATE_BYTES]; + u64b_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) /* is there a key? */ + { + memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */ + } + else /* here to pre-process a key */ + { + Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X)); + /* do a mini-Init right here */ + ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */ + Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */ + Skein1024_Update(ctx,key,keyBytes); /* hash the key */ + Skein1024_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */ + memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */ +#if SKEIN_NEED_SWAP + { + uint_t i; + for (i=0;iX[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* build/process the config block, type == CONFIG (could be precomputed for each key) */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx,CFG_FINAL); + + memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + + Skein_Show_Key(1024,&ctx->h,key,keyBytes); + + /* compute the initial chaining values from config block */ + Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx,MSG); + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process the input bytes */ +int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) + { + size_t n; + + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) + { + if (ctx->h.bCnt) /* finish up any buffered message data */ + { + n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if (n) + { + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES); + Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* now process any remaining full blocks, directly from input message data */ + if (msgByteCnt > SKEIN1024_BLOCK_BYTES) + { + n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */ + Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES); + msgByteCnt -= n * SKEIN1024_BLOCK_BYTES; + msg += n * SKEIN1024_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) + { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES); + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the result */ +int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN1024_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + + Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein1024_API_CodeSize(void) + { + return ((u08b_t *) Skein1024_API_CodeSize) - + ((u08b_t *) Skein1024_Init); + } +#endif + +/**************** Functions to support MAC/tree hashing ***************/ +/* (this code is identical for Optimized and Reference versions) */ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the block, no OUTPUT stage */ +int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t *hashVal) + { + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_256_BLOCK_BYTES); /* "output" the state bytes */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the block, no OUTPUT stage */ +int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) + { + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_512_BLOCK_BYTES); /* "output" the state bytes */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the block, no OUTPUT stage */ +int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) + { + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN1024_BLOCK_BYTES); /* "output" the state bytes */ + + return SKEIN_SUCCESS; + } + +#if SKEIN_TREE_HASH +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* just do the OUTPUT stage */ +int Skein_256_Output(Skein_256_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN_256_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_256_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* just do the OUTPUT stage */ +int Skein_512_Output(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN_512_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* just do the OUTPUT stage */ +int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN1024_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } +#endif Index: sys/contrib/skein/skein_block.c =================================================================== --- /dev/null +++ sys/contrib/skein/skein_block.c @@ -0,0 +1,689 @@ +/*********************************************************************** +** +** Implementation of the Skein block functions. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +** Compile-time switches: +** +** SKEIN_USE_ASM -- set bits (256/512/1024) to select which +** versions use ASM code for block processing +** [default: use C for all block sizes] +** +************************************************************************/ + +#include +#include "skein.h" + +#ifndef SKEIN_USE_ASM +#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ +#endif + +#ifndef SKEIN_LOOP +#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ +#endif + +#define BLK_BITS (WCNT*64) /* some useful definitions for code here */ +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +#ifdef SKEIN_DEBUG +#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } +#else +#define DebugSaveTweak(ctx) +#endif + +/***************************** Skein_256 ******************************/ +#if !(SKEIN_USE_ASM & 256) +void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) + { /* do it in C */ + enum + { + WCNT = SKEIN_256_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN_256_ROUNDS_TOTAL/8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10) +#else +#define SKEIN_UNROLL_256 (0) +#endif + +#if SKEIN_UNROLL_256 +#if (RCNT % SKEIN_UNROLL_256) +#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ +#endif + size_t r; + u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ +#else + u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ +#endif + u64b_t X0,X1,X2,X3; /* local copy of context vars, for speed */ + u64b_t w [WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + const u64b_t *Xptr[4]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; +#endif + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* this implementation only supports 2**64 input bytes (no carry out here) */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1] + ts[0]; + X2 = w[2] + ks[2] + ts[1]; + X3 = w[3] + ks[3]; + + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); /* show starting state values */ + + blkPtr += SKEIN_256_BLOCK_BYTES; + + /* run the rounds */ + +#define Round256(p0,p1,p2,p3,ROT,rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ + +#if SKEIN_UNROLL_256 == 0 +#define R256(p0,p1,p2,p3,ROT,rNum) /* fully unrolled */ \ + Round256(p0,p1,p2,p3,ROT,rNum) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr); + +#define I256(R) \ + X0 += ks[((R)+1) % 5]; /* inject the key schedule value */ \ + X1 += ks[((R)+2) % 5] + ts[((R)+1) % 3]; \ + X2 += ks[((R)+3) % 5] + ts[((R)+2) % 3]; \ + X3 += ks[((R)+4) % 5] + (R)+1; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); +#else /* looping version */ +#define R256(p0,p1,p2,p3,ROT,rNum) \ + Round256(p0,p1,p2,p3,ROT,rNum) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr); + +#define I256(R) \ + X0 += ks[r+(R)+0]; /* inject the key schedule value */ \ + X1 += ks[r+(R)+1] + ts[r+(R)+0]; \ + X2 += ks[r+(R)+2] + ts[r+(R)+1]; \ + X3 += ks[r+(R)+3] + r+(R) ; \ + ks[r + (R)+4 ] = ks[r+(R)-1]; /* rotate key schedule */\ + ts[r + (R)+2 ] = ts[r+(R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); + + for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256) /* loop thru it */ +#endif + { +#define R256_8_rounds(R) \ + R256(0,1,2,3,R_256_0,8*(R) + 1); \ + R256(0,3,2,1,R_256_1,8*(R) + 2); \ + R256(0,1,2,3,R_256_2,8*(R) + 3); \ + R256(0,3,2,1,R_256_3,8*(R) + 4); \ + I256(2*(R)); \ + R256(0,1,2,3,R_256_4,8*(R) + 5); \ + R256(0,3,2,1,R_256_5,8*(R) + 6); \ + R256(0,1,2,3,R_256_6,8*(R) + 7); \ + R256(0,3,2,1,R_256_7,8*(R) + 8); \ + I256(2*(R)+1); + + R256_8_rounds( 0); + +#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN))) + + #if R256_Unroll_R( 1) + R256_8_rounds( 1); + #endif + #if R256_Unroll_R( 2) + R256_8_rounds( 2); + #endif + #if R256_Unroll_R( 3) + R256_8_rounds( 3); + #endif + #if R256_Unroll_R( 4) + R256_8_rounds( 4); + #endif + #if R256_Unroll_R( 5) + R256_8_rounds( 5); + #endif + #if R256_Unroll_R( 6) + R256_8_rounds( 6); + #endif + #if R256_Unroll_R( 7) + R256_8_rounds( 7); + #endif + #if R256_Unroll_R( 8) + R256_8_rounds( 8); + #endif + #if R256_Unroll_R( 9) + R256_8_rounds( 9); + #endif + #if R256_Unroll_R(10) + R256_8_rounds(10); + #endif + #if R256_Unroll_R(11) + R256_8_rounds(11); + #endif + #if R256_Unroll_R(12) + R256_8_rounds(12); + #endif + #if R256_Unroll_R(13) + R256_8_rounds(13); + #endif + #if R256_Unroll_R(14) + R256_8_rounds(14); + #endif + #if (SKEIN_UNROLL_256 > 14) +#error "need more unrolling in Skein_256_Process_Block" + #endif + } + /* do the final "feedforward" xor, update context chaining vars */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + + Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein_256_Process_Block_CodeSize(void) + { + return ((u08b_t *) Skein_256_Process_Block_CodeSize) - + ((u08b_t *) Skein_256_Process_Block); + } +uint_t Skein_256_Unroll_Cnt(void) + { + return SKEIN_UNROLL_256; + } +#endif +#endif + +/***************************** Skein_512 ******************************/ +#if !(SKEIN_USE_ASM & 512) +void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) + { /* do it in C */ + enum + { + WCNT = SKEIN_512_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN_512_ROUNDS_TOTAL/8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10) +#else +#define SKEIN_UNROLL_512 (0) +#endif + +#if SKEIN_UNROLL_512 +#if (RCNT % SKEIN_UNROLL_512) +#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ +#endif + size_t r; + u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ +#else + u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ +#endif + u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ + u64b_t w [WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + const u64b_t *Xptr[8]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; + Xptr[4] = &X4; Xptr[5] = &X5; Xptr[6] = &X6; Xptr[7] = &X7; +#endif + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* this implementation only supports 2**64 input bytes (no carry out here) */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; + + blkPtr += SKEIN_512_BLOCK_BYTES; + + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); + /* run the rounds */ +#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ + X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ + X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ + +#if SKEIN_UNROLL_512 == 0 +#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \ + Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr); + +#define I512(R) \ + X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \ + X1 += ks[((R)+2) % 9]; \ + X2 += ks[((R)+3) % 9]; \ + X3 += ks[((R)+4) % 9]; \ + X4 += ks[((R)+5) % 9]; \ + X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ + X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ + X7 += ks[((R)+8) % 9] + (R)+1; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); +#else /* looping version */ +#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr); + +#define I512(R) \ + X0 += ks[r+(R)+0]; /* inject the key schedule value */ \ + X1 += ks[r+(R)+1]; \ + X2 += ks[r+(R)+2]; \ + X3 += ks[r+(R)+3]; \ + X4 += ks[r+(R)+4]; \ + X5 += ks[r+(R)+5] + ts[r+(R)+0]; \ + X6 += ks[r+(R)+6] + ts[r+(R)+1]; \ + X7 += ks[r+(R)+7] + r+(R) ; \ + ks[r + (R)+8] = ks[r+(R)-1]; /* rotate key schedule */ \ + ts[r + (R)+2] = ts[r+(R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); + + for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512) /* loop thru it */ +#endif /* end of looped code definitions */ + { +#define R512_8_rounds(R) /* do 8 full rounds */ \ + R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ + R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ + R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ + R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ + I512(2*(R)); \ + R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ + R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ + R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ + R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ + I512(2*(R)+1); /* and key injection */ + + R512_8_rounds( 0); + +#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN))) + + #if R512_Unroll_R( 1) + R512_8_rounds( 1); + #endif + #if R512_Unroll_R( 2) + R512_8_rounds( 2); + #endif + #if R512_Unroll_R( 3) + R512_8_rounds( 3); + #endif + #if R512_Unroll_R( 4) + R512_8_rounds( 4); + #endif + #if R512_Unroll_R( 5) + R512_8_rounds( 5); + #endif + #if R512_Unroll_R( 6) + R512_8_rounds( 6); + #endif + #if R512_Unroll_R( 7) + R512_8_rounds( 7); + #endif + #if R512_Unroll_R( 8) + R512_8_rounds( 8); + #endif + #if R512_Unroll_R( 9) + R512_8_rounds( 9); + #endif + #if R512_Unroll_R(10) + R512_8_rounds(10); + #endif + #if R512_Unroll_R(11) + R512_8_rounds(11); + #endif + #if R512_Unroll_R(12) + R512_8_rounds(12); + #endif + #if R512_Unroll_R(13) + R512_8_rounds(13); + #endif + #if R512_Unroll_R(14) + R512_8_rounds(14); + #endif + #if (SKEIN_UNROLL_512 > 14) +#error "need more unrolling in Skein_512_Process_Block" + #endif + } + + /* do the final "feedforward" xor, update context chaining vars */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + ctx->X[4] = X4 ^ w[4]; + ctx->X[5] = X5 ^ w[5]; + ctx->X[6] = X6 ^ w[6]; + ctx->X[7] = X7 ^ w[7]; + Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein_512_Process_Block_CodeSize(void) + { + return ((u08b_t *) Skein_512_Process_Block_CodeSize) - + ((u08b_t *) Skein_512_Process_Block); + } +uint_t Skein_512_Unroll_Cnt(void) + { + return SKEIN_UNROLL_512; + } +#endif +#endif + +/***************************** Skein1024 ******************************/ +#if !(SKEIN_USE_ASM & 1024) +void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) + { /* do it in C, always looping (unrolled is bigger AND slower!) */ + enum + { + WCNT = SKEIN1024_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN1024_ROUNDS_TOTAL/8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) +#else +#define SKEIN_UNROLL_1024 (0) +#endif + +#if (SKEIN_UNROLL_1024 != 0) +#if (RCNT % SKEIN_UNROLL_1024) +#error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ +#endif + size_t r; + u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ +#else + u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ +#endif + + u64b_t X00,X01,X02,X03,X04,X05,X06,X07, /* local copy of vars, for speed */ + X08,X09,X10,X11,X12,X13,X14,X15; + u64b_t w [WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + const u64b_t *Xptr[16]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[ 0] = &X00; Xptr[ 1] = &X01; Xptr[ 2] = &X02; Xptr[ 3] = &X03; + Xptr[ 4] = &X04; Xptr[ 5] = &X05; Xptr[ 6] = &X06; Xptr[ 7] = &X07; + Xptr[ 8] = &X08; Xptr[ 9] = &X09; Xptr[10] = &X10; Xptr[11] = &X11; + Xptr[12] = &X12; Xptr[13] = &X13; Xptr[14] = &X14; Xptr[15] = &X15; +#endif + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* this implementation only supports 2**64 input bytes (no carry out here) */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[ 0] = ctx->X[ 0]; + ks[ 1] = ctx->X[ 1]; + ks[ 2] = ctx->X[ 2]; + ks[ 3] = ctx->X[ 3]; + ks[ 4] = ctx->X[ 4]; + ks[ 5] = ctx->X[ 5]; + ks[ 6] = ctx->X[ 6]; + ks[ 7] = ctx->X[ 7]; + ks[ 8] = ctx->X[ 8]; + ks[ 9] = ctx->X[ 9]; + ks[10] = ctx->X[10]; + ks[11] = ctx->X[11]; + ks[12] = ctx->X[12]; + ks[13] = ctx->X[13]; + ks[14] = ctx->X[14]; + ks[15] = ctx->X[15]; + ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^ + ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^ + ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^ + ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + + X00 = w[ 0] + ks[ 0]; /* do the first full key injection */ + X01 = w[ 1] + ks[ 1]; + X02 = w[ 2] + ks[ 2]; + X03 = w[ 3] + ks[ 3]; + X04 = w[ 4] + ks[ 4]; + X05 = w[ 5] + ks[ 5]; + X06 = w[ 6] + ks[ 6]; + X07 = w[ 7] + ks[ 7]; + X08 = w[ 8] + ks[ 8]; + X09 = w[ 9] + ks[ 9]; + X10 = w[10] + ks[10]; + X11 = w[11] + ks[11]; + X12 = w[12] + ks[12]; + X13 = w[13] + ks[13] + ts[0]; + X14 = w[14] + ks[14] + ts[1]; + X15 = w[15] + ks[15]; + + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); + +#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ + X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ + X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ + X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8; \ + X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA; \ + X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC; \ + X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE; \ + +#if SKEIN_UNROLL_1024 == 0 +#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ + Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr); + +#define I1024(R) \ + X00 += ks[((R)+ 1) % 17]; /* inject the key schedule value */ \ + X01 += ks[((R)+ 2) % 17]; \ + X02 += ks[((R)+ 3) % 17]; \ + X03 += ks[((R)+ 4) % 17]; \ + X04 += ks[((R)+ 5) % 17]; \ + X05 += ks[((R)+ 6) % 17]; \ + X06 += ks[((R)+ 7) % 17]; \ + X07 += ks[((R)+ 8) % 17]; \ + X08 += ks[((R)+ 9) % 17]; \ + X09 += ks[((R)+10) % 17]; \ + X10 += ks[((R)+11) % 17]; \ + X11 += ks[((R)+12) % 17]; \ + X12 += ks[((R)+13) % 17]; \ + X13 += ks[((R)+14) % 17] + ts[((R)+1) % 3]; \ + X14 += ks[((R)+15) % 17] + ts[((R)+2) % 3]; \ + X15 += ks[((R)+16) % 17] + (R)+1; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); +#else /* looping version */ +#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ + Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr); + +#define I1024(R) \ + X00 += ks[r+(R)+ 0]; /* inject the key schedule value */ \ + X01 += ks[r+(R)+ 1]; \ + X02 += ks[r+(R)+ 2]; \ + X03 += ks[r+(R)+ 3]; \ + X04 += ks[r+(R)+ 4]; \ + X05 += ks[r+(R)+ 5]; \ + X06 += ks[r+(R)+ 6]; \ + X07 += ks[r+(R)+ 7]; \ + X08 += ks[r+(R)+ 8]; \ + X09 += ks[r+(R)+ 9]; \ + X10 += ks[r+(R)+10]; \ + X11 += ks[r+(R)+11]; \ + X12 += ks[r+(R)+12]; \ + X13 += ks[r+(R)+13] + ts[r+(R)+0]; \ + X14 += ks[r+(R)+14] + ts[r+(R)+1]; \ + X15 += ks[r+(R)+15] + r+(R) ; \ + ks[r + (R)+16] = ks[r+(R)-1]; /* rotate key schedule */ \ + ts[r + (R)+ 2] = ts[r+(R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); + + for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024) /* loop thru it */ +#endif + { +#define R1024_8_rounds(R) /* do 8 full rounds */ \ + R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \ + R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \ + R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \ + R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \ + I1024(2*(R)); \ + R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \ + R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \ + R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \ + R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \ + I1024(2*(R)+1); + + R1024_8_rounds( 0); + +#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN))) + + #if R1024_Unroll_R( 1) + R1024_8_rounds( 1); + #endif + #if R1024_Unroll_R( 2) + R1024_8_rounds( 2); + #endif + #if R1024_Unroll_R( 3) + R1024_8_rounds( 3); + #endif + #if R1024_Unroll_R( 4) + R1024_8_rounds( 4); + #endif + #if R1024_Unroll_R( 5) + R1024_8_rounds( 5); + #endif + #if R1024_Unroll_R( 6) + R1024_8_rounds( 6); + #endif + #if R1024_Unroll_R( 7) + R1024_8_rounds( 7); + #endif + #if R1024_Unroll_R( 8) + R1024_8_rounds( 8); + #endif + #if R1024_Unroll_R( 9) + R1024_8_rounds( 9); + #endif + #if R1024_Unroll_R(10) + R1024_8_rounds(10); + #endif + #if R1024_Unroll_R(11) + R1024_8_rounds(11); + #endif + #if R1024_Unroll_R(12) + R1024_8_rounds(12); + #endif + #if R1024_Unroll_R(13) + R1024_8_rounds(13); + #endif + #if R1024_Unroll_R(14) + R1024_8_rounds(14); + #endif + #if (SKEIN_UNROLL_1024 > 14) +#error "need more unrolling in Skein_1024_Process_Block" + #endif + } + /* do the final "feedforward" xor, update context chaining vars */ + + ctx->X[ 0] = X00 ^ w[ 0]; + ctx->X[ 1] = X01 ^ w[ 1]; + ctx->X[ 2] = X02 ^ w[ 2]; + ctx->X[ 3] = X03 ^ w[ 3]; + ctx->X[ 4] = X04 ^ w[ 4]; + ctx->X[ 5] = X05 ^ w[ 5]; + ctx->X[ 6] = X06 ^ w[ 6]; + ctx->X[ 7] = X07 ^ w[ 7]; + ctx->X[ 8] = X08 ^ w[ 8]; + ctx->X[ 9] = X09 ^ w[ 9]; + ctx->X[10] = X10 ^ w[10]; + ctx->X[11] = X11 ^ w[11]; + ctx->X[12] = X12 ^ w[12]; + ctx->X[13] = X13 ^ w[13]; + ctx->X[14] = X14 ^ w[14]; + ctx->X[15] = X15 ^ w[15]; + + Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + blkPtr += SKEIN1024_BLOCK_BYTES; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein1024_Process_Block_CodeSize(void) + { + return ((u08b_t *) Skein1024_Process_Block_CodeSize) - + ((u08b_t *) Skein1024_Process_Block); + } +uint_t Skein1024_Unroll_Cnt(void) + { + return SKEIN_UNROLL_1024; + } +#endif +#endif Index: sys/contrib/skein/skein_debug.h =================================================================== --- /dev/null +++ sys/contrib/skein/skein_debug.h @@ -0,0 +1,48 @@ +#ifndef _SKEIN_DEBUG_H_ +#define _SKEIN_DEBUG_H_ +/*********************************************************************** +** +** Interface definitions for Skein hashing debug output. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +************************************************************************/ + +#ifdef SKEIN_DEBUG +/* callout functions used inside Skein code */ +void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X,const u08b_t *blkPtr, + const u64b_t *wPtr,const u64b_t *ksPtr,const u64b_t *tsPtr); +void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X); +void Skein_Show_R_Ptr(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X_ptr[]); +void Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr); +void Skein_Show_Key (uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes); + +extern uint_t skein_DebugFlag; /* flags to control debug output (0 --> none) */ + +#define SKEIN_RND_SPECIAL (1000u) +#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL+0u) +#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL+1u) +#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL+2u) + +/* flag bits: skein_DebugFlag */ +#define SKEIN_DEBUG_KEY (1u << 1) /* show MAC key */ +#define SKEIN_DEBUG_CONFIG (1u << 2) /* show config block processing */ +#define SKEIN_DEBUG_STATE (1u << 3) /* show input state during Show_Block() */ +#define SKEIN_DEBUG_TWEAK (1u << 4) /* show input state during Show_Block() */ +#define SKEIN_DEBUG_KEYSCHED (1u << 5) /* show expanded key schedule */ +#define SKEIN_DEBUG_INPUT_64 (1u << 6) /* show input block as 64-bit words */ +#define SKEIN_DEBUG_INPUT_08 (1u << 7) /* show input block as 8-bit bytes */ +#define SKEIN_DEBUG_INJECT (1u << 8) /* show state after key injection & feedforward points */ +#define SKEIN_DEBUG_ROUNDS (1u << 9) /* show state after all rounds */ +#define SKEIN_DEBUG_FINAL (1u <<10) /* show final output of Skein */ +#define SKEIN_DEBUG_HDR (1u <<11) /* show block header */ +#define SKEIN_DEBUG_THREEFISH (1u <<12) /* use Threefish name instead of Skein */ +#define SKEIN_DEBUG_PERMUTE (1u <<13) /* use word permutations */ +#define SKEIN_DEBUG_ALL ((~0u) & ~(SKEIN_DEBUG_THREEFISH | SKEIN_DEBUG_PERMUTE)) +#define THREEFISH_DEBUG_ALL (SKEIN_DEBUG_ALL | SKEIN_DEBUG_THREEFISH) + +#endif /* SKEIN_DEBUG */ + +#endif /* _SKEIN_DEBUG_H_ */ Index: sys/contrib/skein/skein_debug.c =================================================================== --- /dev/null +++ sys/contrib/skein/skein_debug.c @@ -0,0 +1,247 @@ +/*********************************************************************** +** +** Debug output functions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +************************************************************************/ +#include + +#ifdef SKEIN_DEBUG /* only instantiate this code if SKEIN_DEBUG is on */ +#include "skein.h" + +static const char INDENT[] = " "; /* how much to indent on new line */ + +uint_t skein_DebugFlag = 0; /* off by default. Must be set externally */ + +static void Show64_step(size_t cnt,const u64b_t *X,size_t step) + { + size_t i,j; + for (i=j=0;i < cnt;i++,j+=step) + { + if (i % 4 == 0) printf(INDENT); + printf(" %08X.%08X ",(uint_32t)(X[j] >> 32),(uint_32t)X[j]); + if (i % 4 == 3 || i==cnt-1) printf("\n"); + fflush(stdout); + } + } + +#define Show64(cnt,X) Show64_step(cnt,X,1) + +static void Show64_flag(size_t cnt,const u64b_t *X) + { + size_t xptr = (size_t) X; + size_t step = (xptr & 1) ? 2 : 1; + if (step != 1) + { + X = (const u64b_t *) (xptr & ~1); + } + Show64_step(cnt,X,step); + } + +static void Show08(size_t cnt,const u08b_t *b) + { + size_t i; + for (i=0;i < cnt;i++) + { + if (i %16 == 0) printf(INDENT); + else if (i % 4 == 0) printf(" "); + printf(" %02X",b[i]); + if (i %16 == 15 || i==cnt-1) printf("\n"); + fflush(stdout); + } + } + +static const char *AlgoHeader(uint_t bits) + { + if (skein_DebugFlag & SKEIN_DEBUG_THREEFISH) + switch (bits) + { + case 256: return ":Threefish-256: "; + case 512: return ":Threefish-512: "; + case 1024: return ":Threefish-1024:"; + } + else + switch (bits) + { + case 256: return ":Skein-256: "; + case 512: return ":Skein-512: "; + case 1024: return ":Skein-1024:"; + } + return NULL; + } + +void Skein_Show_Final(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t cnt,const u08b_t *outPtr) + { + if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG)) + if (skein_DebugFlag & SKEIN_DEBUG_FINAL) + { + printf("\n%s Final output=\n",AlgoHeader(bits)); + Show08(cnt,outPtr); + printf(" ++++++++++\n"); + fflush(stdout); + } + } + +/* show state after a round (or "pseudo-round") */ +void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,size_t r,const u64b_t *X) + { + static uint_t injectNum=0; /* not multi-thread safe! */ + + if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG)) + if (skein_DebugFlag) + { + if (r >= SKEIN_RND_SPECIAL) + { /* a key injection (or feedforward) point */ + injectNum = (r == SKEIN_RND_KEY_INITIAL) ? 0 : injectNum+1; + if ( skein_DebugFlag & SKEIN_DEBUG_INJECT || + ((skein_DebugFlag & SKEIN_DEBUG_FINAL) && r == SKEIN_RND_FEED_FWD)) + { + printf("\n%s",AlgoHeader(bits)); + switch (r) + { + case SKEIN_RND_KEY_INITIAL: + printf(" [state after initial key injection]"); + break; + case SKEIN_RND_KEY_INJECT: + printf(" [state after key injection #%02d]",injectNum); + break; + case SKEIN_RND_FEED_FWD: + printf(" [state after plaintext feedforward]"); + injectNum = 0; + break; + } + printf("=\n"); + Show64(bits/64,X); + if (r== SKEIN_RND_FEED_FWD) + printf(" ----------\n"); + } + } + else if (skein_DebugFlag & SKEIN_DEBUG_ROUNDS) + { + uint_t j; + u64b_t p[SKEIN_MAX_STATE_WORDS]; + const u08b_t *perm; + const static u08b_t PERM_256 [4][ 4] = { { 0,1,2,3 }, { 0,3,2,1 }, { 0,1,2,3 }, { 0,3,2,1 } }; + const static u08b_t PERM_512 [4][ 8] = { { 0,1,2,3,4,5,6,7 }, + { 2,1,4,7,6,5,0,3 }, + { 4,1,6,3,0,5,2,7 }, + { 6,1,0,7,2,5,4,3 } + }; + const static u08b_t PERM_1024[4][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 }, + { 0, 9, 2,13, 6,11, 4,15,10, 7,12, 3,14, 5, 8, 1 }, + { 0, 7, 2, 5, 4, 3, 6, 1,12,15,14,13, 8,11,10, 9 }, + { 0,15, 2,11, 6,13, 4, 9,14, 1, 8, 5,10, 3,12, 7 } + }; + + if ((skein_DebugFlag & SKEIN_DEBUG_PERMUTE) && (r & 3)) + { + printf("\n%s [state after round %2d (permuted)]=\n",AlgoHeader(bits),(int)r); + switch (bits) + { + case 256: perm = PERM_256 [r&3]; break; + case 512: perm = PERM_512 [r&3]; break; + default: perm = PERM_1024[r&3]; break; + } + for (j=0;jT[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG)) + if (skein_DebugFlag) + { + if (skein_DebugFlag & SKEIN_DEBUG_HDR) + { + printf("\n%s Block: outBits=%4d. T0=%06X.",AlgoHeader(bits),(uint_t) h->hashBitLen,(uint_t)h->T[0]); + printf(" Type="); + n = (uint_t) ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) >> SKEIN_T1_POS_BLK_TYPE); + switch (n) + { + case SKEIN_BLK_TYPE_KEY: printf("KEY. "); break; + case SKEIN_BLK_TYPE_CFG: printf("CFG. "); break; + case SKEIN_BLK_TYPE_PERS: printf("PERS."); break; + case SKEIN_BLK_TYPE_PK : printf("PK. "); break; + case SKEIN_BLK_TYPE_KDF: printf("KDF. "); break; + case SKEIN_BLK_TYPE_MSG: printf("MSG. "); break; + case SKEIN_BLK_TYPE_OUT: printf("OUT. "); break; + default: printf("0x%02X.",n); break; + } + printf(" Flags="); + printf((h->T[1] & SKEIN_T1_FLAG_FIRST) ? " First":" "); + printf((h->T[1] & SKEIN_T1_FLAG_FINAL) ? " Final":" "); + printf((h->T[1] & SKEIN_T1_FLAG_BIT_PAD) ? " Pad" :" "); + n = (uint_t) ((h->T[1] & SKEIN_T1_TREE_LVL_MASK) >> SKEIN_T1_POS_TREE_LVL); + if (n) + printf(" TreeLevel = %02X",n); + printf("\n"); + fflush(stdout); + } + if (skein_DebugFlag & SKEIN_DEBUG_TWEAK) + { + printf(" Tweak:\n"); + Show64(2,h->T); + } + if (skein_DebugFlag & SKEIN_DEBUG_STATE) + { + printf(" %s words:\n",(skein_DebugFlag & SKEIN_DEBUG_THREEFISH)?"Key":"State"); + Show64(bits/64,X); + } + if (skein_DebugFlag & SKEIN_DEBUG_KEYSCHED) + { + printf(" Tweak schedule:\n"); + Show64_flag(3,tsPtr); + printf(" Key schedule:\n"); + Show64_flag((bits/64)+1,ksPtr); + } + if (skein_DebugFlag & SKEIN_DEBUG_INPUT_64) + { + printf(" Input block (words):\n"); + Show64(bits/64,wPtr); + } + if (skein_DebugFlag & SKEIN_DEBUG_INPUT_08) + { + printf(" Input block (bytes):\n"); + Show08(bits/8,blkPtr); + } + } + } + +void Skein_Show_Key(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u08b_t *key,size_t keyBytes) + { + if (keyBytes) + if (skein_DebugFlag & SKEIN_DEBUG_CONFIG || ((h->T[1] & SKEIN_T1_BLK_TYPE_MASK) != SKEIN_T1_BLK_TYPE_CFG)) + if (skein_DebugFlag & SKEIN_DEBUG_KEY) + { + printf("\n%s MAC key = %4u bytes\n",AlgoHeader(bits),(unsigned) keyBytes); + Show08(keyBytes,key); + } + } +#endif Index: sys/contrib/skein/skein_iv.h =================================================================== --- /dev/null +++ sys/contrib/skein/skein_iv.h @@ -0,0 +1,199 @@ +#ifndef _SKEIN_IV_H_ +#define _SKEIN_IV_H_ + +#include "skein.h" /* get Skein macros and types */ + +/* +***************** Pre-computed Skein IVs ******************* +** +** NOTE: these values are not "magic" constants, but +** are generated using the Threefish block function. +** They are pre-computed here only for speed; i.e., to +** avoid the need for a Threefish call during Init(). +** +** The IV for any fixed hash length may be pre-computed. +** Only the most common values are included here. +** +************************************************************ +**/ + +#define MK_64 SKEIN_MK_64 + +/* blkSize = 256 bits. hashSize = 128 bits */ +const u64b_t SKEIN_256_IV_128[] = + { + MK_64(0xE1111906,0x964D7260), + MK_64(0x883DAAA7,0x7C8D811C), + MK_64(0x10080DF4,0x91960F7A), + MK_64(0xCCF7DDE5,0xB45BC1C2) + }; + +/* blkSize = 256 bits. hashSize = 160 bits */ +const u64b_t SKEIN_256_IV_160[] = + { + MK_64(0x14202314,0x72825E98), + MK_64(0x2AC4E9A2,0x5A77E590), + MK_64(0xD47A5856,0x8838D63E), + MK_64(0x2DD2E496,0x8586AB7D) + }; + +/* blkSize = 256 bits. hashSize = 224 bits */ +const u64b_t SKEIN_256_IV_224[] = + { + MK_64(0xC6098A8C,0x9AE5EA0B), + MK_64(0x876D5686,0x08C5191C), + MK_64(0x99CB88D7,0xD7F53884), + MK_64(0x384BDDB1,0xAEDDB5DE) + }; + +/* blkSize = 256 bits. hashSize = 256 bits */ +const u64b_t SKEIN_256_IV_256[] = + { + MK_64(0xFC9DA860,0xD048B449), + MK_64(0x2FCA6647,0x9FA7D833), + MK_64(0xB33BC389,0x6656840F), + MK_64(0x6A54E920,0xFDE8DA69) + }; + +/* blkSize = 512 bits. hashSize = 128 bits */ +const u64b_t SKEIN_512_IV_128[] = + { + MK_64(0xA8BC7BF3,0x6FBF9F52), + MK_64(0x1E9872CE,0xBD1AF0AA), + MK_64(0x309B1790,0xB32190D3), + MK_64(0xBCFBB854,0x3F94805C), + MK_64(0x0DA61BCD,0x6E31B11B), + MK_64(0x1A18EBEA,0xD46A32E3), + MK_64(0xA2CC5B18,0xCE84AA82), + MK_64(0x6982AB28,0x9D46982D) + }; + +/* blkSize = 512 bits. hashSize = 160 bits */ +const u64b_t SKEIN_512_IV_160[] = + { + MK_64(0x28B81A2A,0xE013BD91), + MK_64(0xC2F11668,0xB5BDF78F), + MK_64(0x1760D8F3,0xF6A56F12), + MK_64(0x4FB74758,0x8239904F), + MK_64(0x21EDE07F,0x7EAF5056), + MK_64(0xD908922E,0x63ED70B8), + MK_64(0xB8EC76FF,0xECCB52FA), + MK_64(0x01A47BB8,0xA3F27A6E) + }; + +/* blkSize = 512 bits. hashSize = 224 bits */ +const u64b_t SKEIN_512_IV_224[] = + { + MK_64(0xCCD06162,0x48677224), + MK_64(0xCBA65CF3,0xA92339EF), + MK_64(0x8CCD69D6,0x52FF4B64), + MK_64(0x398AED7B,0x3AB890B4), + MK_64(0x0F59D1B1,0x457D2BD0), + MK_64(0x6776FE65,0x75D4EB3D), + MK_64(0x99FBC70E,0x997413E9), + MK_64(0x9E2CFCCF,0xE1C41EF7) + }; + +/* blkSize = 512 bits. hashSize = 256 bits */ +const u64b_t SKEIN_512_IV_256[] = + { + MK_64(0xCCD044A1,0x2FDB3E13), + MK_64(0xE8359030,0x1A79A9EB), + MK_64(0x55AEA061,0x4F816E6F), + MK_64(0x2A2767A4,0xAE9B94DB), + MK_64(0xEC06025E,0x74DD7683), + MK_64(0xE7A436CD,0xC4746251), + MK_64(0xC36FBAF9,0x393AD185), + MK_64(0x3EEDBA18,0x33EDFC13) + }; + +/* blkSize = 512 bits. hashSize = 384 bits */ +const u64b_t SKEIN_512_IV_384[] = + { + MK_64(0xA3F6C6BF,0x3A75EF5F), + MK_64(0xB0FEF9CC,0xFD84FAA4), + MK_64(0x9D77DD66,0x3D770CFE), + MK_64(0xD798CBF3,0xB468FDDA), + MK_64(0x1BC4A666,0x8A0E4465), + MK_64(0x7ED7D434,0xE5807407), + MK_64(0x548FC1AC,0xD4EC44D6), + MK_64(0x266E1754,0x6AA18FF8) + }; + +/* blkSize = 512 bits. hashSize = 512 bits */ +const u64b_t SKEIN_512_IV_512[] = + { + MK_64(0x4903ADFF,0x749C51CE), + MK_64(0x0D95DE39,0x9746DF03), + MK_64(0x8FD19341,0x27C79BCE), + MK_64(0x9A255629,0xFF352CB1), + MK_64(0x5DB62599,0xDF6CA7B0), + MK_64(0xEABE394C,0xA9D5C3F4), + MK_64(0x991112C7,0x1A75B523), + MK_64(0xAE18A40B,0x660FCC33) + }; + +/* blkSize = 1024 bits. hashSize = 384 bits */ +const u64b_t SKEIN1024_IV_384[] = + { + MK_64(0x5102B6B8,0xC1894A35), + MK_64(0xFEEBC9E3,0xFE8AF11A), + MK_64(0x0C807F06,0xE32BED71), + MK_64(0x60C13A52,0xB41A91F6), + MK_64(0x9716D35D,0xD4917C38), + MK_64(0xE780DF12,0x6FD31D3A), + MK_64(0x797846B6,0xC898303A), + MK_64(0xB172C2A8,0xB3572A3B), + MK_64(0xC9BC8203,0xA6104A6C), + MK_64(0x65909338,0xD75624F4), + MK_64(0x94BCC568,0x4B3F81A0), + MK_64(0x3EBBF51E,0x10ECFD46), + MK_64(0x2DF50F0B,0xEEB08542), + MK_64(0x3B5A6530,0x0DBC6516), + MK_64(0x484B9CD2,0x167BBCE1), + MK_64(0x2D136947,0xD4CBAFEA) + }; + +/* blkSize = 1024 bits. hashSize = 512 bits */ +const u64b_t SKEIN1024_IV_512[] = + { + MK_64(0xCAEC0E5D,0x7C1B1B18), + MK_64(0xA01B0E04,0x5F03E802), + MK_64(0x33840451,0xED912885), + MK_64(0x374AFB04,0xEAEC2E1C), + MK_64(0xDF25A0E2,0x813581F7), + MK_64(0xE4004093,0x8B12F9D2), + MK_64(0xA662D539,0xC2ED39B6), + MK_64(0xFA8B85CF,0x45D8C75A), + MK_64(0x8316ED8E,0x29EDE796), + MK_64(0x053289C0,0x2E9F91B8), + MK_64(0xC3F8EF1D,0x6D518B73), + MK_64(0xBDCEC3C4,0xD5EF332E), + MK_64(0x549A7E52,0x22974487), + MK_64(0x67070872,0x5B749816), + MK_64(0xB9CD28FB,0xF0581BD1), + MK_64(0x0E2940B8,0x15804974) + }; + +/* blkSize = 1024 bits. hashSize = 1024 bits */ +const u64b_t SKEIN1024_IV_1024[] = + { + MK_64(0xD593DA07,0x41E72355), + MK_64(0x15B5E511,0xAC73E00C), + MK_64(0x5180E5AE,0xBAF2C4F0), + MK_64(0x03BD41D3,0xFCBCAFAF), + MK_64(0x1CAEC6FD,0x1983A898), + MK_64(0x6E510B8B,0xCDD0589F), + MK_64(0x77E2BDFD,0xC6394ADA), + MK_64(0xC11E1DB5,0x24DCB0A3), + MK_64(0xD6D14AF9,0xC6329AB5), + MK_64(0x6A9B0BFC,0x6EB67E0D), + MK_64(0x9243C60D,0xCCFF1332), + MK_64(0x1A1F1DDE,0x743F02D4), + MK_64(0x0996753C,0x10ED0BB8), + MK_64(0x6572DD22,0xF2B4969A), + MK_64(0x61FD3062,0xD00A579A), + MK_64(0x1DE0536E,0x8682E539) + }; + +#endif /* _SKEIN_IV_H_ */ Index: sys/contrib/skein/skein_port.h =================================================================== --- /dev/null +++ sys/contrib/skein/skein_port.h @@ -0,0 +1,124 @@ +#ifndef _SKEIN_PORT_H_ +#define _SKEIN_PORT_H_ +/******************************************************************* +** +** Platform-specific definitions for Skein hash function. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +** Many thanks to Brian Gladman for his portable header files. +** +** To port Skein to an "unsupported" platform, change the definitions +** in this file appropriately. +** +********************************************************************/ + +#include "brg_types.h" /* get integer type definitions */ + +typedef unsigned int uint_t; /* native unsigned integer */ +typedef uint_8t u08b_t; /* 8-bit unsigned integer */ +typedef uint_64t u64b_t; /* 64-bit unsigned integer */ + +#ifndef RotL_64 +#define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N)))) +#endif + +/* + * Skein is "natively" little-endian (unlike SHA-xxx), for optimal + * performance on x86 CPUs. The Skein code requires the following + * definitions for dealing with endianness: + * + * SKEIN_NEED_SWAP: 0 for little-endian, 1 for big-endian + * Skein_Put64_LSB_First + * Skein_Get64_LSB_First + * Skein_Swap64 + * + * If SKEIN_NEED_SWAP is defined at compile time, it is used here + * along with the portable versions of Put64/Get64/Swap64, which + * are slow in general. + * + * Otherwise, an "auto-detect" of endianness is attempted below. + * If the default handling doesn't work well, the user may insert + * platform-specific code instead (e.g., for big-endian CPUs). + * + */ +#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */ + +#include "brg_endian.h" /* get endianness selection */ +#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN + /* here for big-endian CPUs */ +#define SKEIN_NEED_SWAP (1) +#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ +#define SKEIN_NEED_SWAP (0) +#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */ +#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt) +#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt)) +#endif +#else +#error "Skein needs endianness setting!" +#endif + +#endif /* ifndef SKEIN_NEED_SWAP */ + +/* + ****************************************************************** + * Provide any definitions still needed. + ****************************************************************** + */ +#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */ +#if SKEIN_NEED_SWAP +#define Skein_Swap64(w64) \ + ( (( ((u64b_t)(w64)) & 0xFF) << 56) | \ + (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) | \ + (((((u64b_t)(w64)) >>16) & 0xFF) << 40) | \ + (((((u64b_t)(w64)) >>24) & 0xFF) << 32) | \ + (((((u64b_t)(w64)) >>32) & 0xFF) << 24) | \ + (((((u64b_t)(w64)) >>40) & 0xFF) << 16) | \ + (((((u64b_t)(w64)) >>48) & 0xFF) << 8) | \ + (((((u64b_t)(w64)) >>56) & 0xFF) ) ) +#else +#define Skein_Swap64(w64) (w64) +#endif +#endif /* ifndef Skein_Swap64 */ + + +#ifndef Skein_Put64_LSB_First +void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ + { /* this version is fully portable (big-endian or little-endian), but slow */ + size_t n; + + for (n=0;n>3] >> (8*(n&7))); + } +#else + ; /* output only the function prototype */ +#endif +#endif /* ifndef Skein_Put64_LSB_First */ + + +#ifndef Skein_Get64_LSB_First +void Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ + { /* this version is fully portable (big-endian or little-endian), but slow */ + size_t n; + + for (n=0;n<8*wCnt;n+=8) + dst[n/8] = (((u64b_t) src[n ]) ) + + (((u64b_t) src[n+1]) << 8) + + (((u64b_t) src[n+2]) << 16) + + (((u64b_t) src[n+3]) << 24) + + (((u64b_t) src[n+4]) << 32) + + (((u64b_t) src[n+5]) << 40) + + (((u64b_t) src[n+6]) << 48) + + (((u64b_t) src[n+7]) << 56) ; + } +#else + ; /* output only the function prototype */ +#endif +#endif /* ifndef Skein_Get64_LSB_First */ + +#endif /* ifndef _SKEIN_PORT_H_ */ Index: sys/crypto/skein/amd64/skein_block_asm.s =================================================================== --- /dev/null +++ sys/crypto/skein/amd64/skein_block_asm.s @@ -0,0 +1,1328 @@ +# +#---------------------------------------------------------------- +# 64-bit x86 assembler code (gnu as) for Skein block functions +# +# Author: Doug Whiting, Hifn/Exar +# +# This code is released to the public domain. +#---------------------------------------------------------------- +# + .text + .altmacro + .psize 0,128 #list file has no page boundaries +# +_MASK_ALL_ = (256+512+1024) #all three algorithm bits +_MAX_FRAME_ = 240 +# +################# +.ifndef SKEIN_USE_ASM +_USE_ASM_ = _MASK_ALL_ +.else +_USE_ASM_ = SKEIN_USE_ASM +.endif +################# +.ifndef SKEIN_LOOP #configure loop unrolling +_SKEIN_LOOP = 2 #default is fully unrolled for 256/512, twice for 1024 +.else +_SKEIN_LOOP = SKEIN_LOOP + .irp _NN_,%_SKEIN_LOOP #only display loop unrolling if default changed on command line +.print "+++ SKEIN_LOOP = \_NN_" + .endr +.endif +# the unroll counts (0 --> fully unrolled) +SKEIN_UNROLL_256 = (_SKEIN_LOOP / 100) % 10 +SKEIN_UNROLL_512 = (_SKEIN_LOOP / 10) % 10 +SKEIN_UNROLL_1024 = (_SKEIN_LOOP ) % 10 +# +SKEIN_ASM_UNROLL = 0 + .irp _NN_,256,512,1024 + .if (SKEIN_UNROLL_\_NN_) == 0 +SKEIN_ASM_UNROLL = SKEIN_ASM_UNROLL + \_NN_ + .endif + .endr +################# +# +.ifndef SKEIN_ROUNDS +ROUNDS_256 = 72 +ROUNDS_512 = 72 +ROUNDS_1024 = 80 +.else +ROUNDS_256 = 8*((((SKEIN_ROUNDS / 100) + 5) % 10) + 5) +ROUNDS_512 = 8*((((SKEIN_ROUNDS / 10) + 5) % 10) + 5) +ROUNDS_1024 = 8*((((SKEIN_ROUNDS ) + 5) % 10) + 5) +# only display rounds if default size is changed on command line +.irp _NN_,256,512,1024 + .if _USE_ASM_ && \_NN_ + .irp _RR_,%(ROUNDS_\_NN_) + .if _NN_ < 1024 +.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" + .else +.print "+++ SKEIN_ROUNDS_\_NN_ = \_RR_" + .endif + .endr + .endif +.endr +.endif +################# +# +.ifdef SKEIN_CODE_SIZE +_SKEIN_CODE_SIZE = (1) +.else +.ifdef SKEIN_PERF #use code size if SKEIN_PERF is defined +_SKEIN_CODE_SIZE = (1) +.else +_SKEIN_CODE_SIZE = (0) +.endif +.endif +# +################# +# +.ifndef SKEIN_DEBUG +_SKEIN_DEBUG = 0 +.else +_SKEIN_DEBUG = 1 +.endif +################# +# +# define offsets of fields in hash context structure +# +HASH_BITS = 0 #bits of hash output +BCNT = 8 + HASH_BITS #number of bytes in BUFFER[] +TWEAK = 8 + BCNT #tweak values[0..1] +X_VARS = 16 + TWEAK #chaining vars +# +#(Note: buffer[] in context structure is NOT needed here :-) +# +KW_PARITY = 0x1BD11BDAA9FC1A22 #overall parity of key schedule words +FIRST_MASK = ~ (1 << 6) +FIRST_MASK64= ~ (1 << 62) +# +# rotation constants for Skein +# +RC_256_0_0 = 14 +RC_256_0_1 = 16 + +RC_256_1_0 = 52 +RC_256_1_1 = 57 + +RC_256_2_0 = 23 +RC_256_2_1 = 40 + +RC_256_3_0 = 5 +RC_256_3_1 = 37 + +RC_256_4_0 = 25 +RC_256_4_1 = 33 + +RC_256_5_0 = 46 +RC_256_5_1 = 12 + +RC_256_6_0 = 58 +RC_256_6_1 = 22 + +RC_256_7_0 = 32 +RC_256_7_1 = 32 + +RC_512_0_0 = 46 +RC_512_0_1 = 36 +RC_512_0_2 = 19 +RC_512_0_3 = 37 + +RC_512_1_0 = 33 +RC_512_1_1 = 27 +RC_512_1_2 = 14 +RC_512_1_3 = 42 + +RC_512_2_0 = 17 +RC_512_2_1 = 49 +RC_512_2_2 = 36 +RC_512_2_3 = 39 + +RC_512_3_0 = 44 +RC_512_3_1 = 9 +RC_512_3_2 = 54 +RC_512_3_3 = 56 + +RC_512_4_0 = 39 +RC_512_4_1 = 30 +RC_512_4_2 = 34 +RC_512_4_3 = 24 + +RC_512_5_0 = 13 +RC_512_5_1 = 50 +RC_512_5_2 = 10 +RC_512_5_3 = 17 + +RC_512_6_0 = 25 +RC_512_6_1 = 29 +RC_512_6_2 = 39 +RC_512_6_3 = 43 + +RC_512_7_0 = 8 +RC_512_7_1 = 35 +RC_512_7_2 = 56 +RC_512_7_3 = 22 + +RC_1024_0_0 = 24 +RC_1024_0_1 = 13 +RC_1024_0_2 = 8 +RC_1024_0_3 = 47 +RC_1024_0_4 = 8 +RC_1024_0_5 = 17 +RC_1024_0_6 = 22 +RC_1024_0_7 = 37 + +RC_1024_1_0 = 38 +RC_1024_1_1 = 19 +RC_1024_1_2 = 10 +RC_1024_1_3 = 55 +RC_1024_1_4 = 49 +RC_1024_1_5 = 18 +RC_1024_1_6 = 23 +RC_1024_1_7 = 52 + +RC_1024_2_0 = 33 +RC_1024_2_1 = 4 +RC_1024_2_2 = 51 +RC_1024_2_3 = 13 +RC_1024_2_4 = 34 +RC_1024_2_5 = 41 +RC_1024_2_6 = 59 +RC_1024_2_7 = 17 + +RC_1024_3_0 = 5 +RC_1024_3_1 = 20 +RC_1024_3_2 = 48 +RC_1024_3_3 = 41 +RC_1024_3_4 = 47 +RC_1024_3_5 = 28 +RC_1024_3_6 = 16 +RC_1024_3_7 = 25 + +RC_1024_4_0 = 41 +RC_1024_4_1 = 9 +RC_1024_4_2 = 37 +RC_1024_4_3 = 31 +RC_1024_4_4 = 12 +RC_1024_4_5 = 47 +RC_1024_4_6 = 44 +RC_1024_4_7 = 30 + +RC_1024_5_0 = 16 +RC_1024_5_1 = 34 +RC_1024_5_2 = 56 +RC_1024_5_3 = 51 +RC_1024_5_4 = 4 +RC_1024_5_5 = 53 +RC_1024_5_6 = 42 +RC_1024_5_7 = 41 + +RC_1024_6_0 = 31 +RC_1024_6_1 = 44 +RC_1024_6_2 = 47 +RC_1024_6_3 = 46 +RC_1024_6_4 = 19 +RC_1024_6_5 = 42 +RC_1024_6_6 = 44 +RC_1024_6_7 = 25 + +RC_1024_7_0 = 9 +RC_1024_7_1 = 48 +RC_1024_7_2 = 35 +RC_1024_7_3 = 52 +RC_1024_7_4 = 23 +RC_1024_7_5 = 31 +RC_1024_7_6 = 37 +RC_1024_7_7 = 20 +# +# Input: reg +# Output: <<< RC_BlkSize_roundNum_mixNum, BlkSize=256/512/1024 +# +.macro RotL64 reg,BLK_SIZE,ROUND_NUM,MIX_NUM +_RCNT_ = RC_\BLK_SIZE&_\ROUND_NUM&_\MIX_NUM + .if _RCNT_ #is there anything to do? + rolq $_RCNT_,%\reg + .endif +.endm +# +#---------------------------------------------------------------- +# +# MACROS: define local vars and configure stack +# +#---------------------------------------------------------------- +# declare allocated space on the stack +.macro StackVar localName,localSize +\localName = _STK_OFFS_ +_STK_OFFS_ = _STK_OFFS_+(\localSize) +.endm #StackVar +# +#---------------------------------------------------------------- +# +# MACRO: Configure stack frame, allocate local vars +# +.macro Setup_Stack BLK_BITS,KS_CNT,debugCnt + WCNT = (\BLK_BITS)/64 +# +_PushCnt_ = 0 #save nonvolatile regs on stack + .irp _reg_,rbp,rbx,r12,r13,r14,r15 + pushq %\_reg_ +_PushCnt_ = _PushCnt_ + 1 #track count to keep alignment + .endr +# +_STK_OFFS_ = 0 #starting offset from rsp + #---- local variables #<-- rsp + StackVar X_stk ,8*(WCNT) #local context vars + StackVar ksTwk ,8*3 #key schedule: tweak words + StackVar ksKey ,8*(WCNT)+8 #key schedule: key words + .if (SKEIN_ASM_UNROLL && (\BLK_BITS)) == 0 + StackVar ksRot ,16*(\KS_CNT) #leave space for "rotation" to happen + .endif + StackVar Wcopy ,8*(WCNT) #copy of input block + .if _SKEIN_DEBUG + .if \debugCnt + 0 #temp location for debug X[] info + StackVar xDebug_\BLK_BITS ,8*(\debugCnt) + .endif + .endif + .if ((8*_PushCnt_ + _STK_OFFS_) % 8) == 0 + StackVar align16,8 #keep 16-byte aligned (adjust for retAddr?) +tmpStk_\BLK_BITS = align16 #use this + .endif + #---- saved caller parameters (from regs rdi, rsi, rdx, rcx) + StackVar ctxPtr ,8 #context ptr + StackVar blkPtr ,8 #pointer to block data + StackVar blkCnt ,8 #number of full blocks to process + StackVar bitAdd ,8 #bit count to add to tweak +LOCAL_SIZE = _STK_OFFS_ #size of "local" vars + #---- + StackVar savRegs,8*_PushCnt_ #saved registers + StackVar retAddr,8 #return address + #---- caller's stack frame (aligned mod 16) +# +# set up the stack frame pointer (rbp) +# +FRAME_OFFS = ksTwk + 128 #allow short (negative) offset to ksTwk, kwKey + .if FRAME_OFFS > _STK_OFFS_ #keep rbp in the "locals" range +FRAME_OFFS = _STK_OFFS_ + .endif +F_O = -FRAME_OFFS +# + #put some useful defines in the .lst file (for grep) +__STK_LCL_SIZE_\BLK_BITS = LOCAL_SIZE +__STK_TOT_SIZE_\BLK_BITS = _STK_OFFS_ +__STK_FRM_OFFS_\BLK_BITS = FRAME_OFFS +# +# Notes on stack frame setup: +# * the most frequently used variable is X_stk[], based at [rsp+0] +# * the next most used is the key schedule arrays, ksKey and ksTwk +# so rbp is "centered" there, allowing short offsets to the key +# schedule even in 1024-bit Skein case +# * the Wcopy variables are infrequently accessed, but they have long +# offsets from both rsp and rbp only in the 1024-bit case. +# * all other local vars and calling parameters can be accessed +# with short offsets, except in the 1024-bit case +# + subq $LOCAL_SIZE,%rsp #make room for the locals + leaq FRAME_OFFS(%rsp),%rbp #maximize use of short offsets + movq %rdi, ctxPtr+F_O(%rbp) #save caller's parameters on the stack + movq %rsi, blkPtr+F_O(%rbp) + movq %rdx, blkCnt+F_O(%rbp) + movq %rcx, bitAdd+F_O(%rbp) +# +.endm #Setup_Stack +# +#---------------------------------------------------------------- +# +.macro Reset_Stack + addq $LOCAL_SIZE,%rsp #get rid of locals (wipe??) + .irp _reg_,r15,r14,r13,r12,rbx,rbp + popq %\_reg_ #restore caller's regs +_PushCnt_ = _PushCnt_ - 1 + .endr + .if _PushCnt_ + .error "Mismatched push/pops?" + .endif +.endm # Reset_Stack +# +#---------------------------------------------------------------- +# macros to help debug internals +# +.if _SKEIN_DEBUG + .extern Skein_Show_Block #calls to C routines + .extern Skein_Show_Round +# +SKEIN_RND_SPECIAL = 1000 +SKEIN_RND_KEY_INITIAL = SKEIN_RND_SPECIAL+0 +SKEIN_RND_KEY_INJECT = SKEIN_RND_SPECIAL+1 +SKEIN_RND_FEED_FWD = SKEIN_RND_SPECIAL+2 +# +.macro Skein_Debug_Block BLK_BITS +# +#void Skein_Show_Block(uint_t bits,const Skein_Ctxt_Hdr_t *h,const u64b_t *X, +# const u08b_t *blkPtr, const u64b_t *wPtr, +# const u64b_t *ksPtr,const u64b_t *tsPtr) +# +_NN_ = 0 + .irp _reg_,rax,rcx,rdx,rsi,rdi,r8,r9,r10,r11 + pushq %\_reg_ #save all volatile regs on tack before the call +_NN_ = _NN_ + 1 + .endr + # get and push call parameters + movq $\BLK_BITS ,%rdi #bits + movq ctxPtr+F_O(%rbp),%rsi #h (pointer) + leaq X_VARS (%rsi),%rdx #X (pointer) + movq blkPtr+F_O(%rbp),%rcx #blkPtr + leaq Wcopy +F_O(%rbp),%r8 #wPtr + leaq ksKey +F_O(%rbp),%r9 #key pointer + leaq ksTwk +F_O(%rbp),%rax #tweak pointer + pushq %rax # (pass on the stack) + call Skein_Show_Block #call external debug handler + addq $8*1,%rsp #discard parameters on stack + .if (_NN_ % 2 ) == 0 #check stack alignment + .error "Stack misalignment problem in Skein_Debug_Block_\_BLK_BITS" + .endif + .irp _reg_,r11,r10,r9,r8,rdi,rsi,rdx,rcx,rax + popq %\_reg_ #restore regs +_NN_ = _NN_ - 1 + .endr + .if _NN_ + .error "Push/pop mismatch problem in Skein_Debug_Block_\_BLK_BITS" + .endif +.endm # Skein_Debug_Block +# +# the macro to "call" to debug a round +# +.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp + # call the appropriate (local) debug "function" + pushq %rdx #save rdx, so we can use it for round "number" + .if (SKEIN_ASM_UNROLL && \BLK_BITS) || (\R >= SKEIN_RND_SPECIAL) + movq $\R,%rdx + .else #compute round number using edi +_rOffs_ = \RDI_OFFS + 0 + .if \BLK_BITS == 1024 + movq rIdx_offs+8(%rsp),%rdx #get rIdx off the stack (adjust for pushq rdx above) + leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdx,4),%rdx + .else + leaq 1+(((\R)-1) && 3)+_rOffs_(,%rdi,4),%rdx + .endif + .endif + call Skein_Debug_Round_\BLK_BITS + popq %rdx #restore origianl rdx value +# + afterOp +.endm # Skein_Debug_Round +.else #------- _SKEIN_DEBUG (dummy macros if debug not enabled) +.macro Skein_Debug_Block BLK_BITS +.endm +# +.macro Skein_Debug_Round BLK_BITS,R,RDI_OFFS,afterOp +.endm +# +.endif # _SKEIN_DEBUG +# +#---------------------------------------------------------------- +# +.macro addReg dstReg,srcReg_A,srcReg_B,useAddOp,immOffs + .if \immOffs + 0 + leaq \immOffs(%\srcReg_A\srcReg_B,%\dstReg),%\dstReg + .elseif ((\useAddOp + 0) == 0) + .ifndef ASM_NO_LEA #lea seems to be faster on Core 2 Duo CPUs! + leaq (%\srcReg_A\srcReg_B,%\dstReg),%\dstReg + .else + addq %\srcReg_A\srcReg_B,%\dstReg + .endif + .else + addq %\srcReg_A\srcReg_B,%\dstReg + .endif +.endm + +# keep Intel-style ordering here, to match addReg +.macro xorReg dstReg,srcReg_A,srcReg_B + xorq %\srcReg_A\srcReg_B,%\dstReg +.endm +# +#---------------------------------------------------------------- +# +.macro C_label lName + \lName: #use both "genders" to work across linkage conventions +_\lName: + .global \lName + .global _\lName +.endm +# +#=================================== Skein_256 ============================================= +# +.if _USE_ASM_ & 256 +# +# void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +################# +# +# code +# +C_label Skein_256_Process_Block + Setup_Stack 256,((ROUNDS_256/8)+1) + movq TWEAK+8(%rdi),%r14 + jmp Skein_256_block_loop + .p2align 4 + # main hash loop for Skein_256 +Skein_256_block_loop: + # + # general register usage: + # RAX..RDX = X0..X3 + # R08..R12 = ks[0..4] + # R13..R15 = ts[0..2] + # RSP, RBP = stack/frame pointers + # RDI = round counter or context pointer + # RSI = temp + # + movq TWEAK+0(%rdi) ,%r13 + addq bitAdd+F_O(%rbp) ,%r13 #computed updated tweak value T0 + movq %r14 ,%r15 + xorq %r13 ,%r15 #now %r13.%r15 is set as the tweak + + movq $KW_PARITY ,%r12 + movq X_VARS+ 0(%rdi),%r8 + movq X_VARS+ 8(%rdi),%r9 + movq X_VARS+16(%rdi),%r10 + movq X_VARS+24(%rdi),%r11 + movq %r13,TWEAK+0(%rdi) #save updated tweak value ctx->h.T[0] + xorq %r8 ,%r12 #start accumulating overall parity + + movq blkPtr +F_O(%rbp) ,%rsi #esi --> input block + xorq %r9 ,%r12 + movq 0(%rsi) ,%rax #get X[0..3] + xorq %r10 ,%r12 + movq 8(%rsi) ,%rbx + xorq %r11 ,%r12 + movq 16(%rsi) ,%rcx + movq 24(%rsi) ,%rdx + + movq %rax,Wcopy+ 0+F_O(%rbp) #save copy of input block + movq %rbx,Wcopy+ 8+F_O(%rbp) + movq %rcx,Wcopy+16+F_O(%rbp) + movq %rdx,Wcopy+24+F_O(%rbp) + + addq %r8 ,%rax #initial key injection + addq %r9 ,%rbx + addq %r10,%rcx + addq %r11,%rdx + addq %r13,%rbx + addq %r14,%rcx + +.if _SKEIN_DEBUG + movq %r14,TWEAK+ 8(%rdi) #save updated tweak T[1] (start bit cleared?) + movq %r8 ,ksKey+ 0+F_O(%rbp) #save key schedule on stack for Skein_Debug_Block + movq %r9 ,ksKey+ 8+F_O(%rbp) + movq %r10,ksKey+16+F_O(%rbp) + movq %r11,ksKey+24+F_O(%rbp) + movq %r12,ksKey+32+F_O(%rbp) + + movq %r13,ksTwk+ 0+F_O(%rbp) + movq %r14,ksTwk+ 8+F_O(%rbp) + movq %r15,ksTwk+16+F_O(%rbp) + + movq %rax,X_stk + 0(%rsp) #save X[] on stack for Skein_Debug_Block + movq %rbx,X_stk + 8(%rsp) + movq %rcx,X_stk +16(%rsp) + movq %rdx,X_stk +24(%rsp) + + Skein_Debug_Block 256 #debug dump + Skein_Debug_Round 256,SKEIN_RND_KEY_INITIAL +.endif +# +.if ((SKEIN_ASM_UNROLL & 256) == 0) + movq %r8 ,ksKey+40+F_O(%rbp) #save key schedule on stack for looping code + movq %r9 ,ksKey+ 8+F_O(%rbp) + movq %r10,ksKey+16+F_O(%rbp) + movq %r11,ksKey+24+F_O(%rbp) + movq %r12,ksKey+32+F_O(%rbp) + + movq %r13,ksTwk+24+F_O(%rbp) + movq %r14,ksTwk+ 8+F_O(%rbp) + movq %r15,ksTwk+16+F_O(%rbp) +.endif + addq $WCNT*8,%rsi #skip the block + movq %rsi,blkPtr +F_O(%rbp) #update block pointer + # + # now the key schedule is computed. Start the rounds + # +.if SKEIN_ASM_UNROLL & 256 +_UNROLL_CNT = ROUNDS_256/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_256 + .if ((ROUNDS_256/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_256" + .endif + xorq %rdi,%rdi #rdi = iteration count +Skein_256_round_loop: +.endif +_Rbase_ = 0 +.rept _UNROLL_CNT*2 + # all X and ks vars in regs # (ops to "rotate" ks vars, via mem, if not unrolled) + # round 4*_RBase_ + 0 + addReg rax, rbx + RotL64 rbx, 256,%((4*_Rbase_+0) % 8),0 + addReg rcx, rdx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*1+F_O(%rbp,%rdi,8),%r8 + .endif + xorReg rbx, rax + RotL64 rdx, 256,%((4*_Rbase_+0) % 8),1 + xorReg rdx, rcx + .if SKEIN_ASM_UNROLL & 256 + .irp _r0_,%( 8+(_Rbase_+3) % 5) + .irp _r1_,%(13+(_Rbase_+2) % 3) + leaq (%r\_r0_,%r\_r1_),%rdi #precompute key injection value for %rcx + .endr + .endr + .endif + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksTwk+8*1+F_O(%rbp,%rdi,8),%r13 + .endif + Skein_Debug_Round 256,%(4*_Rbase_+1) + + # round 4*_Rbase_ + 1 + addReg rax, rdx + RotL64 rdx, 256,%((4*_Rbase_+1) % 8),0 + xorReg rdx, rax + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*2+F_O(%rbp,%rdi,8),%r9 + .endif + addReg rcx, rbx + RotL64 rbx, 256,%((4*_Rbase_+1) % 8),1 + xorReg rbx, rcx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*4+F_O(%rbp,%rdi,8),%r11 + .endif + Skein_Debug_Round 256,%(4*_Rbase_+2) + .if SKEIN_ASM_UNROLL & 256 + .irp _r0_,%( 8+(_Rbase_+2) % 5) + .irp _r1_,%(13+(_Rbase_+1) % 3) + leaq (%r\_r0_,%r\_r1_),%rsi #precompute key injection value for %rbx + .endr + .endr + .endif + # round 4*_Rbase_ + 2 + addReg rax, rbx + RotL64 rbx, 256,%((4*_Rbase_+2) % 8),0 + addReg rcx, rdx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq ksKey+8*3+F_O(%rbp,%rdi,8),%r10 + .endif + xorReg rbx, rax + RotL64 rdx, 256,%((4*_Rbase_+2) % 8),1 + xorReg rdx, rcx + .if (SKEIN_ASM_UNROLL & 256) == 0 + movq %r8,ksKey+8*6+F_O(%rbp,%rdi,8) #"rotate" the key + leaq 1(%r11,%rdi),%r11 #precompute key + tweak + .endif + Skein_Debug_Round 256,%(4*_Rbase_+3) + # round 4*_Rbase_ + 3 + addReg rax, rdx + RotL64 rdx, 256,%((4*_Rbase_+3) % 8),0 + addReg rcx, rbx + .if (SKEIN_ASM_UNROLL & 256) == 0 + addq ksTwk+8*2+F_O(%rbp,%rdi,8),%r10 #precompute key + tweak + movq %r13,ksTwk+8*4+F_O(%rbp,%rdi,8) #"rotate" the tweak + .endif + xorReg rdx, rax + RotL64 rbx, 256,%((4*_Rbase_+3) % 8),1 + xorReg rbx, rcx + Skein_Debug_Round 256,%(4*_Rbase_+4) + .if (SKEIN_ASM_UNROLL & 256) == 0 + addReg r9 ,r13 #precompute key+tweak + .endif + #inject key schedule words +_Rbase_ = _Rbase_+1 + .if SKEIN_ASM_UNROLL & 256 + addReg rax,r,%(8+((_Rbase_+0) % 5)) + addReg rbx,rsi + addReg rcx,rdi + addReg rdx,r,%(8+((_Rbase_+3) % 5)),,_Rbase_ + .else + incq %rdi + addReg rax,r8 + addReg rcx,r10 + addReg rbx,r9 + addReg rdx,r11 + .endif + Skein_Debug_Round 256,SKEIN_RND_KEY_INJECT +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 256) == 0 + cmpq $2*(ROUNDS_256/8),%rdi + jb Skein_256_round_loop +.endif # (SKEIN_ASM_UNROLL & 256) == 0 + movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context + + #---------------------------- + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..3} + movq $FIRST_MASK64 ,%r14 + xorq Wcopy + 0+F_O (%rbp),%rax + xorq Wcopy + 8+F_O (%rbp),%rbx + xorq Wcopy +16+F_O (%rbp),%rcx + xorq Wcopy +24+F_O (%rbp),%rdx + andq TWEAK + 8 (%rdi),%r14 + movq %rax,X_VARS+ 0(%rdi) #store final result + movq %rbx,X_VARS+ 8(%rdi) + movq %rcx,X_VARS+16(%rdi) + movq %rdx,X_VARS+24(%rdi) + + Skein_Debug_Round 256,SKEIN_RND_FEED_FWD + + # go back for more blocks, if needed + decq blkCnt+F_O(%rbp) + jnz Skein_256_block_loop + movq %r14,TWEAK + 8(%rdi) + Reset_Stack + ret +Skein_256_Process_Block_End: + + .if _SKEIN_DEBUG +Skein_Debug_Round_256: #here with rdx == round "number" from macro + pushq %rsi #save two regs for BLK_BITS-specific parms + pushq %rdi + movq 24(%rsp),%rdi #get back original rdx (pushed on stack in macro call) to rdi + movq %rax,X_stk+ 0+F_O(%rbp) #save X[] state on stack so debug routines can access it + movq %rbx,X_stk+ 8+F_O(%rbp) #(use FP_ since rsp has changed!) + movq %rcx,X_stk+16+F_O(%rbp) + movq %rdi,X_stk+24+F_O(%rbp) + + movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr + movq $256,%rdi #now are set for the call + jmp Skein_Debug_Round_Common + .endif +# +.if _SKEIN_CODE_SIZE +C_label Skein_256_Process_Block_CodeSize + movq $(Skein_256_Process_Block_End-Skein_256_Process_Block),%rax + ret +# +C_label Skein_256_Unroll_Cnt + .if _UNROLL_CNT <> ROUNDS_256/8 + movq $_UNROLL_CNT,%rax + .else + xorq %rax,%rax + .endif + ret +.endif +# +.endif #_USE_ASM_ & 256 +# +#=================================== Skein_512 ============================================= +# +.if _USE_ASM_ & 512 +# +# void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd) +# +# X[i] == %r[8+i] #register assignments for X[] values during rounds (i=0..7) +# +################# +# MACRO: one round for 512-bit blocks +# +.macro R_512_OneRound rn0,rn1,rn2,rn3,rn4,rn5,rn6,rn7,_Rn_,op1,op2,op3,op4 +# + addReg r\rn0, r\rn1 + RotL64 r\rn1, 512,%((_Rn_) % 8),0 + xorReg r\rn1, r\rn0 + op1 + addReg r\rn2, r\rn3 + RotL64 r\rn3, 512,%((_Rn_) % 8),1 + xorReg r\rn3, r\rn2 + op2 + addReg r\rn4, r\rn5 + RotL64 r\rn5, 512,%((_Rn_) % 8),2 + xorReg r\rn5, r\rn4 + op3 + addReg r\rn6, r\rn7 + RotL64 r\rn7, 512,%((_Rn_) % 8),3 + xorReg r\rn7, r\rn6 + op4 + Skein_Debug_Round 512,%(_Rn_+1),-4 +# +.endm #R_512_OneRound +# +################# +# MACRO: eight rounds for 512-bit blocks +# +.macro R_512_FourRounds _RR_ #RR = base round number (0 % 8) + .if (SKEIN_ASM_UNROLL && 512) + # here for fully unrolled case. + _II_ = ((_RR_)/4) + 1 #key injection counter + R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),,, + R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),,, + R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),,, + R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),, + # inject the key schedule + addq ksKey+8*(((_II_)+0)%9)+F_O(%rbp),%r8 + addReg r11, rax + addq ksKey+8*(((_II_)+1)%9)+F_O(%rbp),%r9 + addReg r12, rbx + addq ksKey+8*(((_II_)+2)%9)+F_O(%rbp),%r10 + addReg r13, rcx + addReg r14, rdx + addReg r15, rsi,,,(_II_) + .else + # here for looping case #"rotate" key/tweak schedule (move up on stack) + incq %rdi #bump key injection counter + R_512_OneRound 8, 9,10,11,12,13,14,15,%((_RR_)+0),,, + R_512_OneRound 10, 9,12,15,14,13, 8,11,%((_RR_)+1),,, + R_512_OneRound 12, 9,14,11, 8,13,10,15,%((_RR_)+2),,, + R_512_OneRound 14, 9, 8,15,10,13,12,11,%((_RR_)+3),, + # inject the key schedule + addq ksKey+8*0+F_O(%rbp,%rdi,8),%r8 + addReg r11, rax + addReg r12, rbx + addq ksKey+8*1+F_O(%rbp,%rdi,8),%r9 + addReg r13, rcx + addReg r14, rdx + addq ksKey+8*2+F_O(%rbp,%rdi,8),%r10 + addReg r15, rsi + addReg r15, rdi #inject the round number + .endif + + #show the result of the key injection + Skein_Debug_Round 512,SKEIN_RND_KEY_INJECT +.endm #R_512_EightRounds +# +################# +# instantiated code +# +C_label Skein_512_Process_Block + Setup_Stack 512,ROUNDS_512/8 + movq TWEAK+ 8(%rdi),%rbx + jmp Skein_512_block_loop + .p2align 4 + # main hash loop for Skein_512 +Skein_512_block_loop: + # general register usage: + # RAX..RDX = temps for key schedule pre-loads + # R8 ..R15 = X0..X7 + # RSP, RBP = stack/frame pointers + # RDI = round counter or context pointer + # RSI = temp + # + movq TWEAK + 0(%rdi),%rax + addq bitAdd+F_O(%rbp),%rax #computed updated tweak value T0 + movq %rbx,%rcx + xorq %rax,%rcx #%rax/%rbx/%rcx = tweak schedule + movq %rax,TWEAK+ 0 (%rdi) #save updated tweak value ctx->h.T[0] + movq %rax,ksTwk+ 0+F_O(%rbp) + movq $KW_PARITY,%rdx + movq blkPtr +F_O(%rbp),%rsi #%rsi --> input block + movq %rbx,ksTwk+ 8+F_O(%rbp) + movq %rcx,ksTwk+16+F_O(%rbp) + .irp _Rn_,8,9,10,11,12,13,14,15 + movq X_VARS+8*(_Rn_-8)(%rdi),%r\_Rn_ + xorq %r\_Rn_,%rdx #compute overall parity + movq %r\_Rn_,ksKey+8*(_Rn_-8)+F_O(%rbp) + .endr #load state into %r8 ..%r15, compute parity + movq %rdx,ksKey+8*(8)+F_O(%rbp)#save key schedule parity + + addReg r13,rax #precompute key injection for tweak + addReg r14, rbx +.if _SKEIN_DEBUG + movq %rbx,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block below +.endif + movq 0(%rsi),%rax #load input block + movq 8(%rsi),%rbx + movq 16(%rsi),%rcx + movq 24(%rsi),%rdx + addReg r8 , rax #do initial key injection + addReg r9 , rbx + movq %rax,Wcopy+ 0+F_O(%rbp) #keep local copy for feedforward + movq %rbx,Wcopy+ 8+F_O(%rbp) + addReg r10, rcx + addReg r11, rdx + movq %rcx,Wcopy+16+F_O(%rbp) + movq %rdx,Wcopy+24+F_O(%rbp) + + movq 32(%rsi),%rax + movq 40(%rsi),%rbx + movq 48(%rsi),%rcx + movq 56(%rsi),%rdx + addReg r12, rax + addReg r13, rbx + addReg r14, rcx + addReg r15, rdx + movq %rax,Wcopy+32+F_O(%rbp) + movq %rbx,Wcopy+40+F_O(%rbp) + movq %rcx,Wcopy+48+F_O(%rbp) + movq %rdx,Wcopy+56+F_O(%rbp) + +.if _SKEIN_DEBUG + .irp _Rn_,8,9,10,11,12,13,14,15 #save values on stack for debug output + movq %r\_Rn_,X_stk+8*(_Rn_-8)(%rsp) + .endr + + Skein_Debug_Block 512 #debug dump + Skein_Debug_Round 512,SKEIN_RND_KEY_INITIAL +.endif + addq $8*WCNT,%rsi #skip the block + movq %rsi,blkPtr+F_O(%rbp) #update block pointer + # + ################# + # now the key schedule is computed. Start the rounds + # +.if SKEIN_ASM_UNROLL & 512 +_UNROLL_CNT = ROUNDS_512/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_512 + .if ((ROUNDS_512/8) % _UNROLL_CNT) + .err "Invalid SKEIN_UNROLL_512" + .endif + xorq %rdi,%rdi #rdi = round counter +Skein_512_round_loop: +.endif +# +_Rbase_ = 0 +.rept _UNROLL_CNT*2 + R_512_FourRounds %(4*_Rbase_+00) +_Rbase_ = _Rbase_+1 +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 512) == 0 + cmpq $2*(ROUNDS_512/8),%rdi + jb Skein_512_round_loop + movq ctxPtr +F_O(%rbp),%rdi #restore rdi --> context +.endif + # end of rounds + ################# + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..7} + .irp _Rn_,8,9,10,11,12,13,14,15 + .if (_Rn_ == 8) + movq $FIRST_MASK64,%rbx + .endif + xorq Wcopy+8*(_Rn_-8)+F_O(%rbp),%r\_Rn_ #feedforward XOR + movq %r\_Rn_,X_VARS+8*(_Rn_-8)(%rdi) #and store result + .if (_Rn_ == 14) + andq TWEAK+ 8(%rdi),%rbx + .endif + .endr + Skein_Debug_Round 512,SKEIN_RND_FEED_FWD + + # go back for more blocks, if needed + decq blkCnt+F_O(%rbp) + jnz Skein_512_block_loop + movq %rbx,TWEAK + 8(%rdi) + + Reset_Stack + ret +Skein_512_Process_Block_End: +# + .if _SKEIN_DEBUG +# call here with rdx = "round number" +Skein_Debug_Round_512: + pushq %rsi #save two regs for BLK_BITS-specific parms + pushq %rdi + .irp _Rn_,8,9,10,11,12,13,14,15 #save X[] state on stack so debug routines can access it + movq %r\_Rn_,X_stk+8*(_Rn_-8)+F_O(%rbp) + .endr + movq ctxPtr+F_O(%rbp),%rsi #ctx_hdr_ptr + movq $512,%rdi #now are set for the call + jmp Skein_Debug_Round_Common + .endif +# +.if _SKEIN_CODE_SIZE +C_label Skein_512_Process_Block_CodeSize + movq $(Skein_512_Process_Block_End-Skein_512_Process_Block),%rax + ret +# +C_label Skein_512_Unroll_Cnt + .if _UNROLL_CNT <> (ROUNDS_512/8) + movq $_UNROLL_CNT,%rax + .else + xorq %rax,%rax + .endif + ret +.endif +# +.endif # _USE_ASM_ & 512 +# +#=================================== Skein1024 ============================================= +.if _USE_ASM_ & 1024 +# +# void Skein1024_Process_Block(Skein_1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t bitcntAdd)# +# +################# +# use details of permutation to make register assignments +# +o1K_rdi = 0 #offsets in X[] associated with each register +o1K_rsi = 1 +o1K_rbp = 2 +o1K_rax = 3 +o1K_rcx = 4 #rcx is "shared" with X6, since X4/X6 alternate +o1K_rbx = 5 +o1K_rdx = 7 +o1K_r8 = 8 +o1K_r9 = 9 +o1K_r10 = 10 +o1K_r11 = 11 +o1K_r12 = 12 +o1K_r13 = 13 +o1K_r14 = 14 +o1K_r15 = 15 +# +rIdx_offs = tmpStk_1024 +# +.macro r1024_Mix w0,w1,reg0,reg1,_RN0_,_Rn1_,op1 + addReg \reg0 , \reg1 #perform the MIX + RotL64 \reg1 , 1024,%((_RN0_) % 8),_Rn1_ + xorReg \reg1 , \reg0 +.if ((_RN0_) && 3) == 3 #time to do key injection? + .if _SKEIN_DEBUG + movq %\reg0 , xDebug_1024+8*w0(%rsp) #save intermediate values for Debug_Round + movq %\reg1 , xDebug_1024+8*w1(%rsp) # (before inline key injection) + .endif +_II_ = ((_RN0_)/4)+1 #injection count + .if SKEIN_ASM_UNROLL && 1024 #here to do fully unrolled key injection + addq ksKey+ 8*((_II_+w0) % 17)(%rsp),%\reg0 + addq ksKey+ 8*((_II_+w1) % 17)(%rsp),%\reg1 + .if w1 == 13 #tweak injection + addq ksTwk+ 8*((_II_+ 0) % 3)(%rsp),%\reg1 + .elseif w0 == 14 + addq ksTwk+ 8*((_II_+ 1) % 3)(%rsp),%\reg0 + .elseif w1 == 15 + addq $_II_, %\reg1 #(injection counter) + .endif + .else #here to do looping key injection + .if (w0 == 0) + movq %rdi, X_stk+8*w0(%rsp) #if so, store N0 so we can use reg as index + movq rIdx_offs(%rsp),%rdi #get the injection counter index into rdi + .else + addq ksKey+8+8*w0(%rsp,%rdi,8),%\reg0 #even key injection + .endif + .if w1 == 13 #tweak injection + addq ksTwk+8+8* 0(%rsp,%rdi,8),%\reg1 + .elseif w0 == 14 + addq ksTwk+8+8* 1(%rsp,%rdi,8),%\reg0 + .elseif w1 == 15 + addReg \reg1,rdi,,,1 #(injection counter) + .endif + addq ksKey+8+8*w1(%rsp,%rdi,8),%\reg1 #odd key injection + .endif +.endif + # insert the op provided, .if any + op1 +.endm +################# +# MACRO: four rounds for 1024-bit blocks +# +.macro r1024_FourRounds _RR_ #RR = base round number (0 mod 4) + # should be here with X4 set properly, X6 stored on stack +_Rn_ = (_RR_) + 0 + r1024_Mix 0, 1,rdi,rsi,_Rn_,0 + r1024_Mix 2, 3,rbp,rax,_Rn_,1 + r1024_Mix 4, 5,rcx,rbx,_Rn_,2, #save X4 on stack (x4/x6 alternate) + r1024_Mix 8, 9,r8 ,r9 ,_Rn_,4, #load X6 from stack + r1024_Mix 10,11,r10,r11,_Rn_,5 + r1024_Mix 12,13,r12,r13,_Rn_,6 + r1024_Mix 6, 7,rcx,rdx,_Rn_,3 + r1024_Mix 14,15,r14,r15,_Rn_,7 + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + .endif +_Rn_ = (_RR_) + 1 + r1024_Mix 0, 9,rdi,r9 ,_Rn_,0 + r1024_Mix 2,13,rbp,r13,_Rn_,1 + r1024_Mix 6,11,rcx,r11,_Rn_,2, #save X6 on stack (x4/x6 alternate) + r1024_Mix 10, 7,r10,rdx,_Rn_,4, #load X4 from stack + r1024_Mix 12, 3,r12,rax,_Rn_,5 + r1024_Mix 14, 5,r14,rbx,_Rn_,6 + r1024_Mix 4,15,rcx,r15,_Rn_,3 + r1024_Mix 8, 1,r8 ,rsi,_Rn_,7 + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + .endif +_Rn_ = (_RR_) + 2 + r1024_Mix 0, 7,rdi,rdx,_Rn_,0 + r1024_Mix 2, 5,rbp,rbx,_Rn_,1 + r1024_Mix 4, 3,rcx,rax,_Rn_,2, #save X4 on stack (x4/x6 alternate) + r1024_Mix 12,15,r12,r15,_Rn_,4, #load X6 from stack + r1024_Mix 14,13,r14,r13,_Rn_,5 + r1024_Mix 8,11,r8 ,r11,_Rn_,6 + r1024_Mix 6, 1,rcx,rsi,_Rn_,3 + r1024_Mix 10, 9,r10,r9 ,_Rn_,7 + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + .endif +_Rn_ = (_RR_) + 3 + r1024_Mix 0,15,rdi,r15,_Rn_,0 + r1024_Mix 2,11,rbp,r11,_Rn_,1 + r1024_Mix 6,13,rcx,r13,_Rn_,2, #save X6 on stack (x4/x6 alternate) + r1024_Mix 14, 1,r14,rsi,_Rn_,4, #load X4 from stack + r1024_Mix 8, 5,r8 ,rbx,_Rn_,5 + r1024_Mix 10, 3,r10,rax,_Rn_,6 + r1024_Mix 4, 9,rcx,r9 ,_Rn_,3 + r1024_Mix 12, 7,r12,rdx,_Rn_,7 + .if _SKEIN_DEBUG + Skein_Debug_Round 1024,%(_Rn_+1) + .endif + + .if (SKEIN_ASM_UNROLL && 1024) == 0 #here with rdi == rIdx, X0 on stack + #"rotate" the key schedule on the stack +i8 = o1K_r8 +i0 = o1K_rdi + movq %r8 , X_stk+8*i8(%rsp) #free up a register (save it on the stack) + movq ksKey+8* 0(%rsp,%rdi,8),%r8 #get key word + movq %r8 , ksKey+8*17(%rsp,%rdi,8) #rotate key (must do key first or tweak clobbers it!) + movq ksTwk+8* 0(%rsp,%rdi,8),%r8 #get tweak word + movq %r8 , ksTwk+8* 3(%rsp,%rdi,8) #rotate tweak (onto the stack) + movq X_stk+8*i8(%rsp) ,%r8 #get the reg back + incq %rdi #bump the index + movq %rdi, rIdx_offs (%rsp) #save rdi again + movq ksKey+8*i0(%rsp,%rdi,8),%rdi #get the key schedule word for X0 back + addq X_stk+8*i0(%rsp) ,%rdi #perform the X0 key injection + .endif + #show the result of the key injection + Skein_Debug_Round 1024,SKEIN_RND_KEY_INJECT +.endm #r1024_FourRounds +# +################ +# code +# +C_label Skein1024_Process_Block +# + Setup_Stack 1024,ROUNDS_1024/8,WCNT + movq TWEAK+ 8(%rdi),%r9 + jmp Skein1024_block_loop + # main hash loop for Skein1024 + .p2align 4 +Skein1024_block_loop: + # general register usage: + # RSP = stack pointer + # RAX..RDX,RSI,RDI = X1, X3..X7 (state words) + # R8 ..R15 = X8..X15 (state words) + # RBP = temp (used for X0 and X2) + # + .if (SKEIN_ASM_UNROLL & 1024) == 0 + xorq %rax,%rax #init loop index on the stack + movq %rax,rIdx_offs(%rsp) + .endif + movq TWEAK+ 0(%rdi),%r8 + addq bitAdd+ F_O(%rbp),%r8 #computed updated tweak value T0 + movq %r9 ,%r10 + xorq %r8 ,%r10 #%rax/%rbx/%rcx = tweak schedule + movq %r8 ,TWEAK+ 0(%rdi) #save updated tweak value ctx->h.T[0] + movq %r8 ,ksTwk+ 0+F_O(%rbp) + movq %r9 ,ksTwk+ 8+F_O(%rbp) #keep values in %r8 ,%r9 for initial tweak injection below + movq %r10,ksTwk+16+F_O(%rbp) + .if _SKEIN_DEBUG + movq %r9 ,TWEAK+ 8(%rdi) #save updated tweak value ctx->h.T[1] for Skein_Debug_Block + .endif + movq blkPtr +F_O(%rbp),%rsi # rsi --> input block + movq $KW_PARITY ,%rax #overall key schedule parity + + # the logic here assumes the set {rdi,rsi,rbp,rax} = X[0,1,2,3] + .irp _rN_,0,1,2,3,4,6 #process the "initial" words, using r14/r15 as temps + movq X_VARS+8*_rN_(%rdi),%r14 #get state word + movq 8*_rN_(%rsi),%r15 #get msg word + xorq %r14,%rax #update key schedule overall parity + movq %r14,ksKey +8*_rN_+F_O(%rbp) #save key schedule word on stack + movq %r15,Wcopy +8*_rN_+F_O(%rbp) #save local msg Wcopy + addq %r15,%r14 #do the initial key injection + movq %r14,X_stk +8*_rN_ (%rsp) #save initial state var on stack + .endr + # now process the rest, using the "real" registers + # (MUST do it in reverse order to inject tweaks r8/r9 first) + .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rdx,rbx +_oo_ = o1K_\_rr_ #offset assocated with the register + movq X_VARS+8*_oo_(%rdi),%\_rr_ #get key schedule word from context + movq 8*_oo_(%rsi),%rcx #get next input msg word + movq %\_rr_, ksKey +8*_oo_(%rsp) #save key schedule on stack + xorq %\_rr_, %rax #accumulate key schedule parity + movq %rcx,Wcopy+8*_oo_+F_O(%rbp) #save copy of msg word for feedforward + addq %rcx,%\_rr_ #do the initial key injection + .if _oo_ == 13 #do the initial tweak injection + addReg _rr_,r8 # (only in words 13/14) + .elseif _oo_ == 14 + addReg _rr_,r9 + .endif + .endr + movq %rax,ksKey+8*WCNT+F_O(%rbp) #save key schedule parity +.if _SKEIN_DEBUG + Skein_Debug_Block 1024 #initial debug dump +.endif + addq $8*WCNT,%rsi #bump the msg ptr + movq %rsi,blkPtr+F_O(%rbp) #save bumped msg ptr + # re-load words 0..4 from stack, enter the main loop + .irp _rr_,rdi,rsi,rbp,rax,rcx #(no need to re-load x6, already on stack) + movq X_stk+8*o1K_\_rr_(%rsp),%\_rr_ #re-load state and get ready to go! + .endr +.if _SKEIN_DEBUG + Skein_Debug_Round 1024,SKEIN_RND_KEY_INITIAL #show state after initial key injection +.endif + # + ################# + # now the key schedule is computed. Start the rounds + # +.if SKEIN_ASM_UNROLL & 1024 +_UNROLL_CNT = ROUNDS_1024/8 +.else +_UNROLL_CNT = SKEIN_UNROLL_1024 + .if ((ROUNDS_1024/8) % _UNROLL_CNT) + .error "Invalid SKEIN_UNROLL_1024" + .endif +Skein1024_round_loop: +.endif +# +_Rbase_ = 0 +.rept _UNROLL_CNT*2 #implement the rounds, 4 at a time + r1024_FourRounds %(4*_Rbase_+00) +_Rbase_ = _Rbase_+1 +.endr #rept _UNROLL_CNT +# +.if (SKEIN_ASM_UNROLL & 1024) == 0 + cmpq $2*(ROUNDS_1024/8),tmpStk_1024(%rsp) #see .if we are done + jb Skein1024_round_loop +.endif + # end of rounds + ################# + # + # feedforward: ctx->X[i] = X[i] ^ w[i], {i=0..15} + movq %rdx,X_stk+8*o1K_rdx(%rsp) #we need a register. x6 already on stack + movq ctxPtr(%rsp),%rdx + + .irp _rr_,rdi,rsi,rbp,rax,rcx,rbx,r8,r9,r10,r11,r12,r13,r14,r15 #do all but x6,x7 +_oo_ = o1K_\_rr_ + xorq Wcopy +8*_oo_(%rsp),%\_rr_ #feedforward XOR + movq %\_rr_,X_VARS+8*_oo_(%rdx) #save result into context + .if (_oo_ == 9) + movq $FIRST_MASK64 ,%r9 + .endif + .if (_oo_ == 14) + andq TWEAK+ 8(%rdx),%r9 + .endif + .endr + # + movq X_stk +8*6(%rsp),%rax #now process x6,x7 (skipped in .irp above) + movq X_stk +8*7(%rsp),%rbx + xorq Wcopy +8*6(%rsp),%rax + xorq Wcopy +8*7(%rsp),%rbx + movq %rax,X_VARS+8*6(%rdx) + decq blkCnt(%rsp) #set zero flag iff done + movq %rbx,X_VARS+8*7(%rdx) + + Skein_Debug_Round 1024,SKEIN_RND_FEED_FWD,, + # go back for more blocks, if needed + movq ctxPtr(%rsp),%rdi #don't muck with the flags here! + lea FRAME_OFFS(%rsp),%rbp + jnz Skein1024_block_loop + movq %r9 ,TWEAK+ 8(%rdx) + Reset_Stack + ret +# +Skein1024_Process_Block_End: +# +.if _SKEIN_DEBUG +Skein_Debug_Round_1024: + # call here with rdx = "round number", +_SP_OFFS_ = 8*2 #stack "offset" here: rdx, return addr + # + #save rest of X[] state on stack so debug routines can access it + .irp _rr_,rsi,rbp,rax,rbx,r8,r9,r10,r11,r12,r13,r14,r15 + movq %\_rr_,X_stk+8*o1K_\_rr_+_SP_OFFS_(%rsp) + .endr + # Figure out what to do with x0 (rdi). When rdx == 0 mod 4, it's already on stack + cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always save + jae save_x0 + testq $3,%rdx #otherwise only if rdx != 0 mod 4 + jz save_x0_not +save_x0: + movq %rdi,X_stk+8*o1K_rdi+_SP_OFFS_(%rsp) +save_x0_not: + #figure out the x4/x6 swapping state and save the correct one! + cmpq $SKEIN_RND_SPECIAL,%rdx #special rounds always do x4 + jae save_x4 + testq $1,%rdx #and even ones have r4 as well + jz save_x4 + movq %rcx,X_stk+8*6+_SP_OFFS_(%rsp) + jmp debug_1024_go +save_x4: + movq %rcx,X_stk+8*4+_SP_OFFS_(%rsp) +debug_1024_go: + #now all is saved in Xstk[] except for rdx + push %rsi #save two regs for BLK_BITS-specific parms + push %rdi +_SP_OFFS_ = _SP_OFFS_ + 16 #adjust stack offset accordingly (now 32) + + movq _SP_OFFS_-8(%rsp),%rsi #get back original %rdx (pushed on stack in macro call) + movq %rsi,X_stk+8*o1K_rdx+_SP_OFFS_(%rsp) #and save it in its rightful place in X_stk[] + + movq ctxPtr+_SP_OFFS_(%rsp),%rsi #rsi = ctx_hdr_ptr + movq $1024,%rdi #rdi = block size + jmp Skein_Debug_Round_Common +.endif +# +.if _SKEIN_CODE_SIZE +C_label Skein1024_Process_Block_CodeSize + movq $(Skein1024_Process_Block_End-Skein1024_Process_Block),%rax + ret +# +C_label Skein1024_Unroll_Cnt + .if _UNROLL_CNT <> (ROUNDS_1024/8) + movq $_UNROLL_CNT,%rax + .else + xorq %rax,%rax + .endif + ret +.endif +# +.endif # _USE_ASM_ and 1024 +# +.if _SKEIN_DEBUG +#---------------------------------------------------------------- +#local debug routine to set up for calls to: +# void Skein_Show_Round(uint_t bits,const Skein_Ctxt_Hdr_t *h,int r,const u64b_t *X) +# [ rdi rsi rdx rcx] +# +# here with %rdx = round number +# %rsi = ctx_hdr_ptr +# %rdi = block size (256/512/1024) +# on stack: saved rdi, saved rsi, retAddr, saved rdx +# +Skein_Debug_Round_Common: +_SP_OFFS_ = 32 #account for four words on stack already + .irp _rr_,rax,rbx,rcx,rbp,r8,r9,r10,r11,r12,r13,r14,r15 #save the rest of the regs + pushq %\_rr_ +_SP_OFFS_ = _SP_OFFS_+8 + .endr + .if (_SP_OFFS_ % 16) # make sure stack is still 16-byte aligned here + .error "Debug_Round_Common: stack alignment" + .endif + # compute %rcx = ptr to the X[] array on the stack (final parameter to call) + leaq X_stk+_SP_OFFS_(%rsp),%rcx #adjust for reg pushes, return address + cmpq $SKEIN_RND_FEED_FWD,%rdx #special handling for feedforward "round"? + jnz _got_rcxA + leaq X_VARS(%rsi),%rcx +_got_rcxA: + .if _USE_ASM_ & 1024 + # special handling for 1024-bit case + # (for rounds right before with key injection: + # use xDebug_1024[] instead of X_stk[]) + cmpq $SKEIN_RND_SPECIAL,%rdx + jae _got_rcxB #must be a normal round + orq %rdx,%rdx + jz _got_rcxB #just before key injection + test $3,%rdx + jne _got_rcxB + cmp $1024,%rdi #only 1024-bit(s) for now + jne _got_rcxB + leaq xDebug_1024+_SP_OFFS_(%rsp),%rcx +_got_rcxB: + .endif + call Skein_Show_Round #call external debug handler + + .irp _rr_,r15,r14,r13,r12,r11,r10,r9,r8,rbp,rcx,rbx,rax #restore regs + popq %\_rr_ +_SP_OFFS_ = _SP_OFFS_-8 + .endr + .if _SP_OFFS_ - 32 + .error "Debug_Round_Common: push/pop misalignment!" + .endif + popq %rdi + popq %rsi + ret +.endif +#---------------------------------------------------------------- + .end Index: sys/crypto/skein/brg_endian.h =================================================================== --- /dev/null +++ sys/crypto/skein/brg_endian.h @@ -0,0 +1,148 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 20/10/2006 +*/ + +#ifndef BRG_ENDIAN_H +#define BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined(AVR) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) || defined( AVR ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order +#endif +#endif + +/* special handler for IA64, which may be either endianness (?) */ +/* here we assume little-endian, but this may need to be changed */ +#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) +# define PLATFORM_MUST_ALIGN (1) +#ifndef PLATFORM_BYTE_ORDER +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif +#endif + +#ifndef PLATFORM_MUST_ALIGN +# define PLATFORM_MUST_ALIGN (0) +#endif + +#endif /* ifndef BRG_ENDIAN_H */ Index: sys/crypto/skein/brg_types.h =================================================================== --- /dev/null +++ sys/crypto/skein/brg_types.h @@ -0,0 +1,188 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 09/09/2006 + + The unsigned integer types defined here are of the form uint_t where + is the length of the type; for example, the unsigned 32-bit type is + 'uint_32t'. These are NOT the same as the 'C99 integer types' that are + defined in the inttypes.h and stdint.h headers since attempts to use these + types have shown that support for them is still highly variable. However, + since the latter are of the form uint_t, a regular expression search + and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t') + can be used to convert the types used here to the C99 standard types. +*/ + +#ifndef BRG_TYPES_H +#define BRG_TYPES_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include + +#ifndef BRG_UI8 +# define BRG_UI8 +# if UCHAR_MAX == 255u + typedef unsigned char uint_8t; +# else +# error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI16 +# define BRG_UI16 +# if USHRT_MAX == 65535u + typedef unsigned short uint_16t; +# else +# error Please define uint_16t as a 16-bit unsigned short type in brg_types.h +# endif +#endif + +#ifndef BRG_UI32 +# define BRG_UI32 +# if UINT_MAX == 4294967295u +# define li_32(h) 0x##h##u + typedef unsigned int uint_32t; +# elif ULONG_MAX == 4294967295u +# define li_32(h) 0x##h##ul + typedef unsigned long uint_32t; +# elif defined( _CRAY ) +# error This code needs 32-bit data types, which Cray machines do not provide +# else +# error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h +# endif +#endif + +#ifndef BRG_UI64 +# if defined( __BORLANDC__ ) && !defined( __MSDOS__ ) +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( _MSC_VER ) && ( _MSC_VER < 1300 ) /* 1300 == VC++ 7.0 */ +# define BRG_UI64 +# define li_64(h) 0x##h##ui64 + typedef unsigned __int64 uint_64t; +# elif defined( __sun ) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# elif defined( UINT_MAX ) && UINT_MAX > 4294967295u +# if UINT_MAX == 18446744073709551615u +# define BRG_UI64 +# define li_64(h) 0x##h##u + typedef unsigned int uint_64t; +# endif +# elif defined( ULONG_MAX ) && ULONG_MAX > 4294967295u +# if ULONG_MAX == 18446744073709551615ul +# define BRG_UI64 +# define li_64(h) 0x##h##ul + typedef unsigned long uint_64t; +# endif +# elif defined( ULLONG_MAX ) && ULLONG_MAX > 4294967295u +# if ULLONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# elif defined( ULONG_LONG_MAX ) && ULONG_LONG_MAX > 4294967295u +# if ULONG_LONG_MAX == 18446744073709551615ull +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +# elif defined(__GNUC__) /* DLW: avoid mingw problem with -ansi */ +# define BRG_UI64 +# define li_64(h) 0x##h##ull + typedef unsigned long long uint_64t; +# endif +#endif + +#if defined( NEED_UINT_64T ) && !defined( BRG_UI64 ) +# error Please define uint_64t as an unsigned 64 bit type in brg_types.h +#endif + +#ifndef RETURN_VALUES +# define RETURN_VALUES +# if defined( DLL_EXPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllexport ) void __stdcall +# define INT_RETURN __declspec( dllexport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllexport__ ) void +# define INT_RETURN __declspec( __dllexport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( DLL_IMPORT ) +# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) +# define VOID_RETURN __declspec( dllimport ) void __stdcall +# define INT_RETURN __declspec( dllimport ) int __stdcall +# elif defined( __GNUC__ ) +# define VOID_RETURN __declspec( __dllimport__ ) void +# define INT_RETURN __declspec( __dllimport__ ) int +# else +# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +# endif +# elif defined( __WATCOMC__ ) +# define VOID_RETURN void __cdecl +# define INT_RETURN int __cdecl +# else +# define VOID_RETURN void +# define INT_RETURN int +# endif +#endif + +/* These defines are used to declare buffers in a way that allows + faster operations on longer variables to be used. In all these + defines 'size' must be a power of 2 and >= 8 + + dec_unit_type(size,x) declares a variable 'x' of length + 'size' bits + + dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize' + bytes defined as an array of variables + each of 'size' bits (bsize must be a + multiple of size / 8) + + ptr_cast(x,size) casts a pointer to a pointer to a + varaiable of length 'size' bits +*/ + +#define ui_type(size) uint_##size##t +#define dec_unit_type(size,x) typedef ui_type(size) x +#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)] +#define ptr_cast(x,size) ((ui_type(size)*)(x)) + +#if defined(__cplusplus) +} +#endif + +#endif Index: sys/crypto/skein/skein.h =================================================================== --- /dev/null +++ sys/crypto/skein/skein.h @@ -0,0 +1,336 @@ +#ifndef _SKEIN_H_ +#define _SKEIN_H_ 1 +/************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** +** +** The following compile-time switches may be defined to control some +** tradeoffs between speed, code size, error checking, and security. +** +** The "default" note explains what happens when the switch is not defined. +** +** SKEIN_DEBUG -- make callouts from inside Skein code +** to examine/display intermediate values. +** [default: no callouts (no overhead)] +** +** SKEIN_ERR_CHECK -- how error checking is handled inside Skein +** code. If not defined, most error checking +** is disabled (for performance). Otherwise, +** the switch value is interpreted as: +** 0: use assert() to flag errors +** 1: return SKEIN_FAIL to flag errors +** +***************************************************************************/ +#ifdef __cplusplus +extern "C" +{ +#endif + +#ifndef _KERNEL +#include /* get size_t definition */ +#endif +#include "skein_port.h" /* get platform-specific definitions */ + +enum + { + SKEIN_SUCCESS = 0, /* return codes from Skein calls */ + SKEIN_FAIL = 1, + SKEIN_BAD_HASHLEN = 2 + }; + +#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ + +#define SKEIN_256_STATE_WORDS ( 4) +#define SKEIN_512_STATE_WORDS ( 8) +#define SKEIN1024_STATE_WORDS (16) +#define SKEIN_MAX_STATE_WORDS (16) + +#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) + +#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) + +#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) + +typedef struct + { + size_t hashBitLen; /* size of hash result, in bits */ + size_t bCnt; /* current byte count in buffer b[] */ + u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ + } Skein_Ctxt_Hdr_t; + +typedef struct /* 256-bit Skein hash context structure */ + { + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + } Skein_256_Ctxt_t; + +typedef struct /* 512-bit Skein hash context structure */ + { + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + } Skein_512_Ctxt_t; + +typedef struct /* 1024-bit Skein hash context structure */ + { + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + } Skein1024_Ctxt_t; + +/* Skein APIs for (incremental) "straight hashing" */ +int Skein_256_Init (Skein_256_Ctxt_t *ctx, size_t hashBitLen); +int Skein_512_Init (Skein_512_Ctxt_t *ctx, size_t hashBitLen); +int Skein1024_Init (Skein1024_Ctxt_t *ctx, size_t hashBitLen); + +int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); +int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); + +int Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal); +int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); +int Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); + +/* +** Skein APIs for "extended" initialization: MAC keys, tree hashing. +** After an InitExt() call, just use Update/Final calls as with Init(). +** +** Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes. +** When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, +** the results of InitExt() are identical to calling Init(). +** The function Init() may be called once to "precompute" the IV for +** a given hashBitLen value, then by saving a copy of the context +** the IV computation may be avoided in later calls. +** Similarly, the function InitExt() may be called once per MAC key +** to precompute the MAC IV, then a copy of the context saved and +** reused for each new MAC computation. +**/ +int Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes); +int Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes); +int Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, u64b_t treeInfo, const u08b_t *key, size_t keyBytes); + +/* +** Skein APIs for MAC and tree hash: +** Final_Pad: pad, do final block, but no OUTPUT type +** Output: do just the output stage +*/ +int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t * hashVal); +int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t * hashVal); +int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal); + +#ifndef SKEIN_TREE_HASH +#define SKEIN_TREE_HASH (1) +#endif +#if SKEIN_TREE_HASH +int Skein_256_Output (Skein_256_Ctxt_t *ctx, u08b_t * hashVal); +int Skein_512_Output (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); +int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); +#endif + +/***************************************************************** +** "Internal" Skein definitions +** -- not needed for sequential hashing API, but will be +** helpful for other uses of Skein (e.g., tree hash mode). +** -- included here so that they can be shared between +** reference and optimized code. +******************************************************************/ + +/* tweak word T[1]: bit field starting positions */ +#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ + +#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ + +/* tweak word T[1]: flag bit definition(s) */ +#define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL) +#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t) 1 ) << SKEIN_T1_POS_BIT_PAD) + +/* tweak word T[1]: tree level bit field mask */ +#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL) +#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL) + +/* tweak word T[1]: block type field */ +#define SKEIN_BLK_TYPE_KEY ( 0) /* key, for MAC and KDF */ +#define SKEIN_BLK_TYPE_CFG ( 4) /* configuration block */ +#define SKEIN_BLK_TYPE_PERS ( 8) /* personalization string */ +#define SKEIN_BLK_TYPE_PK (12) /* public key (for digital signature hashing) */ +#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */ +#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ +#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */ + +#define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) /* key, for MAC and KDF */ +#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) /* configuration block */ +#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) /* personalization string */ +#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) /* public key (for digital signature hashing) */ +#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) /* key identifier for KDF */ +#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */ +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ +#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */ + +#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL) +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define SKEIN_VERSION (1) + +#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */ +#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/ +#endif + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32)) +#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) + +#define SKEIN_CFG_STR_LEN (4*8) + +/* bit field definitions in config block treeInfo word */ +#define SKEIN_CFG_TREE_LEAF_SIZE_POS ( 0) +#define SKEIN_CFG_TREE_NODE_SIZE_POS ( 8) +#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16) + +#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS) +#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS) +#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS) + +#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl) \ + ( (((u64b_t)(leaf )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \ + (((u64b_t)(node )) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \ + (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) ) + +#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */ + +/* +** Skein macros for getting/setting tweak words, etc. +** These are useful for partial input bytes, hash tree init/update, etc. +**/ +#define Skein_Get_Tweak(ctxPtr,TWK_NUM) ((ctxPtr)->h.T[TWK_NUM]) +#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} + +#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr,0) +#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr,1) +#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) +#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) + +/* set both tweak words at once */ +#define Skein_Set_T0_T1(ctxPtr,T0,T1) \ + { \ + Skein_Set_T0(ctxPtr,(T0)); \ + Skein_Set_T1(ctxPtr,(T1)); \ + } + +#define Skein_Set_Type(ctxPtr,BLK_TYPE) \ + Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE) + +/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */ +#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ + { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } + +#define Skein_Clear_First_Flag(hdr) { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; } +#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } + +#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);} + +/***************************************************************** +** "Internal" Skein definitions for debugging and error checking +******************************************************************/ +#ifdef SKEIN_DEBUG /* examine/display intermediate values? */ +#include "skein_debug.h" +#else /* default is no callouts */ +#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr) +#define Skein_Show_Round(bits,ctx,r,X) +#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr) +#define Skein_Show_Final(bits,ctx,cnt,outPtr) +#define Skein_Show_Key(bits,ctx,key,keyBytes) +#endif + +#ifndef SKEIN_ERR_CHECK /* run-time checks (e.g., bad params, uninitialized context)? */ +#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */ +#define Skein_assert(x) +#elif defined(SKEIN_ASSERT) +#include +#define Skein_Assert(x,retCode) assert(x) +#define Skein_assert(x) assert(x) +#else +#include +#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */ +#define Skein_assert(x) assert(x) /* internal error */ +#endif + +/***************************************************************** +** Skein block function constants (shared across Ref and Opt code) +******************************************************************/ +enum + { + /* Skein_256 round rotation constants */ + R_256_0_0=14, R_256_0_1=16, + R_256_1_0=52, R_256_1_1=57, + R_256_2_0=23, R_256_2_1=40, + R_256_3_0= 5, R_256_3_1=37, + R_256_4_0=25, R_256_4_1=33, + R_256_5_0=46, R_256_5_1=12, + R_256_6_0=58, R_256_6_1=22, + R_256_7_0=32, R_256_7_1=32, + + /* Skein_512 round rotation constants */ + R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, + R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, + R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, + R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, + R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, + R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, + R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, + R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22, + + /* Skein1024 round rotation constants */ + R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37, + R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52, + R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17, + R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25, + R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30, + R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41, + R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25, + R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20 + }; + +#ifndef SKEIN_ROUNDS +#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */ +#define SKEIN_512_ROUNDS_TOTAL (72) +#define SKEIN1024_ROUNDS_TOTAL (80) +#else /* allow command-line define in range 8*(5..14) */ +#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5)) +#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5)) +#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)) +#endif + +void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); +void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); +void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); + +#ifdef __cplusplus +} +#endif + +/* Pull in FreeBSD specific shims */ +#include "skein_freebsd.h" + +#endif /* ifndef _SKEIN_H_ */ Index: sys/crypto/skein/skein.c =================================================================== --- /dev/null +++ sys/crypto/skein/skein.c @@ -0,0 +1,858 @@ +/*********************************************************************** +** +** Implementation of the Skein hash function. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +************************************************************************/ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +/* get the memcpy/memset functions */ +#ifdef _KERNEL +#include +#else +#include +#endif + +#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */ + +#include "skein.h" /* get the Skein API definitions */ +#include "skein_iv.h" /* get precomputed IVs */ + +/*****************************************************************/ +/* External function to process blkCnt (nonzero) full block(s) of data. */ +void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); +void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); +void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd); + +/*****************************************************************/ +/* 256-bit Skein */ +/*****************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a straight hashing operation */ +int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen) + { + union + { + u08b_t b[SKEIN_256_STATE_BYTES]; + u64b_t w[SKEIN_256_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) + { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 256: memcpy(ctx->X,SKEIN_256_IV_256,sizeof(ctx->X)); break; + case 224: memcpy(ctx->X,SKEIN_256_IV_224,sizeof(ctx->X)); break; + case 160: memcpy(ctx->X,SKEIN_256_IV_160,sizeof(ctx->X)); break; + case 128: memcpy(ctx->X,SKEIN_256_IV_128,sizeof(ctx->X)); break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ + Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + break; + } + /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ + /* Set up to process the data message portion of the hash (default) */ + Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a MAC and/or tree hash operation */ +/* [identical to Skein_256_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */ +int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes) + { + union + { + u08b_t b[SKEIN_256_STATE_BYTES]; + u64b_t w[SKEIN_256_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) /* is there a key? */ + { + memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */ + } + else /* here to pre-process a key */ + { + Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X)); + /* do a mini-Init right here */ + ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */ + Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */ + Skein_256_Update(ctx,key,keyBytes); /* hash the key */ + Skein_256_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */ + memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */ +#if SKEIN_NEED_SWAP + { + uint_t i; + for (i=0;iX[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* build/process the config block, type == CONFIG (could be precomputed for each key) */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx,CFG_FINAL); + + memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + + Skein_Show_Key(256,&ctx->h,key,keyBytes); + + /* compute the initial chaining values from config block */ + Skein_256_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx,MSG); + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process the input bytes */ +int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) + { + size_t n; + + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) + { + if (ctx->h.bCnt) /* finish up any buffered message data */ + { + n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if (n) + { + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES); + Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* now process any remaining full blocks, directly from input message data */ + if (msgByteCnt > SKEIN_256_BLOCK_BYTES) + { + n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */ + Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_256_BLOCK_BYTES; + msg += n * SKEIN_256_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) + { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES); + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the result */ +int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN_256_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + + Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_256_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein_256_API_CodeSize(void) + { + return ((u08b_t *) Skein_256_API_CodeSize) - + ((u08b_t *) Skein_256_Init); + } +#endif + +/*****************************************************************/ +/* 512-bit Skein */ +/*****************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a straight hashing operation */ +int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen) + { + union + { + u08b_t b[SKEIN_512_STATE_BYTES]; + u64b_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) + { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X)); break; + case 384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X)); break; + case 256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X)); break; + case 224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X)); break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ + Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + break; + } + + /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ + /* Set up to process the data message portion of the hash (default) */ + Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a MAC and/or tree hash operation */ +/* [identical to Skein_512_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */ +int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes) + { + union + { + u08b_t b[SKEIN_512_STATE_BYTES]; + u64b_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) /* is there a key? */ + { + memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */ + } + else /* here to pre-process a key */ + { + Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X)); + /* do a mini-Init right here */ + ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */ + Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */ + Skein_512_Update(ctx,key,keyBytes); /* hash the key */ + Skein_512_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */ + memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */ +#if SKEIN_NEED_SWAP + { + uint_t i; + for (i=0;iX[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* build/process the config block, type == CONFIG (could be precomputed for each key) */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx,CFG_FINAL); + + memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + + Skein_Show_Key(512,&ctx->h,key,keyBytes); + + /* compute the initial chaining values from config block */ + Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx,MSG); + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process the input bytes */ +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) + { + size_t n; + + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + { + if (ctx->h.bCnt) /* finish up any buffered message data */ + { + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if (n) + { + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* now process any remaining full blocks, directly from input message data */ + if (msgByteCnt > SKEIN_512_BLOCK_BYTES) + { + n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ + Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) + { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the result */ +int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN_512_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + + Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein_512_API_CodeSize(void) + { + return ((u08b_t *) Skein_512_API_CodeSize) - + ((u08b_t *) Skein_512_Init); + } +#endif + +/*****************************************************************/ +/* 1024-bit Skein */ +/*****************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a straight hashing operation */ +int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen) + { + union + { + u08b_t b[SKEIN1024_STATE_BYTES]; + u64b_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) + { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break; + case 384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break; + case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ + Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + break; + } + + /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ + /* Set up to process the data message portion of the hash (default) */ + Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a MAC and/or tree hash operation */ +/* [identical to Skein1024_Init() when keyBytes == 0 && treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] */ +int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t treeInfo, const u08b_t *key, size_t keyBytes) + { + union + { + u08b_t b[SKEIN1024_STATE_BYTES]; + u64b_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL,SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) /* is there a key? */ + { + memset(ctx->X,0,sizeof(ctx->X)); /* no key: use all zeroes as key for config block */ + } + else /* here to pre-process a key */ + { + Skein_assert(sizeof(cfg.b) >= sizeof(ctx->X)); + /* do a mini-Init right here */ + ctx->h.hashBitLen=8*sizeof(ctx->X); /* set output hash bit count = state size */ + Skein_Start_New_Type(ctx,KEY); /* set tweaks: T0 = 0; T1 = KEY type */ + memset(ctx->X,0,sizeof(ctx->X)); /* zero the initial chaining variables */ + Skein1024_Update(ctx,key,keyBytes); /* hash the key */ + Skein1024_Final_Pad(ctx,cfg.b); /* put result into cfg.b[] */ + memcpy(ctx->X,cfg.b,sizeof(cfg.b)); /* copy over into ctx->X[] */ +#if SKEIN_NEED_SWAP + { + uint_t i; + for (i=0;iX[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* build/process the config block, type == CONFIG (could be precomputed for each key) */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx,CFG_FINAL); + + memset(&cfg.w,0,sizeof(cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(treeInfo); /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + + Skein_Show_Key(1024,&ctx->h,key,keyBytes); + + /* compute the initial chaining values from config block */ + Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx,MSG); + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process the input bytes */ +int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) + { + size_t n; + + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) + { + if (ctx->h.bCnt) /* finish up any buffered message data */ + { + n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if (n) + { + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES); + Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* now process any remaining full blocks, directly from input message data */ + if (msgByteCnt > SKEIN1024_BLOCK_BYTES) + { + n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */ + Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES); + msgByteCnt -= n * SKEIN1024_BLOCK_BYTES; + msg += n * SKEIN1024_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) + { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES); + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the result */ +int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN1024_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + + Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein1024_API_CodeSize(void) + { + return ((u08b_t *) Skein1024_API_CodeSize) - + ((u08b_t *) Skein1024_Init); + } +#endif + +/**************** Functions to support MAC/tree hashing ***************/ +/* (this code is identical for Optimized and Reference versions) */ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the block, no OUTPUT stage */ +int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, u08b_t *hashVal) + { + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_256_BLOCK_BYTES); /* "output" the state bytes */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the block, no OUTPUT stage */ +int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) + { + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN_512_BLOCK_BYTES); /* "output" the state bytes */ + + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the block, no OUTPUT stage */ +int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) + { + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + Skein_Put64_LSB_First(hashVal,ctx->X,SKEIN1024_BLOCK_BYTES); /* "output" the state bytes */ + + return SKEIN_SUCCESS; + } + +#if SKEIN_TREE_HASH +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* just do the OUTPUT stage */ +int Skein_256_Output(Skein_256_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN_256_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_256_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_256_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_256_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* just do the OUTPUT stage */ +int Skein_512_Output(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN_512_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* just do the OUTPUT stage */ +int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) + { + size_t i,n,byteCnt; + u64b_t X[SKEIN1024_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++) + { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES); + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; + } + + +/* Adapt the functions to match the prototype expected by libmd */ +void +SKEIN256_Init(SKEIN256_CTX * ctx) +{ + + Skein_256_Init(ctx, 256); +} + +void +SKEIN512_Init(SKEIN512_CTX * ctx) +{ + + Skein_512_Init(ctx, 512); +} + +void +SKEIN1024_Init(SKEIN1024_CTX * ctx) +{ + + Skein1024_Init(ctx, 1024); +} + +void +SKEIN256_Update(SKEIN256_CTX * ctx, const void *in, size_t len) +{ + + Skein_256_Update(ctx, in, len); +} + +void +SKEIN512_Update(SKEIN512_CTX * ctx, const void *in, size_t len) +{ + + Skein_512_Update(ctx, in, len); +} + +void +SKEIN1024_Update(SKEIN1024_CTX * ctx, const void *in, size_t len) +{ + + Skein1024_Update(ctx, in, len); +} + +void +SKEIN256_Final(unsigned char digest[SKEIN_256_BLOCK_BYTES], SKEIN256_CTX * ctx) +{ + + Skein_256_Final(ctx, digest); +} + +void +SKEIN512_Final(unsigned char digest[SKEIN_512_BLOCK_BYTES], SKEIN512_CTX * ctx) +{ + + Skein_512_Final(ctx, digest); +} + +void +SKEIN1024_Final(unsigned char digest[SKEIN1024_BLOCK_BYTES], SKEIN1024_CTX * ctx) +{ + + Skein1024_Final(ctx, digest); +} + +#ifdef WEAK_REFS +/* When building libmd, provide weak references. Note: this is not + activated in the context of compiling these sources for internal + use in libcrypt. + */ +#undef Skein_256_Init +__weak_reference(_libmd_SKEIN256_Init, Skein_256_Init); +#undef Skein_256_Update +__weak_reference(_libmd_SKEIN256_Update, Skein_256_Update); +#undef Skein_256_Final +__weak_reference(_libmd_SKEIN256_Final, Skein_256_Final); + +#undef Skein_512_Init +__weak_reference(_libmd_SKEIN512_Init, Skein_512_Init); +#undef Skein_512_Update +__weak_reference(_libmd_SKEIN512_Update, Skein_512_Update); +#undef Skein_512_Final +__weak_reference(_libmd_SKEIN512_Final, Skein_512_Final); + +#undef Skein1024_Init +__weak_reference(_libmd_SKEIN1024_Init, Skein1024_Init); +#undef Skein1024_Update +__weak_reference(_libmd_SKEIN1024_Update, Skein1024_Update); +#undef Skein1024_Final +__weak_reference(_libmd_SKEIN1024_Final, Skein1024_Final); +#endif + +#endif Index: sys/crypto/skein/skein_block.c =================================================================== --- /dev/null +++ sys/crypto/skein/skein_block.c @@ -0,0 +1,700 @@ +/*********************************************************************** +** +** Implementation of the Skein block functions. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +** Compile-time switches: +** +** SKEIN_USE_ASM -- set bits (256/512/1024) to select which +** versions use ASM code for block processing +** [default: use C for all block sizes] +** +************************************************************************/ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include "skein.h" + +#ifndef SKEIN_USE_ASM +#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ +#endif + +#ifndef SKEIN_LOOP +#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ +#endif + +#define BLK_BITS (WCNT*64) /* some useful definitions for code here */ +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +#ifdef SKEIN_DEBUG +#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } +#else +#define DebugSaveTweak(ctx) +#endif + +/***************************** Skein_256 ******************************/ +#if !(SKEIN_USE_ASM & 256) +void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) + { /* do it in C */ + enum + { + WCNT = SKEIN_256_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN_256_ROUNDS_TOTAL/8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10) +#else +#define SKEIN_UNROLL_256 (0) +#endif + +#if SKEIN_UNROLL_256 +#if (RCNT % SKEIN_UNROLL_256) +#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ +#endif + size_t r; + u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ +#else + u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ +#endif + u64b_t X0,X1,X2,X3; /* local copy of context vars, for speed */ + u64b_t w [WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + const u64b_t *Xptr[4]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; +#endif + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* this implementation only supports 2**64 input bytes (no carry out here) */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1] + ts[0]; + X2 = w[2] + ks[2] + ts[1]; + X3 = w[3] + ks[3]; + + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); /* show starting state values */ + + blkPtr += SKEIN_256_BLOCK_BYTES; + + /* run the rounds */ + +#define Round256(p0,p1,p2,p3,ROT,rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ + +#if SKEIN_UNROLL_256 == 0 +#define R256(p0,p1,p2,p3,ROT,rNum) /* fully unrolled */ \ + Round256(p0,p1,p2,p3,ROT,rNum) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr); + +#define I256(R) \ + X0 += ks[((R)+1) % 5]; /* inject the key schedule value */ \ + X1 += ks[((R)+2) % 5] + ts[((R)+1) % 3]; \ + X2 += ks[((R)+3) % 5] + ts[((R)+2) % 3]; \ + X3 += ks[((R)+4) % 5] + (R)+1; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); +#else /* looping version */ +#define R256(p0,p1,p2,p3,ROT,rNum) \ + Round256(p0,p1,p2,p3,ROT,rNum) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr); + +#define I256(R) \ + X0 += ks[r+(R)+0]; /* inject the key schedule value */ \ + X1 += ks[r+(R)+1] + ts[r+(R)+0]; \ + X2 += ks[r+(R)+2] + ts[r+(R)+1]; \ + X3 += ks[r+(R)+3] + r+(R) ; \ + ks[r + (R)+4 ] = ks[r+(R)-1]; /* rotate key schedule */\ + ts[r + (R)+2 ] = ts[r+(R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); + + for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256) /* loop thru it */ +#endif + { +#define R256_8_rounds(R) \ + R256(0,1,2,3,R_256_0,8*(R) + 1); \ + R256(0,3,2,1,R_256_1,8*(R) + 2); \ + R256(0,1,2,3,R_256_2,8*(R) + 3); \ + R256(0,3,2,1,R_256_3,8*(R) + 4); \ + I256(2*(R)); \ + R256(0,1,2,3,R_256_4,8*(R) + 5); \ + R256(0,3,2,1,R_256_5,8*(R) + 6); \ + R256(0,1,2,3,R_256_6,8*(R) + 7); \ + R256(0,3,2,1,R_256_7,8*(R) + 8); \ + I256(2*(R)+1); + + R256_8_rounds( 0); + +#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN))) + + #if R256_Unroll_R( 1) + R256_8_rounds( 1); + #endif + #if R256_Unroll_R( 2) + R256_8_rounds( 2); + #endif + #if R256_Unroll_R( 3) + R256_8_rounds( 3); + #endif + #if R256_Unroll_R( 4) + R256_8_rounds( 4); + #endif + #if R256_Unroll_R( 5) + R256_8_rounds( 5); + #endif + #if R256_Unroll_R( 6) + R256_8_rounds( 6); + #endif + #if R256_Unroll_R( 7) + R256_8_rounds( 7); + #endif + #if R256_Unroll_R( 8) + R256_8_rounds( 8); + #endif + #if R256_Unroll_R( 9) + R256_8_rounds( 9); + #endif + #if R256_Unroll_R(10) + R256_8_rounds(10); + #endif + #if R256_Unroll_R(11) + R256_8_rounds(11); + #endif + #if R256_Unroll_R(12) + R256_8_rounds(12); + #endif + #if R256_Unroll_R(13) + R256_8_rounds(13); + #endif + #if R256_Unroll_R(14) + R256_8_rounds(14); + #endif + #if (SKEIN_UNROLL_256 > 14) +#error "need more unrolling in Skein_256_Process_Block" + #endif + } + /* do the final "feedforward" xor, update context chaining vars */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + + Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein_256_Process_Block_CodeSize(void) + { + return ((u08b_t *) Skein_256_Process_Block_CodeSize) - + ((u08b_t *) Skein_256_Process_Block); + } +uint_t Skein_256_Unroll_Cnt(void) + { + return SKEIN_UNROLL_256; + } +#endif +#endif + +/***************************** Skein_512 ******************************/ +#if !(SKEIN_USE_ASM & 512) +void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) + { /* do it in C */ + enum + { + WCNT = SKEIN_512_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN_512_ROUNDS_TOTAL/8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10) +#else +#define SKEIN_UNROLL_512 (0) +#endif + +#if SKEIN_UNROLL_512 +#if (RCNT % SKEIN_UNROLL_512) +#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ +#endif + size_t r; + u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ +#else + u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ +#endif + u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ + u64b_t w [WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + const u64b_t *Xptr[8]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; + Xptr[4] = &X4; Xptr[5] = &X5; Xptr[6] = &X6; Xptr[7] = &X7; +#endif + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* this implementation only supports 2**64 input bytes (no carry out here) */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; + + blkPtr += SKEIN_512_BLOCK_BYTES; + + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); + /* run the rounds */ +#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ + X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ + X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ + +#if SKEIN_UNROLL_512 == 0 +#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \ + Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr); + +#define I512(R) \ + X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \ + X1 += ks[((R)+2) % 9]; \ + X2 += ks[((R)+3) % 9]; \ + X3 += ks[((R)+4) % 9]; \ + X4 += ks[((R)+5) % 9]; \ + X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ + X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ + X7 += ks[((R)+8) % 9] + (R)+1; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); +#else /* looping version */ +#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr); + +#define I512(R) \ + X0 += ks[r+(R)+0]; /* inject the key schedule value */ \ + X1 += ks[r+(R)+1]; \ + X2 += ks[r+(R)+2]; \ + X3 += ks[r+(R)+3]; \ + X4 += ks[r+(R)+4]; \ + X5 += ks[r+(R)+5] + ts[r+(R)+0]; \ + X6 += ks[r+(R)+6] + ts[r+(R)+1]; \ + X7 += ks[r+(R)+7] + r+(R) ; \ + ks[r + (R)+8] = ks[r+(R)-1]; /* rotate key schedule */ \ + ts[r + (R)+2] = ts[r+(R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); + + for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512) /* loop thru it */ +#endif /* end of looped code definitions */ + { +#define R512_8_rounds(R) /* do 8 full rounds */ \ + R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ + R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ + R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ + R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ + I512(2*(R)); \ + R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ + R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ + R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ + R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ + I512(2*(R)+1); /* and key injection */ + + R512_8_rounds( 0); + +#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN))) + + #if R512_Unroll_R( 1) + R512_8_rounds( 1); + #endif + #if R512_Unroll_R( 2) + R512_8_rounds( 2); + #endif + #if R512_Unroll_R( 3) + R512_8_rounds( 3); + #endif + #if R512_Unroll_R( 4) + R512_8_rounds( 4); + #endif + #if R512_Unroll_R( 5) + R512_8_rounds( 5); + #endif + #if R512_Unroll_R( 6) + R512_8_rounds( 6); + #endif + #if R512_Unroll_R( 7) + R512_8_rounds( 7); + #endif + #if R512_Unroll_R( 8) + R512_8_rounds( 8); + #endif + #if R512_Unroll_R( 9) + R512_8_rounds( 9); + #endif + #if R512_Unroll_R(10) + R512_8_rounds(10); + #endif + #if R512_Unroll_R(11) + R512_8_rounds(11); + #endif + #if R512_Unroll_R(12) + R512_8_rounds(12); + #endif + #if R512_Unroll_R(13) + R512_8_rounds(13); + #endif + #if R512_Unroll_R(14) + R512_8_rounds(14); + #endif + #if (SKEIN_UNROLL_512 > 14) +#error "need more unrolling in Skein_512_Process_Block" + #endif + } + + /* do the final "feedforward" xor, update context chaining vars */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + ctx->X[4] = X4 ^ w[4]; + ctx->X[5] = X5 ^ w[5]; + ctx->X[6] = X6 ^ w[6]; + ctx->X[7] = X7 ^ w[7]; + Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein_512_Process_Block_CodeSize(void) + { + return ((u08b_t *) Skein_512_Process_Block_CodeSize) - + ((u08b_t *) Skein_512_Process_Block); + } +uint_t Skein_512_Unroll_Cnt(void) + { + return SKEIN_UNROLL_512; + } +#endif +#endif + +/***************************** Skein1024 ******************************/ +#if !(SKEIN_USE_ASM & 1024) +void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) + { /* do it in C, always looping (unrolled is bigger AND slower!) */ + enum + { + WCNT = SKEIN1024_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN1024_ROUNDS_TOTAL/8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) +#else +#define SKEIN_UNROLL_1024 (0) +#endif + +#if (SKEIN_UNROLL_1024 != 0) +#if (RCNT % SKEIN_UNROLL_1024) +#error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ +#endif + size_t r; + u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ +#else + u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ +#endif + + u64b_t X00,X01,X02,X03,X04,X05,X06,X07, /* local copy of vars, for speed */ + X08,X09,X10,X11,X12,X13,X14,X15; + u64b_t w [WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + const u64b_t *Xptr[16]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[ 0] = &X00; Xptr[ 1] = &X01; Xptr[ 2] = &X02; Xptr[ 3] = &X03; + Xptr[ 4] = &X04; Xptr[ 5] = &X05; Xptr[ 6] = &X06; Xptr[ 7] = &X07; + Xptr[ 8] = &X08; Xptr[ 9] = &X09; Xptr[10] = &X10; Xptr[11] = &X11; + Xptr[12] = &X12; Xptr[13] = &X13; Xptr[14] = &X14; Xptr[15] = &X15; +#endif + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* this implementation only supports 2**64 input bytes (no carry out here) */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[ 0] = ctx->X[ 0]; + ks[ 1] = ctx->X[ 1]; + ks[ 2] = ctx->X[ 2]; + ks[ 3] = ctx->X[ 3]; + ks[ 4] = ctx->X[ 4]; + ks[ 5] = ctx->X[ 5]; + ks[ 6] = ctx->X[ 6]; + ks[ 7] = ctx->X[ 7]; + ks[ 8] = ctx->X[ 8]; + ks[ 9] = ctx->X[ 9]; + ks[10] = ctx->X[10]; + ks[11] = ctx->X[11]; + ks[12] = ctx->X[12]; + ks[13] = ctx->X[13]; + ks[14] = ctx->X[14]; + ks[15] = ctx->X[15]; + ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^ + ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^ + ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^ + ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + + X00 = w[ 0] + ks[ 0]; /* do the first full key injection */ + X01 = w[ 1] + ks[ 1]; + X02 = w[ 2] + ks[ 2]; + X03 = w[ 3] + ks[ 3]; + X04 = w[ 4] + ks[ 4]; + X05 = w[ 5] + ks[ 5]; + X06 = w[ 6] + ks[ 6]; + X07 = w[ 7] + ks[ 7]; + X08 = w[ 8] + ks[ 8]; + X09 = w[ 9] + ks[ 9]; + X10 = w[10] + ks[10]; + X11 = w[11] + ks[11]; + X12 = w[12] + ks[12]; + X13 = w[13] + ks[13] + ts[0]; + X14 = w[14] + ks[14] + ts[1]; + X15 = w[15] + ks[15]; + + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); + +#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ + X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ + X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ + X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8; \ + X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA; \ + X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC; \ + X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE; \ + +#if SKEIN_UNROLL_1024 == 0 +#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ + Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr); + +#define I1024(R) \ + X00 += ks[((R)+ 1) % 17]; /* inject the key schedule value */ \ + X01 += ks[((R)+ 2) % 17]; \ + X02 += ks[((R)+ 3) % 17]; \ + X03 += ks[((R)+ 4) % 17]; \ + X04 += ks[((R)+ 5) % 17]; \ + X05 += ks[((R)+ 6) % 17]; \ + X06 += ks[((R)+ 7) % 17]; \ + X07 += ks[((R)+ 8) % 17]; \ + X08 += ks[((R)+ 9) % 17]; \ + X09 += ks[((R)+10) % 17]; \ + X10 += ks[((R)+11) % 17]; \ + X11 += ks[((R)+12) % 17]; \ + X12 += ks[((R)+13) % 17]; \ + X13 += ks[((R)+14) % 17] + ts[((R)+1) % 3]; \ + X14 += ks[((R)+15) % 17] + ts[((R)+2) % 3]; \ + X15 += ks[((R)+16) % 17] + (R)+1; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); +#else /* looping version */ +#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ + Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr); + +#define I1024(R) \ + X00 += ks[r+(R)+ 0]; /* inject the key schedule value */ \ + X01 += ks[r+(R)+ 1]; \ + X02 += ks[r+(R)+ 2]; \ + X03 += ks[r+(R)+ 3]; \ + X04 += ks[r+(R)+ 4]; \ + X05 += ks[r+(R)+ 5]; \ + X06 += ks[r+(R)+ 6]; \ + X07 += ks[r+(R)+ 7]; \ + X08 += ks[r+(R)+ 8]; \ + X09 += ks[r+(R)+ 9]; \ + X10 += ks[r+(R)+10]; \ + X11 += ks[r+(R)+11]; \ + X12 += ks[r+(R)+12]; \ + X13 += ks[r+(R)+13] + ts[r+(R)+0]; \ + X14 += ks[r+(R)+14] + ts[r+(R)+1]; \ + X15 += ks[r+(R)+15] + r+(R) ; \ + ks[r + (R)+16] = ks[r+(R)-1]; /* rotate key schedule */ \ + ts[r + (R)+ 2] = ts[r+(R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); + + for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024) /* loop thru it */ +#endif + { +#define R1024_8_rounds(R) /* do 8 full rounds */ \ + R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \ + R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \ + R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \ + R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \ + I1024(2*(R)); \ + R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \ + R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \ + R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \ + R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \ + I1024(2*(R)+1); + + R1024_8_rounds( 0); + +#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN))) + + #if R1024_Unroll_R( 1) + R1024_8_rounds( 1); + #endif + #if R1024_Unroll_R( 2) + R1024_8_rounds( 2); + #endif + #if R1024_Unroll_R( 3) + R1024_8_rounds( 3); + #endif + #if R1024_Unroll_R( 4) + R1024_8_rounds( 4); + #endif + #if R1024_Unroll_R( 5) + R1024_8_rounds( 5); + #endif + #if R1024_Unroll_R( 6) + R1024_8_rounds( 6); + #endif + #if R1024_Unroll_R( 7) + R1024_8_rounds( 7); + #endif + #if R1024_Unroll_R( 8) + R1024_8_rounds( 8); + #endif + #if R1024_Unroll_R( 9) + R1024_8_rounds( 9); + #endif + #if R1024_Unroll_R(10) + R1024_8_rounds(10); + #endif + #if R1024_Unroll_R(11) + R1024_8_rounds(11); + #endif + #if R1024_Unroll_R(12) + R1024_8_rounds(12); + #endif + #if R1024_Unroll_R(13) + R1024_8_rounds(13); + #endif + #if R1024_Unroll_R(14) + R1024_8_rounds(14); + #endif + #if (SKEIN_UNROLL_1024 > 14) +#error "need more unrolling in Skein_1024_Process_Block" + #endif + } + /* do the final "feedforward" xor, update context chaining vars */ + + ctx->X[ 0] = X00 ^ w[ 0]; + ctx->X[ 1] = X01 ^ w[ 1]; + ctx->X[ 2] = X02 ^ w[ 2]; + ctx->X[ 3] = X03 ^ w[ 3]; + ctx->X[ 4] = X04 ^ w[ 4]; + ctx->X[ 5] = X05 ^ w[ 5]; + ctx->X[ 6] = X06 ^ w[ 6]; + ctx->X[ 7] = X07 ^ w[ 7]; + ctx->X[ 8] = X08 ^ w[ 8]; + ctx->X[ 9] = X09 ^ w[ 9]; + ctx->X[10] = X10 ^ w[10]; + ctx->X[11] = X11 ^ w[11]; + ctx->X[12] = X12 ^ w[12]; + ctx->X[13] = X13 ^ w[13]; + ctx->X[14] = X14 ^ w[14]; + ctx->X[15] = X15 ^ w[15]; + + Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + blkPtr += SKEIN1024_BLOCK_BYTES; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; + } + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t Skein1024_Process_Block_CodeSize(void) + { + return ((u08b_t *) Skein1024_Process_Block_CodeSize) - + ((u08b_t *) Skein1024_Process_Block); + } +uint_t Skein1024_Unroll_Cnt(void) + { + return SKEIN_UNROLL_1024; + } +#endif +#endif Index: sys/crypto/skein/skein_freebsd.h =================================================================== --- /dev/null +++ sys/crypto/skein/skein_freebsd.h @@ -0,0 +1,51 @@ +#ifndef _SKEIN_FREEBSD_H_ +#define _SKEIN_FREEBSD_H_ + +#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) + +#define SKEIN256_BLOCK_LENGTH SKEIN_256_BLOCK_BYTES +#define SKEIN256_DIGEST_LENGTH 32 +#define SKEIN256_DIGEST_STRING_LENGTH (SKEIN256_DIGEST_LENGTH * 2 + 1) +#define SKEIN512_BLOCK_LENGTH SKEIN_512_BLOCK_BYTES +#define SKEIN512_DIGEST_LENGTH 64 +#define SKEIN512_DIGEST_STRING_LENGTH (SKEIN512_DIGEST_LENGTH * 2 + 1) +#define SKEIN1024_BLOCK_LENGTH SKEIN1024_BLOCK_BYTES +#define SKEIN1024_DIGEST_LENGTH 128 +#define SKEIN1024_DIGEST_STRING_LENGTH (SKEIN1024_DIGEST_LENGTH * 2 + 1) + +/* Make the context types look like the other hashes on FreeBSD */ +typedef Skein_256_Ctxt_t SKEIN256_CTX; +typedef Skein_512_Ctxt_t SKEIN512_CTX; +typedef Skein1024_Ctxt_t SKEIN1024_CTX; + +/* Make the prototypes look like the other hashes */ +void SKEIN256_Init (SKEIN256_CTX *ctx); +void SKEIN512_Init (SKEIN512_CTX *ctx); +void SKEIN1024_Init (SKEIN1024_CTX *ctx); + +void SKEIN256_Update(SKEIN256_CTX * ctx, const void *in, size_t len); +void SKEIN512_Update(SKEIN512_CTX * ctx, const void *in, size_t len); +void SKEIN1024_Update(SKEIN1024_CTX * ctx, const void *in, size_t len); + +void SKEIN256_Final(unsigned char digest[SKEIN256_DIGEST_LENGTH], SKEIN256_CTX * ctx); +void SKEIN512_Final(unsigned char digest[SKEIN512_DIGEST_LENGTH], SKEIN512_CTX * ctx); +void SKEIN1024_Final(unsigned char digest[SKEIN1024_DIGEST_LENGTH], SKEIN1024_CTX * ctx); + +#ifndef _KERNEL +char *SKEIN256_End(SKEIN256_CTX *, char *); +char *SKEIN512_End(SKEIN512_CTX *, char *); +char *SKEIN1024_End(SKEIN1024_CTX *, char *); +char *SKEIN256_Data(const void *, unsigned int, char *); +char *SKEIN512_Data(const void *, unsigned int, char *); +char *SKEIN1024_Data(const void *, unsigned int, char *); +char *SKEIN256_File(const char *, char *); +char *SKEIN512_File(const char *, char *); +char *SKEIN1024_File(const char *, char *); +char *SKEIN256_FileChunk(const char *, char *, off_t, off_t); +char *SKEIN512_FileChunk(const char *, char *, off_t, off_t); +char *SKEIN1024_FileChunk(const char *, char *, off_t, off_t); +#endif + +#endif /* ifndef _SKEIN_FREEBSD_H_ */ Index: sys/crypto/skein/skein_iv.h =================================================================== --- /dev/null +++ sys/crypto/skein/skein_iv.h @@ -0,0 +1,199 @@ +#ifndef _SKEIN_IV_H_ +#define _SKEIN_IV_H_ + +#include "skein.h" /* get Skein macros and types */ + +/* +***************** Pre-computed Skein IVs ******************* +** +** NOTE: these values are not "magic" constants, but +** are generated using the Threefish block function. +** They are pre-computed here only for speed; i.e., to +** avoid the need for a Threefish call during Init(). +** +** The IV for any fixed hash length may be pre-computed. +** Only the most common values are included here. +** +************************************************************ +**/ + +#define MK_64 SKEIN_MK_64 + +/* blkSize = 256 bits. hashSize = 128 bits */ +const u64b_t SKEIN_256_IV_128[] = + { + MK_64(0xE1111906,0x964D7260), + MK_64(0x883DAAA7,0x7C8D811C), + MK_64(0x10080DF4,0x91960F7A), + MK_64(0xCCF7DDE5,0xB45BC1C2) + }; + +/* blkSize = 256 bits. hashSize = 160 bits */ +const u64b_t SKEIN_256_IV_160[] = + { + MK_64(0x14202314,0x72825E98), + MK_64(0x2AC4E9A2,0x5A77E590), + MK_64(0xD47A5856,0x8838D63E), + MK_64(0x2DD2E496,0x8586AB7D) + }; + +/* blkSize = 256 bits. hashSize = 224 bits */ +const u64b_t SKEIN_256_IV_224[] = + { + MK_64(0xC6098A8C,0x9AE5EA0B), + MK_64(0x876D5686,0x08C5191C), + MK_64(0x99CB88D7,0xD7F53884), + MK_64(0x384BDDB1,0xAEDDB5DE) + }; + +/* blkSize = 256 bits. hashSize = 256 bits */ +const u64b_t SKEIN_256_IV_256[] = + { + MK_64(0xFC9DA860,0xD048B449), + MK_64(0x2FCA6647,0x9FA7D833), + MK_64(0xB33BC389,0x6656840F), + MK_64(0x6A54E920,0xFDE8DA69) + }; + +/* blkSize = 512 bits. hashSize = 128 bits */ +const u64b_t SKEIN_512_IV_128[] = + { + MK_64(0xA8BC7BF3,0x6FBF9F52), + MK_64(0x1E9872CE,0xBD1AF0AA), + MK_64(0x309B1790,0xB32190D3), + MK_64(0xBCFBB854,0x3F94805C), + MK_64(0x0DA61BCD,0x6E31B11B), + MK_64(0x1A18EBEA,0xD46A32E3), + MK_64(0xA2CC5B18,0xCE84AA82), + MK_64(0x6982AB28,0x9D46982D) + }; + +/* blkSize = 512 bits. hashSize = 160 bits */ +const u64b_t SKEIN_512_IV_160[] = + { + MK_64(0x28B81A2A,0xE013BD91), + MK_64(0xC2F11668,0xB5BDF78F), + MK_64(0x1760D8F3,0xF6A56F12), + MK_64(0x4FB74758,0x8239904F), + MK_64(0x21EDE07F,0x7EAF5056), + MK_64(0xD908922E,0x63ED70B8), + MK_64(0xB8EC76FF,0xECCB52FA), + MK_64(0x01A47BB8,0xA3F27A6E) + }; + +/* blkSize = 512 bits. hashSize = 224 bits */ +const u64b_t SKEIN_512_IV_224[] = + { + MK_64(0xCCD06162,0x48677224), + MK_64(0xCBA65CF3,0xA92339EF), + MK_64(0x8CCD69D6,0x52FF4B64), + MK_64(0x398AED7B,0x3AB890B4), + MK_64(0x0F59D1B1,0x457D2BD0), + MK_64(0x6776FE65,0x75D4EB3D), + MK_64(0x99FBC70E,0x997413E9), + MK_64(0x9E2CFCCF,0xE1C41EF7) + }; + +/* blkSize = 512 bits. hashSize = 256 bits */ +const u64b_t SKEIN_512_IV_256[] = + { + MK_64(0xCCD044A1,0x2FDB3E13), + MK_64(0xE8359030,0x1A79A9EB), + MK_64(0x55AEA061,0x4F816E6F), + MK_64(0x2A2767A4,0xAE9B94DB), + MK_64(0xEC06025E,0x74DD7683), + MK_64(0xE7A436CD,0xC4746251), + MK_64(0xC36FBAF9,0x393AD185), + MK_64(0x3EEDBA18,0x33EDFC13) + }; + +/* blkSize = 512 bits. hashSize = 384 bits */ +const u64b_t SKEIN_512_IV_384[] = + { + MK_64(0xA3F6C6BF,0x3A75EF5F), + MK_64(0xB0FEF9CC,0xFD84FAA4), + MK_64(0x9D77DD66,0x3D770CFE), + MK_64(0xD798CBF3,0xB468FDDA), + MK_64(0x1BC4A666,0x8A0E4465), + MK_64(0x7ED7D434,0xE5807407), + MK_64(0x548FC1AC,0xD4EC44D6), + MK_64(0x266E1754,0x6AA18FF8) + }; + +/* blkSize = 512 bits. hashSize = 512 bits */ +const u64b_t SKEIN_512_IV_512[] = + { + MK_64(0x4903ADFF,0x749C51CE), + MK_64(0x0D95DE39,0x9746DF03), + MK_64(0x8FD19341,0x27C79BCE), + MK_64(0x9A255629,0xFF352CB1), + MK_64(0x5DB62599,0xDF6CA7B0), + MK_64(0xEABE394C,0xA9D5C3F4), + MK_64(0x991112C7,0x1A75B523), + MK_64(0xAE18A40B,0x660FCC33) + }; + +/* blkSize = 1024 bits. hashSize = 384 bits */ +const u64b_t SKEIN1024_IV_384[] = + { + MK_64(0x5102B6B8,0xC1894A35), + MK_64(0xFEEBC9E3,0xFE8AF11A), + MK_64(0x0C807F06,0xE32BED71), + MK_64(0x60C13A52,0xB41A91F6), + MK_64(0x9716D35D,0xD4917C38), + MK_64(0xE780DF12,0x6FD31D3A), + MK_64(0x797846B6,0xC898303A), + MK_64(0xB172C2A8,0xB3572A3B), + MK_64(0xC9BC8203,0xA6104A6C), + MK_64(0x65909338,0xD75624F4), + MK_64(0x94BCC568,0x4B3F81A0), + MK_64(0x3EBBF51E,0x10ECFD46), + MK_64(0x2DF50F0B,0xEEB08542), + MK_64(0x3B5A6530,0x0DBC6516), + MK_64(0x484B9CD2,0x167BBCE1), + MK_64(0x2D136947,0xD4CBAFEA) + }; + +/* blkSize = 1024 bits. hashSize = 512 bits */ +const u64b_t SKEIN1024_IV_512[] = + { + MK_64(0xCAEC0E5D,0x7C1B1B18), + MK_64(0xA01B0E04,0x5F03E802), + MK_64(0x33840451,0xED912885), + MK_64(0x374AFB04,0xEAEC2E1C), + MK_64(0xDF25A0E2,0x813581F7), + MK_64(0xE4004093,0x8B12F9D2), + MK_64(0xA662D539,0xC2ED39B6), + MK_64(0xFA8B85CF,0x45D8C75A), + MK_64(0x8316ED8E,0x29EDE796), + MK_64(0x053289C0,0x2E9F91B8), + MK_64(0xC3F8EF1D,0x6D518B73), + MK_64(0xBDCEC3C4,0xD5EF332E), + MK_64(0x549A7E52,0x22974487), + MK_64(0x67070872,0x5B749816), + MK_64(0xB9CD28FB,0xF0581BD1), + MK_64(0x0E2940B8,0x15804974) + }; + +/* blkSize = 1024 bits. hashSize = 1024 bits */ +const u64b_t SKEIN1024_IV_1024[] = + { + MK_64(0xD593DA07,0x41E72355), + MK_64(0x15B5E511,0xAC73E00C), + MK_64(0x5180E5AE,0xBAF2C4F0), + MK_64(0x03BD41D3,0xFCBCAFAF), + MK_64(0x1CAEC6FD,0x1983A898), + MK_64(0x6E510B8B,0xCDD0589F), + MK_64(0x77E2BDFD,0xC6394ADA), + MK_64(0xC11E1DB5,0x24DCB0A3), + MK_64(0xD6D14AF9,0xC6329AB5), + MK_64(0x6A9B0BFC,0x6EB67E0D), + MK_64(0x9243C60D,0xCCFF1332), + MK_64(0x1A1F1DDE,0x743F02D4), + MK_64(0x0996753C,0x10ED0BB8), + MK_64(0x6572DD22,0xF2B4969A), + MK_64(0x61FD3062,0xD00A579A), + MK_64(0x1DE0536E,0x8682E539) + }; + +#endif /* _SKEIN_IV_H_ */ Index: sys/crypto/skein/skein_port.h =================================================================== --- /dev/null +++ sys/crypto/skein/skein_port.h @@ -0,0 +1,167 @@ +#ifndef _SKEIN_PORT_H_ +#define _SKEIN_PORT_H_ +/******************************************************************* +** +** Platform-specific definitions for Skein hash function. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +** Many thanks to Brian Gladman for his portable header files. +** +** To port Skein to an "unsupported" platform, change the definitions +** in this file appropriately. +** +********************************************************************/ + +#ifndef _KERNEL +#include +#endif + +#include "brg_types.h" /* get integer type definitions */ + +typedef unsigned int uint_t; /* native unsigned integer */ +typedef uint_8t u08b_t; /* 8-bit unsigned integer */ +typedef uint_64t u64b_t; /* 64-bit unsigned integer */ + +#ifndef RotL_64 +#define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N)))) +#endif + +/* + * Skein is "natively" little-endian (unlike SHA-xxx), for optimal + * performance on x86 CPUs. The Skein code requires the following + * definitions for dealing with endianness: + * + * SKEIN_NEED_SWAP: 0 for little-endian, 1 for big-endian + * Skein_Put64_LSB_First + * Skein_Get64_LSB_First + * Skein_Swap64 + * + * If SKEIN_NEED_SWAP is defined at compile time, it is used here + * along with the portable versions of Put64/Get64/Swap64, which + * are slow in general. + * + * Otherwise, an "auto-detect" of endianness is attempted below. + * If the default handling doesn't work well, the user may insert + * platform-specific code instead (e.g., for big-endian CPUs). + * + */ +#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */ + +#include "brg_endian.h" /* get endianness selection */ +#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN + /* here for big-endian CPUs */ +#define SKEIN_NEED_SWAP (1) +#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN + /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ +#define SKEIN_NEED_SWAP (0) +#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */ +#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt) +#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt)) +#endif +#else +#error "Skein needs endianness setting!" +#endif + +#endif /* ifndef SKEIN_NEED_SWAP */ + +/* + ****************************************************************** + * Provide any definitions still needed. + ****************************************************************** + */ +#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */ +#if SKEIN_NEED_SWAP +#define Skein_Swap64(w64) \ + ( (( ((u64b_t)(w64)) & 0xFF) << 56) | \ + (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) | \ + (((((u64b_t)(w64)) >>16) & 0xFF) << 40) | \ + (((((u64b_t)(w64)) >>24) & 0xFF) << 32) | \ + (((((u64b_t)(w64)) >>32) & 0xFF) << 24) | \ + (((((u64b_t)(w64)) >>40) & 0xFF) << 16) | \ + (((((u64b_t)(w64)) >>48) & 0xFF) << 8) | \ + (((((u64b_t)(w64)) >>56) & 0xFF) ) ) +#else +#define Skein_Swap64(w64) (w64) +#endif +#endif /* ifndef Skein_Swap64 */ + + +#ifndef Skein_Put64_LSB_First +void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ + { /* this version is fully portable (big-endian or little-endian), but slow */ + size_t n; + + for (n=0;n>3] >> (8*(n&7))); + } +#else + ; /* output only the function prototype */ +#endif +#endif /* ifndef Skein_Put64_LSB_First */ + + +#ifndef Skein_Get64_LSB_First +void Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ + { /* this version is fully portable (big-endian or little-endian), but slow */ + size_t n; + + for (n=0;n<8*wCnt;n+=8) + dst[n/8] = (((u64b_t) src[n ]) ) + + (((u64b_t) src[n+1]) << 8) + + (((u64b_t) src[n+2]) << 16) + + (((u64b_t) src[n+3]) << 24) + + (((u64b_t) src[n+4]) << 32) + + (((u64b_t) src[n+5]) << 40) + + (((u64b_t) src[n+6]) << 48) + + (((u64b_t) src[n+7]) << 56) ; + } +#else + ; /* output only the function prototype */ +#endif +#endif /* ifndef Skein_Get64_LSB_First */ + +/* Start FreeBSD libmd shims */ + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef Skein_256_Init +#define Skein_256_Init _libmd_SKEIN256_Init +#define Skein_512_Init _libmd_SKEIN512_Init +#define Skein_1024_Init _libmd_SKEIN1024_Init +#endif +#ifndef Skein_256_Update +#define Skein_256_Update _libmd_SKEIN256_Update +#define Skein_512_Update _libmd_SKEIN512_Update +#define Skein_1024_Update _libmd_SKEIN1024_Update +#endif +#ifndef Skein_256_Final +#define Skein_256_Final _libmd_SKEIN256_Final +#define Skein_512_Final _libmd_SKEIN512_Final +#define Skein1024_Final _libmd_SKEIN1024_Final +#endif +#ifndef Skein_256_End +#define Skein_256_End _libmd_SKEIN256_End +#define Skein_512_End _libmd_SKEIN512_End +#define Skein_1024_End _libmd_SKEIN1024_End +#endif +#ifndef Skein_256_File +#define Skein_256_File _libmd_SKEIN256_File +#define Skein_512_File _libmd_SKEIN512_File +#define Skein_1024_File _libmd_SKEIN1024_File +#endif +#ifndef Skein_256_FileChunk +#define Skein_256_FileChunk _libmd_SKEIN256_FileChunk +#define Skein_512_FileChunk _libmd_SKEIN512_FileChunk +#define Skein_1024_FileChunk _libmd_SKEIN1024_FileChunk +#endif +#ifndef Skein_256_Data +#define Skein_256_Data _libmd_SKEIN256_Data +#define Skein_512_Data _libmd_SKEIN512_Data +#define Skein_1024_Data _libmd_SKEIN1024_Data +#endif + +#endif /* ifndef _SKEIN_PORT_H_ */ Index: sys/modules/crypto/Makefile =================================================================== --- sys/modules/crypto/Makefile +++ sys/modules/crypto/Makefile @@ -8,6 +8,7 @@ .PATH: ${.CURDIR}/../../crypto/rijndael .PATH: ${.CURDIR}/../../crypto/sha2 .PATH: ${.CURDIR}/../../crypto/siphash +.PATH: ${.CURDIR}/../../crypto/skein KMOD = crypto SRCS = crypto.c cryptodev_if.c @@ -17,6 +18,13 @@ SRCS += camellia.c camellia-api.c SRCS += des_ecb.c des_enc.c des_setkey.c SRCS += sha1.c sha256c.c sha512c.c +SRCS += skein.c skein_block.c +.if exists(${MACHINE_ARCH}/skein_block_asm.s) +.PATH: ${.CURDIR}/../../crypto/skein/${MACHINE_ARCH} +SRCS += skein_block_asm.s +CFLAGS += -DSKEIN_ASM -DSKEIN_USE_ASM=1792 # list of block functions to replace with assembly: 256+512+1024 = 1792 +ACFLAGS += -DELF -Wa,--noexecstack +.endif SRCS += siphash.c SRCS += gmac.c gfmult.c SRCS += opt_param.h cryptodev_if.h bus_if.h device_if.h