diff --git a/lib/libmd/Makefile b/lib/libmd/Makefile --- a/lib/libmd/Makefile +++ b/lib/libmd/Makefile @@ -117,9 +117,12 @@ .endif .if ${USE_ASM_SOURCES} != 0 -.if exists(${MACHINE_ARCH}/sha.S) -SRCS+= sha.S +.if exists(${MACHINE_ARCH}/sha1block.S) +SRCS+= sha1block.S CFLAGS+= -DSHA1_ASM +.if exists(${MACHINE_ARCH}/sha1dispatch.c) +SRCS+= sha1dispatch.c +.endif .endif .if exists(${MACHINE_ARCH}/rmd160.S) SRCS+= rmd160.S @@ -135,7 +138,7 @@ # the assembly vs C versions, and skein_block needs to be rebuilt if it changes. skein_block.o skein_block.pico: Makefile .endif -.if exists(${MACHINE_ARCH}/sha.S) || exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.S) +.if exists(${MACHINE_ARCH}/rmd160.S) || exists(${MACHINE_ARCH}/skein_block_asm.S) ACFLAGS+= -DELF -Wa,--noexecstack .endif .if ${MACHINE_CPUARCH} == "aarch64" diff --git a/lib/libmd/aarch64/sha1block.S b/lib/libmd/aarch64/sha1block.S new file mode 100644 --- /dev/null +++ b/lib/libmd/aarch64/sha1block.S @@ -0,0 +1,490 @@ +/*- + * Copyright (c) 2024 Robert Clausecker + * + * SPDX-License-Identifier: BSD-2-Clause + * + * sha1block_sha1 implementation based on sha1-arm.c, + * written and placed in public domain by Jeffrey Walton + * based on code from ARM, and by Johannes Schneiders, Skip + * Hovsmith and Barry O'Rourke for the mbedTLS project. + */ + +#include + +/* + * Scalar SHA1 implementation. + * + * Due to the ample register file available on AArch64, the w array is + * kept entirely in registers. The saved a-e variables are instead kept + * in memory as we don't have that much memory. + */ + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_scalar) +ctx .req x0 +buf .req x1 +len .req x2 +w .req sp +a .req w3 +b .req w4 +c .req w5 +d .req w6 +e .req w7 +k .req w8 +f .req w9 +tmp .req w10 +w_0 .req w11 +w_1 .req w12 +w_2 .req w13 +w_3 .req w14 +w_4 .req w15 +w_5 .req w16 +w_6 .req w17 +// w18 is the platform register +w_7 .req w19 +w_8 .req w20 +w_9 .req w21 +w_10 .req w22 +w_11 .req w23 +w_12 .req w24 +w_13 .req w25 +w_14 .req w26 +w_15 .req w27 + +.macro shuffle w_i, w_i3, w_i8, w_i14 + eor \w_i, \w_i, \w_i3 + eor tmp, \w_i8, \w_i14 + eor \w_i, \w_i, tmp // w[i-16] ^ w[i-14] ^ w[i-8] ^ w[i-3] + ror \w_i, \w_i, #31 // w[i] = ... ror #31 +.endm + +.macro func1 a, b, c, d, e + and f, \c, \b + bic tmp, \d, \b + orr f, f, tmp +.endm + +.macro func2 a, b, c, d, e + eor f, \b, \c + eor f, f, \d +.endm + +.macro func3 a, b, c, d, e + eor tmp, \b, \c + and f, \b, \c + and tmp, tmp, \d + orr f, f, tmp +.endm + +.macro func4 a, b, c, d, e + func2 \a, \b, \c, \d, \e +.endm + +.macro mix a, b, c, d, e, w_i + ror \b, \b, #2 + ror tmp, \a, #27 + add \e, \e, \w_i + add tmp, tmp, k + add \e, \e, f + add \e, \e, tmp // (a ror 27) + e + f + k + w[i] +.endm + +.macro round1 a, b, c, d, e, w_i + func1 \a, \b, \c, \d, \e + rev \w_i, \w_i + mix \a, \b, \c, \d, \e, \w_i +.endm + +.macro round func, a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + shuffle \w_i, \w_i3, \w_i8, \w_i14 + \func \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, \w_i +.endm + +.macro round1x a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + round func1, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14 +.endm + +.macro round2 a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + round func2, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14 +.endm + +.macro round3 a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + round func3, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14 +.endm + +.macro round4 a, b, c, d, e, w_i, w_i3, w_i8, w_i14 + round func4, \a, \b, \c, \d, \e, \w_i, \w_i3, \w_i8, \w_i14 +.endm + + ands len, len, #~63 // take length in multiples of block length + beq 1f // bail out if input empty + + sub sp, sp, #24+9*8 // allocate stack space + str x19, [sp, #24+0*8] + stp x20, x21, [sp, #24+1*8] + stp x22, x23, [sp, #24+3*8] + stp x24, x25, [sp, #24+5*8] + stp x26, x27, [sp, #24+7*8] + + ldp a, b, [ctx, #0] // load SHA1 state from context + ldp c, d, [ctx, #8] + ldr e, [ctx, #16] + +0: stp a, b, [sp, #0] // save old SHA1 state + stp c, d, [sp, #8] + str e, [sp, #16] + + movz k, #0x7999 // round constant 1 + movk k, #0x5a82, lsl #16 + + ldp w_0, w_1, [buf, #0*4] + round1 a, b, c, d, e, w_0 + round1 e, a, b, c, d, w_1 + + ldp w_2, w_3, [buf, #2*4] + round1 d, e, a, b, c, w_2 + round1 c, d, e, a, b, w_3 + + ldp w_4, w_5, [buf, #4*4] + round1 b, c, d, e, a, w_4 + round1 a, b, c, d, e, w_5 + + ldp w_6, w_7, [buf, #6*4] + round1 e, a, b, c, d, w_6 + round1 d, e, a, b, c, w_7 + + ldp w_8, w_9, [buf, #8*4] + round1 c, d, e, a, b, w_8 + round1 b, c, d, e, a, w_9 + + ldp w_10, w_11, [buf, #10*4] + round1 a, b, c, d, e, w_10 + round1 e, a, b, c, d, w_11 + + ldp w_12, w_13, [buf, #12*4] + round1 d, e, a, b, c, w_12 + round1 c, d, e, a, b, w_13 + + ldp w_14, w_15, [buf, #14*4] + round1 b, c, d, e, a, w_14 + round1 a, b, c, d, e, w_15 + + round1x e, a, b, c, d, w_0, w_13, w_8, w_2 + round1x d, e, a, b, c, w_1, w_14, w_9, w_3 + round1x c, d, e, a, b, w_2, w_15, w_10, w_4 + round1x b, c, d, e, a, w_3, w_0, w_11, w_5 + + movz k, #0xeba1 // round constant 2 + movk k, #0x6ed9, lsl #16 + + round2 a, b, c, d, e, w_4, w_1, w_12, w_6 + round2 e, a, b, c, d, w_5, w_2, w_13, w_7 + round2 d, e, a, b, c, w_6, w_3, w_14, w_8 + round2 c, d, e, a, b, w_7, w_4, w_15, w_9 + round2 b, c, d, e, a, w_8, w_5, w_0, w_10 + + round2 a, b, c, d, e, w_9, w_6, w_1, w_11 + round2 e, a, b, c, d, w_10, w_7, w_2, w_12 + round2 d, e, a, b, c, w_11, w_8, w_3, w_13 + round2 c, d, e, a, b, w_12, w_9, w_4, w_14 + round2 b, c, d, e, a, w_13, w_10, w_5, w_15 + + round2 a, b, c, d, e, w_14, w_11, w_6, w_0 + round2 e, a, b, c, d, w_15, w_12, w_7, w_1 + round2 d, e, a, b, c, w_0, w_13, w_8, w_2 + round2 c, d, e, a, b, w_1, w_14, w_9, w_3 + round2 b, c, d, e, a, w_2, w_15, w_10, w_4 + + round2 a, b, c, d, e, w_3, w_0, w_11, w_5 + round2 e, a, b, c, d, w_4, w_1, w_12, w_6 + round2 d, e, a, b, c, w_5, w_2, w_13, w_7 + round2 c, d, e, a, b, w_6, w_3, w_14, w_8 + round2 b, c, d, e, a, w_7, w_4, w_15, w_9 + + movz k, #0xbcdc // round constant 3 + movk k, #0x8f1b, lsl #16 + + round3 a, b, c, d, e, w_8, w_5, w_0, w_10 + round3 e, a, b, c, d, w_9, w_6, w_1, w_11 + round3 d, e, a, b, c, w_10, w_7, w_2, w_12 + round3 c, d, e, a, b, w_11, w_8, w_3, w_13 + round3 b, c, d, e, a, w_12, w_9, w_4, w_14 + + round3 a, b, c, d, e, w_13, w_10, w_5, w_15 + round3 e, a, b, c, d, w_14, w_11, w_6, w_0 + round3 d, e, a, b, c, w_15, w_12, w_7, w_1 + round3 c, d, e, a, b, w_0, w_13, w_8, w_2 + round3 b, c, d, e, a, w_1, w_14, w_9, w_3 + + round3 a, b, c, d, e, w_2, w_15, w_10, w_4 + round3 e, a, b, c, d, w_3, w_0, w_11, w_5 + round3 d, e, a, b, c, w_4, w_1, w_12, w_6 + round3 c, d, e, a, b, w_5, w_2, w_13, w_7 + round3 b, c, d, e, a, w_6, w_3, w_14, w_8 + + round3 a, b, c, d, e, w_7, w_4, w_15, w_9 + round3 e, a, b, c, d, w_8, w_5, w_0, w_10 + round3 d, e, a, b, c, w_9, w_6, w_1, w_11 + round3 c, d, e, a, b, w_10, w_7, w_2, w_12 + round3 b, c, d, e, a, w_11, w_8, w_3, w_13 + + movz k, #0xc1d6 // round constant 4 + movk k, #0xca62, lsl #16 + + round4 a, b, c, d, e, w_12, w_9, w_4, w_14 + round4 e, a, b, c, d, w_13, w_10, w_5, w_15 + round4 d, e, a, b, c, w_14, w_11, w_6, w_0 + round4 c, d, e, a, b, w_15, w_12, w_7, w_1 + round4 b, c, d, e, a, w_0, w_13, w_8, w_2 + + round4 a, b, c, d, e, w_1, w_14, w_9, w_3 + round4 e, a, b, c, d, w_2, w_15, w_10, w_4 + round4 d, e, a, b, c, w_3, w_0, w_11, w_5 + round4 c, d, e, a, b, w_4, w_1, w_12, w_6 + round4 b, c, d, e, a, w_5, w_2, w_13, w_7 + + round4 a, b, c, d, e, w_6, w_3, w_14, w_8 + round4 e, a, b, c, d, w_7, w_4, w_15, w_9 + round4 d, e, a, b, c, w_8, w_5, w_0, w_10 + round4 c, d, e, a, b, w_9, w_6, w_1, w_11 + round4 b, c, d, e, a, w_10, w_7, w_2, w_12 + + round4 a, b, c, d, e, w_11, w_8, w_3, w_13 + round4 e, a, b, c, d, w_12, w_9, w_4, w_14 + round4 d, e, a, b, c, w_13, w_10, w_5, w_15 + round4 c, d, e, a, b, w_14, w_11, w_6, w_0 + round4 b, c, d, e, a, w_15, w_12, w_7, w_1 + + ldp w_0, w_1, [sp, #0] // reload saved SHA1 state + ldp w_2, w_3, [sp, #8] + ldr w_4, [sp, #16] + + add a, a, w_0 + add b, b, w_1 + add c, c, w_2 + add d, d, w_3 + add e, e, w_4 + + add buf, buf, #64 + subs len, len, #64 + bhi 0b + + stp a, b, [ctx, #0] // write updated SHA1 state + stp c, d, [ctx, #8] + str e, [ctx, #16] + + ldr x19, [sp, #24+0*8] + ldp x20, x21, [sp, #24+1*8] + ldp x22, x23, [sp, #24+3*8] + ldp x24, x25, [sp, #24+5*8] + ldp x26, x27, [sp, #24+7*8] + add sp, sp, #24+9*8 + +1: ret +END(_libmd_sha1block_scalar) + +/* + * SHA1 implementation using the SHA1 instruction set extension. + */ + + .arch_extension sha2 + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_sha1) + /* ctx, buf, len: same as for sha1block_scalar */ +kaddr .req x3 +abcd .req v0 +abcd_q .req q0 // alias for use with scalar instructions +abcd_s .req s0 +e0 .req s1 +e0_v .req v1 +e1 .req s2 +abcd_saved .req v3 +e0_saved .req v4 +tmp0 .req v5 +tmp1 .req v6 +msg0 .req v16 +msg1 .req v17 +msg2 .req v18 +msg3 .req v19 +k0 .req v20 +k1 .req v21 +k2 .req v22 +k3 .req v23 + + ands len, len, #~63 // take length in multiples of block length + beq 1f // bail out if input empty + + ldr abcd_q, [ctx, #0] + ldr e0, [ctx, #16] + + adrp kaddr, k1234 + add kaddr, kaddr, #:lo12:k1234 + ld4r {k0.4s, k1.4s, k2.4s, k3.4s}, [kaddr] + +0: mov abcd_saved.16b, abcd.16b + mov e0_saved.16b, e0_v.16b + + ld1 {msg0.4s, msg1.4s, msg2.4s, msg3.4s}, [buf], #64 + rev32 msg0.16b, msg0.16b + rev32 msg1.16b, msg1.16b + rev32 msg2.16b, msg2.16b + rev32 msg3.16b, msg3.16b + + add tmp0.4s, msg0.4s, k0.4s + add tmp1.4s, msg1.4s, k0.4s + + /* rounds 0--3 */ + sha1h e1, abcd_s + sha1c abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k0.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 4--7 */ + sha1h e0, abcd_s + sha1c abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k0.4s + sha1su1 msg0.4s, msg3.4s + sha1su0 msg1.4s, msg2.4s, msg3.4s + + /* rounds 8--11 */ + sha1h e1, abcd_s + sha1c abcd_q, e0, tmp0.4s + add tmp0.4s, msg0.4s, k0.4s + sha1su1 msg1.4s, msg0.4s + sha1su0 msg2.4s, msg3.4s, msg0.4s + + /* rounds 12--15 */ + sha1h e0, abcd_s + sha1c abcd_q, e1, tmp1.4s + add tmp1.4s, msg1.4s, k1.4s + sha1su1 msg2.4s, msg1.4s + sha1su0 msg3.4s, msg0.4s, msg1.4s + + /* rounds 16--19 */ + sha1h e1, abcd_s + sha1c abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k1.4s + sha1su1 msg3.4s, msg2.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 20--23 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k1.4s + sha1su1 msg0.4s, msg3.4s + sha1su0 msg1.4s, msg2.4s, msg3.4s + + /* rounds 24--27 */ + sha1h e1, abcd_s + sha1p abcd_q, e0, tmp0.4s + add tmp0.4s, msg0.4s, k1.4s + sha1su1 msg1.4s, msg0.4s + sha1su0 msg2.4s, msg3.4s, msg0.4s + + /* rounds 28--31 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg1.4s, k1.4s + sha1su1 msg2.4s, msg1.4s + sha1su0 msg3.4s, msg0.4s, msg1.4s + + /* rounds 32--35 */ + sha1h e1, abcd_s + sha1p abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k2.4s + sha1su1 msg3.4s, msg2.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 36--39 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k2.4s + sha1su1 msg0.4s, msg3.4s + sha1su0 msg1.4s, msg2.4s, msg3.4s + + /* rounds 40--43 */ + sha1h e1, abcd_s + sha1m abcd_q, e0, tmp0.4s + add tmp0.4s, msg0.4s, k2.4s + sha1su1 msg1.4s, msg0.4s + sha1su0 msg2.4s, msg3.4s, msg0.4s + + /* rounds 44--47 */ + sha1h e0, abcd_s + sha1m abcd_q, e1, tmp1.4s + add tmp1.4s, msg1.4s, k2.4s + sha1su1 msg2.4s, msg1.4s + sha1su0 msg3.4s, msg0.4s, msg1.4s + + /* rounds 48--51 */ + sha1h e1, abcd_s + sha1m abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k2.4s + sha1su1 msg3.4s, msg2.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 52--55 */ + sha1h e0, abcd_s + sha1m abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k3.4s + sha1su1 msg0.4s, msg3.4s + sha1su0 msg1.4s, msg2.4s, msg3.4s + + /* rounds 56--59 */ + sha1h e1, abcd_s + sha1m abcd_q, e0, tmp0.4s + add tmp0.4s, msg0.4s, k3.4s + sha1su1 msg1.4s, msg0.4s + sha1su0 msg2.4s, msg3.4s, msg0.4s + + /* rounds 60--63 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg1.4s, k3.4s + sha1su1 msg2.4s, msg1.4s + sha1su0 msg3.4s, msg0.4s, msg1.4s + + /* rounds 64--67 */ + sha1h e1, abcd_s + sha1p abcd_q, e0, tmp0.4s + add tmp0.4s, msg2.4s, k3.4s + sha1su1 msg3.4s, msg2.4s + sha1su0 msg0.4s, msg1.4s, msg2.4s + + /* rounds 68--71 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + add tmp1.4s, msg3.4s, k3.4s + sha1su1 msg0.4s, msg3.4s + + /* rounds 72--75 */ + sha1h e1, abcd_s + sha1p abcd_q, e0, tmp0.4s + + /* rounds 76--79 */ + sha1h e0, abcd_s + sha1p abcd_q, e1, tmp1.4s + + add e0_v.4s, e0_v.4s, e0_saved.4s + add abcd.4s, abcd.4s, abcd_saved.4s + + subs len, len, #64 + bhi 0b + + str abcd_q, [ctx, #0] + str e0, [ctx, #16] + +1: ret +END(_libmd_sha1block_sha1) + + .section .rodata + .balign 16 +k1234: .4byte 0x5a827999 + .4byte 0x6ed9eba1 + .4byte 0x8f1bbcdc + .4byte 0xca62c1d6 + .size k1234, .-k1234 + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libmd/aarch64/sha1dispatch.c b/lib/libmd/aarch64/sha1dispatch.c new file mode 100644 --- /dev/null +++ b/lib/libmd/aarch64/sha1dispatch.c @@ -0,0 +1,24 @@ +/*- + * Copyright (c) 2024 Robert Clausecker + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +extern void _libmd_sha1block_scalar(SHA1_CTX *, const void *, size_t); +extern void _libmd_sha1block_sha1(SHA1_CTX *, const void *, size_t); + +DEFINE_IFUNC(, void, sha1_block, (SHA1_CTX *, const void *, size_t)) +{ + unsigned long hwcap = 0; + + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + + if (hwcap & HWCAP_SHA1) + return (_libmd_sha1block_sha1); + else + return (_libmd_sha1block_scalar); +} diff --git a/lib/libmd/amd64/sha1block.S b/lib/libmd/amd64/sha1block.S new file mode 100644 --- /dev/null +++ b/lib/libmd/amd64/sha1block.S @@ -0,0 +1,1851 @@ +/*- + * Copyright (c) 2013 The Go Authors. All rights reserved. + * Copyright (c) 2024 Robert Clausecker + * + * Adapted from Go's crypto/sha1/sha1block_amd64.s. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +/* + * SHA-1 block routine. See sha1c.c for C equivalent. + * + * There are 80 rounds of 4 types: + * - rounds 0-15 are type 1 and load data (round1 macro). + * - rounds 16-19 are type 1 and do not load data (round1x macro). + * - rounds 20-39 are type 2 and do not load data (round2 macro). + * - rounds 40-59 are type 3 and do not load data (round3 macro). + * - rounds 60-79 are type 4 and do not load data (round4 macro). + * + * Each round loads or shuffles the data, then computes a per-round + * function of b, c, d, and then mixes the result into and rotates the + * five registers a, b, c, d, e holding the intermediate results. + * + * The register rotation is implemented by rotating the arguments to + * the round macros instead of by explicit move instructions. + */ +.macro load index + mov (\index)*4(%rsi), %r10d + bswap %r10d + mov %r10d, (\index)*4(%rsp) +.endm + +.macro shuffle index + mov ((\index )&0xf)*4(%rsp), %r10d + xor ((\index- 3)&0xf)*4(%rsp), %r10d + xor ((\index- 8)&0xf)*4(%rsp), %r10d + xor ((\index-14)&0xf)*4(%rsp), %r10d + rol $1, %r10d + mov %r10d, ((\index)&0xf)*4(%rsp) +.endm + +.macro func1 a, b, c, d, e + mov \d, %r9d + xor \c, %r9d + and \b, %r9d + xor \d, %r9d +.endm + +.macro func2 a, b, c, d, e + mov \b, %r9d + xor \c, %r9d + xor \d, %r9d +.endm + +.macro func3 a, b, c, d, e + mov \b, %r8d + or \c, %r8d + and \d, %r8d + mov \b, %r9d + and \c, %r9d + or %r8d, %r9d +.endm + +.macro func4 a, b, c, d, e + func2 \a, \b, \c, \d, \e +.endm + +.macro mix a, b, c, d, e, const + rol $30, \b + add %r9d, \e + mov \a, %r8d + rol $5, %r8d + lea \const(\e, %r10d, 1), \e + add %r8d, \e +.endm + +.macro round1 a, b, c, d, e, index + load \index + func1 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x5a827999 +.endm + +.macro round1x a, b, c, d, e, index + shuffle \index + func1 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x5a827999 +.endm + +.macro round2 a, b, c, d, e, index + shuffle \index + func2 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x6ed9eba1 +.endm + +.macro round3 a, b, c, d, e, index + shuffle \index + func3 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0x8f1bbcdc +.endm + +.macro round4 a, b, c, d, e, index + shuffle \index + func4 \a, \b, \c, \d, \e + mix \a, \b, \c, \d, \e, 0xca62c1d6 +.endm + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_scalar) + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + push %rdi // rdi: SHA1_CTX + sub $64+8, %rsp // 64 bytes for round keys + // plus alignment + + mov %rdi, %rbp + // rsi: buf + and $~63, %rdx // rdx: length in blocks + lea (%rsi, %rdx, 1), %rdi // rdi: end pointer + mov (%rbp), %eax // c->h0 + mov 4(%rbp), %ebx // c->h1 + mov 8(%rbp), %ecx // c->h2 + mov 12(%rbp), %edx // c->h3 + mov 16(%rbp), %ebp // c->h4 + + cmp %rsi, %rdi // any data to process? + je .Lend + +.Lloop: mov %eax, %r11d + mov %ebx, %r12d + mov %ecx, %r13d + mov %edx, %r14d + mov %ebp, %r15d + + round1 %eax, %ebx, %ecx, %edx, %ebp, 0 + round1 %ebp, %eax, %ebx, %ecx, %edx, 1 + round1 %edx, %ebp, %eax, %ebx, %ecx, 2 + round1 %ecx, %edx, %ebp, %eax, %ebx, 3 + round1 %ebx, %ecx, %edx, %ebp, %eax, 4 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 5 + round1 %ebp, %eax, %ebx, %ecx, %edx, 6 + round1 %edx, %ebp, %eax, %ebx, %ecx, 7 + round1 %ecx, %edx, %ebp, %eax, %ebx, 8 + round1 %ebx, %ecx, %edx, %ebp, %eax, 9 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 10 + round1 %ebp, %eax, %ebx, %ecx, %edx, 11 + round1 %edx, %ebp, %eax, %ebx, %ecx, 12 + round1 %ecx, %edx, %ebp, %eax, %ebx, 13 + round1 %ebx, %ecx, %edx, %ebp, %eax, 14 + + round1 %eax, %ebx, %ecx, %edx, %ebp, 15 + round1x %ebp, %eax, %ebx, %ecx, %edx, 16 + round1x %edx, %ebp, %eax, %ebx, %ecx, 17 + round1x %ecx, %edx, %ebp, %eax, %ebx, 18 + round1x %ebx, %ecx, %edx, %ebp, %eax, 19 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 20 + round2 %ebp, %eax, %ebx, %ecx, %edx, 21 + round2 %edx, %ebp, %eax, %ebx, %ecx, 22 + round2 %ecx, %edx, %ebp, %eax, %ebx, 23 + round2 %ebx, %ecx, %edx, %ebp, %eax, 24 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 25 + round2 %ebp, %eax, %ebx, %ecx, %edx, 26 + round2 %edx, %ebp, %eax, %ebx, %ecx, 27 + round2 %ecx, %edx, %ebp, %eax, %ebx, 28 + round2 %ebx, %ecx, %edx, %ebp, %eax, 29 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 30 + round2 %ebp, %eax, %ebx, %ecx, %edx, 31 + round2 %edx, %ebp, %eax, %ebx, %ecx, 32 + round2 %ecx, %edx, %ebp, %eax, %ebx, 33 + round2 %ebx, %ecx, %edx, %ebp, %eax, 34 + + round2 %eax, %ebx, %ecx, %edx, %ebp, 35 + round2 %ebp, %eax, %ebx, %ecx, %edx, 36 + round2 %edx, %ebp, %eax, %ebx, %ecx, 37 + round2 %ecx, %edx, %ebp, %eax, %ebx, 38 + round2 %ebx, %ecx, %edx, %ebp, %eax, 39 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 40 + round3 %ebp, %eax, %ebx, %ecx, %edx, 41 + round3 %edx, %ebp, %eax, %ebx, %ecx, 42 + round3 %ecx, %edx, %ebp, %eax, %ebx, 43 + round3 %ebx, %ecx, %edx, %ebp, %eax, 44 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 45 + round3 %ebp, %eax, %ebx, %ecx, %edx, 46 + round3 %edx, %ebp, %eax, %ebx, %ecx, 47 + round3 %ecx, %edx, %ebp, %eax, %ebx, 48 + round3 %ebx, %ecx, %edx, %ebp, %eax, 49 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 50 + round3 %ebp, %eax, %ebx, %ecx, %edx, 51 + round3 %edx, %ebp, %eax, %ebx, %ecx, 52 + round3 %ecx, %edx, %ebp, %eax, %ebx, 53 + round3 %ebx, %ecx, %edx, %ebp, %eax, 54 + + round3 %eax, %ebx, %ecx, %edx, %ebp, 55 + round3 %ebp, %eax, %ebx, %ecx, %edx, 56 + round3 %edx, %ebp, %eax, %ebx, %ecx, 57 + round3 %ecx, %edx, %ebp, %eax, %ebx, 58 + round3 %ebx, %ecx, %edx, %ebp, %eax, 59 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 60 + round4 %ebp, %eax, %ebx, %ecx, %edx, 61 + round4 %edx, %ebp, %eax, %ebx, %ecx, 62 + round4 %ecx, %edx, %ebp, %eax, %ebx, 63 + round4 %ebx, %ecx, %edx, %ebp, %eax, 64 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 65 + round4 %ebp, %eax, %ebx, %ecx, %edx, 66 + round4 %edx, %ebp, %eax, %ebx, %ecx, 67 + round4 %ecx, %edx, %ebp, %eax, %ebx, 68 + round4 %ebx, %ecx, %edx, %ebp, %eax, 69 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 70 + round4 %ebp, %eax, %ebx, %ecx, %edx, 71 + round4 %edx, %ebp, %eax, %ebx, %ecx, 72 + round4 %ecx, %edx, %ebp, %eax, %ebx, 73 + round4 %ebx, %ecx, %edx, %ebp, %eax, 74 + + round4 %eax, %ebx, %ecx, %edx, %ebp, 75 + round4 %ebp, %eax, %ebx, %ecx, %edx, 76 + round4 %edx, %ebp, %eax, %ebx, %ecx, 77 + round4 %ecx, %edx, %ebp, %eax, %ebx, 78 + round4 %ebx, %ecx, %edx, %ebp, %eax, 79 + + add %r11d, %eax + add %r12d, %ebx + add %r13d, %ecx + add %r14d, %edx + add %r15d, %ebp + + add $64, %rsi + cmp %rdi, %rsi + jb .Lloop + +.Lend: add $64+8, %rsp + pop %rdi // SHA1_CTX + mov %eax, (%rdi) + mov %ebx, 4(%rdi) + mov %ecx, 8(%rdi) + mov %edx, 12(%rdi) + mov %ebp, 16(%rdi) + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + ret +END(_libmd_sha1block_scalar) + +/* + * This is the implementation using AVX2, BMI1 and BMI2. It is based on: + * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" + * From http://software.intel.com/en-us/articles + * (look for improving-the-performance-of-the-secure-hash-algorithm-1) + * This implementation is 2x unrolled, and interleaves vector instructions, + * used to precompute W, with scalar computation of current round + * for optimal scheduling. + */ + + /* trivial helper macros */ +.macro update_hash a, tb, c, d, e + add (%r9), \a + mov \a, (%r9) + add 4(%r9), \tb + mov \tb, 4(%r9) + add 8(%r9), \c + mov \c, 8(%r9) + add 12(%r9), \d + mov \d, 12(%r9) + add 16(%r9), \e + mov \e, 16(%r9) +.endm + + /* help macros for recalc, which does precomputations */ +.macro precalc0 offset + vmovdqu \offset(%r10), %xmm0 +.endm + +.macro precalc1 offset + vinserti128 $1, \offset(%r13), %ymm0, %ymm0 +.endm + +.macro precalc2 yreg + vpshufb %ymm10, %ymm0, \yreg +.endm + +.macro precalc4 yreg, k_offset + vpaddd \k_offset(%r8), \yreg, %ymm0 +.endm + +.macro precalc7 offset + vmovdqu %ymm0, (\offset)*2(%r14) +.endm + +/* + * Message scheduling pre-compute for rounds 0-15 + * r13 is a pointer to the even 64-byte block + * r10 is a pointer to the odd 64-byte block + * r14 is a pointer to the temp buffer + * xmm0 is used as a temp register + * yreg is clobbered as part of the computation + * offset chooses a 16 byte chunk within a block + * r8 is a pointer to the constants block + * k_offset chooses K constants relevant to this round + * xmm10 holds the swap mask + */ +.macro precalc00_15 offset, yreg + precalc0 \offset + precalc1 \offset + precalc2 \yreg + precalc4 \yreg, 0 + precalc7 \offset +.endm + + /* helper macros for precalc16_31 */ +.macro precalc16 reg_sub16, reg_sub12, reg_sub4, reg + vpalignr $8, \reg_sub16, \reg_sub12, \reg // w[i - 14] + vpsrldq $4, \reg_sub4, %ymm0 // w[i - 3] +.endm + +.macro precalc17 reg_sub16, reg_sub8, reg + vpxor \reg_sub8, \reg, \reg + vpxor \reg_sub16, %ymm0, %ymm0 +.endm + +.macro precalc18 reg + vpxor %ymm0, \reg, \reg + vpslldq $12, \reg, %ymm9 +.endm + +.macro precalc19 reg + vpslld $1, \reg, %ymm0 + vpsrld $31, \reg, \reg + .endm + +.macro precalc20 reg + vpor \reg, %ymm0, %ymm0 + vpslld $2, %ymm9, \reg +.endm + +.macro precalc21 reg + vpsrld $30, %ymm9, %ymm9 + vpxor \reg, %ymm0, %ymm0 +.endm + +.macro precalc23 reg, k_offset, offset + vpxor %ymm9, %ymm0, \reg + vpaddd \k_offset(%r8), \reg, %ymm0 + vmovdqu %ymm0, (\offset)(%r14) +.endm + +/* + * Message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction. + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency. + + clobbers 5 input ymm registers REG_SUB* + * uses xmm0 and xmm9 as temp registers + * As always, r8 is a pointer to constants block + * and r14 is a pointer to temp buffer + */ +.macro precalc16_31 reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset + precalc16 \reg_sub16, \reg_sub12, \reg_sub4, \reg + precalc17 \reg_sub16, \reg_sub8, \reg + precalc18 \reg + precalc19 \reg + precalc20 \reg + precalc21 \reg + precalc23 \reg, \k_offset, \offset +.endm + + /* helper macros for precalc_32_79 */ +.macro precalc32 reg_sub8, reg_sub4 + vpalignr $8, \reg_sub8, \reg_sub4, %ymm0 +.endm + +.macro precalc33 reg_sub28, reg + vpxor \reg_sub28, \reg, \reg +.endm + +.macro precalc34 reg_sub16 + vpxor \reg_sub16, %ymm0, %ymm0 +.endm + +.macro precalc35 reg + vpxor %ymm0, \reg, \reg +.endm + +.macro precalc36 reg + vpslld $2, \reg, %ymm0 +.endm + +.macro precalc37 reg + vpsrld $30, \reg, \reg + vpor \reg, %ymm0, \reg +.endm + +.macro precalc39 reg, k_offset, offset + vpaddd \k_offset(%r8), \reg, %ymm0 + vmovdqu %ymm0, \offset(%r14) +.endm + +.macro precalc32_79 reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset + precalc32 \reg_sub8, \reg_sub4 + precalc33 \reg_sub28, \reg + precalc34 \reg_sub16 + precalc35 \reg + precalc36 \reg + precalc37 \reg + precalc39 \reg, \k_offset, \offset +.endm + +.macro precalc + precalc00_15 0x00, %ymm15 + precalc00_15 0x10, %ymm14 + precalc00_15 0x20, %ymm13 + precalc00_15 0x30, %ymm12 + precalc16_31 %ymm8, %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080 + precalc16_31 %ymm7, %ymm8, %ymm12, %ymm13, %ymm14, 0x20, 0x0a0 + precalc16_31 %ymm5, %ymm7, %ymm8, %ymm12, %ymm13, 0x20, 0x0c0 + precalc16_31 %ymm3, %ymm5, %ymm7, %ymm8, %ymm12, 0x20, 0x0e0 + precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x20, 0x100 + precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x20, 0x120 + precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x40, 0x140 + precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x40, 0x160 + precalc32_79 %ymm8, %ymm12, %ymm13, %ymm15, %ymm7, 0x40, 0x180 + precalc32_79 %ymm7, %ymm8, %ymm12, %ymm14, %ymm5, 0x40, 0x1a0 + precalc32_79 %ymm5, %ymm7, %ymm8, %ymm13, %ymm3, 0x40, 0x1c0 + precalc32_79 %ymm3, %ymm5, %ymm7, %ymm12, %ymm15, 0x60, 0x1e0 + precalc32_79 %ymm15, %ymm3, %ymm5, %ymm8, %ymm14, 0x60, 0x200 + precalc32_79 %ymm14, %ymm15, %ymm3, %ymm7, %ymm13, 0x60, 0x220 + precalc32_79 %ymm13, %ymm14, %ymm15, %ymm5, %ymm12, 0x60, 0x240 + precalc32_79 %ymm12, %ymm13, %ymm14, %ymm3, %ymm8, 0x60, 0x260 +.endm + +/* + * Macros calculating individual rounds have general form + * calc_round_pre + precalc_round + calc_round_post + * calc_round_{pre,post} macros follow + */ +.macro calc_f1_pre offset, reg_a, reg_b, reg_c, reg_e + add \offset(%r15), \reg_e + andn \reg_c, \reg_a, %ebp + add \reg_b, \reg_e // add F from the previous round + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_b // for the next round +.endm + +/* + * Calculate F for the next round + */ +.macro calc_f1_post reg_a, reg_b, reg_e + and \reg_b, \reg_a // b & c + xor %ebp, \reg_a // F1 = (b&c) ^ (~b&d) + add %r12d, \reg_e +.endm + +/* + * Registers are cyclically rotated: + * edx -> eax -> edi -> esi -> ebx -> ecx + */ +.macro calc0 + mov %esi, %ebx // precalculate first round + rorx $2, %esi, %esi + andn %eax, %ebx, %ebp + and %edi, %ebx + xor %ebp, %ebx + calc_f1_pre 0x0, %ecx, %ebx, %edi, %edx + precalc0 0x80 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc1 + calc_f1_pre 0x4, %edx, %ecx, %esi, %eax + precalc1 0x80 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc2 + calc_f1_pre 0x8, %eax, %edx, %ebx, %edi + precalc2 %ymm15 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc3 + calc_f1_pre 0xc, %edi, %eax, %ecx, %esi + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc4 + calc_f1_pre 0x20, %esi, %edi, %edx, %ebx + precalc4 %ymm15, 0x0 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc5 + calc_f1_pre 0x24, %ebx, %esi, %eax, %ecx + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc6 + calc_f1_pre 0x28, %ecx, %ebx, %edi, %edx + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc7 + calc_f1_pre 0x2c, %edx, %ecx, %esi, %eax + precalc7 0x0 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc8 + calc_f1_pre 0x40, %eax, %edx, %ebx, %edi + precalc0 0x90 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc9 + calc_f1_pre 0x44, %edi, %eax, %ecx, %esi + precalc1 0x90 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc10 + calc_f1_pre 0x48, %esi, %edi, %edx, %ebx + precalc2 %ymm14 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc11 + calc_f1_pre 0x4c, %ebx, %esi, %eax, %ecx + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc12 + calc_f1_pre 0x60, %ecx, %ebx, %edi, %edx + precalc4 %ymm14, 0 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc13 + calc_f1_pre 0x64, %edx, %ecx, %esi, %eax + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc14 + calc_f1_pre 0x68, %eax, %edx, %ebx, %edi + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc15 + calc_f1_pre 0x6c, %edi, %eax, %ecx, %esi + precalc7 0x10 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc16 + calc_f1_pre 0x80, %esi, %edi, %edx, %ebx + precalc0 0xa0 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc17 + calc_f1_pre 0x84, %ebx, %esi, %eax, %ecx + precalc1 0xa0 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc18 + calc_f1_pre 0x88, %ecx, %ebx, %edi, %edx + precalc2 %ymm13 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc_f2_pre offset, reg_a, reg_b, reg_e + add \offset(%r15), \reg_e + add \reg_b, \reg_e // add F from the previous round + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_b // for next round +.endm + +.macro calc_f2_post reg_a, reg_b, reg_c, reg_e + xor \reg_b, \reg_a + add %r12d, \reg_e + xor \reg_c, \reg_a +.endm + +.macro calc19 + calc_f2_pre 0x8c, %edx, %ecx, %eax + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc20 + calc_f2_pre 0xa0, %eax, %edx, %edi + precalc4 %ymm13, 0x0 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc21 + calc_f2_pre 0xa4, %edi, %eax, %esi + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc22 + calc_f2_pre 0xa8, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc23 + calc_f2_pre 0xac, %ebx, %esi, %ecx + precalc7 0x20 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc24 + calc_f2_pre 0xc0, %ecx, %ebx, %edx + precalc0 0xb0 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc25 + calc_f2_pre 0xc4, %edx, %ecx, %eax + precalc1 0xb0 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc26 + calc_f2_pre 0xc8, %eax, %edx, %edi + precalc2 %ymm12 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc27 + calc_f2_pre 0xcc, %edi, %eax, %esi + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc28 + calc_f2_pre 0xe0, %esi, %edi, %ebx + precalc4 %ymm12, 0x0 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc29 + calc_f2_pre 0xe4, %ebx, %esi, %ecx + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc30 + calc_f2_pre 0xe8, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc31 + calc_f2_pre 0xec, %edx, %ecx, %eax + precalc7 0x30 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc32 + calc_f2_pre 0x100, %eax, %edx, %edi + precalc16 %ymm15, %ymm14, %ymm12, %ymm8 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc33 + calc_f2_pre 0x104, %edi, %eax, %esi + precalc17 %ymm15, %ymm13, %ymm8 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc34 + calc_f2_pre 0x108, %esi, %edi, %ebx + precalc18 %ymm8 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc35 + calc_f2_pre 0x10c, %ebx, %esi, %ecx + precalc19 %ymm8 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc36 + calc_f2_pre 0x120, %ecx, %ebx, %edx + precalc20 %ymm8 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc37 + calc_f2_pre 0x124, %edx, %ecx, %eax + precalc21 %ymm8 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc38 + calc_f2_pre 0x128, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc_f3_pre offset, reg_e + add \offset(%r15), \reg_e +.endm + +.macro calc_f3_post reg_a, reg_b, reg_c, reg_e, reg_tb + add \reg_tb, \reg_e // add F from the previous round + mov \reg_b, %ebp + or \reg_a, %ebp + rorx $0x1b, \reg_a, %r12d + rorx $2, \reg_a, \reg_tb + and \reg_c, %ebp // calculate F for the next round + and \reg_b, \reg_a + or %ebp, \reg_a + add %r12d, \reg_e +.endm + +.macro calc39 + calc_f3_pre 0x12c, %esi + precalc23 %ymm8, 0x0, 0x80 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc40 + calc_f3_pre 0x140, %ebx + precalc16 %ymm14, %ymm13, %ymm8, %ymm7 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc41 + calc_f3_pre 0x144, %ecx + precalc17 %ymm14, %ymm12, %ymm7 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc42 + calc_f3_pre 0x148, %edx + precalc18 %ymm7 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc43 + calc_f3_pre 0x14c, %eax + precalc19 %ymm7 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc44 + calc_f3_pre 0x160, %edi + precalc20 %ymm7 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc45 + calc_f3_pre 0x164, %esi + precalc21 %ymm7 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc46 + calc_f3_pre 0x168, %ebx + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc47 + calc_f3_pre 0x16c, %ecx + vpxor %ymm9, %ymm0, %ymm7 + vpaddd 0x20(%r8), %ymm7, %ymm0 + vmovdqu %ymm0, 0xa0(%r14) + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc48 + calc_f3_pre 0x180, %edx + precalc16 %ymm13, %ymm12, %ymm7, %ymm5 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc49 + calc_f3_pre 0x184, %eax + precalc17 %ymm13, %ymm8, %ymm5 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc50 + calc_f3_pre 0x188, %edi + precalc18 %ymm5 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc51 + calc_f3_pre 0x18c, %esi + precalc19 %ymm5 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc52 + calc_f3_pre 0x1a0, %ebx + precalc20 %ymm5 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc53 + calc_f3_pre 0x1a4, %ecx + precalc21 %ymm5 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc54 + calc_f3_pre 0x1a8, %edx + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc55 + calc_f3_pre 0x1ac, %eax + precalc23 %ymm5, 0x20, 0xc0 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc56 + calc_f3_pre 0x1c0, %edi + precalc16 %ymm12, %ymm8, %ymm5, %ymm3 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc57 + calc_f3_pre 0x1c4, %esi + precalc17 %ymm12, %ymm7, %ymm3 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc58 + calc_f3_pre 0x1c8, %ebx + precalc18 %ymm3 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc59 + calc_f2_pre 0x1cc, %ebx, %esi, %ecx + precalc19 %ymm3 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc60 + calc_f2_pre 0x1e0, %ecx, %ebx, %edx + precalc20 %ymm3 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc61 + calc_f2_pre 0x1e4, %edx, %ecx, %eax + precalc21 %ymm3 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc62 + calc_f2_pre 0x1e8, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc63 + calc_f2_pre 0x1ec, %edi, %eax, %esi + precalc23 %ymm3, 0x20, 0xe0 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc64 + calc_f2_pre 0x200, %esi, %edi, %ebx + precalc32 %ymm5, %ymm3 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc65 + calc_f2_pre 0x204, %ebx, %esi, %ecx + precalc33 %ymm14, %ymm15 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc66 + calc_f2_pre 0x208, %ecx, %ebx, %edx + precalc34 %ymm8 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc67 + calc_f2_pre 0x20c, %edx, %ecx, %eax + precalc35 %ymm15 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc68 + calc_f2_pre 0x220, %eax, %edx, %edi + precalc36 %ymm15 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc69 + calc_f2_pre 0x224, %edi, %eax, %esi + precalc37 %ymm15 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc70 + calc_f2_pre 0x228, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc71 + calc_f2_pre 0x22c, %ebx, %esi, %ecx + precalc39 %ymm15, 0x20, 0x100 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc72 + calc_f2_pre 0x240, %ecx, %ebx, %edx + precalc32 %ymm3, %ymm15 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc73 + calc_f2_pre 0x244, %edx, %ecx, %eax + precalc33 %ymm13, %ymm14 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc74 + calc_f2_pre 0x248, %eax, %edx, %edi + precalc34 %ymm7 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc75 + calc_f2_pre 0x24c, %edi, %eax, %esi + precalc35 %ymm14 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc76 + calc_f2_pre 0x260, %esi, %edi, %ebx + precalc36 %ymm14 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc77 + calc_f2_pre 0x264, %ebx, %esi, %ecx + precalc37 %ymm14 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc78 + calc_f2_pre 0x268, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc79 + add 0x26c(%r15), %eax + add %ecx, %eax + rorx $0x1b, %edx, %r12d + precalc39 %ymm14, 0x20, 0x120 + add %r12d, %eax +.endm + +/* + * Similar to calc0 + */ +.macro calc80 + mov %ecx, %edx // precalculate first round + rorx $2, %ecx, %ecx + andn %esi, %edx, %ebp + and %ebx, %edx + xor %ebp, %edx + calc_f1_pre 0x10, %eax, %edx, %ebx, %edi + precalc32 %ymm15, %ymm14 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc81 + calc_f1_pre 0x14, %edi, %eax, %ecx, %esi + precalc33 %ymm12, %ymm13 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc82 + calc_f1_pre 0x18, %esi, %edi, %edx, %ebx + precalc34 %ymm5 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc83 + calc_f1_pre 0x1c, %ebx, %esi, %eax, %ecx + precalc35 %ymm13 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc84 + calc_f1_pre 0x30, %ecx, %ebx, %edi, %edx + precalc36 %ymm13 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc85 + calc_f1_pre 0x34, %edx, %ecx, %esi, %eax + precalc37 %ymm13 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc86 + calc_f1_pre 0x38, %eax, %edx, %ebx, %edi + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc87 + calc_f1_pre 0x3c, %edi, %eax, %ecx, %esi + precalc39 %ymm13, 0x40, 0x140 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc88 + calc_f1_pre 0x50, %esi, %edi, %edx, %ebx + precalc32 %ymm14, %ymm13 + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc89 + calc_f1_pre 0x54, %ebx, %esi, %eax, %ecx + precalc33 %ymm8, %ymm12 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc90 + calc_f1_pre 0x58, %ecx, %ebx, %edi, %edx + precalc34 %ymm3 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc91 + calc_f1_pre 0x5c, %edx, %ecx, %esi, %eax + precalc35 %ymm12 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc92 + calc_f1_pre 0x70, %eax, %edx, %ebx, %edi + precalc36 %ymm12 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc93 + calc_f1_pre 0x74, %edi, %eax, %ecx, %esi + precalc37 %ymm12 + calc_f1_post %edi, %edx, %esi +.endm + +.macro calc94 + calc_f1_pre 0x78, %esi, %edi, %edx, %ebx + calc_f1_post %esi, %eax, %ebx +.endm + +.macro calc95 + calc_f1_pre 0x7c, %ebx, %esi, %eax, %ecx + precalc39 %ymm12, 0x40, 0x160 + calc_f1_post %ebx, %edi, %ecx +.endm + +.macro calc96 + calc_f1_pre 0x90, %ecx, %ebx, %edi, %edx + precalc32 %ymm13, %ymm12 + calc_f1_post %ecx, %esi, %edx +.endm + +.macro calc97 + calc_f1_pre 0x94, %edx, %ecx, %esi, %eax + precalc33 %ymm7, %ymm8 + calc_f1_post %edx, %ebx, %eax +.endm + +.macro calc98 + calc_f1_pre 0x98, %eax, %edx, %ebx, %edi + precalc34 %ymm15 + calc_f1_post %eax, %ecx, %edi +.endm + +.macro calc99 + calc_f2_pre 0x9c, %edi, %eax, %esi + precalc35 %ymm8 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc100 + calc_f2_pre 0xb0, %esi, %edi, %ebx + precalc36 %ymm8 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc101 + calc_f2_pre 0xb4, %ebx, %esi, %ecx + precalc37 %ymm8 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc102 + calc_f2_pre 0xb8, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc103 + calc_f2_pre 0xbc, %edx, %ecx, %eax + precalc39 %ymm8, 0x40, 0x180 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc104 + calc_f2_pre 0xd0, %eax, %edx, %edi + precalc32 %ymm12, %ymm8 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc105 + calc_f2_pre 0xd4, %edi, %eax, %esi + precalc33 %ymm5, %ymm7 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc106 + calc_f2_pre 0xd8, %esi, %edi, %ebx + precalc34 %ymm14 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc107 + calc_f2_pre 0xdc, %ebx, %esi, %ecx + precalc35 %ymm7 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc108 + calc_f2_pre 0xf0, %ecx, %ebx, %edx + precalc36 %ymm7 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc109 + calc_f2_pre 0xf4, %edx, %ecx, %eax + precalc37 %ymm7 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc110 + calc_f2_pre 0xf8, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc111 + calc_f2_pre 0xfc, %edi, %eax, %esi + precalc39 %ymm7, 0x40, 0x1a0 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc112 + calc_f2_pre 0x110, %esi, %edi, %ebx + precalc32 %ymm8, %ymm7 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc113 + calc_f2_pre 0x114, %ebx, %esi, %ecx + precalc33 %ymm3, %ymm5 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc114 + calc_f2_pre 0x118, %ecx, %ebx, %edx + precalc34 %ymm13 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc115 + calc_f2_pre 0x11c, %edx, %ecx, %eax + precalc35 %ymm5 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc116 + calc_f2_pre 0x130, %eax, %edx, %edi + precalc37 %ymm5 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc117 + calc_f2_pre 0x134, %edi, %eax, %esi + precalc37 %ymm5 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc118 + calc_f2_pre 0x138, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc119 + calc_f3_pre 0x13c, %ecx + precalc39 %ymm5, 0x40, 0x1c0 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc120 + calc_f3_pre 0x150, %edx + precalc32 %ymm7, %ymm5 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc121 + calc_f3_pre 0x154, %eax + precalc33 %ymm15, %ymm3 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc122 + calc_f3_pre 0x158, %edi + precalc34 %ymm12 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc123 + calc_f3_pre 0x15c, %esi + precalc35 %ymm3 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc124 + calc_f3_pre 0x170, %ebx + precalc36 %ymm3 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc125 + calc_f3_pre 0x174, %ecx + precalc37 %ymm3 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc126 + calc_f3_pre 0x178, %edx + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc127 + calc_f3_pre 0x17c, %eax + precalc39 %ymm3, 0x60, 0x1e0 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc128 + calc_f3_pre 0x190, %edi + precalc32 %ymm5, %ymm3 + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc129 + calc_f3_pre 0x194, %esi + precalc33 %ymm14, %ymm15 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc130 + calc_f3_pre 0x198, %ebx + precalc34 %ymm8 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc131 + calc_f3_pre 0x19c, %ecx + precalc35 %ymm15 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc132 + calc_f3_pre 0x1b0, %edx + precalc36 %ymm15 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc133 + calc_f3_pre 0x1b4, %eax + precalc37 %ymm15 + calc_f3_post %edx, %ebx, %esi, %eax, %ecx +.endm + +.macro calc134 + calc_f3_pre 0x1b8, %edi + calc_f3_post %eax, %ecx, %ebx, %edi, %edx +.endm + +.macro calc135 + calc_f3_pre 0x1bc, %esi + precalc39 %ymm15, 0x60, 0x200 + calc_f3_post %edi, %edx, %ecx, %esi, %eax +.endm + +.macro calc136 + calc_f3_pre 0x1d0, %ebx + precalc32 %ymm3, %ymm15 + calc_f3_post %esi, %eax, %edx, %ebx, %edi +.endm + +.macro calc137 + calc_f3_pre 0x1d4, %ecx + precalc33 %ymm13, %ymm14 + calc_f3_post %ebx, %edi, %eax, %ecx, %esi +.endm + +.macro calc138 + calc_f3_pre 0x1d8, %edx + precalc34 %ymm7 + calc_f3_post %ecx, %esi, %edi, %edx, %ebx +.endm + +.macro calc139 + calc_f2_pre 0x1cc, %edx, %ecx, %eax + precalc35 %ymm14 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc140 + calc_f2_pre 0x1f0, %eax, %edx, %edi + precalc36 %ymm14 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc141 + calc_f2_pre 0x1f4, %edi, %eax, %esi + precalc37 %ymm14 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc142 + calc_f2_pre 0x1f8, %esi, %edi, %ebx + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc143 + calc_f2_pre 0x1fc, %ebx, %esi, %ecx + precalc39 %ymm14, 0x60, 0x220 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc144 + calc_f2_pre 0x210, %ecx, %ebx, %edx + precalc32 %ymm15, %ymm14 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc145 + calc_f2_pre 0x214, %edx, %ecx, %eax + precalc33 %ymm12, %ymm13 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc146 + calc_f2_pre 0x218, %eax, %edx, %edi + precalc34 %ymm5 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc147 + calc_f2_pre 0x21c, %edi, %eax, %esi + precalc35 %ymm13 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc148 + calc_f2_pre 0x230, %esi, %edi, %ebx + precalc36 %ymm13 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc149 + calc_f2_pre 0x234, %ebx, %esi, %ecx + precalc37 %ymm13 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc150 + calc_f2_pre 0x238, %ecx, %ebx, %edx + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc151 + calc_f2_pre 0x23c, %edx, %ecx, %eax + precalc39 %ymm13, 0x60, 0x240 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc152 + calc_f2_pre 0x250, %eax, %edx, %edi + precalc32 %ymm14, %ymm13 + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc153 + calc_f2_pre 0x254, %edi, %eax, %esi + precalc33 %ymm8, %ymm12 + calc_f2_post %edi, %edx, %ecx, %esi +.endm + +.macro calc154 + calc_f2_pre 0x258, %esi, %edi, %ebx + precalc34 %ymm3 + calc_f2_post %esi, %eax, %edx, %ebx +.endm + +.macro calc155 + calc_f2_pre 0x25c, %ebx, %esi, %ecx + precalc35 %ymm12 + calc_f2_post %ebx, %edi, %eax, %ecx +.endm + +.macro calc156 + calc_f2_pre 0x270, %ecx, %ebx, %edx + precalc36 %ymm12 + calc_f2_post %ecx, %esi, %edi, %edx +.endm + +.macro calc157 + calc_f2_pre 0x274, %edx, %ecx, %eax + precalc37 %ymm12 + calc_f2_post %edx, %ebx, %esi, %eax +.endm + +.macro calc158 + calc_f2_pre 0x278, %eax, %edx, %edi + calc_f2_post %eax, %ecx, %ebx, %edi +.endm + +.macro calc159 + add 0x27c(%r15), %esi + add %eax, %esi + rorx $0x1b, %edi, %r12d + precalc39 %ymm12, 0x60, 0x260 + add %r12d, %esi +.endm + + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_avx2) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub $1408+8, %rsp + + and $~63, %rdx + lea k_xmm_ar(%rip), %r8 + mov %rdi, %r9 + mov %rsi, %r10 + lea 64(%rsi), %r13 + lea 64(%rsi, %rdx), %r11 + cmp %r11, %r13 + cmovae %r8, %r13 + vmovdqu bswap_shufb_ctl(%rip), %ymm10 + + mov (%r9), %ecx + mov 4(%r9), %esi + mov 8(%r9), %edi + mov 12(%r9), %eax + mov 16(%r9), %edx + mov %rsp, %r14 + lea 2*4*80+32(%rsp), %r15 + precalc // precalc WK for first 2 blocks + xchg %r14, %r15 + + // this is unrolled +.Loop: cmp %r8, %r10 // we use the value of R8 (set below) + // as a signal of the last block + jne .Lbegin + add $1408+8, %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + vzeroupper + ret + +.Lbegin: + calc0 + calc1 + calc2 + calc3 + calc4 + calc5 + calc6 + calc7 + calc8 + calc9 + calc10 + calc11 + calc12 + calc13 + calc14 + calc15 + calc16 + calc17 + calc18 + calc19 + calc20 + calc21 + calc22 + calc23 + calc24 + calc25 + calc26 + calc27 + calc28 + calc29 + calc30 + calc31 + calc32 + calc33 + calc34 + calc35 + calc36 + calc37 + calc38 + calc39 + calc40 + calc41 + calc42 + calc43 + calc44 + calc45 + calc46 + calc47 + calc48 + calc49 + calc50 + calc51 + calc52 + calc53 + calc54 + calc55 + calc56 + calc57 + calc58 + calc59 + + add $128, %r10 // move to the next even-64-byte block + cmp %r11, %r10 // is the current block the last one? + cmovae %r10, %r8 // signal the last iteration smartly + + calc60 + calc61 + calc62 + calc63 + calc64 + calc65 + calc66 + calc67 + calc68 + calc69 + calc70 + calc71 + calc72 + calc73 + calc74 + calc75 + calc76 + calc77 + calc78 + calc79 + + update_hash %eax, %edx, %ebx, %esi, %edi + cmp %r8, %r10 // is the current block the last one? + je .Loop + mov %edx, %ecx + + calc80 + calc81 + calc82 + calc83 + calc84 + calc85 + calc86 + calc87 + calc88 + calc89 + calc90 + calc91 + calc92 + calc93 + calc94 + calc95 + calc96 + calc97 + calc98 + calc99 + calc100 + calc101 + calc102 + calc103 + calc104 + calc105 + calc106 + calc107 + calc108 + calc109 + calc110 + calc111 + calc112 + calc113 + calc114 + calc115 + calc116 + calc117 + calc118 + calc119 + calc120 + calc121 + calc122 + calc123 + calc124 + calc125 + calc126 + calc127 + calc128 + calc129 + calc130 + calc131 + calc132 + calc133 + calc134 + calc135 + calc136 + calc137 + calc138 + calc139 + + add $128, %r13 // move to the next even-64-byte block + cmp %r11, %r13 // is the current block the last one? + cmovae %r8, %r10 + + calc140 + calc141 + calc142 + calc143 + calc144 + calc145 + calc146 + calc147 + calc148 + calc149 + calc150 + calc151 + calc152 + calc153 + calc154 + calc155 + calc156 + calc157 + calc158 + calc159 + + update_hash %esi, %edi, %edx, %ecx, %ebx + mov %esi, %r12d // reset state for AVX2 reg permutation + mov %edi, %esi + mov %edx, %edi + mov %ebx, %edx + mov %ecx, %eax + mov %r12d, %ecx + xchg %r14, %r15 + jmp .Loop +END(_libmd_sha1block_avx2) + + .section .rodata + .balign 32 +k_xmm_ar: + .fill 8, 4, 0x5a827999 + .fill 8, 4, 0x6ed9eba1 + .fill 8, 4, 0x8f1bbcdc + .fill 8, 4, 0xca62c1d6 + .size k_xmm_ar, .-k_xmm_ar + +bswap_shufb_ctl: + .4byte 0x00010203 + .4byte 0x04050607 + .4byte 0x08090a0b + .4byte 0x0c0d0e0f + .4byte 0x00010203 + .4byte 0x04050607 + .4byte 0x08090a0b + .4byte 0x0c0d0e0f + .size bswap_shufb_ctl, .-bswap_shufb_ctl + + /* + * SHA1 implementation using the Intel SHA extensions (SHANI). + * + * Imlemented according to the Intel white paper + * + * S. Gulley, V. Gopal, K. Yap, W. Feghali, J. Guilford, + * G. Wolrich: "Intel SHA Extensions: new instruction supporting + * the Secure Hash Algorithm on IntelĀ® architecture processors", + * July 2013. + */ + // sha1block(SHA1_CTX, buf, len) +ENTRY(_libmd_sha1block_shani) + and $~63, %rdx // round length to block-size multiple + lea (%rsi, %rdx, 1), %rcx // end pointer + test %rdx, %rdx // nothing to do? + je 1f // if so, terminate immediately + + movdqu (%rdi), %xmm6 // h0, h1, h2, h3 + pxor %xmm7, %xmm7 + pshufd $0x1b, %xmm6, %xmm6 // h3, h2, h1, h0 + pinsrd $3, 16(%rdi), %xmm7 // h4 in the highest word of xmm7 + movdqu shuf_mask(%rip), %xmm4 + + // main loop +0: movdqa %xmm6, %xmm8 // stash ABCD + movdqa %xmm7, %xmm9 // stash E + + // rounds 0--3 + movdqu 0*16(%rsi), %xmm0 // load first message block + pshufb %xmm4, %xmm0 // and byte-swap + paddd %xmm0, %xmm7 // E += w[0] + movdqa %xmm6, %xmm5 // E' = A + sha1rnds4 $0, %xmm7, %xmm6 // perform rounds 0--3 + + // rounds 4--7 + movdqu 1*16(%rsi), %xmm1 + pshufb %xmm4, %xmm1 + sha1nexte %xmm1, %xmm5 + movdqa %xmm6, %xmm7 + sha1rnds4 $0, %xmm5, %xmm6 + sha1msg1 %xmm1, %xmm0 + + // rounds 8--11 + movdqu 2*16(%rsi), %xmm2 + pshufb %xmm4, %xmm2 + sha1nexte %xmm2, %xmm7 + movdqa %xmm6, %xmm5 + sha1rnds4 $0, %xmm7, %xmm6 + sha1msg1 %xmm2, %xmm1 + pxor %xmm2, %xmm0 + +.macro midround msg3, msg0, msg1, msg2, e1, e0, k + sha1nexte \msg3, \e1 + movdqa %xmm6, \e0 + sha1msg2 \msg3, \msg0 + sha1rnds4 $\k, \e1, %xmm6 + sha1msg1 \msg3, \msg2 + pxor \msg3, \msg1 +.endm + + movdqu 3*16(%rsi), %xmm3 // load third message block + pshufb %xmm4, %xmm3 + + add $4*16, %rsi + + midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 0 // 12--15 + midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 0 // 16--19 + midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 20--23 + midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 1 // 24--27 + midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 1 // 28--31 + midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 1 // 32--35 + midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1 // 36--39 + midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 40--43 + midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 2 // 44--47 + midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 2 // 48--51 + midround %xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 2 // 52--55 + midround %xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2 // 56--59 + midround %xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 3 // 60--63 + midround %xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 3 // 64--67 + + // rounds 68--71 + sha1nexte %xmm1, %xmm5 + movdqa %xmm6, %xmm7 + sha1msg2 %xmm1, %xmm2 + sha1rnds4 $3, %xmm5, %xmm6 + pxor %xmm1, %xmm3 + + // rounds 72--75 + sha1nexte %xmm2, %xmm7 + movdqa %xmm6, %xmm5 + sha1msg2 %xmm2, %xmm3 + sha1rnds4 $3, %xmm7, %xmm6 + + // rounds 76--79 + sha1nexte %xmm3, %xmm5 + movdqa %xmm6, %xmm7 + sha1rnds4 $3, %xmm5, %xmm6 + + sha1nexte %xmm9, %xmm7 // add saved E + paddd %xmm8, %xmm6 // add saved ABCD + + cmp %rsi, %rcx // end reached? + jne 0b + + pshufd $0x1b, %xmm6, %xmm6 // restore order of h0--h3 + movdqu %xmm6, (%rdi) // write h0--h3 + pextrd $3, %xmm7, 16(%rdi) // write h4 +1: ret +END(_libmd_sha1block_shani) + + .section .rodata + .balign 16 +shuf_mask: + .8byte 0x08090a0b0c0d0e0f + .8byte 0x0001020304050607 + .size shuf_mask, .-shuf_mask + + .section .note.GNU-stack,"",%progbits diff --git a/lib/libmd/amd64/sha1dispatch.c b/lib/libmd/amd64/sha1dispatch.c new file mode 100644 --- /dev/null +++ b/lib/libmd/amd64/sha1dispatch.c @@ -0,0 +1,77 @@ +/*- + * Copyright (c) 2016 The Go Authors. All rights reserved. + * Copyright (c) 2024 Robert Clausecker + * + * Adapted from Go's crypto/sha1/sha1block_amd64.go. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +extern void _libmd_sha1block_scalar(SHA1_CTX *, const void *, size_t); +extern void _libmd_sha1block_avx2(SHA1_CTX *, const void *, size_t); +extern void _libmd_sha1block_shani(SHA1_CTX *, const void *, size_t); +static void sha1block_avx2_wrapper(SHA1_CTX *, const void *, size_t); + +#define AVX2_STDEXT_NEEDED \ + (CPUID_STDEXT_BMI1 | CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2) + +DEFINE_UIFUNC(, void, sha1_block, (SHA1_CTX *, const void *, size_t)) +{ + if (cpu_stdext_feature & CPUID_STDEXT_SHA) + return (_libmd_sha1block_shani); + if ((cpu_stdext_feature & AVX2_STDEXT_NEEDED) == AVX2_STDEXT_NEEDED) + return (sha1block_avx2_wrapper); + else + return (_libmd_sha1block_scalar); +} + +static void +sha1block_avx2_wrapper(SHA1_CTX *c, const void *data, size_t len) +{ + if (len >= 256) { + /* + * sha1block_avx2 calculates sha1 for 2 block per iteration. + * It also interleaves the precalculation for next the block. + * So it may read up-to 192 bytes past the end of p. + * We may add checks inside sha1block_avx2, but this will + * just turn it into a copy of sha1block_scalar, + * so call it directly, instead. + */ + size_t safe_len = len - 128; + + if (safe_len % 128 != 0) + safe_len -= 64; + + _libmd_sha1block_avx2(c, data, safe_len); + _libmd_sha1block_scalar(c, data + safe_len, len - safe_len); + } else + _libmd_sha1block_scalar(c, data, len); +} diff --git a/lib/libmd/i386/sha.S b/lib/libmd/i386/sha.S deleted file mode 100644 --- a/lib/libmd/i386/sha.S +++ /dev/null @@ -1,1951 +0,0 @@ -/* -*- Fundamental -*- Emacs' assembler mode hoses this file */ -#ifndef PIC -/* Run the C pre-processor over this file with one of the following defined - * ELF - elf object files, - * OUT - a.out object files, - * BSDI - BSDI style a.out object files - * SOL - Solaris style elf - */ - -#define TYPE(a,b) .type a,b -#define SIZE(a,b) .size a,b - -#if defined(OUT) || defined(BSDI) -#define sha1_block_x86 _sha1_block_x86 - -#endif - -#ifdef OUT -#define OK 1 -#define ALIGN 4 -#endif - -#ifdef BSDI -#define OK 1 -#define ALIGN 4 -#undef SIZE -#undef TYPE -#define SIZE(a,b) -#define TYPE(a,b) -#endif - -#if defined(ELF) || defined(SOL) -#define OK 1 -#define ALIGN 4 -#endif - -#ifndef OK -You need to define one of -ELF - elf systems - linux-elf, NetBSD and DG-UX -OUT - a.out systems - linux-a.out and FreeBSD -SOL - solaris systems, which are elf with strange comment lines -BSDI - a.out with a very primative version of as. -#endif - -/* Let the Assembler begin :-) */ - /* Don't even think of reading this code */ - /* It was automatically generated by sha1-586.pl */ - /* Which is a perl program used to generate the x86 assember for */ - /* any of elf, a.out, BSDI,Win32, or Solaris */ - /* eric */ - - .file "sha1-586.s" - .version "01.01" -gcc2_compiled.: -.text - .p2align ALIGN -.globl sha1_block_x86 - TYPE(sha1_block_x86,@function) -sha1_block_x86: - pushl %esi - pushl %ebp - movl 20(%esp), %eax - movl 16(%esp), %esi - addl %esi, %eax - movl 12(%esp), %ebp - pushl %ebx - subl $64, %eax - pushl %edi - movl 4(%ebp), %ebx - subl $72, %esp - movl 12(%ebp), %edx - movl 16(%ebp), %edi - movl 8(%ebp), %ecx - movl %eax, 68(%esp) - /* First we need to setup the X array */ - movl (%esi), %eax -.L000start: - /* First, load the words onto the stack in network byte order */ -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, (%esp) - movl 4(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 4(%esp) - movl 8(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 8(%esp) - movl 12(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 12(%esp) - movl 16(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 16(%esp) - movl 20(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 20(%esp) - movl 24(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 24(%esp) - movl 28(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 28(%esp) - movl 32(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 32(%esp) - movl 36(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 36(%esp) - movl 40(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 40(%esp) - movl 44(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 44(%esp) - movl 48(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 48(%esp) - movl 52(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 52(%esp) - movl 56(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 56(%esp) - movl 60(%esi), %eax -.byte 15 -.byte 200 /* bswapl %eax */ - movl %eax, 60(%esp) - /* We now have the X array on the stack */ - /* starting at sp-4 */ - movl %esi, 64(%esp) - - /* Start processing */ - movl (%ebp), %eax - /* 00_15 0 */ - movl %ecx, %esi - movl %eax, %ebp - xorl %edx, %esi - roll $5, %ebp - andl %ebx, %esi - addl %edi, %ebp -.byte 209 -.byte 203 /* rorl $1 %ebx */ - movl (%esp), %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - xorl %edx, %esi - leal 1518500249(%ebp,%edi,1),%ebp - movl %ebx, %edi - addl %ebp, %esi - xorl %ecx, %edi - movl %esi, %ebp - andl %eax, %edi - roll $5, %ebp - addl %edx, %ebp - movl 4(%esp), %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - xorl %ecx, %edi -.byte 209 -.byte 200 /* rorl $1 %eax */ - leal 1518500249(%ebp,%edx,1),%ebp - addl %ebp, %edi - /* 00_15 2 */ - movl %eax, %edx - movl %edi, %ebp - xorl %ebx, %edx - roll $5, %ebp - andl %esi, %edx - addl %ecx, %ebp -.byte 209 -.byte 206 /* rorl $1 %esi */ - movl 8(%esp), %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - xorl %ebx, %edx - leal 1518500249(%ebp,%ecx,1),%ebp - movl %esi, %ecx - addl %ebp, %edx - xorl %eax, %ecx - movl %edx, %ebp - andl %edi, %ecx - roll $5, %ebp - addl %ebx, %ebp - movl 12(%esp), %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - xorl %eax, %ecx -.byte 209 -.byte 207 /* rorl $1 %edi */ - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ebp, %ecx - /* 00_15 4 */ - movl %edi, %ebx - movl %ecx, %ebp - xorl %esi, %ebx - roll $5, %ebp - andl %edx, %ebx - addl %eax, %ebp -.byte 209 -.byte 202 /* rorl $1 %edx */ - movl 16(%esp), %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - xorl %esi, %ebx - leal 1518500249(%ebp,%eax,1),%ebp - movl %edx, %eax - addl %ebp, %ebx - xorl %edi, %eax - movl %ebx, %ebp - andl %ecx, %eax - roll $5, %ebp - addl %esi, %ebp - movl 20(%esp), %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - xorl %edi, %eax -.byte 209 -.byte 201 /* rorl $1 %ecx */ - leal 1518500249(%ebp,%esi,1),%ebp - addl %ebp, %eax - /* 00_15 6 */ - movl %ecx, %esi - movl %eax, %ebp - xorl %edx, %esi - roll $5, %ebp - andl %ebx, %esi - addl %edi, %ebp -.byte 209 -.byte 203 /* rorl $1 %ebx */ - movl 24(%esp), %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - xorl %edx, %esi - leal 1518500249(%ebp,%edi,1),%ebp - movl %ebx, %edi - addl %ebp, %esi - xorl %ecx, %edi - movl %esi, %ebp - andl %eax, %edi - roll $5, %ebp - addl %edx, %ebp - movl 28(%esp), %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - xorl %ecx, %edi -.byte 209 -.byte 200 /* rorl $1 %eax */ - leal 1518500249(%ebp,%edx,1),%ebp - addl %ebp, %edi - /* 00_15 8 */ - movl %eax, %edx - movl %edi, %ebp - xorl %ebx, %edx - roll $5, %ebp - andl %esi, %edx - addl %ecx, %ebp -.byte 209 -.byte 206 /* rorl $1 %esi */ - movl 32(%esp), %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - xorl %ebx, %edx - leal 1518500249(%ebp,%ecx,1),%ebp - movl %esi, %ecx - addl %ebp, %edx - xorl %eax, %ecx - movl %edx, %ebp - andl %edi, %ecx - roll $5, %ebp - addl %ebx, %ebp - movl 36(%esp), %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - xorl %eax, %ecx -.byte 209 -.byte 207 /* rorl $1 %edi */ - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ebp, %ecx - /* 00_15 10 */ - movl %edi, %ebx - movl %ecx, %ebp - xorl %esi, %ebx - roll $5, %ebp - andl %edx, %ebx - addl %eax, %ebp -.byte 209 -.byte 202 /* rorl $1 %edx */ - movl 40(%esp), %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - xorl %esi, %ebx - leal 1518500249(%ebp,%eax,1),%ebp - movl %edx, %eax - addl %ebp, %ebx - xorl %edi, %eax - movl %ebx, %ebp - andl %ecx, %eax - roll $5, %ebp - addl %esi, %ebp - movl 44(%esp), %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - xorl %edi, %eax -.byte 209 -.byte 201 /* rorl $1 %ecx */ - leal 1518500249(%ebp,%esi,1),%ebp - addl %ebp, %eax - /* 00_15 12 */ - movl %ecx, %esi - movl %eax, %ebp - xorl %edx, %esi - roll $5, %ebp - andl %ebx, %esi - addl %edi, %ebp -.byte 209 -.byte 203 /* rorl $1 %ebx */ - movl 48(%esp), %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - xorl %edx, %esi - leal 1518500249(%ebp,%edi,1),%ebp - movl %ebx, %edi - addl %ebp, %esi - xorl %ecx, %edi - movl %esi, %ebp - andl %eax, %edi - roll $5, %ebp - addl %edx, %ebp - movl 52(%esp), %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - xorl %ecx, %edi -.byte 209 -.byte 200 /* rorl $1 %eax */ - leal 1518500249(%ebp,%edx,1),%ebp - addl %ebp, %edi - /* 00_15 14 */ - movl %eax, %edx - movl %edi, %ebp - xorl %ebx, %edx - roll $5, %ebp - andl %esi, %edx - addl %ecx, %ebp -.byte 209 -.byte 206 /* rorl $1 %esi */ - movl 56(%esp), %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - xorl %ebx, %edx - leal 1518500249(%ebp,%ecx,1),%ebp - movl %esi, %ecx - addl %ebp, %edx - xorl %eax, %ecx - movl %edx, %ebp - andl %edi, %ecx - roll $5, %ebp - addl %ebx, %ebp - movl 60(%esp), %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - xorl %eax, %ecx -.byte 209 -.byte 207 /* rorl $1 %edi */ - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ebp, %ecx - /* 16_19 16 */ - nop - movl (%esp), %ebp - movl 8(%esp), %ebx - xorl %ebp, %ebx - movl 32(%esp), %ebp - xorl %ebp, %ebx - movl 52(%esp), %ebp - xorl %ebp, %ebx - movl %edi, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - xorl %esi, %ebp - movl %ebx, (%esp) - andl %edx, %ebp - leal 1518500249(%ebx,%eax,1),%ebx - xorl %esi, %ebp - movl %ecx, %eax - addl %ebp, %ebx - roll $5, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %eax, %ebx - movl 4(%esp), %eax - movl 12(%esp), %ebp - xorl %ebp, %eax - movl 36(%esp), %ebp - xorl %ebp, %eax - movl 56(%esp), %ebp -.byte 209 -.byte 202 /* rorl $1 %edx */ - xorl %ebp, %eax -.byte 209 -.byte 192 /* roll $1 %eax */ - movl %edx, %ebp - xorl %edi, %ebp - movl %eax, 4(%esp) - andl %ecx, %ebp - leal 1518500249(%eax,%esi,1),%eax - xorl %edi, %ebp - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %eax -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %eax - /* 16_19 18 */ - movl 8(%esp), %ebp - movl 16(%esp), %esi - xorl %ebp, %esi - movl 40(%esp), %ebp - xorl %ebp, %esi - movl 60(%esp), %ebp - xorl %ebp, %esi - movl %ecx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - xorl %edx, %ebp - movl %esi, 8(%esp) - andl %ebx, %ebp - leal 1518500249(%esi,%edi,1),%esi - xorl %edx, %ebp - movl %eax, %edi - addl %ebp, %esi - roll $5, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %edi, %esi - movl 12(%esp), %edi - movl 20(%esp), %ebp - xorl %ebp, %edi - movl 44(%esp), %ebp - xorl %ebp, %edi - movl (%esp), %ebp -.byte 209 -.byte 203 /* rorl $1 %ebx */ - xorl %ebp, %edi -.byte 209 -.byte 199 /* roll $1 %edi */ - movl %ebx, %ebp - xorl %ecx, %ebp - movl %edi, 12(%esp) - andl %eax, %ebp - leal 1518500249(%edi,%edx,1),%edi - xorl %ecx, %ebp - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %edi -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edi - /* 20_39 20 */ - movl 16(%esp), %edx - movl 24(%esp), %ebp - xorl %ebp, %edx - movl 48(%esp), %ebp - xorl %ebp, %edx - movl 4(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - xorl %eax, %ebp - movl %edx, 16(%esp) - xorl %ebx, %ebp - leal 1859775393(%edx,%ecx,1),%edx - movl %edi, %ecx - roll $5, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ebp, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ecx, %edx - /* 20_39 21 */ - movl 20(%esp), %ecx - movl 28(%esp), %ebp - xorl %ebp, %ecx - movl 52(%esp), %ebp - xorl %ebp, %ecx - movl 8(%esp), %ebp - xorl %ebp, %ecx - movl %edi, %ebp -.byte 209 -.byte 193 /* roll $1 %ecx */ - xorl %esi, %ebp - movl %ecx, 20(%esp) - xorl %eax, %ebp - leal 1859775393(%ecx,%ebx,1),%ecx - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ecx - /* 20_39 22 */ - movl 24(%esp), %ebx - movl 32(%esp), %ebp - xorl %ebp, %ebx - movl 56(%esp), %ebp - xorl %ebp, %ebx - movl 12(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - xorl %edi, %ebp - movl %ebx, 24(%esp) - xorl %esi, %ebp - leal 1859775393(%ebx,%eax,1),%ebx - movl %ecx, %eax - roll $5, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %ebp, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %eax, %ebx - /* 20_39 23 */ - movl 28(%esp), %eax - movl 36(%esp), %ebp - xorl %ebp, %eax - movl 60(%esp), %ebp - xorl %ebp, %eax - movl 16(%esp), %ebp - xorl %ebp, %eax - movl %ecx, %ebp -.byte 209 -.byte 192 /* roll $1 %eax */ - xorl %edx, %ebp - movl %eax, 28(%esp) - xorl %edi, %ebp - leal 1859775393(%eax,%esi,1),%eax - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %eax - /* 20_39 24 */ - movl 32(%esp), %esi - movl 40(%esp), %ebp - xorl %ebp, %esi - movl (%esp), %ebp - xorl %ebp, %esi - movl 20(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - xorl %ecx, %ebp - movl %esi, 32(%esp) - xorl %edx, %ebp - leal 1859775393(%esi,%edi,1),%esi - movl %eax, %edi - roll $5, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %ebp, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %edi, %esi - /* 20_39 25 */ - movl 36(%esp), %edi - movl 44(%esp), %ebp - xorl %ebp, %edi - movl 4(%esp), %ebp - xorl %ebp, %edi - movl 24(%esp), %ebp - xorl %ebp, %edi - movl %eax, %ebp -.byte 209 -.byte 199 /* roll $1 %edi */ - xorl %ebx, %ebp - movl %edi, 36(%esp) - xorl %ecx, %ebp - leal 1859775393(%edi,%edx,1),%edi - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %edi - /* 20_39 26 */ - movl 40(%esp), %edx - movl 48(%esp), %ebp - xorl %ebp, %edx - movl 8(%esp), %ebp - xorl %ebp, %edx - movl 28(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - xorl %eax, %ebp - movl %edx, 40(%esp) - xorl %ebx, %ebp - leal 1859775393(%edx,%ecx,1),%edx - movl %edi, %ecx - roll $5, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ebp, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ecx, %edx - /* 20_39 27 */ - movl 44(%esp), %ecx - movl 52(%esp), %ebp - xorl %ebp, %ecx - movl 12(%esp), %ebp - xorl %ebp, %ecx - movl 32(%esp), %ebp - xorl %ebp, %ecx - movl %edi, %ebp -.byte 209 -.byte 193 /* roll $1 %ecx */ - xorl %esi, %ebp - movl %ecx, 44(%esp) - xorl %eax, %ebp - leal 1859775393(%ecx,%ebx,1),%ecx - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ecx - /* 20_39 28 */ - movl 48(%esp), %ebx - movl 56(%esp), %ebp - xorl %ebp, %ebx - movl 16(%esp), %ebp - xorl %ebp, %ebx - movl 36(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - xorl %edi, %ebp - movl %ebx, 48(%esp) - xorl %esi, %ebp - leal 1859775393(%ebx,%eax,1),%ebx - movl %ecx, %eax - roll $5, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %ebp, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %eax, %ebx - /* 20_39 29 */ - movl 52(%esp), %eax - movl 60(%esp), %ebp - xorl %ebp, %eax - movl 20(%esp), %ebp - xorl %ebp, %eax - movl 40(%esp), %ebp - xorl %ebp, %eax - movl %ecx, %ebp -.byte 209 -.byte 192 /* roll $1 %eax */ - xorl %edx, %ebp - movl %eax, 52(%esp) - xorl %edi, %ebp - leal 1859775393(%eax,%esi,1),%eax - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %eax - /* 20_39 30 */ - movl 56(%esp), %esi - movl (%esp), %ebp - xorl %ebp, %esi - movl 24(%esp), %ebp - xorl %ebp, %esi - movl 44(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - xorl %ecx, %ebp - movl %esi, 56(%esp) - xorl %edx, %ebp - leal 1859775393(%esi,%edi,1),%esi - movl %eax, %edi - roll $5, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %ebp, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %edi, %esi - /* 20_39 31 */ - movl 60(%esp), %edi - movl 4(%esp), %ebp - xorl %ebp, %edi - movl 28(%esp), %ebp - xorl %ebp, %edi - movl 48(%esp), %ebp - xorl %ebp, %edi - movl %eax, %ebp -.byte 209 -.byte 199 /* roll $1 %edi */ - xorl %ebx, %ebp - movl %edi, 60(%esp) - xorl %ecx, %ebp - leal 1859775393(%edi,%edx,1),%edi - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %edi - /* 20_39 32 */ - movl (%esp), %edx - movl 8(%esp), %ebp - xorl %ebp, %edx - movl 32(%esp), %ebp - xorl %ebp, %edx - movl 52(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - xorl %eax, %ebp - movl %edx, (%esp) - xorl %ebx, %ebp - leal 1859775393(%edx,%ecx,1),%edx - movl %edi, %ecx - roll $5, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ebp, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ecx, %edx - /* 20_39 33 */ - movl 4(%esp), %ecx - movl 12(%esp), %ebp - xorl %ebp, %ecx - movl 36(%esp), %ebp - xorl %ebp, %ecx - movl 56(%esp), %ebp - xorl %ebp, %ecx - movl %edi, %ebp -.byte 209 -.byte 193 /* roll $1 %ecx */ - xorl %esi, %ebp - movl %ecx, 4(%esp) - xorl %eax, %ebp - leal 1859775393(%ecx,%ebx,1),%ecx - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ecx - /* 20_39 34 */ - movl 8(%esp), %ebx - movl 16(%esp), %ebp - xorl %ebp, %ebx - movl 40(%esp), %ebp - xorl %ebp, %ebx - movl 60(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - xorl %edi, %ebp - movl %ebx, 8(%esp) - xorl %esi, %ebp - leal 1859775393(%ebx,%eax,1),%ebx - movl %ecx, %eax - roll $5, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %ebp, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %eax, %ebx - /* 20_39 35 */ - movl 12(%esp), %eax - movl 20(%esp), %ebp - xorl %ebp, %eax - movl 44(%esp), %ebp - xorl %ebp, %eax - movl (%esp), %ebp - xorl %ebp, %eax - movl %ecx, %ebp -.byte 209 -.byte 192 /* roll $1 %eax */ - xorl %edx, %ebp - movl %eax, 12(%esp) - xorl %edi, %ebp - leal 1859775393(%eax,%esi,1),%eax - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %eax - /* 20_39 36 */ - movl 16(%esp), %esi - movl 24(%esp), %ebp - xorl %ebp, %esi - movl 48(%esp), %ebp - xorl %ebp, %esi - movl 4(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - xorl %ecx, %ebp - movl %esi, 16(%esp) - xorl %edx, %ebp - leal 1859775393(%esi,%edi,1),%esi - movl %eax, %edi - roll $5, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %ebp, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %edi, %esi - /* 20_39 37 */ - movl 20(%esp), %edi - movl 28(%esp), %ebp - xorl %ebp, %edi - movl 52(%esp), %ebp - xorl %ebp, %edi - movl 8(%esp), %ebp - xorl %ebp, %edi - movl %eax, %ebp -.byte 209 -.byte 199 /* roll $1 %edi */ - xorl %ebx, %ebp - movl %edi, 20(%esp) - xorl %ecx, %ebp - leal 1859775393(%edi,%edx,1),%edi - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %edi - /* 20_39 38 */ - movl 24(%esp), %edx - movl 32(%esp), %ebp - xorl %ebp, %edx - movl 56(%esp), %ebp - xorl %ebp, %edx - movl 12(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - xorl %eax, %ebp - movl %edx, 24(%esp) - xorl %ebx, %ebp - leal 1859775393(%edx,%ecx,1),%edx - movl %edi, %ecx - roll $5, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ebp, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ecx, %edx - /* 20_39 39 */ - movl 28(%esp), %ecx - movl 36(%esp), %ebp - xorl %ebp, %ecx - movl 60(%esp), %ebp - xorl %ebp, %ecx - movl 16(%esp), %ebp - xorl %ebp, %ecx - movl %edi, %ebp -.byte 209 -.byte 193 /* roll $1 %ecx */ - xorl %esi, %ebp - movl %ecx, 28(%esp) - xorl %eax, %ebp - leal 1859775393(%ecx,%ebx,1),%ecx - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ecx - /* 40_59 40 */ - movl 32(%esp), %ebx - movl 40(%esp), %ebp - xorl %ebp, %ebx - movl (%esp), %ebp - xorl %ebp, %ebx - movl 20(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - orl %edi, %ebp - movl %ebx, 32(%esp) - andl %esi, %ebp - leal 2400959708(%ebx,%eax,1),%ebx - movl %edx, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - andl %edi, %eax - orl %eax, %ebp - movl %ecx, %eax - roll $5, %eax - addl %eax, %ebp - movl 36(%esp), %eax - addl %ebp, %ebx - movl 44(%esp), %ebp - xorl %ebp, %eax - movl 4(%esp), %ebp - xorl %ebp, %eax - movl 24(%esp), %ebp -.byte 209 -.byte 202 /* rorl $1 %edx */ - xorl %ebp, %eax -.byte 209 -.byte 192 /* roll $1 %eax */ - movl %ecx, %ebp - movl %eax, 36(%esp) - orl %edx, %ebp - leal 2400959708(%eax,%esi,1),%eax - movl %ecx, %esi - andl %edi, %ebp - andl %edx, %esi - orl %esi, %ebp - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %ebp -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %eax - /* 40_59 41 */ - /* 40_59 42 */ - movl 40(%esp), %esi - movl 48(%esp), %ebp - xorl %ebp, %esi - movl 8(%esp), %ebp - xorl %ebp, %esi - movl 28(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - orl %ecx, %ebp - movl %esi, 40(%esp) - andl %edx, %ebp - leal 2400959708(%esi,%edi,1),%esi - movl %ebx, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - andl %ecx, %edi - orl %edi, %ebp - movl %eax, %edi - roll $5, %edi - addl %edi, %ebp - movl 44(%esp), %edi - addl %ebp, %esi - movl 52(%esp), %ebp - xorl %ebp, %edi - movl 12(%esp), %ebp - xorl %ebp, %edi - movl 32(%esp), %ebp -.byte 209 -.byte 203 /* rorl $1 %ebx */ - xorl %ebp, %edi -.byte 209 -.byte 199 /* roll $1 %edi */ - movl %eax, %ebp - movl %edi, 44(%esp) - orl %ebx, %ebp - leal 2400959708(%edi,%edx,1),%edi - movl %eax, %edx - andl %ecx, %ebp - andl %ebx, %edx - orl %edx, %ebp - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %ebp -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edi - /* 40_59 43 */ - /* 40_59 44 */ - movl 48(%esp), %edx - movl 56(%esp), %ebp - xorl %ebp, %edx - movl 16(%esp), %ebp - xorl %ebp, %edx - movl 36(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - orl %eax, %ebp - movl %edx, 48(%esp) - andl %ebx, %ebp - leal 2400959708(%edx,%ecx,1),%edx - movl %esi, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - andl %eax, %ecx - orl %ecx, %ebp - movl %edi, %ecx - roll $5, %ecx - addl %ecx, %ebp - movl 52(%esp), %ecx - addl %ebp, %edx - movl 60(%esp), %ebp - xorl %ebp, %ecx - movl 20(%esp), %ebp - xorl %ebp, %ecx - movl 40(%esp), %ebp -.byte 209 -.byte 206 /* rorl $1 %esi */ - xorl %ebp, %ecx -.byte 209 -.byte 193 /* roll $1 %ecx */ - movl %edi, %ebp - movl %ecx, 52(%esp) - orl %esi, %ebp - leal 2400959708(%ecx,%ebx,1),%ecx - movl %edi, %ebx - andl %eax, %ebp - andl %esi, %ebx - orl %ebx, %ebp - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ebp -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ecx - /* 40_59 45 */ - /* 40_59 46 */ - movl 56(%esp), %ebx - movl (%esp), %ebp - xorl %ebp, %ebx - movl 24(%esp), %ebp - xorl %ebp, %ebx - movl 44(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - orl %edi, %ebp - movl %ebx, 56(%esp) - andl %esi, %ebp - leal 2400959708(%ebx,%eax,1),%ebx - movl %edx, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - andl %edi, %eax - orl %eax, %ebp - movl %ecx, %eax - roll $5, %eax - addl %eax, %ebp - movl 60(%esp), %eax - addl %ebp, %ebx - movl 4(%esp), %ebp - xorl %ebp, %eax - movl 28(%esp), %ebp - xorl %ebp, %eax - movl 48(%esp), %ebp -.byte 209 -.byte 202 /* rorl $1 %edx */ - xorl %ebp, %eax -.byte 209 -.byte 192 /* roll $1 %eax */ - movl %ecx, %ebp - movl %eax, 60(%esp) - orl %edx, %ebp - leal 2400959708(%eax,%esi,1),%eax - movl %ecx, %esi - andl %edi, %ebp - andl %edx, %esi - orl %esi, %ebp - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %ebp -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %eax - /* 40_59 47 */ - /* 40_59 48 */ - movl (%esp), %esi - movl 8(%esp), %ebp - xorl %ebp, %esi - movl 32(%esp), %ebp - xorl %ebp, %esi - movl 52(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - orl %ecx, %ebp - movl %esi, (%esp) - andl %edx, %ebp - leal 2400959708(%esi,%edi,1),%esi - movl %ebx, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - andl %ecx, %edi - orl %edi, %ebp - movl %eax, %edi - roll $5, %edi - addl %edi, %ebp - movl 4(%esp), %edi - addl %ebp, %esi - movl 12(%esp), %ebp - xorl %ebp, %edi - movl 36(%esp), %ebp - xorl %ebp, %edi - movl 56(%esp), %ebp -.byte 209 -.byte 203 /* rorl $1 %ebx */ - xorl %ebp, %edi -.byte 209 -.byte 199 /* roll $1 %edi */ - movl %eax, %ebp - movl %edi, 4(%esp) - orl %ebx, %ebp - leal 2400959708(%edi,%edx,1),%edi - movl %eax, %edx - andl %ecx, %ebp - andl %ebx, %edx - orl %edx, %ebp - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %ebp -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edi - /* 40_59 49 */ - /* 40_59 50 */ - movl 8(%esp), %edx - movl 16(%esp), %ebp - xorl %ebp, %edx - movl 40(%esp), %ebp - xorl %ebp, %edx - movl 60(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - orl %eax, %ebp - movl %edx, 8(%esp) - andl %ebx, %ebp - leal 2400959708(%edx,%ecx,1),%edx - movl %esi, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - andl %eax, %ecx - orl %ecx, %ebp - movl %edi, %ecx - roll $5, %ecx - addl %ecx, %ebp - movl 12(%esp), %ecx - addl %ebp, %edx - movl 20(%esp), %ebp - xorl %ebp, %ecx - movl 44(%esp), %ebp - xorl %ebp, %ecx - movl (%esp), %ebp -.byte 209 -.byte 206 /* rorl $1 %esi */ - xorl %ebp, %ecx -.byte 209 -.byte 193 /* roll $1 %ecx */ - movl %edi, %ebp - movl %ecx, 12(%esp) - orl %esi, %ebp - leal 2400959708(%ecx,%ebx,1),%ecx - movl %edi, %ebx - andl %eax, %ebp - andl %esi, %ebx - orl %ebx, %ebp - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ebp -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ecx - /* 40_59 51 */ - /* 40_59 52 */ - movl 16(%esp), %ebx - movl 24(%esp), %ebp - xorl %ebp, %ebx - movl 48(%esp), %ebp - xorl %ebp, %ebx - movl 4(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - orl %edi, %ebp - movl %ebx, 16(%esp) - andl %esi, %ebp - leal 2400959708(%ebx,%eax,1),%ebx - movl %edx, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - andl %edi, %eax - orl %eax, %ebp - movl %ecx, %eax - roll $5, %eax - addl %eax, %ebp - movl 20(%esp), %eax - addl %ebp, %ebx - movl 28(%esp), %ebp - xorl %ebp, %eax - movl 52(%esp), %ebp - xorl %ebp, %eax - movl 8(%esp), %ebp -.byte 209 -.byte 202 /* rorl $1 %edx */ - xorl %ebp, %eax -.byte 209 -.byte 192 /* roll $1 %eax */ - movl %ecx, %ebp - movl %eax, 20(%esp) - orl %edx, %ebp - leal 2400959708(%eax,%esi,1),%eax - movl %ecx, %esi - andl %edi, %ebp - andl %edx, %esi - orl %esi, %ebp - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %ebp -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %eax - /* 40_59 53 */ - /* 40_59 54 */ - movl 24(%esp), %esi - movl 32(%esp), %ebp - xorl %ebp, %esi - movl 56(%esp), %ebp - xorl %ebp, %esi - movl 12(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - orl %ecx, %ebp - movl %esi, 24(%esp) - andl %edx, %ebp - leal 2400959708(%esi,%edi,1),%esi - movl %ebx, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - andl %ecx, %edi - orl %edi, %ebp - movl %eax, %edi - roll $5, %edi - addl %edi, %ebp - movl 28(%esp), %edi - addl %ebp, %esi - movl 36(%esp), %ebp - xorl %ebp, %edi - movl 60(%esp), %ebp - xorl %ebp, %edi - movl 16(%esp), %ebp -.byte 209 -.byte 203 /* rorl $1 %ebx */ - xorl %ebp, %edi -.byte 209 -.byte 199 /* roll $1 %edi */ - movl %eax, %ebp - movl %edi, 28(%esp) - orl %ebx, %ebp - leal 2400959708(%edi,%edx,1),%edi - movl %eax, %edx - andl %ecx, %ebp - andl %ebx, %edx - orl %edx, %ebp - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %ebp -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edi - /* 40_59 55 */ - /* 40_59 56 */ - movl 32(%esp), %edx - movl 40(%esp), %ebp - xorl %ebp, %edx - movl (%esp), %ebp - xorl %ebp, %edx - movl 20(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - orl %eax, %ebp - movl %edx, 32(%esp) - andl %ebx, %ebp - leal 2400959708(%edx,%ecx,1),%edx - movl %esi, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - andl %eax, %ecx - orl %ecx, %ebp - movl %edi, %ecx - roll $5, %ecx - addl %ecx, %ebp - movl 36(%esp), %ecx - addl %ebp, %edx - movl 44(%esp), %ebp - xorl %ebp, %ecx - movl 4(%esp), %ebp - xorl %ebp, %ecx - movl 24(%esp), %ebp -.byte 209 -.byte 206 /* rorl $1 %esi */ - xorl %ebp, %ecx -.byte 209 -.byte 193 /* roll $1 %ecx */ - movl %edi, %ebp - movl %ecx, 36(%esp) - orl %esi, %ebp - leal 2400959708(%ecx,%ebx,1),%ecx - movl %edi, %ebx - andl %eax, %ebp - andl %esi, %ebx - orl %ebx, %ebp - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ebp -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ecx - /* 40_59 57 */ - /* 40_59 58 */ - movl 40(%esp), %ebx - movl 48(%esp), %ebp - xorl %ebp, %ebx - movl 8(%esp), %ebp - xorl %ebp, %ebx - movl 28(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - orl %edi, %ebp - movl %ebx, 40(%esp) - andl %esi, %ebp - leal 2400959708(%ebx,%eax,1),%ebx - movl %edx, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - andl %edi, %eax - orl %eax, %ebp - movl %ecx, %eax - roll $5, %eax - addl %eax, %ebp - movl 44(%esp), %eax - addl %ebp, %ebx - movl 52(%esp), %ebp - xorl %ebp, %eax - movl 12(%esp), %ebp - xorl %ebp, %eax - movl 32(%esp), %ebp -.byte 209 -.byte 202 /* rorl $1 %edx */ - xorl %ebp, %eax -.byte 209 -.byte 192 /* roll $1 %eax */ - movl %ecx, %ebp - movl %eax, 44(%esp) - orl %edx, %ebp - leal 2400959708(%eax,%esi,1),%eax - movl %ecx, %esi - andl %edi, %ebp - andl %edx, %esi - orl %esi, %ebp - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %ebp -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %eax - /* 40_59 59 */ - /* 20_39 60 */ - movl 48(%esp), %esi - movl 56(%esp), %ebp - xorl %ebp, %esi - movl 16(%esp), %ebp - xorl %ebp, %esi - movl 36(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - xorl %ecx, %ebp - movl %esi, 48(%esp) - xorl %edx, %ebp - leal 3395469782(%esi,%edi,1),%esi - movl %eax, %edi - roll $5, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %ebp, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %edi, %esi - /* 20_39 61 */ - movl 52(%esp), %edi - movl 60(%esp), %ebp - xorl %ebp, %edi - movl 20(%esp), %ebp - xorl %ebp, %edi - movl 40(%esp), %ebp - xorl %ebp, %edi - movl %eax, %ebp -.byte 209 -.byte 199 /* roll $1 %edi */ - xorl %ebx, %ebp - movl %edi, 52(%esp) - xorl %ecx, %ebp - leal 3395469782(%edi,%edx,1),%edi - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %edi - /* 20_39 62 */ - movl 56(%esp), %edx - movl (%esp), %ebp - xorl %ebp, %edx - movl 24(%esp), %ebp - xorl %ebp, %edx - movl 44(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - xorl %eax, %ebp - movl %edx, 56(%esp) - xorl %ebx, %ebp - leal 3395469782(%edx,%ecx,1),%edx - movl %edi, %ecx - roll $5, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ebp, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ecx, %edx - /* 20_39 63 */ - movl 60(%esp), %ecx - movl 4(%esp), %ebp - xorl %ebp, %ecx - movl 28(%esp), %ebp - xorl %ebp, %ecx - movl 48(%esp), %ebp - xorl %ebp, %ecx - movl %edi, %ebp -.byte 209 -.byte 193 /* roll $1 %ecx */ - xorl %esi, %ebp - movl %ecx, 60(%esp) - xorl %eax, %ebp - leal 3395469782(%ecx,%ebx,1),%ecx - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ecx - /* 20_39 64 */ - movl (%esp), %ebx - movl 8(%esp), %ebp - xorl %ebp, %ebx - movl 32(%esp), %ebp - xorl %ebp, %ebx - movl 52(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - xorl %edi, %ebp - movl %ebx, (%esp) - xorl %esi, %ebp - leal 3395469782(%ebx,%eax,1),%ebx - movl %ecx, %eax - roll $5, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %ebp, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %eax, %ebx - /* 20_39 65 */ - movl 4(%esp), %eax - movl 12(%esp), %ebp - xorl %ebp, %eax - movl 36(%esp), %ebp - xorl %ebp, %eax - movl 56(%esp), %ebp - xorl %ebp, %eax - movl %ecx, %ebp -.byte 209 -.byte 192 /* roll $1 %eax */ - xorl %edx, %ebp - movl %eax, 4(%esp) - xorl %edi, %ebp - leal 3395469782(%eax,%esi,1),%eax - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %eax - /* 20_39 66 */ - movl 8(%esp), %esi - movl 16(%esp), %ebp - xorl %ebp, %esi - movl 40(%esp), %ebp - xorl %ebp, %esi - movl 60(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - xorl %ecx, %ebp - movl %esi, 8(%esp) - xorl %edx, %ebp - leal 3395469782(%esi,%edi,1),%esi - movl %eax, %edi - roll $5, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %ebp, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %edi, %esi - /* 20_39 67 */ - movl 12(%esp), %edi - movl 20(%esp), %ebp - xorl %ebp, %edi - movl 44(%esp), %ebp - xorl %ebp, %edi - movl (%esp), %ebp - xorl %ebp, %edi - movl %eax, %ebp -.byte 209 -.byte 199 /* roll $1 %edi */ - xorl %ebx, %ebp - movl %edi, 12(%esp) - xorl %ecx, %ebp - leal 3395469782(%edi,%edx,1),%edi - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %edi - /* 20_39 68 */ - movl 16(%esp), %edx - movl 24(%esp), %ebp - xorl %ebp, %edx - movl 48(%esp), %ebp - xorl %ebp, %edx - movl 4(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - xorl %eax, %ebp - movl %edx, 16(%esp) - xorl %ebx, %ebp - leal 3395469782(%edx,%ecx,1),%edx - movl %edi, %ecx - roll $5, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ebp, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ecx, %edx - /* 20_39 69 */ - movl 20(%esp), %ecx - movl 28(%esp), %ebp - xorl %ebp, %ecx - movl 52(%esp), %ebp - xorl %ebp, %ecx - movl 8(%esp), %ebp - xorl %ebp, %ecx - movl %edi, %ebp -.byte 209 -.byte 193 /* roll $1 %ecx */ - xorl %esi, %ebp - movl %ecx, 20(%esp) - xorl %eax, %ebp - leal 3395469782(%ecx,%ebx,1),%ecx - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ecx - /* 20_39 70 */ - movl 24(%esp), %ebx - movl 32(%esp), %ebp - xorl %ebp, %ebx - movl 56(%esp), %ebp - xorl %ebp, %ebx - movl 12(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - xorl %edi, %ebp - movl %ebx, 24(%esp) - xorl %esi, %ebp - leal 3395469782(%ebx,%eax,1),%ebx - movl %ecx, %eax - roll $5, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %ebp, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %eax, %ebx - /* 20_39 71 */ - movl 28(%esp), %eax - movl 36(%esp), %ebp - xorl %ebp, %eax - movl 60(%esp), %ebp - xorl %ebp, %eax - movl 16(%esp), %ebp - xorl %ebp, %eax - movl %ecx, %ebp -.byte 209 -.byte 192 /* roll $1 %eax */ - xorl %edx, %ebp - movl %eax, 28(%esp) - xorl %edi, %ebp - leal 3395469782(%eax,%esi,1),%eax - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %eax - /* 20_39 72 */ - movl 32(%esp), %esi - movl 40(%esp), %ebp - xorl %ebp, %esi - movl (%esp), %ebp - xorl %ebp, %esi - movl 20(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - xorl %ecx, %ebp - movl %esi, 32(%esp) - xorl %edx, %ebp - leal 3395469782(%esi,%edi,1),%esi - movl %eax, %edi - roll $5, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %ebp, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %edi, %esi - /* 20_39 73 */ - movl 36(%esp), %edi - movl 44(%esp), %ebp - xorl %ebp, %edi - movl 4(%esp), %ebp - xorl %ebp, %edi - movl 24(%esp), %ebp - xorl %ebp, %edi - movl %eax, %ebp -.byte 209 -.byte 199 /* roll $1 %edi */ - xorl %ebx, %ebp - movl %edi, 36(%esp) - xorl %ecx, %ebp - leal 3395469782(%edi,%edx,1),%edi - movl %esi, %edx - roll $5, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %ebp, %edx -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %edi - /* 20_39 74 */ - movl 40(%esp), %edx - movl 48(%esp), %ebp - xorl %ebp, %edx - movl 8(%esp), %ebp - xorl %ebp, %edx - movl 28(%esp), %ebp - xorl %ebp, %edx - movl %esi, %ebp -.byte 209 -.byte 194 /* roll $1 %edx */ - xorl %eax, %ebp - movl %edx, 40(%esp) - xorl %ebx, %ebp - leal 3395469782(%edx,%ecx,1),%edx - movl %edi, %ecx - roll $5, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ebp, %ecx -.byte 209 -.byte 206 /* rorl $1 %esi */ - addl %ecx, %edx - /* 20_39 75 */ - movl 44(%esp), %ecx - movl 52(%esp), %ebp - xorl %ebp, %ecx - movl 12(%esp), %ebp - xorl %ebp, %ecx - movl 32(%esp), %ebp - xorl %ebp, %ecx - movl %edi, %ebp -.byte 209 -.byte 193 /* roll $1 %ecx */ - xorl %esi, %ebp - movl %ecx, 44(%esp) - xorl %eax, %ebp - leal 3395469782(%ecx,%ebx,1),%ecx - movl %edx, %ebx - roll $5, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebp, %ebx -.byte 209 -.byte 207 /* rorl $1 %edi */ - addl %ebx, %ecx - /* 20_39 76 */ - movl 48(%esp), %ebx - movl 56(%esp), %ebp - xorl %ebp, %ebx - movl 16(%esp), %ebp - xorl %ebp, %ebx - movl 36(%esp), %ebp - xorl %ebp, %ebx - movl %edx, %ebp -.byte 209 -.byte 195 /* roll $1 %ebx */ - xorl %edi, %ebp - movl %ebx, 48(%esp) - xorl %esi, %ebp - leal 3395469782(%ebx,%eax,1),%ebx - movl %ecx, %eax - roll $5, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %ebp, %eax -.byte 209 -.byte 202 /* rorl $1 %edx */ - addl %eax, %ebx - /* 20_39 77 */ - movl 52(%esp), %eax - movl 60(%esp), %ebp - xorl %ebp, %eax - movl 20(%esp), %ebp - xorl %ebp, %eax - movl 40(%esp), %ebp - xorl %ebp, %eax - movl %ecx, %ebp -.byte 209 -.byte 192 /* roll $1 %eax */ - xorl %edx, %ebp - movl %eax, 52(%esp) - xorl %edi, %ebp - leal 3395469782(%eax,%esi,1),%eax - movl %ebx, %esi - roll $5, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %ebp, %esi -.byte 209 -.byte 201 /* rorl $1 %ecx */ - addl %esi, %eax - /* 20_39 78 */ - movl 56(%esp), %esi - movl (%esp), %ebp - xorl %ebp, %esi - movl 24(%esp), %ebp - xorl %ebp, %esi - movl 44(%esp), %ebp - xorl %ebp, %esi - movl %ebx, %ebp -.byte 209 -.byte 198 /* roll $1 %esi */ - xorl %ecx, %ebp - movl %esi, 56(%esp) - xorl %edx, %ebp - leal 3395469782(%esi,%edi,1),%esi - movl %eax, %edi - roll $5, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %ebp, %edi -.byte 209 -.byte 203 /* rorl $1 %ebx */ - addl %edi, %esi - /* 20_39 79 */ - movl 60(%esp), %edi - movl 4(%esp), %ebp - xorl %ebp, %edi - movl 28(%esp), %ebp - xorl %ebp, %edi - movl 48(%esp), %ebp - xorl %ebp, %edi - movl %eax, %ebp -.byte 209 -.byte 199 /* roll $1 %edi */ - xorl %ebx, %ebp - movl %edi, 60(%esp) - xorl %ecx, %ebp - leal 3395469782(%edi,%edx,1),%edi - movl %esi, %edx - roll $5, %edx - addl %ebp, %edx - movl 92(%esp), %ebp -.byte 209 -.byte 200 /* rorl $1 %eax */ - addl %edx, %edi -.byte 209 -.byte 200 /* rorl $1 %eax */ - /* End processing */ - - movl 12(%ebp), %edx - addl %ebx, %edx - movl 4(%ebp), %ebx - addl %esi, %ebx - movl %eax, %esi - movl (%ebp), %eax - movl %edx, 12(%ebp) - addl %edi, %eax - movl 16(%ebp), %edi - addl %ecx, %edi - movl 8(%ebp), %ecx - addl %esi, %ecx - movl %eax, (%ebp) - movl 64(%esp), %esi - movl %ecx, 8(%ebp) - addl $64, %esi - movl 68(%esp), %eax - movl %edi, 16(%ebp) - cmpl %esi, %eax - movl %ebx, 4(%ebp) - jb .L001end - movl (%esi), %eax - jmp .L000start -.L001end: - addl $72, %esp - popl %edi - popl %ebx - popl %ebp - popl %esi - ret -.sha1_block_x86_end: - SIZE(sha1_block_x86,.sha1_block_x86_end-sha1_block_x86) -.ident "desasm.pl" -#endif diff --git a/lib/libmd/sha1c.c b/lib/libmd/sha1c.c --- a/lib/libmd/sha1c.c +++ b/lib/libmd/sha1c.c @@ -1,476 +1,244 @@ -/* crypto/sha/sha1dgst.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. +/*- + * Copyright (c) 2009 The Go Authors. All rights reserved. + * Copyright (c) 2024 Robert Clausecker + * + * Adapted from Go's crypto/sha1/sha1.go. * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the routines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include - -#include +#include +#include +#include #include +#include +#include -#if 0 -#include /* we use the __ variants of bit-sized types */ +#ifdef SHA1_ASM +extern void sha1_block(SHA1_CTX *, const void *, size_t); +#else +static void sha1_block(SHA1_CTX *, const void *, size_t); #endif -#include -#undef SHA_0 -#define SHA_1 -#include "sha.h" -#include "sha_locl.h" +#define INIT0 0x67452301 +#define INIT1 0xEFCDAB89 +#define INIT2 0x98BADCFE +#define INIT3 0x10325476 +#define INIT4 0xC3D2E1F0 -/* - * The assembly-language code is not position-independent, so don't - * try to use it in a shared library. - */ -#ifdef PIC -#undef SHA1_ASM -#endif +#define K0 0x5A827999 +#define K1 0x6ED9EBA1 +#define K2 0x8F1BBCDC +#define K3 0xCA62C1D6 -static char *SHA1_version="SHA1 part of SSLeay 0.9.0b 11-Oct-1998"; +void +SHA1_Init(SHA1_CTX *c) +{ + c->h0 = INIT0; + c->h1 = INIT1; + c->h2 = INIT2; + c->h3 = INIT3; + c->h4 = INIT4; + c->Nl = 0; + c->Nh = 0; + c->num = 0; +} -/* Implemented from SHA-1 document - The Secure Hash Algorithm - */ +void +SHA1_Update(SHA1_CTX *c, const void *data, size_t len) +{ + uint64_t nn; + const char *p = data; -#define INIT_DATA_h0 (unsigned long)0x67452301L -#define INIT_DATA_h1 (unsigned long)0xefcdab89L -#define INIT_DATA_h2 (unsigned long)0x98badcfeL -#define INIT_DATA_h3 (unsigned long)0x10325476L -#define INIT_DATA_h4 (unsigned long)0xc3d2e1f0L - -#define K_00_19 0x5a827999L -#define K_20_39 0x6ed9eba1L -#define K_40_59 0x8f1bbcdcL -#define K_60_79 0xca62c1d6L - -#ifndef NOPROTO -# ifdef SHA1_ASM - void sha1_block_x86(SHA_CTX *c, const u_int32_t *p, int num); -# define sha1_block sha1_block_x86 -# else - void sha1_block(SHA_CTX *c, const u_int32_t *p, int num); -# endif -#else -# ifdef SHA1_ASM - void sha1_block_x86(); -# define sha1_block sha1_block_x86 -# else - void sha1_block(); -# endif -#endif + nn = (uint64_t)c->Nl | (uint64_t)c->Nh << 32; + nn += len; + c->Nl = (uint32_t)nn; + c->Nh = (uint32_t)(nn >> 32); + if (c->num > 0) { + size_t n = SHA_CBLOCK - c->num; -#if BYTE_ORDER == LITTLE_ENDIAN && defined(SHA1_ASM) -# define M_c2nl c2l -# define M_p_c2nl p_c2l -# define M_c2nl_p c2l_p -# define M_p_c2nl_p p_c2l_p -# define M_nl2c l2c -#else -# define M_c2nl c2nl -# define M_p_c2nl p_c2nl -# define M_c2nl_p c2nl_p -# define M_p_c2nl_p p_c2nl_p -# define M_nl2c nl2c -#endif + if (n > len) + n = len; + + memcpy((char *)c->data + c->num, p, n); + c->num += n; + if (c->num == SHA_CBLOCK) { + sha1_block(c, (void *)c->data, SHA_CBLOCK); + c->num = 0; + } + + p += n; + len -= n; + } + + if (len >= SHA_CBLOCK) { + size_t n = len & ~(size_t)(SHA_CBLOCK - 1); + + sha1_block(c, p, n); + p += n; + len -= n; + } -void SHA1_Init(SHA_CTX *c) - { - c->h0=INIT_DATA_h0; - c->h1=INIT_DATA_h1; - c->h2=INIT_DATA_h2; - c->h3=INIT_DATA_h3; - c->h4=INIT_DATA_h4; - c->Nl=0; - c->Nh=0; - c->num=0; + if (len > 0) { + memcpy(c->data, p, len); + c->num = len; } +} void -SHA1_Update(SHA_CTX *c, const void *in, size_t len) +SHA1_Final(unsigned char *md, SHA1_CTX *c) { - u_int32_t *p; - int ew,ec,sw,sc; - u_int32_t l; - const unsigned char *data = in; - - if (len == 0) return; - - l=(c->Nl+(len<<3))&0xffffffffL; - if (l < c->Nl) /* overflow */ - c->Nh++; - c->Nh+=(len>>29); - c->Nl=l; - - if (c->num != 0) - { - p=c->data; - sw=c->num>>2; - sc=c->num&0x03; - - if ((c->num+len) >= SHA_CBLOCK) - { - l= p[sw]; - M_p_c2nl(data,l,sc); - p[sw++]=l; - for (; swnum); - - sha1_block(c,p,64); - c->num=0; - /* drop through and do the rest */ - } - else - { - c->num+=(int)len; - if ((sc+len) < 4) /* ugly, add char's to a word */ - { - l= p[sw]; - M_p_c2nl_p(data,l,sc,len); - p[sw]=l; - } - else - { - ew=(c->num>>2); - ec=(c->num&0x03); - l= p[sw]; - M_p_c2nl(data,l,sc); - p[sw++]=l; - for (; sw < ew; sw++) - { M_c2nl(data,l); p[sw]=l; } - if (ec) - { - M_c2nl_p(data,l,ec); - p[sw]=l; - } - } - return; - } + uint64_t len; + size_t t; + unsigned char tmp[SHA_CBLOCK + sizeof(uint64_t)] = {0x80, 0}; + + len = (uint64_t)c->Nl | (uint64_t)c->Nh << 32; + t = 64 + 56 - c->Nl % 64; + if (t > 64) + t -= 64; + + /* length in bits */ + len <<= 3; + be64enc(tmp + t, len); + SHA1_Update(c, tmp, t + 8); + assert(c->num == 0); + + be32enc(md + 0, c->h0); + be32enc(md + 4, c->h1); + be32enc(md + 8, c->h2); + be32enc(md + 12, c->h3); + be32enc(md + 16, c->h4); + + explicit_bzero(c, sizeof(*c)); +} + +#ifndef SHA1_ASM +static void +/* invariant: len is a multiple of SHA_CBLOCK */ +sha1_block(SHA1_CTX *c, const void *data, size_t len) +{ + uint32_t w[16]; + uint32_t h0 = c->h0, h1 = c->h1, h2 = c->h2, h3 = c->h3, h4 = c->h4; + const char *p = data; + + while (len >= SHA_CBLOCK) { + size_t i; + uint32_t a = h0, b = h1, c = h2, d = h3, e = h4; + uint32_t f, t, tmp; + +# pragma unroll + for (i = 0; i < 16; i++) + w[i] = be32dec(p + 4*i); + +# pragma unroll + for (i = 0; i < 16; i++) { + f = b & c | ~b & d; + t = (a << 5 | a >> 32 - 5) + f + e + w[i & 0xf] + K0; + e = d; + d = c; + c = b << 30 | b >> 32 - 30; + b = a; + a = t; } - /* We can only do the following code for assember, the reason - * being that the sha1_block 'C' version changes the values - * in the 'data' array. The assember code avoids this and - * copies it to a local array. I should be able to do this for - * the C version as well.... - */ -#if 1 -#if BYTE_ORDER == BIG_ENDIAN || defined(SHA1_ASM) - if ((((unsigned int)data)%sizeof(u_int32_t)) == 0) - { - sw=len/SHA_CBLOCK; - if (sw) - { - sw*=SHA_CBLOCK; - sha1_block(c,(u_int32_t *)data,sw); - data+=sw; - len-=sw; - } + +# pragma unroll + for (; i < 20; i++) { + tmp = w[i - 3 & 0xf] ^ w[i - 8 & 0xf] ^ w[i - 14 & 0xf] ^ w[i & 0xf]; + w[i & 0xf] = tmp << 1 | tmp >> 32 - 1; + + f = b & c | ~b & d; + t = (a << 5 | a >> 32 - 5) + f + e + w[i & 0xf] + K0; + e = d; + d = c; + c = b << 30 | b >> 32 - 30; + b = a; + a = t; } -#endif -#endif - /* we now can process the input data in blocks of SHA_CBLOCK - * chars and save the leftovers to c->data. */ - p=c->data; - while (len >= SHA_CBLOCK) - { -#if BYTE_ORDER == BIG_ENDIAN || BYTE_ORDER == LITTLE_ENDIAN - if (p != (u_int32_t *)data) - memcpy(p,data,SHA_CBLOCK); - data+=SHA_CBLOCK; -# if BYTE_ORDER == LITTLE_ENDIAN -# ifndef SHA1_ASM /* Will not happen */ - for (sw=(SHA_LBLOCK/4); sw; sw--) - { - Endian_Reverse32(p[0]); - Endian_Reverse32(p[1]); - Endian_Reverse32(p[2]); - Endian_Reverse32(p[3]); - p+=4; - } - p=c->data; -# endif -# endif -#else - for (sw=(SHA_BLOCK/4); sw; sw--) - { - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; - M_c2nl(data,l); *(p++)=l; - } - p=c->data; -#endif - sha1_block(c,p,64); - len-=SHA_CBLOCK; + +# pragma unroll + for (; i < 40; i++) { + tmp = w[i - 3 & 0xf] ^ w[i - 8 & 0xf] ^ w[i - 14 & 0xf] ^ w[i & 0xf]; + w[i & 0xf] = tmp << 1 | tmp >> 32 - 1; + + f = b ^ c ^ d; + t = (a << 5 | a >> 32 - 5) + f + e + w[i & 0xf] + K1; + e = d; + d = c; + c = b << 30 | b >> 32 - 30; + b = a; + a = t; } - ec=(int)len; - c->num=ec; - ew=(ec>>2); - ec&=0x03; - - for (sw=0; sw < ew; sw++) - { M_c2nl(data,l); p[sw]=l; } - M_c2nl_p(data,l,ec); - p[sw]=l; - } -static void SHA1_Transform(SHA_CTX *c, unsigned char *b) - { - u_int32_t p[16]; -#if BYTE_ORDER != BIG_ENDIAN - u_int32_t *q; - int i; -#endif +# pragma unroll + for (; i < 60; i++) { + tmp = w[i - 3 & 0xf] ^ w[i - 8 & 0xf] ^ w[i - 14 & 0xf] ^ w[i & 0xf]; + w[i & 0xf] = tmp << 1 | tmp >> 32 - 1; + + f = (b | c) & d | b & c; + t = (a << 5 | a >> 32 - 5) + f + e + w[i & 0xf] + K2; + e = d; + d = c; + c = b << 30 | b >> 32 - 30; + b = a; + a = t; + } -#if BYTE_ORDER == BIG_ENDIAN || BYTE_ORDER == LITTLE_ENDIAN - memcpy(p,b,64); -#if BYTE_ORDER == LITTLE_ENDIAN - q=p; - for (i=(SHA_LBLOCK/4); i; i--) - { - Endian_Reverse32(q[0]); - Endian_Reverse32(q[1]); - Endian_Reverse32(q[2]); - Endian_Reverse32(q[3]); - q+=4; +# pragma unroll + for (; i < 80; i++) { + tmp = w[i - 3 & 0xf] ^ w[i - 8 & 0xf] ^ w[i - 14 & 0xf] ^ w[i & 0xf]; + w[i & 0xf] = tmp << 1 | tmp >> 32 - 1; + + f = b ^ c ^ d; + t = (a << 5 | a >> 32 - 5) + f + e + w[i & 0xf] + K3; + e = d; + d = c; + c = b << 30 | b >> 32 - 30; + b = a; + a = t; } -#endif -#else - q=p; - for (i=(SHA_LBLOCK/4); i; i--) - { - u_int32_t l; - c2nl(b,l); *(q++)=l; - c2nl(b,l); *(q++)=l; - c2nl(b,l); *(q++)=l; - c2nl(b,l); *(q++)=l; - } -#endif - sha1_block(c,p,64); - } -#ifndef SHA1_ASM + h0 += a; + h1 += b; + h2 += c; + h3 += d; + h4 += e; -void -sha1_block(SHA_CTX *c, const u_int32_t *W, int num) -{ - u_int32_t A,B,C,D,E,T; - u_int32_t X[16]; - - A=c->h0; - B=c->h1; - C=c->h2; - D=c->h3; - E=c->h4; - - for (;;) - { - BODY_00_15( 0,A,B,C,D,E,T,W); - BODY_00_15( 1,T,A,B,C,D,E,W); - BODY_00_15( 2,E,T,A,B,C,D,W); - BODY_00_15( 3,D,E,T,A,B,C,W); - BODY_00_15( 4,C,D,E,T,A,B,W); - BODY_00_15( 5,B,C,D,E,T,A,W); - BODY_00_15( 6,A,B,C,D,E,T,W); - BODY_00_15( 7,T,A,B,C,D,E,W); - BODY_00_15( 8,E,T,A,B,C,D,W); - BODY_00_15( 9,D,E,T,A,B,C,W); - BODY_00_15(10,C,D,E,T,A,B,W); - BODY_00_15(11,B,C,D,E,T,A,W); - BODY_00_15(12,A,B,C,D,E,T,W); - BODY_00_15(13,T,A,B,C,D,E,W); - BODY_00_15(14,E,T,A,B,C,D,W); - BODY_00_15(15,D,E,T,A,B,C,W); - BODY_16_19(16,C,D,E,T,A,B,W,W,W,W); - BODY_16_19(17,B,C,D,E,T,A,W,W,W,W); - BODY_16_19(18,A,B,C,D,E,T,W,W,W,W); - BODY_16_19(19,T,A,B,C,D,E,W,W,W,X); - - BODY_20_31(20,E,T,A,B,C,D,W,W,W,X); - BODY_20_31(21,D,E,T,A,B,C,W,W,W,X); - BODY_20_31(22,C,D,E,T,A,B,W,W,W,X); - BODY_20_31(23,B,C,D,E,T,A,W,W,W,X); - BODY_20_31(24,A,B,C,D,E,T,W,W,X,X); - BODY_20_31(25,T,A,B,C,D,E,W,W,X,X); - BODY_20_31(26,E,T,A,B,C,D,W,W,X,X); - BODY_20_31(27,D,E,T,A,B,C,W,W,X,X); - BODY_20_31(28,C,D,E,T,A,B,W,W,X,X); - BODY_20_31(29,B,C,D,E,T,A,W,W,X,X); - BODY_20_31(30,A,B,C,D,E,T,W,X,X,X); - BODY_20_31(31,T,A,B,C,D,E,W,X,X,X); - BODY_32_39(32,E,T,A,B,C,D,X); - BODY_32_39(33,D,E,T,A,B,C,X); - BODY_32_39(34,C,D,E,T,A,B,X); - BODY_32_39(35,B,C,D,E,T,A,X); - BODY_32_39(36,A,B,C,D,E,T,X); - BODY_32_39(37,T,A,B,C,D,E,X); - BODY_32_39(38,E,T,A,B,C,D,X); - BODY_32_39(39,D,E,T,A,B,C,X); - - BODY_40_59(40,C,D,E,T,A,B,X); - BODY_40_59(41,B,C,D,E,T,A,X); - BODY_40_59(42,A,B,C,D,E,T,X); - BODY_40_59(43,T,A,B,C,D,E,X); - BODY_40_59(44,E,T,A,B,C,D,X); - BODY_40_59(45,D,E,T,A,B,C,X); - BODY_40_59(46,C,D,E,T,A,B,X); - BODY_40_59(47,B,C,D,E,T,A,X); - BODY_40_59(48,A,B,C,D,E,T,X); - BODY_40_59(49,T,A,B,C,D,E,X); - BODY_40_59(50,E,T,A,B,C,D,X); - BODY_40_59(51,D,E,T,A,B,C,X); - BODY_40_59(52,C,D,E,T,A,B,X); - BODY_40_59(53,B,C,D,E,T,A,X); - BODY_40_59(54,A,B,C,D,E,T,X); - BODY_40_59(55,T,A,B,C,D,E,X); - BODY_40_59(56,E,T,A,B,C,D,X); - BODY_40_59(57,D,E,T,A,B,C,X); - BODY_40_59(58,C,D,E,T,A,B,X); - BODY_40_59(59,B,C,D,E,T,A,X); - - BODY_60_79(60,A,B,C,D,E,T,X); - BODY_60_79(61,T,A,B,C,D,E,X); - BODY_60_79(62,E,T,A,B,C,D,X); - BODY_60_79(63,D,E,T,A,B,C,X); - BODY_60_79(64,C,D,E,T,A,B,X); - BODY_60_79(65,B,C,D,E,T,A,X); - BODY_60_79(66,A,B,C,D,E,T,X); - BODY_60_79(67,T,A,B,C,D,E,X); - BODY_60_79(68,E,T,A,B,C,D,X); - BODY_60_79(69,D,E,T,A,B,C,X); - BODY_60_79(70,C,D,E,T,A,B,X); - BODY_60_79(71,B,C,D,E,T,A,X); - BODY_60_79(72,A,B,C,D,E,T,X); - BODY_60_79(73,T,A,B,C,D,E,X); - BODY_60_79(74,E,T,A,B,C,D,X); - BODY_60_79(75,D,E,T,A,B,C,X); - BODY_60_79(76,C,D,E,T,A,B,X); - BODY_60_79(77,B,C,D,E,T,A,X); - BODY_60_79(78,A,B,C,D,E,T,X); - BODY_60_79(79,T,A,B,C,D,E,X); - - c->h0=(c->h0+E)&0xffffffffL; - c->h1=(c->h1+T)&0xffffffffL; - c->h2=(c->h2+A)&0xffffffffL; - c->h3=(c->h3+B)&0xffffffffL; - c->h4=(c->h4+C)&0xffffffffL; - - num-=64; - if (num <= 0) break; - - A=c->h0; - B=c->h1; - C=c->h2; - D=c->h3; - E=c->h4; - - W+=16; - } + p += SHA_CBLOCK; + len -= SHA_CBLOCK; } -#endif -void SHA1_Final(unsigned char *md, SHA_CTX *c) - { - int i,j; - u_int32_t l; - u_int32_t *p; - static unsigned char end[4]={0x80,0x00,0x00,0x00}; - unsigned char *cp=end; - - /* c->num should definitly have room for at least one more byte. */ - p=c->data; - j=c->num; - i=j>>2; -#ifdef PURIFY - if ((j&0x03) == 0) p[i]=0; + c->h0 = h0; + c->h1 = h1; + c->h2 = h2; + c->h3 = h3; + c->h4 = h4; +} #endif - l=p[i]; - M_p_c2nl(cp,l,j&0x03); - p[i]=l; - i++; - /* i is the next 'undefined word' */ - if (c->num >= SHA_LAST_BLOCK) - { - for (; iNh; - p[SHA_LBLOCK-1]=c->Nl; -#if BYTE_ORDER == LITTLE_ENDIAN && defined(SHA1_ASM) - Endian_Reverse32(p[SHA_LBLOCK-2]); - Endian_Reverse32(p[SHA_LBLOCK-1]); -#endif - sha1_block(c,p,64); - cp=md; - l=c->h0; nl2c(l,cp); - l=c->h1; nl2c(l,cp); - l=c->h2; nl2c(l,cp); - l=c->h3; nl2c(l,cp); - l=c->h4; nl2c(l,cp); - - /* Clear the context state */ - explicit_bzero(&c, sizeof(c)); - } #ifdef WEAK_REFS /* When building libmd, provide weak references. Note: this is not