diff --git a/include/os/freebsd/spl/sys/ia32/asm_linkage.h b/include/os/freebsd/spl/sys/ia32/asm_linkage.h index bbbd22030213..058d600007af 100644 --- a/include/os/freebsd/spl/sys/ia32/asm_linkage.h +++ b/include/os/freebsd/spl/sys/ia32/asm_linkage.h @@ -1,178 +1,178 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H #define RET ret /* Tell compiler to call assembler like Unix */ #undef ASMABI #define ASMABI __attribute__((sysv_abi)) #define ENDBR #define SECTION_TEXT .text #define SECTION_STATIC .data #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ #define ENTRY(x) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x; \ x: #define ENTRY_ALIGN(x, a) \ .text; \ - .align a; \ + .balign a; \ .globl x; \ x: /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x, y; \ x:; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x, y; \ x:; \ y: /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ #define SET_SIZE(x) #define SET_OBJ(x) #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/include/os/linux/spl/sys/ia32/asm_linkage.h b/include/os/linux/spl/sys/ia32/asm_linkage.h index 2864d9455129..3aaa4af5dab8 100644 --- a/include/os/linux/spl/sys/ia32/asm_linkage.h +++ b/include/os/linux/spl/sys/ia32/asm_linkage.h @@ -1,212 +1,212 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H #if defined(_KERNEL) && defined(__linux__) #include #endif #ifndef ENDBR #if defined(__ELF__) && defined(__CET__) && defined(__has_include) /* CSTYLED */ #if __has_include() #include #ifdef _CET_ENDBR #define ENDBR _CET_ENDBR #endif /* _CET_ENDBR */ #endif /* */ #endif /* __ELF__ && __CET__ && __has_include */ #endif /* !ENDBR */ #ifndef ENDBR #define ENDBR #endif #ifndef RET #define RET ret #endif /* You can set to nothing on Unix platforms */ #undef ASMABI #define ASMABI __attribute__((sysv_abi)) #define SECTION_TEXT .text #define SECTION_STATIC .section .rodata #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ #undef ENTRY #define ENTRY(x) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: #define ENTRY_ALIGN(x, a) \ .text; \ - .align a; \ + .balign a; \ .globl x; \ .type x, @function; \ x: #define FUNCTION(x) \ .type x, @function; \ x: /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ x:; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ x:; \ y: /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ #define SET_SIZE(x) \ .size x, [.-x] #define SET_OBJ(x) .type x, @object #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h index 3b4beecc5d34..9964f183cc68 100644 --- a/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h +++ b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h @@ -1,184 +1,184 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H #if defined(__linux__) && defined(CONFIG_SLS) #define RET ret; int3 #else #define RET ret #endif /* Tell compiler to call assembler like Unix */ #undef ASMABI #define ASMABI __attribute__((sysv_abi)) #define ENDBR #define SECTION_TEXT .text #define SECTION_STATIC .data #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ #define ENTRY(x) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x; \ x: #define ENTRY_ALIGN(x, a) \ .text; \ - .align a; \ + .balign a; \ .globl x; \ x: #define FUNCTION(x) \ .type x, @function; \ x: /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x, y; \ x:; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x, y; \ x:; \ y: /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ #define SET_SIZE(x) #define SET_OBJ(x) #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h b/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h index 76765dd040cf..f07596123341 100644 --- a/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h +++ b/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h @@ -1,211 +1,211 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H #if defined(_KERNEL) && defined(__linux__) #include #endif #ifndef ENDBR #if defined(__ELF__) && defined(__CET__) && defined(__has_include) /* CSTYLED */ #if __has_include() #include #ifdef _CET_ENDBR #define ENDBR _CET_ENDBR #endif /* _CET_ENDBR */ #endif /* */ #endif /* __ELF__ && __CET__ && __has_include */ #endif /* !ENDBR */ #ifndef ENDBR #define ENDBR #endif #ifndef RET #define RET ret #endif /* You can set to nothing on Unix platforms */ #undef ASMABI #define ASMABI __attribute__((sysv_abi)) #define SECTION_TEXT .text #define SECTION_STATIC .section .rodata #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ #undef ENTRY #define ENTRY(x) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: #define ENTRY_ALIGN(x, a) \ .text; \ - .align a; \ + .balign a; \ .globl x; \ .type x, @function; \ x: #define FUNCTION(x) \ .type x, @function; \ x: /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ x:; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ - .align ASM_ENTRY_ALIGN; \ + .balign ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ x:; \ y: /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ #define SET_SIZE(x) \ .size x, [.-x] #define SET_OBJ(x) .type x, @object #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/module/icp/asm-x86_64/aes/aes_aesni.S b/module/icp/asm-x86_64/aes/aes_aesni.S index f622235bd15b..4f3fe3ec65d6 100644 --- a/module/icp/asm-x86_64/aes/aes_aesni.S +++ b/module/icp/asm-x86_64/aes/aes_aesni.S @@ -1,748 +1,748 @@ /* * ==================================================================== * Written by Intel Corporation for the OpenSSL project to add support * for Intel AES-NI instructions. Rights for redistribution and usage * in source and binary forms are granted according to the OpenSSL * license. * * Author: Huang Ying * Vinodh Gopal * Kahraman Akdemir * * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) * instructions that are going to be introduced in the next generation * of Intel processor, as of 2009. These instructions enable fast and * secure data encryption and decryption, using the Advanced Encryption * Standard (AES), defined by FIPS Publication number 197. The * architecture introduces six instructions that offer full hardware * support for AES. Four of them support high performance data * encryption and decryption, and the other two instructions support * the AES key expansion procedure. * ==================================================================== */ /* * ==================================================================== * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. All advertising materials mentioning features or use of this * software must display the following acknowledgment: * "This product includes software developed by the OpenSSL Project * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" * * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to * endorse or promote products derived from this software without * prior written permission. For written permission, please contact * openssl-core@openssl.org. * * 5. Products derived from this software may not be called "OpenSSL" * nor may "OpenSSL" appear in their names without prior written * permission of the OpenSSL Project. * * 6. Redistributions of any form whatsoever must retain the following * acknowledgment: * "This product includes software developed by the OpenSSL Project * for use in the OpenSSL Toolkit (http://www.openssl.org/)" * * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. * ==================================================================== */ /* * ==================================================================== * OpenSolaris OS modifications * * This source originates as files aes-intel.S and eng_aesni_asm.pl, in * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by * Huang Ying of Intel to the openssl-dev mailing list under the subject * of "Add support to Intel AES-NI instruction set for x86_64 platform". * * This OpenSolaris version has these major changes from the original source: * * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function * definitions for lint. * * 2. Formatted code, added comments, and added #includes and #defines. * * 3. If bit CR0.TS is set, clear and set the TS bit, after and before * calling kpreempt_disable() and kpreempt_enable(). * If the TS bit is not set, Save and restore %xmm registers at the beginning * and end of function calls (%xmm* registers are not saved and restored by * during kernel thread preemption). * * 4. Renamed functions, reordered parameters, and changed return value * to match OpenSolaris: * * OpenSSL interface: * int intel_AES_set_encrypt_key(const unsigned char *userKey, * const int bits, AES_KEY *key); * int intel_AES_set_decrypt_key(const unsigned char *userKey, * const int bits, AES_KEY *key); * Return values for above are non-zero on error, 0 on success. * * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key); * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key); * typedef struct aes_key_st { * unsigned int rd_key[4 *(AES_MAXNR + 1)]; * int rounds; * unsigned int pad[3]; * } AES_KEY; * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules * (ks32) instead of 64-bit (ks64). * Number of rounds (aka round count) is at offset 240 of AES_KEY. * * OpenSolaris OS interface (#ifdefs removed for readability): * int rijndael_key_setup_dec_intel(uint32_t rk[], * const uint32_t cipherKey[], uint64_t keyBits); * int rijndael_key_setup_enc_intel(uint32_t rk[], * const uint32_t cipherKey[], uint64_t keyBits); * Return values for above are 0 on error, number of rounds on success. * * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4]); * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4]); * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]; * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t; * * typedef union { * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; * } aes_ks_t; * typedef struct aes_key { * aes_ks_t encr_ks, decr_ks; * long double align128; * int flags, nr, type; * } aes_key_t; * * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, * ct is crypto text, and MAX_AES_NR is 14. * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. * * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. * * ==================================================================== */ #if defined(lint) || defined(__lint) #include void aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]) { (void) rk, (void) Nr, (void) pt, (void) ct; } void aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]) { (void) rk, (void) Nr, (void) ct, (void) pt; } int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits) { (void) rk, (void) cipherKey, (void) keyBits; return (0); } int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits) { (void) rk, (void) cipherKey, (void) keyBits; return (0); } #elif defined(HAVE_AES) /* guard by instruction set */ #define _ASM #include /* * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(), * _key_expansion_256a(), _key_expansion_256b() * * Helper functions called by rijndael_key_setup_inc_intel(). * Also used indirectly by rijndael_key_setup_dec_intel(). * * Input: * %xmm0 User-provided cipher key * %xmm1 Round constant * Output: * (%rcx) AES key */ ENTRY_NP2(_key_expansion_128, _key_expansion_256a) _key_expansion_128_local: _key_expansion_256a_local: pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movups %xmm0, (%rcx) add $0x10, %rcx RET nop SET_SIZE(_key_expansion_128) SET_SIZE(_key_expansion_256a) ENTRY_NP(_key_expansion_192a) _key_expansion_192a_local: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movups %xmm2, %xmm5 movups %xmm2, %xmm6 pslldq $4, %xmm5 pshufd $0b11111111, %xmm0, %xmm3 pxor %xmm3, %xmm2 pxor %xmm5, %xmm2 movups %xmm0, %xmm1 shufps $0b01000100, %xmm0, %xmm6 movups %xmm6, (%rcx) shufps $0b01001110, %xmm2, %xmm1 movups %xmm1, 0x10(%rcx) add $0x20, %rcx RET SET_SIZE(_key_expansion_192a) ENTRY_NP(_key_expansion_192b) _key_expansion_192b_local: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movups %xmm2, %xmm5 pslldq $4, %xmm5 pshufd $0b11111111, %xmm0, %xmm3 pxor %xmm3, %xmm2 pxor %xmm5, %xmm2 movups %xmm0, (%rcx) add $0x10, %rcx RET SET_SIZE(_key_expansion_192b) ENTRY_NP(_key_expansion_256b) _key_expansion_256b_local: pshufd $0b10101010, %xmm1, %xmm1 shufps $0b00010000, %xmm2, %xmm4 pxor %xmm4, %xmm2 shufps $0b10001100, %xmm2, %xmm4 pxor %xmm4, %xmm2 pxor %xmm1, %xmm2 movups %xmm2, (%rcx) add $0x10, %rcx RET SET_SIZE(_key_expansion_256b) /* * rijndael_key_setup_enc_intel() * Expand the cipher key into the encryption key schedule. * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored. * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set * on entry. Otherwise, if TS is not set, save and restore %xmm registers * on the stack. * * OpenSolaris interface: * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], * uint64_t keyBits); * Return value is 0 on error, number of rounds on success. * * Original Intel OpenSSL interface: * int intel_AES_set_encrypt_key(const unsigned char *userKey, * const int bits, AES_KEY *key); * Return value is non-zero on error, 0 on success. */ #ifdef OPENSSL_INTERFACE #define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key #define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key #define USERCIPHERKEY rdi /* P1, 64 bits */ #define KEYSIZE32 esi /* P2, 32 bits */ #define KEYSIZE64 rsi /* P2, 64 bits */ #define AESKEY rdx /* P3, 64 bits */ #else /* OpenSolaris Interface */ #define AESKEY rdi /* P1, 64 bits */ #define USERCIPHERKEY rsi /* P2, 64 bits */ #define KEYSIZE32 edx /* P3, 32 bits */ #define KEYSIZE64 rdx /* P3, 64 bits */ #endif /* OPENSSL_INTERFACE */ #define ROUNDS32 KEYSIZE32 /* temp */ #define ROUNDS64 KEYSIZE64 /* temp */ #define ENDAESKEY USERCIPHERKEY /* temp */ ENTRY_NP(rijndael_key_setup_enc_intel) rijndael_key_setup_enc_intel_local: FRAME_BEGIN // NULL pointer sanity check test %USERCIPHERKEY, %USERCIPHERKEY jz .Lenc_key_invalid_param test %AESKEY, %AESKEY jz .Lenc_key_invalid_param movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes) movups %xmm0, (%AESKEY) lea 0x10(%AESKEY), %rcx // key addr pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x cmp $256, %KEYSIZE32 jnz .Lenc_key192 // AES 256: 14 rounds in encryption key schedule #ifdef OPENSSL_INTERFACE mov $14, %ROUNDS32 movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14 #endif /* OPENSSL_INTERFACE */ movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes) movups %xmm2, (%rcx) add $0x10, %rcx aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x1, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x2, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x4, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x8, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x10, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x20, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local #ifdef OPENSSL_INTERFACE xor %rax, %rax // return 0 (OK) #else /* Open Solaris Interface */ mov $14, %rax // return # rounds = 14 #endif FRAME_END RET -.align 4 +.balign 4 .Lenc_key192: cmp $192, %KEYSIZE32 jnz .Lenc_key128 // AES 192: 12 rounds in encryption key schedule #ifdef OPENSSL_INTERFACE mov $12, %ROUNDS32 movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12 #endif /* OPENSSL_INTERFACE */ movq 0x10(%USERCIPHERKEY), %xmm2 // other user key aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key call _key_expansion_192a_local aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key call _key_expansion_192b_local aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key call _key_expansion_192a_local aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key call _key_expansion_192b_local aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key call _key_expansion_192a_local aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key call _key_expansion_192b_local aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key call _key_expansion_192a_local aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key call _key_expansion_192b_local #ifdef OPENSSL_INTERFACE xor %rax, %rax // return 0 (OK) #else /* OpenSolaris Interface */ mov $12, %rax // return # rounds = 12 #endif FRAME_END RET -.align 4 +.balign 4 .Lenc_key128: cmp $128, %KEYSIZE32 jnz .Lenc_key_invalid_key_bits // AES 128: 10 rounds in encryption key schedule #ifdef OPENSSL_INTERFACE mov $10, %ROUNDS32 movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10 #endif /* OPENSSL_INTERFACE */ aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key call _key_expansion_128_local #ifdef OPENSSL_INTERFACE xor %rax, %rax // return 0 (OK) #else /* OpenSolaris Interface */ mov $10, %rax // return # rounds = 10 #endif FRAME_END RET .Lenc_key_invalid_param: #ifdef OPENSSL_INTERFACE mov $-1, %rax // user key or AES key pointer is NULL FRAME_END RET #else /* FALLTHROUGH */ #endif /* OPENSSL_INTERFACE */ .Lenc_key_invalid_key_bits: #ifdef OPENSSL_INTERFACE mov $-2, %rax // keysize is invalid #else /* Open Solaris Interface */ xor %rax, %rax // a key pointer is NULL or invalid keysize #endif /* OPENSSL_INTERFACE */ FRAME_END RET SET_SIZE(rijndael_key_setup_enc_intel) /* * rijndael_key_setup_dec_intel() * Expand the cipher key into the decryption key schedule. * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored. * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set * on entry. Otherwise, if TS is not set, save and restore %xmm registers * on the stack. * * OpenSolaris interface: * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], * uint64_t keyBits); * Return value is 0 on error, number of rounds on success. * P1->P2, P2->P3, P3->P1 * * Original Intel OpenSSL interface: * int intel_AES_set_decrypt_key(const unsigned char *userKey, * const int bits, AES_KEY *key); * Return value is non-zero on error, 0 on success. */ ENTRY_NP(rijndael_key_setup_dec_intel) FRAME_BEGIN // Generate round keys used for encryption call rijndael_key_setup_enc_intel_local test %rax, %rax #ifdef OPENSSL_INTERFACE jnz .Ldec_key_exit // Failed if returned non-0 #else /* OpenSolaris Interface */ jz .Ldec_key_exit // Failed if returned 0 #endif /* OPENSSL_INTERFACE */ /* * Convert round keys used for encryption * to a form usable for decryption */ #ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */ mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14) // (already set for OpenSSL) #endif lea 0x10(%AESKEY), %rcx // key addr shl $4, %ROUNDS32 add %AESKEY, %ROUNDS64 mov %ROUNDS64, %ENDAESKEY -.align 4 +.balign 4 .Ldec_key_reorder_loop: movups (%AESKEY), %xmm0 movups (%ROUNDS64), %xmm1 movups %xmm0, (%ROUNDS64) movups %xmm1, (%AESKEY) lea 0x10(%AESKEY), %AESKEY lea -0x10(%ROUNDS64), %ROUNDS64 cmp %AESKEY, %ROUNDS64 ja .Ldec_key_reorder_loop -.align 4 +.balign 4 .Ldec_key_inv_loop: movups (%rcx), %xmm0 // Convert an encryption round key to a form usable for decryption // with the "AES Inverse Mix Columns" instruction aesimc %xmm0, %xmm1 movups %xmm1, (%rcx) lea 0x10(%rcx), %rcx cmp %ENDAESKEY, %rcx jnz .Ldec_key_inv_loop .Ldec_key_exit: // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error // OpenSSL: rax = 0 for OK, or non-zero for error FRAME_END RET SET_SIZE(rijndael_key_setup_dec_intel) /* * aes_encrypt_intel() * Encrypt a single block (in and out can overlap). * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored. * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set * on entry. Otherwise, if TS is not set, save and restore %xmm registers * on the stack. * * Temporary register usage: * %xmm0 State * %xmm1 Key * * Original OpenSolaris Interface: * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4]) * * Original Intel OpenSSL Interface: * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key) */ #ifdef OPENSSL_INTERFACE #define aes_encrypt_intel intel_AES_encrypt #define aes_decrypt_intel intel_AES_decrypt #define INP rdi /* P1, 64 bits */ #define OUTP rsi /* P2, 64 bits */ #define KEYP rdx /* P3, 64 bits */ /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */ #define NROUNDS32 ecx /* temporary, 32 bits */ #define NROUNDS cl /* temporary, 8 bits */ #else /* OpenSolaris Interface */ #define KEYP rdi /* P1, 64 bits */ #define NROUNDS esi /* P2, 32 bits */ #define INP rdx /* P3, 64 bits */ #define OUTP rcx /* P4, 64 bits */ #endif /* OPENSSL_INTERFACE */ #define STATE xmm0 /* temporary, 128 bits */ #define KEY xmm1 /* temporary, 128 bits */ ENTRY_NP(aes_encrypt_intel) movups (%INP), %STATE // input movups (%KEYP), %KEY // key #ifdef OPENSSL_INTERFACE mov 240(%KEYP), %NROUNDS32 // round count #else /* OpenSolaris Interface */ /* Round count is already present as P2 in %rsi/%esi */ #endif /* OPENSSL_INTERFACE */ pxor %KEY, %STATE // round 0 lea 0x30(%KEYP), %KEYP cmp $12, %NROUNDS jb .Lenc128 lea 0x20(%KEYP), %KEYP je .Lenc192 // AES 256 lea 0x20(%KEYP), %KEYP movups -0x60(%KEYP), %KEY aesenc %KEY, %STATE movups -0x50(%KEYP), %KEY aesenc %KEY, %STATE -.align 4 +.balign 4 .Lenc192: // AES 192 and 256 movups -0x40(%KEYP), %KEY aesenc %KEY, %STATE movups -0x30(%KEYP), %KEY aesenc %KEY, %STATE -.align 4 +.balign 4 .Lenc128: // AES 128, 192, and 256 movups -0x20(%KEYP), %KEY aesenc %KEY, %STATE movups -0x10(%KEYP), %KEY aesenc %KEY, %STATE movups (%KEYP), %KEY aesenc %KEY, %STATE movups 0x10(%KEYP), %KEY aesenc %KEY, %STATE movups 0x20(%KEYP), %KEY aesenc %KEY, %STATE movups 0x30(%KEYP), %KEY aesenc %KEY, %STATE movups 0x40(%KEYP), %KEY aesenc %KEY, %STATE movups 0x50(%KEYP), %KEY aesenc %KEY, %STATE movups 0x60(%KEYP), %KEY aesenc %KEY, %STATE movups 0x70(%KEYP), %KEY aesenclast %KEY, %STATE // last round movups %STATE, (%OUTP) // output RET SET_SIZE(aes_encrypt_intel) /* * aes_decrypt_intel() * Decrypt a single block (in and out can overlap). * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored. * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set * on entry. Otherwise, if TS is not set, save and restore %xmm registers * on the stack. * * Temporary register usage: * %xmm0 State * %xmm1 Key * * Original OpenSolaris Interface: * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * * Original Intel OpenSSL Interface: * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key); */ ENTRY_NP(aes_decrypt_intel) movups (%INP), %STATE // input movups (%KEYP), %KEY // key #ifdef OPENSSL_INTERFACE mov 240(%KEYP), %NROUNDS32 // round count #else /* OpenSolaris Interface */ /* Round count is already present as P2 in %rsi/%esi */ #endif /* OPENSSL_INTERFACE */ pxor %KEY, %STATE // round 0 lea 0x30(%KEYP), %KEYP cmp $12, %NROUNDS jb .Ldec128 lea 0x20(%KEYP), %KEYP je .Ldec192 // AES 256 lea 0x20(%KEYP), %KEYP movups -0x60(%KEYP), %KEY aesdec %KEY, %STATE movups -0x50(%KEYP), %KEY aesdec %KEY, %STATE -.align 4 +.balign 4 .Ldec192: // AES 192 and 256 movups -0x40(%KEYP), %KEY aesdec %KEY, %STATE movups -0x30(%KEYP), %KEY aesdec %KEY, %STATE -.align 4 +.balign 4 .Ldec128: // AES 128, 192, and 256 movups -0x20(%KEYP), %KEY aesdec %KEY, %STATE movups -0x10(%KEYP), %KEY aesdec %KEY, %STATE movups (%KEYP), %KEY aesdec %KEY, %STATE movups 0x10(%KEYP), %KEY aesdec %KEY, %STATE movups 0x20(%KEYP), %KEY aesdec %KEY, %STATE movups 0x30(%KEYP), %KEY aesdec %KEY, %STATE movups 0x40(%KEYP), %KEY aesdec %KEY, %STATE movups 0x50(%KEYP), %KEY aesdec %KEY, %STATE movups 0x60(%KEYP), %KEY aesdec %KEY, %STATE movups 0x70(%KEYP), %KEY aesdeclast %KEY, %STATE // last round movups %STATE, (%OUTP) // output RET SET_SIZE(aes_decrypt_intel) #endif /* lint || __lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S index d5cf4040fb93..c4870a28ead6 100644 --- a/module/icp/asm-x86_64/aes/aes_amd64.S +++ b/module/icp/asm-x86_64/aes/aes_amd64.S @@ -1,908 +1,908 @@ /* * --------------------------------------------------------------------------- * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. * * LICENSE TERMS * * The free distribution and use of this software is allowed (with or without * changes) provided that: * * 1. source code distributions include the above copyright notice, this * list of conditions and the following disclaimer; * * 2. binary distributions include the above copyright notice, this list * of conditions and the following disclaimer in their documentation; * * 3. the name of the copyright holder is not used to endorse products * built using this software without specific written permission. * * DISCLAIMER * * This software is provided 'as is' with no explicit or implied warranties * in respect of its properties, including, but not limited to, correctness * and/or fitness for purpose. * --------------------------------------------------------------------------- * Issue 20/12/2007 * * I am grateful to Dag Arne Osvik for many discussions of the techniques that * can be used to optimise AES assembler code on AMD64/EM64T architectures. * Some of the techniques used in this implementation are the result of * suggestions made by him for which I am most grateful. * * An AES implementation for AMD64 processors using the YASM assembler. This * implementation provides only encryption, decryption and hence requires key * scheduling support in C. It uses 8k bytes of tables but its encryption and * decryption performance is very close to that obtained using large tables. * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, * which are as follows: * ms windows gnu/linux/opensolaris os * * in_blk rcx rdi * out_blk rdx rsi * context (cx) r8 rdx * * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 * registers rdi - on both * * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 * registers - rdi on both * * The convention used here is that for gnu/linux/opensolaris os. * * This code provides the standard AES block size (128 bits, 16 bytes) and the * three standard AES key sizes (128, 192 and 256 bits). It has the same call * interface as my C implementation. It uses the Microsoft C AMD64 calling * conventions in which the three parameters are placed in rcx, rdx and r8 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. * * OpenSolaris Note: * Modified to use GNU/Linux/Solaris calling conventions. * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. * * AES_RETURN aes_encrypt(const unsigned char in_blk[], * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt(const unsigned char in_blk[], * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_encrypt_key(const unsigned char key[], * const aes_encrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt_key(const unsigned char key[], * const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_encrypt_key(const unsigned char key[], * unsigned int len, const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt_key(const unsigned char key[], * unsigned int len, const aes_decrypt_ctx cx[1])/ * * where is 128, 102 or 256. In the last two calls the length can be in * either bits or bytes. * * Comment in/out the following lines to obtain the desired subroutines. These * selections MUST match those in the C header file aesopt.h */ #define AES_REV_DKS /* define if key decryption schedule is reversed */ #define LAST_ROUND_TABLES /* define for the faster version using extra tables */ /* * The encryption key schedule has the following in memory layout where N is the * number of rounds (10, 12 or 14): * * lo: | input key (round 0) | / each round is four 32-bit words * | encryption round 1 | * | encryption round 2 | * .... * | encryption round N-1 | * hi: | encryption round N | * * The decryption key schedule is normally set up so that it has the same * layout as above by actually reversing the order of the encryption key * schedule in memory (this happens when AES_REV_DKS is set): * * lo: | decryption round 0 | = | encryption round N | * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] * .... .... * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] * hi: | decryption round N | = | input key (round 0) | * * with rounds except the first and last modified using inv_mix_column() * But if AES_REV_DKS is NOT set the order of keys is left as it is for * encryption so that it has to be accessed in reverse when used for * decryption (although the inverse mix column modifications are done) * * lo: | decryption round 0 | = | input key (round 0) | * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] * .... .... * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] * hi: | decryption round N | = | encryption round N | * * This layout is faster when the assembler key scheduling provided here * is used. * * End of user defines */ /* * --------------------------------------------------------------------------- * OpenSolaris OS modifications * * This source originates from Brian Gladman file aes_amd64.asm * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip * with these changes: * * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, * AES_128, AES_192, AES_256, AES_VAR ifdefs. * * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define * * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef * * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax * (operands reversed, literals prefixed with "$", registers prefixed with "%", * and "[register+offset]", addressing changed to "offset(register)", * parenthesis in constant expressions "()" changed to square brackets "[]", * "." removed from local (numeric) labels, and other changes. * Examples: * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax * mov rax,(4*20h) mov $[4*0x20],%rax * mov rax,[ebx+20h] mov 0x20(%ebx),%rax * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax * * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function * definitions for lint. * * 6. Renamed functions and reordered parameters to match OpenSolaris: * Original Gladman interface: * int aes_encrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ * int aes_decrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, * and a union type, inf., containing inf.l, a uint32_t and * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is * used and contains the key schedule length * 16 where key schedule length is * 10, 12, or 14 bytes. * * OpenSolaris OS interface: * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, * ct is crypto text, and MAX_AES_NR is 14. * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. */ #if defined(lint) || defined(__lint) #include void aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]) { (void) rk, (void) Nr, (void) pt, (void) ct; } void aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]) { (void) rk, (void) Nr, (void) pt, (void) ct; } #else #define _ASM #include #define KS_LENGTH 60 #define raxd eax #define rdxd edx #define rcxd ecx #define rbxd ebx #define rsid esi #define rdid edi #define raxb al #define rdxb dl #define rcxb cl #define rbxb bl #define rsib sil #define rdib dil // finite field multiplies by {02}, {04} and {08} #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) // finite field multiplies required in table generation #define f3(x) ((f2(x)) ^ (x)) #define f9(x) ((f8(x)) ^ (x)) #define fb(x) ((f8(x)) ^ (f2(x)) ^ (x)) #define fd(x) ((f8(x)) ^ (f4(x)) ^ (x)) #define fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x))) // macros for expanding S-box data #define u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x)) #define v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x) #define w8(x) (x), 0, 0, 0, (x), 0, 0, 0 #define enc_vals(x) \ .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) #define dec_vals(x) \ .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) #define tptr %rbp /* table pointer */ #define kptr %r8 /* key schedule pointer */ #define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ #define fk_ref(x, y) -16*x+fofs+4*y(kptr) #ifdef AES_REV_DKS #define rofs 128 #define ik_ref(x, y) -16*x+rofs+4*y(kptr) #else #define rofs -128 #define ik_ref(x, y) 16*x+rofs+4*y(kptr) #endif /* AES_REV_DKS */ #define tab_0(x) (tptr,x,8) #define tab_1(x) 3(tptr,x,8) #define tab_2(x) 2(tptr,x,8) #define tab_3(x) 1(tptr,x,8) #define tab_f(x) 1(tptr,x,8) #define tab_i(x) 7(tptr,x,8) #define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p1; \ \ mov p1, %eax; \ mov p2, %ebx; \ mov p3, %ecx; \ mov p4, %edx #ifdef LAST_ROUND_TABLES #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ add $2048, tptr; \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p1 #else #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ xor %esi, p1; \ rol $8, %edi; \ xor %edi, p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p3; \ xor %edi, p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ xor %esi, p2; \ rol $8, %edi; \ xor %edi, p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p4; \ xor %edi, p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ shr $16, %ecx; \ xor %esi, p3; \ rol $8, %edi; \ xor %edi, p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p1; \ xor %edi, p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ shr $16, %edx; \ xor %esi, p4; \ rol $8, %edi; \ xor %edi, p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p2; \ xor %edi, p1 #endif /* LAST_ROUND_TABLES */ #define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p3; \ \ mov p1, %eax; \ mov p2, %ebx; \ mov p3, %ecx; \ mov p4, %edx #ifdef LAST_ROUND_TABLES #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ add $2048, tptr; \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p3 #else #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %eax; \ xor %esi, p1; \ rol $8, %edi; \ xor %edi, p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p3; \ xor %edi, p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %ebx; \ xor %esi, p2; \ rol $8, %edi; \ xor %edi, p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p4; \ xor %edi, p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %ecx; \ xor %esi, p3; \ rol $8, %edi; \ xor %edi, p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p1; \ xor %edi, p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %edx; \ xor %esi, p4; \ rol $8, %edi; \ xor %edi, p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p2; \ xor %edi, p3 #endif /* LAST_ROUND_TABLES */ /* * OpenSolaris OS: * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * * Original interface: * int aes_encrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ */ SECTION_STATIC -.align 64 +.balign 64 enc_tab: enc_vals(u8) #ifdef LAST_ROUND_TABLES // Last Round Tables: enc_vals(w8) #endif ENTRY_NP(aes_encrypt_amd64) ENDBR #ifdef GLADMAN_INTERFACE // Original interface sub $[4*8], %rsp // gnu/linux/opensolaris binary interface mov %rsi, (%rsp) // output pointer (P2) mov %rdx, %r8 // context (P3) mov %rbx, 1*8(%rsp) // P1: input pointer in rdi mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) mov %r12, 3*8(%rsp) // P3: context in r8 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 #else // OpenSolaris OS interface sub $(4*8), %rsp // Make room on stack to save registers mov %rcx, (%rsp) // Save output pointer (P4) on stack mov %rdi, %r8 // context (P1) mov %rdx, %rdi // P3: save input pointer shl $4, %esi // P2: esi byte key length * 16 mov %rbx, 1*8(%rsp) // Save registers mov %rbp, 2*8(%rsp) mov %r12, 3*8(%rsp) // P1: context in r8 // P2: byte key length * 16 in esi // P3: input pointer in rdi // P4: output pointer in (rsp) #endif /* GLADMAN_INTERFACE */ lea enc_tab(%rip), tptr sub $fofs, kptr // Load input block into registers mov (%rdi), %eax mov 1*4(%rdi), %ebx mov 2*4(%rdi), %ecx mov 3*4(%rdi), %edx xor fofs(kptr), %eax xor fofs+4(kptr), %ebx xor fofs+8(kptr), %ecx xor fofs+12(kptr), %edx lea (kptr,%rsi), kptr // Jump based on byte key length * 16: cmp $(10*16), %esi je 3f cmp $(12*16), %esi je 2f cmp $(14*16), %esi je 1f mov $-1, %rax // error jmp 4f // Perform normal forward rounds 1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) 2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) 3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) // Copy results mov (%rsp), %rbx mov %r9d, (%rbx) mov %r10d, 4(%rbx) mov %r11d, 8(%rbx) mov %r12d, 12(%rbx) xor %rax, %rax 4: // Restore registers mov 1*8(%rsp), %rbx mov 2*8(%rsp), %rbp mov 3*8(%rsp), %r12 add $(4*8), %rsp RET SET_SIZE(aes_encrypt_amd64) /* * OpenSolaris OS: * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * * Original interface: * int aes_decrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ */ SECTION_STATIC -.align 64 +.balign 64 dec_tab: dec_vals(v8) #ifdef LAST_ROUND_TABLES // Last Round Tables: dec_vals(w8) #endif ENTRY_NP(aes_decrypt_amd64) ENDBR #ifdef GLADMAN_INTERFACE // Original interface sub $[4*8], %rsp // gnu/linux/opensolaris binary interface mov %rsi, (%rsp) // output pointer (P2) mov %rdx, %r8 // context (P3) mov %rbx, 1*8(%rsp) // P1: input pointer in rdi mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) mov %r12, 3*8(%rsp) // P3: context in r8 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 #else // OpenSolaris OS interface sub $(4*8), %rsp // Make room on stack to save registers mov %rcx, (%rsp) // Save output pointer (P4) on stack mov %rdi, %r8 // context (P1) mov %rdx, %rdi // P3: save input pointer shl $4, %esi // P2: esi byte key length * 16 mov %rbx, 1*8(%rsp) // Save registers mov %rbp, 2*8(%rsp) mov %r12, 3*8(%rsp) // P1: context in r8 // P2: byte key length * 16 in esi // P3: input pointer in rdi // P4: output pointer in (rsp) #endif /* GLADMAN_INTERFACE */ lea dec_tab(%rip), tptr sub $rofs, kptr // Load input block into registers mov (%rdi), %eax mov 1*4(%rdi), %ebx mov 2*4(%rdi), %ecx mov 3*4(%rdi), %edx #ifdef AES_REV_DKS mov kptr, %rdi lea (kptr,%rsi), kptr #else lea (kptr,%rsi), %rdi #endif xor rofs(%rdi), %eax xor rofs+4(%rdi), %ebx xor rofs+8(%rdi), %ecx xor rofs+12(%rdi), %edx // Jump based on byte key length * 16: cmp $(10*16), %esi je 3f cmp $(12*16), %esi je 2f cmp $(14*16), %esi je 1f mov $-1, %rax // error jmp 4f // Perform normal inverse rounds 1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) 2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) 3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) il_rnd(%r9d, %r10d, %r11d, %r12d, 0) // Copy results mov (%rsp), %rbx mov %r9d, (%rbx) mov %r10d, 4(%rbx) mov %r11d, 8(%rbx) mov %r12d, 12(%rbx) xor %rax, %rax 4: // Restore registers mov 1*8(%rsp), %rbx mov 2*8(%rsp), %rbp mov 3*8(%rsp), %r12 add $(4*8), %rsp RET SET_SIZE(aes_decrypt_amd64) #endif /* lint || __lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index 75dd2c721f56..165492a0ed76 100644 --- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -1,1259 +1,1259 @@ # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # # AES-NI-CTR+GHASH stitch. # # February 2013 # # OpenSSL GCM implementation is organized in such way that its # performance is rather close to the sum of its streamed components, # in the context parallelized AES-NI CTR and modulo-scheduled # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation # was observed to perform significantly better than the sum of the # components on contemporary CPUs, the effort was deemed impossible to # justify. This module is based on combination of Intel submissions, # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles # pressure with notable relative improvement, achieving 1.0 cycle per # byte processed with 128-bit key on Haswell processor, 0.74 - on # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled # measurements for favourable packet size, one divisible by 96. # Applications using the EVP interface will observe a few percent # worse performance.] # # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf # Generated once from # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl # and modified for ICP. Modification are kept at a bare minimum to ease later # upstream merges. #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) #define _ASM #include /* Windows userland links with OpenSSL */ #if !defined (_WIN32) || defined (_KERNEL) .extern gcm_avx_can_use_movbe .text #ifdef HAVE_MOVBE -.align 32 +.balign 32 FUNCTION(_aesni_ctr32_ghash_6x) .cfi_startproc ENDBR vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x -.align 32 +.balign 32 .Loop6x: addl $100663296,%ebx jc .Lhandle_ctr32 vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movbeq 88(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 80(%r14),%r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movbeq 72(%r14),%r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movbeq 64(%r14),%r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movbeq 56(%r14),%r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movbeq 48(%r14),%r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movbeq 40(%r14),%r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movbeq 32(%r14),%r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movbeq 24(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 16(%r14),%r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movbeq 8(%r14),%r13 vaesenc %xmm1,%xmm13,%xmm13 movbeq 0(%r14),%r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 cmpl $14,%ebp // ICP does not zero key schedule. jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail -.align 32 +.balign 32 .Lhandle_ctr32: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32 -.align 32 +.balign 32 .Lenc_tail: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%r10 subq $0x6,%rdx jc .L6x_done vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x .L6x_done: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 RET .cfi_endproc SET_SIZE(_aesni_ctr32_ghash_6x) #endif /* ifdef HAVE_MOVBE */ -.align 32 +.balign 32 FUNCTION(_aesni_ctr32_ghash_no_movbe_6x) .cfi_startproc ENDBR vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x_nmb -.align 32 +.balign 32 .Loop6x_nmb: addl $100663296,%ebx jc .Lhandle_ctr32_nmb vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32_nmb: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movq 88(%r14),%r13 bswapq %r13 vaesenc %xmm15,%xmm11,%xmm11 movq 80(%r14),%r12 bswapq %r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movq 72(%r14),%r13 bswapq %r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movq 64(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movq 56(%r14),%r13 bswapq %r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movq 48(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movq 40(%r14),%r13 bswapq %r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movq 32(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movq 24(%r14),%r13 bswapq %r13 vaesenc %xmm15,%xmm11,%xmm11 movq 16(%r14),%r12 bswapq %r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movq 8(%r14),%r13 bswapq %r13 vaesenc %xmm1,%xmm13,%xmm13 movq 0(%r14),%r12 bswapq %r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. jb .Lenc_tail_nmb vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 cmpl $14,%ebp // ICP does not zero key schedule. jb .Lenc_tail_nmb vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail_nmb -.align 32 +.balign 32 .Lhandle_ctr32_nmb: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32_nmb -.align 32 +.balign 32 .Lenc_tail_nmb: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%r10 subq $0x6,%rdx jc .L6x_done_nmb vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x_nmb .L6x_done_nmb: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 RET .cfi_endproc SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x) ENTRY_ALIGN(aesni_gcm_decrypt, 32) .cfi_startproc ENDBR xorq %r10,%r10 cmpq $0x60,%rdx jb .Lgcm_dec_abort leaq (%rsp),%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 pushq %r9 .cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 vmovdqu (%r9),%xmm8 andq $-128,%rsp vmovdqu (%r11),%xmm0 leaq 128(%rcx),%rcx movq 32(%r9),%r9 leaq 32(%r9),%r9 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. vpshufb %xmm0,%xmm8,%xmm8 andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Ldec_no_key_aliasing cmpq $768,%r15 jnc .Ldec_no_key_aliasing subq %r15,%rsp .Ldec_no_key_aliasing: vmovdqu 80(%rdi),%xmm7 leaq (%rdi),%r14 vmovdqu 64(%rdi),%xmm4 leaq -192(%rdi,%rdx,1),%r15 vmovdqu 48(%rdi),%xmm5 shrq $4,%rdx xorq %r10,%r10 vmovdqu 32(%rdi),%xmm6 vpshufb %xmm0,%xmm7,%xmm7 vmovdqu 16(%rdi),%xmm2 vpshufb %xmm0,%xmm4,%xmm4 vmovdqu (%rdi),%xmm3 vpshufb %xmm0,%xmm5,%xmm5 vmovdqu %xmm4,48(%rsp) vpshufb %xmm0,%xmm6,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm2,%xmm2 vmovdqu %xmm6,80(%rsp) vpshufb %xmm0,%xmm3,%xmm3 vmovdqu %xmm2,96(%rsp) vmovdqu %xmm3,112(%rsp) #ifdef HAVE_MOVBE #ifdef _KERNEL testl $1,gcm_avx_can_use_movbe(%rip) #else testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) #endif jz 1f call _aesni_ctr32_ghash_6x jmp 2f 1: #endif call _aesni_ctr32_ghash_no_movbe_6x 2: vmovups %xmm9,-96(%rsi) vmovups %xmm10,-80(%rsi) vmovups %xmm11,-64(%rsi) vmovups %xmm12,-48(%rsi) vmovups %xmm13,-32(%rsi) vmovups %xmm14,-16(%rsi) vpshufb (%r11),%xmm8,%xmm8 movq -56(%rax),%r9 .cfi_restore %r9 vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lgcm_dec_abort: movq %r10,%rax RET .cfi_endproc SET_SIZE(aesni_gcm_decrypt) -.align 32 +.balign 32 FUNCTION(_aesni_ctr32_6x) .cfi_startproc ENDBR vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. vmovups 16-128(%rcx),%xmm15 leaq 32-128(%rcx),%r12 vpxor %xmm4,%xmm1,%xmm9 addl $100663296,%ebx jc .Lhandle_ctr32_2 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddb %xmm2,%xmm11,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddb %xmm2,%xmm12,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 -.align 16 +.balign 16 .Loop_ctr32: vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vmovups (%r12),%xmm15 leaq 16(%r12),%r12 decl %r13d jnz .Loop_ctr32 vmovdqu (%r12),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor 0(%rdi),%xmm3,%xmm4 vaesenc %xmm15,%xmm10,%xmm10 vpxor 16(%rdi),%xmm3,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 vpxor 32(%rdi),%xmm3,%xmm6 vaesenc %xmm15,%xmm12,%xmm12 vpxor 48(%rdi),%xmm3,%xmm8 vaesenc %xmm15,%xmm13,%xmm13 vpxor 64(%rdi),%xmm3,%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vpxor 80(%rdi),%xmm3,%xmm3 leaq 96(%rdi),%rdi vaesenclast %xmm4,%xmm9,%xmm9 vaesenclast %xmm5,%xmm10,%xmm10 vaesenclast %xmm6,%xmm11,%xmm11 vaesenclast %xmm8,%xmm12,%xmm12 vaesenclast %xmm2,%xmm13,%xmm13 vaesenclast %xmm3,%xmm14,%xmm14 vmovups %xmm9,0(%rsi) vmovups %xmm10,16(%rsi) vmovups %xmm11,32(%rsi) vmovups %xmm12,48(%rsi) vmovups %xmm13,64(%rsi) vmovups %xmm14,80(%rsi) leaq 96(%rsi),%rsi RET -.align 32 +.balign 32 .Lhandle_ctr32_2: vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpshufb %xmm0,%xmm14,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpshufb %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .cfi_endproc SET_SIZE(_aesni_ctr32_6x) ENTRY_ALIGN(aesni_gcm_encrypt, 32) .cfi_startproc ENDBR xorq %r10,%r10 cmpq $288,%rdx jb .Lgcm_enc_abort leaq (%rsp),%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 pushq %r9 .cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 leaq 128(%rcx),%rcx vmovdqu (%r11),%xmm0 andq $-128,%rsp movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Lenc_no_key_aliasing cmpq $768,%r15 jnc .Lenc_no_key_aliasing subq %r15,%rsp .Lenc_no_key_aliasing: leaq (%rsi),%r14 leaq -192(%rsi,%rdx,1),%r15 shrq $4,%rdx call _aesni_ctr32_6x vpshufb %xmm0,%xmm9,%xmm8 vpshufb %xmm0,%xmm10,%xmm2 vmovdqu %xmm8,112(%rsp) vpshufb %xmm0,%xmm11,%xmm4 vmovdqu %xmm2,96(%rsp) vpshufb %xmm0,%xmm12,%xmm5 vmovdqu %xmm4,80(%rsp) vpshufb %xmm0,%xmm13,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm14,%xmm7 vmovdqu %xmm6,48(%rsp) call _aesni_ctr32_6x vmovdqu (%r9),%xmm8 movq 32(%r9),%r9 leaq 32(%r9),%r9 subq $12,%rdx movq $192,%r10 vpshufb %xmm0,%xmm8,%xmm8 #ifdef HAVE_MOVBE #ifdef _KERNEL testl $1,gcm_avx_can_use_movbe(%rip) #else testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) #endif jz 1f call _aesni_ctr32_ghash_6x jmp 2f 1: #endif call _aesni_ctr32_ghash_no_movbe_6x 2: vmovdqu 32(%rsp),%xmm7 vmovdqu (%r11),%xmm0 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm7,%xmm7,%xmm1 vmovdqu 32-32(%r9),%xmm15 vmovups %xmm9,-96(%rsi) vpshufb %xmm0,%xmm9,%xmm9 vpxor %xmm7,%xmm1,%xmm1 vmovups %xmm10,-80(%rsi) vpshufb %xmm0,%xmm10,%xmm10 vmovups %xmm11,-64(%rsi) vpshufb %xmm0,%xmm11,%xmm11 vmovups %xmm12,-48(%rsi) vpshufb %xmm0,%xmm12,%xmm12 vmovups %xmm13,-32(%rsi) vpshufb %xmm0,%xmm13,%xmm13 vmovups %xmm14,-16(%rsi) vpshufb %xmm0,%xmm14,%xmm14 vmovdqu %xmm9,16(%rsp) vmovdqu 48(%rsp),%xmm6 vmovdqu 16-32(%r9),%xmm0 vpunpckhqdq %xmm6,%xmm6,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 vpxor %xmm6,%xmm2,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vmovdqu 64(%rsp),%xmm9 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm9,%xmm9,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 vpxor %xmm9,%xmm5,%xmm5 vpxor %xmm7,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vmovdqu 80(%rsp),%xmm1 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm4,%xmm7,%xmm7 vpunpckhqdq %xmm1,%xmm1,%xmm4 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpxor %xmm6,%xmm9,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 96(%rsp),%xmm2 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm7,%xmm6,%xmm6 vpunpckhqdq %xmm2,%xmm2,%xmm7 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 vpxor %xmm2,%xmm7,%xmm7 vpxor %xmm9,%xmm1,%xmm1 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm5,%xmm4,%xmm4 vpxor 112(%rsp),%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 vmovdqu 112-32(%r9),%xmm0 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpxor %xmm6,%xmm5,%xmm5 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 vpxor %xmm4,%xmm7,%xmm4 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm1 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 vpxor %xmm14,%xmm1,%xmm1 vpxor %xmm5,%xmm6,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 vmovdqu 32-32(%r9),%xmm15 vpxor %xmm2,%xmm8,%xmm7 vpxor %xmm4,%xmm9,%xmm6 vmovdqu 16-32(%r9),%xmm0 vpxor %xmm5,%xmm7,%xmm9 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 vpxor %xmm9,%xmm6,%xmm6 vpunpckhqdq %xmm13,%xmm13,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 vpxor %xmm13,%xmm2,%xmm2 vpslldq $8,%xmm6,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vpxor %xmm9,%xmm5,%xmm8 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm6,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm12,%xmm12,%xmm9 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 vpxor %xmm12,%xmm9,%xmm9 vpxor %xmm14,%xmm13,%xmm13 vpalignr $8,%xmm8,%xmm8,%xmm14 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm11,%xmm11,%xmm1 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm13,%xmm12,%xmm12 vxorps 16(%rsp),%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm9,%xmm9 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm10,%xmm10,%xmm2 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 vpxor %xmm10,%xmm2,%xmm2 vpalignr $8,%xmm8,%xmm8,%xmm14 vpxor %xmm12,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm9,%xmm1,%xmm1 vxorps %xmm7,%xmm14,%xmm14 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 vmovdqu 112-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm11,%xmm10,%xmm10 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 vpxor %xmm4,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 vpxor %xmm10,%xmm7,%xmm7 vpxor %xmm2,%xmm6,%xmm6 vpxor %xmm5,%xmm7,%xmm4 vpxor %xmm4,%xmm6,%xmm6 vpslldq $8,%xmm6,%xmm1 vmovdqu 16(%r11),%xmm3 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm1,%xmm5,%xmm8 vpxor %xmm6,%xmm7,%xmm7 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm2,%xmm8,%xmm8 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm7,%xmm2,%xmm2 vpxor %xmm2,%xmm8,%xmm8 vpshufb (%r11),%xmm8,%xmm8 movq -56(%rax),%r9 .cfi_restore %r9 vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lgcm_enc_abort: movq %r10,%rax RET .cfi_endproc SET_SIZE(aesni_gcm_encrypt) #endif /* !_WIN32 || _KERNEL */ /* Some utility routines */ /* * clear all fpu registers * void clear_fpu_regs_avx(void); */ ENTRY_ALIGN(clear_fpu_regs_avx, 32) vzeroall RET SET_SIZE(clear_fpu_regs_avx) /* * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); * * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and * stores the result at `dst'. The XOR is performed using FPU registers, * so make sure FPU state is saved when running this in the kernel. */ ENTRY_ALIGN(gcm_xor_avx, 32) movdqu (%rdi), %xmm0 movdqu (%rsi), %xmm1 pxor %xmm1, %xmm0 movdqu %xmm0, (%rsi) RET SET_SIZE(gcm_xor_avx) /* * Toggle a boolean_t value atomically and return the new value. * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); */ ENTRY_ALIGN(atomic_toggle_boolean_nv, 32) xorl %eax, %eax lock xorl $1, (%rdi) jz 1f movl $1, %eax 1: RET SET_SIZE(atomic_toggle_boolean_nv) SECTION_STATIC -.align 64 +.balign 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lpoly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .Lone_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Ltwo_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 +.balign 64 /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S index eb9514e10cda..e40b3df32753 100644 --- a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S +++ b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S @@ -1,254 +1,254 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009 Intel Corporation * All Rights Reserved. */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Accelerated GHASH implementation with Intel PCLMULQDQ-NI * instructions. This file contains an accelerated * Galois Field Multiplication implementation. * * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, * carry-less multiplication. More information about PCLMULQDQ can be * found at: * http://software.intel.com/en-us/articles/ * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ * */ /* * ==================================================================== * OpenSolaris OS modifications * * This source originates as file galois_hash_asm.c from * Intel Corporation dated September 21, 2009. * * This OpenSolaris version has these major changes from the original source: * * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function * definition for lint. * * 2. Formatted code, added comments, and added #includes and #defines. * * 3. If bit CR0.TS is set, clear and set the TS bit, after and before * calling kpreempt_disable() and kpreempt_enable(). * If the TS bit is not set, Save and restore %xmm registers at the beginning * and end of function calls (%xmm* registers are not saved and restored by * during kernel thread preemption). * * 4. Removed code to perform hashing. This is already done with C macro * GHASH in gcm.c. For better performance, this removed code should be * reintegrated in the future to replace the C GHASH macro. * * 5. Added code to byte swap 16-byte input and output. * * 6. Folded in comments from the original C source with embedded assembly * (SB_w_shift_xor.c) * * 7. Renamed function and reordered parameters to match OpenSolaris: * Intel interface: * void galois_hash_asm(unsigned char *hk, unsigned char *s, * unsigned char *d, int length) * OpenSolaris OS interface: * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); * ==================================================================== */ #if defined(lint) || defined(__lint) /* lint */ #include void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { (void) x_in, (void) y, (void) res; } #elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */ #define _ASM #include /* * Use this mask to byte-swap a 16-byte integer with the pshufb instruction */ // static uint8_t byte_swap16_mask[] = { // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; .section .rodata -.align XMM_ALIGN +.balign XMM_ALIGN .Lbyte_swap16_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 /* * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); * * Perform a carry-less multiplication (that is, use XOR instead of the * multiply operator) on P1 and P2 and place the result in P3. * * Byte swap the input and the output. * * Note: x_in, y, and res all point to a block of 20-byte numbers * (an array of two 64-bit integers). * * Note2: For kernel code, caller is responsible for ensuring * kpreempt_disable() has been called. This is because %xmm registers are * not saved/restored. Clear and set the CR0.TS bit on entry and exit, * respectively, if TS is set on entry. Otherwise, if TS is not set, * save and restore %xmm registers on the stack. * * Note3: Original Intel definition: * void galois_hash_asm(unsigned char *hk, unsigned char *s, * unsigned char *d, int length) * * Note4: Register/parameter mapping: * Intel: * Parameter 1: %rcx (copied to %xmm0) hk or x_in * Parameter 2: %rdx (copied to %xmm1) s or y * Parameter 3: %rdi (result) d or res * OpenSolaris: * Parameter 1: %rdi (copied to %xmm0) x_in * Parameter 2: %rsi (copied to %xmm1) y * Parameter 3: %rdx (result) res */ ENTRY_NP(gcm_mul_pclmulqdq) // // Copy Parameters // movdqu (%rdi), %xmm0 // P1 movdqu (%rsi), %xmm1 // P2 // // Byte swap 16-byte input // lea .Lbyte_swap16_mask(%rip), %rax movups (%rax), %xmm10 pshufb %xmm10, %xmm0 pshufb %xmm10, %xmm1 // // Multiply with the hash key // movdqu %xmm0, %xmm3 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 movdqu %xmm0, %xmm4 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 movdqu %xmm0, %xmm5 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 movdqu %xmm0, %xmm6 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right pslldq $8, %xmm5 // shift by xmm5 64 bits to the left pxor %xmm5, %xmm3 pxor %xmm4, %xmm6 // Register pair holds the result // of the carry-less multiplication of // xmm0 by xmm1. // We shift the result of the multiplication by one bit position // to the left to cope for the fact that the bits are reversed. movdqu %xmm3, %xmm7 movdqu %xmm6, %xmm8 pslld $1, %xmm3 pslld $1, %xmm6 psrld $31, %xmm7 psrld $31, %xmm8 movdqu %xmm7, %xmm9 pslldq $4, %xmm8 pslldq $4, %xmm7 psrldq $12, %xmm9 por %xmm7, %xmm3 por %xmm8, %xmm6 por %xmm9, %xmm6 // // First phase of the reduction // // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts // independently. movdqu %xmm3, %xmm7 movdqu %xmm3, %xmm8 movdqu %xmm3, %xmm9 pslld $31, %xmm7 // packed right shift shifting << 31 pslld $30, %xmm8 // packed right shift shifting << 30 pslld $25, %xmm9 // packed right shift shifting << 25 pxor %xmm8, %xmm7 // xor the shifted versions pxor %xmm9, %xmm7 movdqu %xmm7, %xmm8 pslldq $12, %xmm7 psrldq $4, %xmm8 pxor %xmm7, %xmm3 // first phase of the reduction complete // // Second phase of the reduction // // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these // shift operations. movdqu %xmm3, %xmm2 movdqu %xmm3, %xmm4 // packed left shifting >> 1 movdqu %xmm3, %xmm5 psrld $1, %xmm2 psrld $2, %xmm4 // packed left shifting >> 2 psrld $7, %xmm5 // packed left shifting >> 7 pxor %xmm4, %xmm2 // xor the shifted versions pxor %xmm5, %xmm2 pxor %xmm8, %xmm2 pxor %xmm2, %xmm3 pxor %xmm3, %xmm6 // the result is in xmm6 // // Byte swap 16-byte result // pshufb %xmm10, %xmm6 // %xmm10 has the swap mask // // Store the result // movdqu %xmm6, (%rdx) // P3 // // Return // RET SET_SIZE(gcm_mul_pclmulqdq) #endif /* lint || __lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S index d48b4f2155cc..f62e056d4b64 100644 --- a/module/icp/asm-x86_64/modes/ghash-x86_64.S +++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S @@ -1,720 +1,720 @@ # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # March, June 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH # function features so called "528B" variant utilizing additional # 256+16 bytes of per-key storage [+512 bytes shared table]. # Performance results are for this streamed GHASH subroutine and are # expressed in cycles per processed byte, less is better: # # gcc 3.4.x(*) assembler # # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% # Atom 31.6 16.8 +88% # VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results # are for "528B";-) # (**) it's mystery [to me] why Core2 result is not same as for # Opteron; # May 2010 # # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. # See ghash-x86.pl for background information and details about coding # techniques. # # Special thanks to David Woodhouse for providing access to a # Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # # Overhaul: aggregate Karatsuba post-processing, improve ILP in # reduction_alg9, increase reduction aggregate factor to 4x. As for # the latter. ghash-x86.pl discusses that it makes lesser sense to # increase aggregate factor. Then why increase here? Critical path # consists of 3 independent pclmulqdq instructions, Karatsuba post- # processing and reduction. "On top" of this we lay down aggregated # multiplication operations, triplets of independent pclmulqdq's. As # issue rate for pclmulqdq is limited, it makes lesser sense to # aggregate more multiplications than it takes to perform remaining # non-multiplication operations. 2x is near-optimal coefficient for # contemporary Intel CPUs (therefore modest improvement coefficient), # but not for Bulldozer. Latter is because logical SIMD operations # are twice as slow in comparison to Intel, so that critical path is # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # # Westmere 1.78(+13%) # Sandy Bridge 1.80(+8%) # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Broadwell 0.45(+110%)(if system doesn't support AVX) # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) # Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 # # ... 8x aggregate factor AVX code path is using reduction algorithm # suggested by Shay Gueron[1]. Even though contemporary AVX-capable # CPUs such as Sandy and Ivy Bridge can execute it, the code performs # sub-optimally in comparison to above mentioned version. But thanks # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # # Knights Landing achieves 1.09 cpb. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # Generated once from # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl # and modified for ICP. Modification are kept at a bare minimum to ease later # upstream merges. #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) #define _ASM #include .text /* Windows userland links with OpenSSL */ #if !defined (_WIN32) || defined (_KERNEL) ENTRY_ALIGN(gcm_gmult_clmul, 16) .cfi_startproc ENDBR .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) RET .cfi_endproc SET_SIZE(gcm_gmult_clmul) #endif /* !_WIN32 || _KERNEL */ ENTRY_ALIGN(gcm_init_htab_avx, 32) .cfi_startproc ENDBR vzeroupper vmovdqu (%rsi),%xmm2 // KCF/ICP stores H in network byte order with the hi qword first // so we need to swap all bytes, not the 2 qwords. vmovdqu .Lbswap_mask(%rip),%xmm4 vpshufb %xmm4,%xmm2,%xmm2 vpshufd $255,%xmm2,%xmm4 vpsrlq $63,%xmm2,%xmm3 vpsllq $1,%xmm2,%xmm2 vpxor %xmm5,%xmm5,%xmm5 vpcmpgtd %xmm4,%xmm5,%xmm5 vpslldq $8,%xmm3,%xmm3 vpor %xmm3,%xmm2,%xmm2 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm2,%xmm2,%xmm6 vmovdqa %xmm2,%xmm0 vpxor %xmm2,%xmm6,%xmm6 movq $4,%r10 jmp .Linit_start_avx -.align 32 +.balign 32 .Linit_loop_avx: vpalignr $8,%xmm3,%xmm4,%xmm5 vmovdqu %xmm5,-16(%rdi) vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 .Linit_start_avx: vmovdqa %xmm0,%xmm5 vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 vpshufd $78,%xmm5,%xmm3 vpshufd $78,%xmm0,%xmm4 vpxor %xmm5,%xmm3,%xmm3 vmovdqu %xmm5,0(%rdi) vpxor %xmm0,%xmm4,%xmm4 vmovdqu %xmm0,16(%rdi) leaq 48(%rdi),%rdi subq $1,%r10 jnz .Linit_loop_avx vpalignr $8,%xmm4,%xmm3,%xmm5 vmovdqu %xmm5,-16(%rdi) vzeroupper RET .cfi_endproc SET_SIZE(gcm_init_htab_avx) #if !defined (_WIN32) || defined (_KERNEL) ENTRY_ALIGN(gcm_gmult_avx, 32) .cfi_startproc ENDBR jmp .L_gmult_clmul .cfi_endproc SET_SIZE(gcm_gmult_avx) ENTRY_ALIGN(gcm_ghash_avx, 32) .cfi_startproc ENDBR vzeroupper vmovdqu (%rdi),%xmm10 leaq .L0x1c2_polynomial(%rip),%r10 leaq 64(%rsi),%rsi vmovdqu .Lbswap_mask(%rip),%xmm13 vpshufb %xmm13,%xmm10,%xmm10 cmpq $0x80,%rcx jb .Lshort_avx subq $0x80,%rcx vmovdqu 112(%rdx),%xmm14 vmovdqu 0-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vmovdqu 32-64(%rsi),%xmm7 vpunpckhqdq %xmm14,%xmm14,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm14,%xmm9,%xmm9 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 80(%rdx),%xmm14 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 48-64(%rsi),%xmm6 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 64(%rdx),%xmm15 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 48(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 32(%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 16(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu (%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 leaq 128(%rdx),%rdx cmpq $0x80,%rcx jb .Ltail_avx vpxor %xmm10,%xmm15,%xmm15 subq $0x80,%rcx jmp .Loop8x_avx -.align 32 +.balign 32 .Loop8x_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 112(%rdx),%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpxor %xmm15,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 vmovdqu 0-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 vmovdqu 32-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm3,%xmm10,%xmm10 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vxorps %xmm4,%xmm11,%xmm11 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm5,%xmm12,%xmm12 vxorps %xmm15,%xmm8,%xmm8 vmovdqu 80(%rdx),%xmm14 vpxor %xmm10,%xmm12,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm11,%xmm12,%xmm12 vpslldq $8,%xmm12,%xmm9 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vpsrldq $8,%xmm12,%xmm12 vpxor %xmm9,%xmm10,%xmm10 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vxorps %xmm12,%xmm11,%xmm11 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 64(%rdx),%xmm15 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vxorps %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vmovdqu 48(%rdx),%xmm14 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 32(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vxorps %xmm12,%xmm10,%xmm10 vmovdqu 16(%rdx),%xmm14 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vxorps %xmm11,%xmm12,%xmm12 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu (%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm12,%xmm15,%xmm15 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 vpxor %xmm10,%xmm15,%xmm15 leaq 128(%rdx),%rdx subq $0x80,%rcx jnc .Loop8x_avx addq $0x80,%rcx jmp .Ltail_no_xor_avx -.align 32 +.balign 32 .Lshort_avx: vmovdqu -16(%rdx,%rcx,1),%xmm14 leaq (%rdx,%rcx,1),%rdx vmovdqu 0-64(%rsi),%xmm6 vmovdqu 32-64(%rsi),%xmm7 vpshufb %xmm13,%xmm14,%xmm15 vmovdqa %xmm0,%xmm3 vmovdqa %xmm1,%xmm4 vmovdqa %xmm2,%xmm5 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -32(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -48(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 80-64(%rsi),%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -64(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -80(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 96-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 128-64(%rsi),%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -96(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -112(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 144-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovq 184-64(%rsi),%xmm7 subq $0x10,%rcx jmp .Ltail_avx -.align 32 +.balign 32 .Ltail_avx: vpxor %xmm10,%xmm15,%xmm15 .Ltail_no_xor_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu (%r10),%xmm12 vpxor %xmm0,%xmm3,%xmm10 vpxor %xmm1,%xmm4,%xmm11 vpxor %xmm2,%xmm5,%xmm5 vpxor %xmm10,%xmm5,%xmm5 vpxor %xmm11,%xmm5,%xmm5 vpslldq $8,%xmm5,%xmm9 vpsrldq $8,%xmm5,%xmm5 vpxor %xmm9,%xmm10,%xmm10 vpxor %xmm5,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm11,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 cmpq $0,%rcx jne .Lshort_avx vpshufb %xmm13,%xmm10,%xmm10 vmovdqu %xmm10,(%rdi) vzeroupper RET .cfi_endproc SET_SIZE(gcm_ghash_avx) #endif /* !_WIN32 || _KERNEL */ SECTION_STATIC -.align 64 +.balign 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .L7_mask: .long 7,0,7,0 .L7_mask_poly: .long 7,0,450,0 -.align 64 +.balign 64 SET_OBJ(.Lrem_4bit) .Lrem_4bit: .long 0,0,0,471859200,0,943718400,0,610271232 .long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 .long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 .long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 SET_OBJ(.Lrem_8bit) .Lrem_8bit: .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 +.balign 64 /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S index 321d5da461db..f3d701528459 100644 --- a/module/icp/asm-x86_64/sha2/sha256_impl.S +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -1,2090 +1,2090 @@ /* * ==================================================================== * Written by Andy Polyakov for the OpenSSL * project. Rights for redistribution and usage in source and binary * forms are granted according to the OpenSSL license. * ==================================================================== * * sha256/512_block procedure for x86_64. * * 40% improvement over compiler-generated code on Opteron. On EM64T * sha256 was observed to run >80% faster and sha512 - >40%. No magical * tricks, just straight implementation... I really wonder why gcc * [being armed with inline assembler] fails to generate as fast code. * The only thing which is cool about this module is that it's very * same instruction sequence used for both SHA-256 and SHA-512. In * former case the instructions operate on 32-bit operands, while in * latter - on 64-bit ones. All I had to do is to get one flavor right, * the other one passed the test right away:-) * * sha256_block runs in ~1005 cycles on Opteron, which gives you * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock * frequency in GHz. sha512_block runs in ~1275 cycles, which results * in 128*1000/1275=100MBps per GHz. Is there room for improvement? * Well, if you compare it to IA-64 implementation, which maintains * X[16] in register bank[!], tends to 4 instructions per CPU clock * cycle and runs in 1003 cycles, 1275 is very good result for 3-way * issue Opteron pipeline and X[16] maintained in memory. So that *if* * there is a way to improve it, *then* the only way would be to try to * offload X[16] updates to SSE unit, but that would require "deeper" * loop unroll, which in turn would naturally cause size blow-up, not * to mention increased complexity! And once again, only *if* it's * actually possible to noticeably improve overall ILP, instruction * level parallelism, on a given CPU implementation in this case. * * Special note on Intel EM64T. While Opteron CPU exhibits perfect * performance ratio of 1.5 between 64- and 32-bit flavors [see above], * [currently available] EM64T CPUs apparently are far from it. On the * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit * sha256_block:-( This is presumably because 64-bit shifts/rotates * apparently are not atomic instructions, but implemented in microcode. */ /* * OpenSolaris OS modifications * * Sun elects to use this software under the BSD license. * * This source originates from OpenSSL file sha512-x86_64.pl at * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz * (presumably for future OpenSSL release 0.9.8h), with these changes: * * 1. Added perl "use strict" and declared variables. * * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. * * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) * assemblers). Replaced the .picmeup macro with assembler code. * * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", * at the beginning of SHA2_CTX (the next field is 8-byte aligned). */ /* * This file was generated by a perl script (sha512-x86_64.pl) that were * used to generate sha256 and sha512 variants from the same code base. * The comments from the original file have been pasted above. */ #if defined(lint) || defined(__lint) #include #include void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) { (void) ctx, (void) in, (void) num; } #else #define _ASM #include ENTRY_NP(SHA256TransformBlocks) .cfi_startproc ENDBR movq %rsp, %rax .cfi_def_cfa_register %rax push %rbx .cfi_offset %rbx,-16 push %rbp .cfi_offset %rbp,-24 push %r12 .cfi_offset %r12,-32 push %r13 .cfi_offset %r13,-40 push %r14 .cfi_offset %r14,-48 push %r15 .cfi_offset %r15,-56 mov %rsp,%rbp # copy %rsp shl $4,%rdx # num*16 sub $16*4+4*8,%rsp lea (%rsi,%rdx,4),%rdx # inp+num*16*4 and $-64,%rsp # align stack frame add $8,%rdi # Skip OpenSolaris field, "algotype" mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg mov %rbp,16*4+3*8(%rsp) # save copy of %rsp # echo ".cfi_cfa_expression %rsp+88,deref,+56" | # openssl/crypto/perlasm/x86_64-xlate.pl .cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x38 #.picmeup %rbp # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts # the address of the "next" instruction into the target register # (%rbp). This generates these 2 instructions: lea .Llea(%rip),%rbp #nop # .picmeup generates a nop for mod 8 alignment--not needed here .Llea: lea K256-.(%rbp),%rbp mov 4*0(%rdi),%eax mov 4*1(%rdi),%ebx mov 4*2(%rdi),%ecx mov 4*3(%rdi),%edx mov 4*4(%rdi),%r8d mov 4*5(%rdi),%r9d mov 4*6(%rdi),%r10d mov 4*7(%rdi),%r11d jmp .Lloop -.align 16 +.balign 16 .Lloop: xor %rdi,%rdi mov 4*0(%rsi),%r12d bswap %r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,0(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 4*1(%rsi),%r12d bswap %r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,4(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 4*2(%rsi),%r12d bswap %r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,8(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 4*3(%rsi),%r12d bswap %r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,12(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 4*4(%rsi),%r12d bswap %r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,16(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 4*5(%rsi),%r12d bswap %r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,20(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 4*6(%rsi),%r12d bswap %r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,24(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 4*7(%rsi),%r12d bswap %r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,28(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) mov 4*8(%rsi),%r12d bswap %r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,32(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 4*9(%rsi),%r12d bswap %r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,36(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 4*10(%rsi),%r12d bswap %r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,40(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 4*11(%rsi),%r12d bswap %r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,44(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 4*12(%rsi),%r12d bswap %r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,48(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 4*13(%rsi),%r12d bswap %r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,52(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 4*14(%rsi),%r12d bswap %r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,56(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 4*15(%rsi),%r12d bswap %r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,60(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) jmp .Lrounds_16_xx -.align 16 +.balign 16 .Lrounds_16_xx: mov 4(%rsp),%r13d mov 56(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 36(%rsp),%r12d add 0(%rsp),%r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,0(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 8(%rsp),%r13d mov 60(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 40(%rsp),%r12d add 4(%rsp),%r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,4(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 12(%rsp),%r13d mov 0(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 44(%rsp),%r12d add 8(%rsp),%r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,8(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 16(%rsp),%r13d mov 4(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 48(%rsp),%r12d add 12(%rsp),%r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,12(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 20(%rsp),%r13d mov 8(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 52(%rsp),%r12d add 16(%rsp),%r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,16(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 24(%rsp),%r13d mov 12(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 56(%rsp),%r12d add 20(%rsp),%r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,20(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 28(%rsp),%r13d mov 16(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 60(%rsp),%r12d add 24(%rsp),%r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,24(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 32(%rsp),%r13d mov 20(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 0(%rsp),%r12d add 28(%rsp),%r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,28(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) mov 36(%rsp),%r13d mov 24(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 4(%rsp),%r12d add 32(%rsp),%r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,32(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 40(%rsp),%r13d mov 28(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 8(%rsp),%r12d add 36(%rsp),%r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,36(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 44(%rsp),%r13d mov 32(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 12(%rsp),%r12d add 40(%rsp),%r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,40(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 48(%rsp),%r13d mov 36(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 16(%rsp),%r12d add 44(%rsp),%r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,44(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 52(%rsp),%r13d mov 40(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 20(%rsp),%r12d add 48(%rsp),%r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,48(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 56(%rsp),%r13d mov 44(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 24(%rsp),%r12d add 52(%rsp),%r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,52(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 60(%rsp),%r13d mov 48(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 28(%rsp),%r12d add 56(%rsp),%r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,56(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 0(%rsp),%r13d mov 52(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 32(%rsp),%r12d add 60(%rsp),%r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,60(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) cmp $64,%rdi jb .Lrounds_16_xx mov 16*4+0*8(%rsp),%rdi lea 16*4(%rsi),%rsi add 4*0(%rdi),%eax add 4*1(%rdi),%ebx add 4*2(%rdi),%ecx add 4*3(%rdi),%edx add 4*4(%rdi),%r8d add 4*5(%rdi),%r9d add 4*6(%rdi),%r10d add 4*7(%rdi),%r11d cmp 16*4+2*8(%rsp),%rsi mov %eax,4*0(%rdi) mov %ebx,4*1(%rdi) mov %ecx,4*2(%rdi) mov %edx,4*3(%rdi) mov %r8d,4*4(%rdi) mov %r9d,4*5(%rdi) mov %r10d,4*6(%rdi) mov %r11d,4*7(%rdi) jb .Lloop mov 16*4+3*8(%rsp),%rsp .cfi_def_cfa %rsp,56 pop %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 pop %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 pop %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 pop %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 pop %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp pop %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx RET .cfi_endproc SET_SIZE(SHA256TransformBlocks) .section .rodata -.align 64 +.balign 64 SET_OBJ(K256) K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 #endif /* !lint && !__lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S index 180f8e366060..520f5b6dab24 100644 --- a/module/icp/asm-x86_64/sha2/sha512_impl.S +++ b/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -1,2116 +1,2116 @@ /* * ==================================================================== * Written by Andy Polyakov for the OpenSSL * project. Rights for redistribution and usage in source and binary * forms are granted according to the OpenSSL license. * ==================================================================== * * sha256/512_block procedure for x86_64. * * 40% improvement over compiler-generated code on Opteron. On EM64T * sha256 was observed to run >80% faster and sha512 - >40%. No magical * tricks, just straight implementation... I really wonder why gcc * [being armed with inline assembler] fails to generate as fast code. * The only thing which is cool about this module is that it's very * same instruction sequence used for both SHA-256 and SHA-512. In * former case the instructions operate on 32-bit operands, while in * latter - on 64-bit ones. All I had to do is to get one flavor right, * the other one passed the test right away:-) * * sha256_block runs in ~1005 cycles on Opteron, which gives you * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock * frequency in GHz. sha512_block runs in ~1275 cycles, which results * in 128*1000/1275=100MBps per GHz. Is there room for improvement? * Well, if you compare it to IA-64 implementation, which maintains * X[16] in register bank[!], tends to 4 instructions per CPU clock * cycle and runs in 1003 cycles, 1275 is very good result for 3-way * issue Opteron pipeline and X[16] maintained in memory. So that *if* * there is a way to improve it, *then* the only way would be to try to * offload X[16] updates to SSE unit, but that would require "deeper" * loop unroll, which in turn would naturally cause size blow-up, not * to mention increased complexity! And once again, only *if* it's * actually possible to noticeably improve overall ILP, instruction * level parallelism, on a given CPU implementation in this case. * * Special note on Intel EM64T. While Opteron CPU exhibits perfect * performance ratio of 1.5 between 64- and 32-bit flavors [see above], * [currently available] EM64T CPUs apparently are far from it. On the * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit * sha256_block:-( This is presumably because 64-bit shifts/rotates * apparently are not atomic instructions, but implemented in microcode. */ /* * OpenSolaris OS modifications * * Sun elects to use this software under the BSD license. * * This source originates from OpenSSL file sha512-x86_64.pl at * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz * (presumably for future OpenSSL release 0.9.8h), with these changes: * * 1. Added perl "use strict" and declared variables. * * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. * * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) * assemblers). Replaced the .picmeup macro with assembler code. * * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", * at the beginning of SHA2_CTX (the next field is 8-byte aligned). */ /* * This file was generated by a perl script (sha512-x86_64.pl) that were * used to generate sha256 and sha512 variants from the same code base. * The comments from the original file have been pasted above. */ #if defined(lint) || defined(__lint) #include #include void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) { (void) ctx, (void) in, (void) num; } #else #define _ASM #include ENTRY_NP(SHA512TransformBlocks) .cfi_startproc ENDBR movq %rsp, %rax .cfi_def_cfa_register %rax push %rbx .cfi_offset %rbx,-16 push %rbp .cfi_offset %rbp,-24 push %r12 .cfi_offset %r12,-32 push %r13 .cfi_offset %r13,-40 push %r14 .cfi_offset %r14,-48 push %r15 .cfi_offset %r15,-56 mov %rsp,%rbp # copy %rsp shl $4,%rdx # num*16 sub $16*8+4*8,%rsp lea (%rsi,%rdx,8),%rdx # inp+num*16*8 and $-64,%rsp # align stack frame add $8,%rdi # Skip OpenSolaris field, "algotype" mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg mov %rbp,16*8+3*8(%rsp) # save copy of %rsp # echo ".cfi_cfa_expression %rsp+152,deref,+56" | # openssl/crypto/perlasm/x86_64-xlate.pl .cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x38 #.picmeup %rbp # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts # the address of the "next" instruction into the target register # (%rbp). This generates these 2 instructions: lea .Llea(%rip),%rbp #nop # .picmeup generates a nop for mod 8 alignment--not needed here .Llea: lea K512-.(%rbp),%rbp mov 8*0(%rdi),%rax mov 8*1(%rdi),%rbx mov 8*2(%rdi),%rcx mov 8*3(%rdi),%rdx mov 8*4(%rdi),%r8 mov 8*5(%rdi),%r9 mov 8*6(%rdi),%r10 mov 8*7(%rdi),%r11 jmp .Lloop -.align 16 +.balign 16 .Lloop: xor %rdi,%rdi mov 8*0(%rsi),%r12 bswap %r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,0(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 8*1(%rsi),%r12 bswap %r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,8(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 8*2(%rsi),%r12 bswap %r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,16(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 8*3(%rsi),%r12 bswap %r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,24(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 8*4(%rsi),%r12 bswap %r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,32(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 8*5(%rsi),%r12 bswap %r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,40(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 8*6(%rsi),%r12 bswap %r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,48(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 8*7(%rsi),%r12 bswap %r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,56(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) mov 8*8(%rsi),%r12 bswap %r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,64(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 8*9(%rsi),%r12 bswap %r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,72(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 8*10(%rsi),%r12 bswap %r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,80(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 8*11(%rsi),%r12 bswap %r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,88(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 8*12(%rsi),%r12 bswap %r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,96(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 8*13(%rsi),%r12 bswap %r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,104(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 8*14(%rsi),%r12 bswap %r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,112(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 8*15(%rsi),%r12 bswap %r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,120(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) jmp .Lrounds_16_xx -.align 16 +.balign 16 .Lrounds_16_xx: mov 8(%rsp),%r13 mov 112(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 72(%rsp),%r12 add 0(%rsp),%r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,0(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 16(%rsp),%r13 mov 120(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 80(%rsp),%r12 add 8(%rsp),%r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,8(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 24(%rsp),%r13 mov 0(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 88(%rsp),%r12 add 16(%rsp),%r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,16(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 32(%rsp),%r13 mov 8(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 96(%rsp),%r12 add 24(%rsp),%r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,24(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 40(%rsp),%r13 mov 16(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 104(%rsp),%r12 add 32(%rsp),%r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,32(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 48(%rsp),%r13 mov 24(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 112(%rsp),%r12 add 40(%rsp),%r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,40(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 56(%rsp),%r13 mov 32(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 120(%rsp),%r12 add 48(%rsp),%r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,48(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 64(%rsp),%r13 mov 40(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 0(%rsp),%r12 add 56(%rsp),%r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,56(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) mov 72(%rsp),%r13 mov 48(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 8(%rsp),%r12 add 64(%rsp),%r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,64(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 80(%rsp),%r13 mov 56(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 16(%rsp),%r12 add 72(%rsp),%r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,72(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 88(%rsp),%r13 mov 64(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 24(%rsp),%r12 add 80(%rsp),%r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,80(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 96(%rsp),%r13 mov 72(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 32(%rsp),%r12 add 88(%rsp),%r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,88(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 104(%rsp),%r13 mov 80(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 40(%rsp),%r12 add 96(%rsp),%r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,96(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 112(%rsp),%r13 mov 88(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 48(%rsp),%r12 add 104(%rsp),%r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,104(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 120(%rsp),%r13 mov 96(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 56(%rsp),%r12 add 112(%rsp),%r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,112(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 0(%rsp),%r13 mov 104(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 64(%rsp),%r12 add 120(%rsp),%r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,120(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) cmp $80,%rdi jb .Lrounds_16_xx mov 16*8+0*8(%rsp),%rdi lea 16*8(%rsi),%rsi add 8*0(%rdi),%rax add 8*1(%rdi),%rbx add 8*2(%rdi),%rcx add 8*3(%rdi),%rdx add 8*4(%rdi),%r8 add 8*5(%rdi),%r9 add 8*6(%rdi),%r10 add 8*7(%rdi),%r11 cmp 16*8+2*8(%rsp),%rsi mov %rax,8*0(%rdi) mov %rbx,8*1(%rdi) mov %rcx,8*2(%rdi) mov %rdx,8*3(%rdi) mov %r8,8*4(%rdi) mov %r9,8*5(%rdi) mov %r10,8*6(%rdi) mov %r11,8*7(%rdi) jb .Lloop mov 16*8+3*8(%rsp),%rsp .cfi_def_cfa %rsp,56 pop %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 pop %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 pop %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 pop %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 pop %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp pop %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx RET .cfi_endproc SET_SIZE(SHA512TransformBlocks) .section .rodata -.align 64 +.balign 64 SET_OBJ(K512) K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 #endif /* !lint && !__lint */ #if defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/module/lua/setjmp/setjmp_aarch64.S b/module/lua/setjmp/setjmp_aarch64.S index a5a9a85fd57e..040ef1821ab0 100644 --- a/module/lua/setjmp/setjmp_aarch64.S +++ b/module/lua/setjmp/setjmp_aarch64.S @@ -1,86 +1,86 @@ /*- * Copyright (c) 2014 Andrew Turner * Copyright (c) 2014-2015 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Andrew Turner * under sponsorship from the FreeBSD Foundation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifdef __aarch64__ #define ENTRY(sym) \ .text; \ .globl sym; \ - .align 2; \ + .balign 2; \ .type sym,#function; \ sym: #define END(sym) \ .size sym, . - sym ENTRY(setjmp) /* Store the stack pointer */ mov x8, sp str x8, [x0], #8 /* Store the general purpose registers and lr */ stp x19, x20, [x0], #16 stp x21, x22, [x0], #16 stp x23, x24, [x0], #16 stp x25, x26, [x0], #16 stp x27, x28, [x0], #16 stp x29, x30, [x0], #16 /* Return value */ mov x0, #0 ret END(setjmp) ENTRY(longjmp) /* Restore the stack pointer */ ldr x8, [x0], #8 mov sp, x8 /* Restore the general purpose registers and lr */ ldp x19, x20, [x0], #16 ldp x21, x22, [x0], #16 ldp x23, x24, [x0], #16 ldp x25, x26, [x0], #16 ldp x27, x28, [x0], #16 ldp x29, x30, [x0], #16 /* Load the return value */ mov x0, x1 ret END(longjmp) #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif #endif /* __aarch64__ */ diff --git a/module/lua/setjmp/setjmp_arm.S b/module/lua/setjmp/setjmp_arm.S index 78bc3e0b347d..0b18a96282cf 100644 --- a/module/lua/setjmp/setjmp_arm.S +++ b/module/lua/setjmp/setjmp_arm.S @@ -1,84 +1,84 @@ /*- * Copyright 2004-2014 Olivier Houchard * Copyright 2012-2014 Ian Lepore * Copyright 2013-2014 Andrew Turner * Copyright 2014 Svatopluk Kraus * Copyright 2014 Michal Meloun * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(__arm__) && !defined(__aarch64__) #if defined(__thumb2__) #define _FUNC_MODE .code 16; .thumb_func #else #define _FUNC_MODE .code 32 #endif #define ENTRY(x) \ .text; \ .syntax unified; \ - .align 2; \ + .balign 2; \ .global x; \ .type x,#function; \ _FUNC_MODE; \ x: #define END(x) \ .size x, . - x; #define RET bx lr /* * setjump + longjmp */ ENTRY(setjmp) #if defined(__thumb2__) mov ip, sp stmia r0, {r4-r12,r14} #else stmia r0, {r4-r14} #endif mov r0, #0x00000000 RET END(setjmp) ENTRY(longjmp) #if defined(__thumb2__) ldmia r0, {r4-r12,r14} mov sp, ip #else ldmia r0, {r4-r14} #endif mov r0, #0x00000001 RET END(longjmp) #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif #endif diff --git a/module/lua/setjmp/setjmp_i386.S b/module/lua/setjmp/setjmp_i386.S index 0d0adfc351ca..87f9cb08c292 100644 --- a/module/lua/setjmp/setjmp_i386.S +++ b/module/lua/setjmp/setjmp_i386.S @@ -1,69 +1,69 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ #define ENTRY(x) \ .text; \ - .align 8; \ + .balign 8; \ .globl x; \ .type x, @function; \ x: #define SET_SIZE(x) \ .size x, [.-x] /* * Setjmp and longjmp implement non-local gotos using state vectors * type label_t. */ #ifdef __i386__ ENTRY(setjmp) /* save area is passed in eax */ movl %ebp, 0(%eax) /* save ebp */ movl %ebx, 4(%eax) /* save ebx */ movl %esi, 8(%eax) /* save esi */ movl %edi, 12(%eax) /* save edi */ movl %esp, 16(%eax) /* save esp */ movl (%esp), %ecx /* %eip (return address) */ movl %ecx, 20(%eax) /* save eip */ subl %eax, %eax /* return 0 */ ret SET_SIZE(setjmp) ENTRY(longjmp) /* save area is passed in eax */ movl 0(%eax), %ebp /* restore ebp */ movl 4(%eax), %ebx /* restore ebx */ movl 8(%eax), %esi /* restore esi */ movl 12(%eax), %edi /* restore edi */ movl 16(%eax), %esp /* restore esp */ movl 20(%eax), %ecx /* %eip (return address) */ addl $4, %esp /* pop ret adr */ jmp *%ecx /* indirect jump */ SET_SIZE(longjmp) #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif #endif /* __i386__ */ diff --git a/module/lua/setjmp/setjmp_ppc.S b/module/lua/setjmp/setjmp_ppc.S index 72aa5d5ab5b0..a035cd11b33b 100644 --- a/module/lua/setjmp/setjmp_ppc.S +++ b/module/lua/setjmp/setjmp_ppc.S @@ -1,165 +1,165 @@ /* $FreeBSD$ */ /* from: NetBSD: setjmp.S,v 1.1 1998/01/27 15:13:12 sakamoto Exp $ */ /* from: OpenBSD: setjmp.S,v 1.2 1996/12/28 06:22:18 rahnds Exp */ /* kernel version of this file, does not have signal goop */ /* int setjmp(jmp_buf env) */ #define _ASM #include #ifdef __powerpc64__ #if !defined(PPC64_ELF_ABI_v2) && !defined(PPC64_ELF_ABI_v1) #if defined(_CALL_ELF) && _CALL_ELF == 2 #define PPC64_ELF_ABI_v2 #endif /* _CALL_ELF */ #endif /* PPC64_ELF_ABI_ */ #endif /* __powerpc64__ */ #ifdef __powerpc64__ #define LD_REG ld #define ST_REG std #define REGWIDTH 8 #else #define LD_REG lwz #define ST_REG stw #define REGWIDTH 4 #endif /* __powerpc64__ */ #define JMP_r1 1*REGWIDTH #define JMP_r2 2*REGWIDTH #define JMP_r14 3*REGWIDTH #define JMP_r15 4*REGWIDTH #define JMP_r16 5*REGWIDTH #define JMP_r17 6*REGWIDTH #define JMP_r18 7*REGWIDTH #define JMP_r19 8*REGWIDTH #define JMP_r20 9*REGWIDTH #define JMP_r21 10*REGWIDTH #define JMP_r22 11*REGWIDTH #define JMP_r23 12*REGWIDTH #define JMP_r24 13*REGWIDTH #define JMP_r25 14*REGWIDTH #define JMP_r26 15*REGWIDTH #define JMP_r27 16*REGWIDTH #define JMP_r28 17*REGWIDTH #define JMP_r29 18*REGWIDTH #define JMP_r30 19*REGWIDTH #define JMP_r31 20*REGWIDTH #define JMP_lr 21*REGWIDTH #define JMP_cr 22*REGWIDTH #define JMP_ctr 23*REGWIDTH #define JMP_xer 24*REGWIDTH #ifdef __powerpc64__ #ifdef PPC64_ELF_ABI_v2 #define ENTRY(name) \ - .align 2 ; \ + .balign 2 ; \ .type name,@function; \ .weak name; \ name: #else /* PPC64_ELF_ABI_v1 */ #define XGLUE(a,b) a##b #define GLUE(a,b) XGLUE(a,b) #define ENTRY(name) \ - .align 2 ; \ + .balign 2 ; \ .weak name; \ .weak GLUE(.,name); \ .pushsection ".opd","aw"; \ name: \ .quad GLUE(.,name); \ .quad .TOC.@tocbase; \ .quad 0; \ .popsection; \ .type GLUE(.,name),@function; \ GLUE(.,name): #endif /* PPC64_ELF_ABI_v2 */ #else /* 32-bit */ #define ENTRY(name) \ .text; \ .p2align 4; \ .weak name; \ .type name,@function; \ name: #endif /* __powerpc64__ */ ENTRY(setjmp) ST_REG 31, JMP_r31(3) /* r1, r2, r14-r30 */ ST_REG 1, JMP_r1 (3) ST_REG 2, JMP_r2 (3) ST_REG 14, JMP_r14(3) ST_REG 15, JMP_r15(3) ST_REG 16, JMP_r16(3) ST_REG 17, JMP_r17(3) ST_REG 18, JMP_r18(3) ST_REG 19, JMP_r19(3) ST_REG 20, JMP_r20(3) ST_REG 21, JMP_r21(3) ST_REG 22, JMP_r22(3) ST_REG 23, JMP_r23(3) ST_REG 24, JMP_r24(3) ST_REG 25, JMP_r25(3) ST_REG 26, JMP_r26(3) ST_REG 27, JMP_r27(3) ST_REG 28, JMP_r28(3) ST_REG 29, JMP_r29(3) ST_REG 30, JMP_r30(3) /* cr, lr, ctr, xer */ mfcr 0 ST_REG 0, JMP_cr(3) mflr 0 ST_REG 0, JMP_lr(3) mfctr 0 ST_REG 0, JMP_ctr(3) mfxer 0 ST_REG 0, JMP_xer(3) /* f14-f31, fpscr */ li 3, 0 blr ENTRY(longjmp) LD_REG 31, JMP_r31(3) /* r1, r2, r14-r30 */ LD_REG 1, JMP_r1 (3) LD_REG 2, JMP_r2 (3) LD_REG 14, JMP_r14(3) LD_REG 15, JMP_r15(3) LD_REG 16, JMP_r16(3) LD_REG 17, JMP_r17(3) LD_REG 18, JMP_r18(3) LD_REG 19, JMP_r19(3) LD_REG 20, JMP_r20(3) LD_REG 21, JMP_r21(3) LD_REG 22, JMP_r22(3) LD_REG 23, JMP_r23(3) LD_REG 24, JMP_r24(3) LD_REG 25, JMP_r25(3) LD_REG 26, JMP_r26(3) LD_REG 27, JMP_r27(3) LD_REG 28, JMP_r28(3) LD_REG 29, JMP_r29(3) LD_REG 30, JMP_r30(3) /* cr, lr, ctr, xer */ LD_REG 0, JMP_cr(3) mtcr 0 LD_REG 0, JMP_lr(3) mtlr 0 LD_REG 0, JMP_ctr(3) mtctr 0 LD_REG 0, JMP_xer(3) mtxer 0 /* f14-f31, fpscr */ mr 3, 4 blr #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/lua/setjmp/setjmp_sparc64.S b/module/lua/setjmp/setjmp_sparc64.S index a37a71cbce33..e1099643de92 100644 --- a/module/lua/setjmp/setjmp_sparc64.S +++ b/module/lua/setjmp/setjmp_sparc64.S @@ -1,105 +1,105 @@ /* * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This software was developed by the Computer Systems Engineering group * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and * contributed to Berkeley. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Header: _setjmp.s,v 1.1 91/07/06 16:45:53 torek Exp */ #if defined(LIBC_SCCS) && !defined(lint) #if 0 .asciz "@(#)_setjmp.s 8.1 (Berkeley) 6/4/93" #else RCSID("$NetBSD: _setjmp.S,v 1.4 1998/10/08 02:27:59 eeh Exp $") #endif #endif /* LIBC_SCCS and not lint */ #define _JB_FP 0x0 #define _JB_PC 0x8 #define _JB_SP 0x10 .register %g2,#ignore .register %g3,#ignore #define ENTRY(x) \ .text ; \ - .align 32 ; \ + .balign 32 ; \ .globl x ; \ .type x,@function ; \ x: #define END(x) \ .size x, . - x /* * C library -- setjmp, longjmp * * longjmp(a,v) * will generate a "return(v?v:1)" from * the last call to * setjmp(a) * by restoring the previous context. */ ENTRY(setjmp) stx %sp, [%o0 + _JB_SP] stx %o7, [%o0 + _JB_PC] stx %fp, [%o0 + _JB_FP] retl clr %o0 END(setjmp) ENTRY(longjmp) mov 1, %g1 movrnz %o1, %o1, %g1 mov %o0, %g2 ldx [%g2 + _JB_FP], %g3 1: cmp %fp, %g3 bl,a 1b restore be,a 2f ldx [%g2 + _JB_SP], %o0 .Lbotch: illtrap 2: cmp %o0, %sp bge,a 3f mov %o0, %sp b,a .Lbotch nop 3: ldx [%g2 + _JB_PC], %o7 retl mov %g1, %o0 END(longjmp) #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif