diff --git a/module/icp/asm-x86_64/aes/aes_aesni.S b/module/icp/asm-x86_64/aes/aes_aesni.S index 4a80c62097ae..b0d9f03af2c8 100644 --- a/module/icp/asm-x86_64/aes/aes_aesni.S +++ b/module/icp/asm-x86_64/aes/aes_aesni.S @@ -1,748 +1,748 @@ /* * ==================================================================== * Written by Intel Corporation for the OpenSSL project to add support * for Intel AES-NI instructions. Rights for redistribution and usage * in source and binary forms are granted according to the OpenSSL * license. * * Author: Huang Ying * Vinodh Gopal * Kahraman Akdemir * * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) * instructions that are going to be introduced in the next generation * of Intel processor, as of 2009. These instructions enable fast and * secure data encryption and decryption, using the Advanced Encryption * Standard (AES), defined by FIPS Publication number 197. The * architecture introduces six instructions that offer full hardware * support for AES. Four of them support high performance data * encryption and decryption, and the other two instructions support * the AES key expansion procedure. * ==================================================================== */ /* * ==================================================================== * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. All advertising materials mentioning features or use of this * software must display the following acknowledgment: * "This product includes software developed by the OpenSSL Project * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" * * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to * endorse or promote products derived from this software without * prior written permission. For written permission, please contact * openssl-core@openssl.org. * * 5. Products derived from this software may not be called "OpenSSL" * nor may "OpenSSL" appear in their names without prior written * permission of the OpenSSL Project. * * 6. Redistributions of any form whatsoever must retain the following * acknowledgment: * "This product includes software developed by the OpenSSL Project * for use in the OpenSSL Toolkit (http://www.openssl.org/)" * * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED * OF THE POSSIBILITY OF SUCH DAMAGE. * ==================================================================== */ /* * ==================================================================== * OpenSolaris OS modifications * * This source originates as files aes-intel.S and eng_aesni_asm.pl, in * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by * Huang Ying of Intel to the openssl-dev mailing list under the subject * of "Add support to Intel AES-NI instruction set for x86_64 platform". * * This OpenSolaris version has these major changes from the original source: * * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function * definitions for lint. * * 2. Formatted code, added comments, and added #includes and #defines. * * 3. If bit CR0.TS is set, clear and set the TS bit, after and before * calling kpreempt_disable() and kpreempt_enable(). * If the TS bit is not set, Save and restore %xmm registers at the beginning * and end of function calls (%xmm* registers are not saved and restored by * during kernel thread preemption). * * 4. Renamed functions, reordered parameters, and changed return value * to match OpenSolaris: * * OpenSSL interface: * int intel_AES_set_encrypt_key(const unsigned char *userKey, * const int bits, AES_KEY *key); * int intel_AES_set_decrypt_key(const unsigned char *userKey, * const int bits, AES_KEY *key); * Return values for above are non-zero on error, 0 on success. * * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key); * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key); * typedef struct aes_key_st { * unsigned int rd_key[4 *(AES_MAXNR + 1)]; * int rounds; * unsigned int pad[3]; * } AES_KEY; * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules * (ks32) instead of 64-bit (ks64). * Number of rounds (aka round count) is at offset 240 of AES_KEY. * * OpenSolaris OS interface (#ifdefs removed for readability): * int rijndael_key_setup_dec_intel(uint32_t rk[], * const uint32_t cipherKey[], uint64_t keyBits); * int rijndael_key_setup_enc_intel(uint32_t rk[], * const uint32_t cipherKey[], uint64_t keyBits); * Return values for above are 0 on error, number of rounds on success. * * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4]); * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4]); * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]; * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t; * * typedef union { * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; * } aes_ks_t; * typedef struct aes_key { * aes_ks_t encr_ks, decr_ks; * long double align128; * int flags, nr, type; * } aes_key_t; * * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, * ct is crypto text, and MAX_AES_NR is 14. * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. * * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. * * ==================================================================== */ #if defined(lint) || defined(__lint) #include /* ARGSUSED */ void aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]) { } /* ARGSUSED */ void aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]) { } /* ARGSUSED */ int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits) { return (0); } /* ARGSUSED */ int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits) { return (0); } #elif defined(HAVE_AES) /* guard by instruction set */ #define _ASM #include /* * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(), * _key_expansion_256a(), _key_expansion_256b() * * Helper functions called by rijndael_key_setup_inc_intel(). * Also used indirectly by rijndael_key_setup_dec_intel(). * * Input: * %xmm0 User-provided cipher key * %xmm1 Round constant * Output: * (%rcx) AES key */ ENTRY_NP2(_key_expansion_128, _key_expansion_256a) _key_expansion_128_local: _key_expansion_256a_local: pshufd $0b11111111, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movups %xmm0, (%rcx) add $0x10, %rcx - ret + RET nop SET_SIZE(_key_expansion_128) SET_SIZE(_key_expansion_256a) ENTRY_NP(_key_expansion_192a) _key_expansion_192a_local: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movups %xmm2, %xmm5 movups %xmm2, %xmm6 pslldq $4, %xmm5 pshufd $0b11111111, %xmm0, %xmm3 pxor %xmm3, %xmm2 pxor %xmm5, %xmm2 movups %xmm0, %xmm1 shufps $0b01000100, %xmm0, %xmm6 movups %xmm6, (%rcx) shufps $0b01001110, %xmm2, %xmm1 movups %xmm1, 0x10(%rcx) add $0x20, %rcx - ret + RET SET_SIZE(_key_expansion_192a) ENTRY_NP(_key_expansion_192b) _key_expansion_192b_local: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 pxor %xmm4, %xmm0 shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 movups %xmm2, %xmm5 pslldq $4, %xmm5 pshufd $0b11111111, %xmm0, %xmm3 pxor %xmm3, %xmm2 pxor %xmm5, %xmm2 movups %xmm0, (%rcx) add $0x10, %rcx - ret + RET SET_SIZE(_key_expansion_192b) ENTRY_NP(_key_expansion_256b) _key_expansion_256b_local: pshufd $0b10101010, %xmm1, %xmm1 shufps $0b00010000, %xmm2, %xmm4 pxor %xmm4, %xmm2 shufps $0b10001100, %xmm2, %xmm4 pxor %xmm4, %xmm2 pxor %xmm1, %xmm2 movups %xmm2, (%rcx) add $0x10, %rcx - ret + RET SET_SIZE(_key_expansion_256b) /* * rijndael_key_setup_enc_intel() * Expand the cipher key into the encryption key schedule. * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored. * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set * on entry. Otherwise, if TS is not set, save and restore %xmm registers * on the stack. * * OpenSolaris interface: * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], * uint64_t keyBits); * Return value is 0 on error, number of rounds on success. * * Original Intel OpenSSL interface: * int intel_AES_set_encrypt_key(const unsigned char *userKey, * const int bits, AES_KEY *key); * Return value is non-zero on error, 0 on success. */ #ifdef OPENSSL_INTERFACE #define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key #define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key #define USERCIPHERKEY rdi /* P1, 64 bits */ #define KEYSIZE32 esi /* P2, 32 bits */ #define KEYSIZE64 rsi /* P2, 64 bits */ #define AESKEY rdx /* P3, 64 bits */ #else /* OpenSolaris Interface */ #define AESKEY rdi /* P1, 64 bits */ #define USERCIPHERKEY rsi /* P2, 64 bits */ #define KEYSIZE32 edx /* P3, 32 bits */ #define KEYSIZE64 rdx /* P3, 64 bits */ #endif /* OPENSSL_INTERFACE */ #define ROUNDS32 KEYSIZE32 /* temp */ #define ROUNDS64 KEYSIZE64 /* temp */ #define ENDAESKEY USERCIPHERKEY /* temp */ ENTRY_NP(rijndael_key_setup_enc_intel) rijndael_key_setup_enc_intel_local: FRAME_BEGIN // NULL pointer sanity check test %USERCIPHERKEY, %USERCIPHERKEY jz .Lenc_key_invalid_param test %AESKEY, %AESKEY jz .Lenc_key_invalid_param movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes) movups %xmm0, (%AESKEY) lea 0x10(%AESKEY), %rcx // key addr pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x cmp $256, %KEYSIZE32 jnz .Lenc_key192 // AES 256: 14 rounds in encryption key schedule #ifdef OPENSSL_INTERFACE mov $14, %ROUNDS32 movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14 #endif /* OPENSSL_INTERFACE */ movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes) movups %xmm2, (%rcx) add $0x10, %rcx aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x1, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x2, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x4, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x8, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x10, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local aeskeygenassist $0x20, %xmm0, %xmm1 call _key_expansion_256b_local aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key call _key_expansion_256a_local #ifdef OPENSSL_INTERFACE xor %rax, %rax // return 0 (OK) #else /* Open Solaris Interface */ mov $14, %rax // return # rounds = 14 #endif FRAME_END - ret + RET .align 4 .Lenc_key192: cmp $192, %KEYSIZE32 jnz .Lenc_key128 // AES 192: 12 rounds in encryption key schedule #ifdef OPENSSL_INTERFACE mov $12, %ROUNDS32 movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12 #endif /* OPENSSL_INTERFACE */ movq 0x10(%USERCIPHERKEY), %xmm2 // other user key aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key call _key_expansion_192a_local aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key call _key_expansion_192b_local aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key call _key_expansion_192a_local aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key call _key_expansion_192b_local aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key call _key_expansion_192a_local aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key call _key_expansion_192b_local aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key call _key_expansion_192a_local aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key call _key_expansion_192b_local #ifdef OPENSSL_INTERFACE xor %rax, %rax // return 0 (OK) #else /* OpenSolaris Interface */ mov $12, %rax // return # rounds = 12 #endif FRAME_END - ret + RET .align 4 .Lenc_key128: cmp $128, %KEYSIZE32 jnz .Lenc_key_invalid_key_bits // AES 128: 10 rounds in encryption key schedule #ifdef OPENSSL_INTERFACE mov $10, %ROUNDS32 movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10 #endif /* OPENSSL_INTERFACE */ aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key call _key_expansion_128_local aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key call _key_expansion_128_local #ifdef OPENSSL_INTERFACE xor %rax, %rax // return 0 (OK) #else /* OpenSolaris Interface */ mov $10, %rax // return # rounds = 10 #endif FRAME_END - ret + RET .Lenc_key_invalid_param: #ifdef OPENSSL_INTERFACE mov $-1, %rax // user key or AES key pointer is NULL FRAME_END - ret + RET #else /* FALLTHROUGH */ #endif /* OPENSSL_INTERFACE */ .Lenc_key_invalid_key_bits: #ifdef OPENSSL_INTERFACE mov $-2, %rax // keysize is invalid #else /* Open Solaris Interface */ xor %rax, %rax // a key pointer is NULL or invalid keysize #endif /* OPENSSL_INTERFACE */ FRAME_END - ret + RET SET_SIZE(rijndael_key_setup_enc_intel) /* * rijndael_key_setup_dec_intel() * Expand the cipher key into the decryption key schedule. * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored. * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set * on entry. Otherwise, if TS is not set, save and restore %xmm registers * on the stack. * * OpenSolaris interface: * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], * uint64_t keyBits); * Return value is 0 on error, number of rounds on success. * P1->P2, P2->P3, P3->P1 * * Original Intel OpenSSL interface: * int intel_AES_set_decrypt_key(const unsigned char *userKey, * const int bits, AES_KEY *key); * Return value is non-zero on error, 0 on success. */ ENTRY_NP(rijndael_key_setup_dec_intel) FRAME_BEGIN // Generate round keys used for encryption call rijndael_key_setup_enc_intel_local test %rax, %rax #ifdef OPENSSL_INTERFACE jnz .Ldec_key_exit // Failed if returned non-0 #else /* OpenSolaris Interface */ jz .Ldec_key_exit // Failed if returned 0 #endif /* OPENSSL_INTERFACE */ /* * Convert round keys used for encryption * to a form usable for decryption */ #ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */ mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14) // (already set for OpenSSL) #endif lea 0x10(%AESKEY), %rcx // key addr shl $4, %ROUNDS32 add %AESKEY, %ROUNDS64 mov %ROUNDS64, %ENDAESKEY .align 4 .Ldec_key_reorder_loop: movups (%AESKEY), %xmm0 movups (%ROUNDS64), %xmm1 movups %xmm0, (%ROUNDS64) movups %xmm1, (%AESKEY) lea 0x10(%AESKEY), %AESKEY lea -0x10(%ROUNDS64), %ROUNDS64 cmp %AESKEY, %ROUNDS64 ja .Ldec_key_reorder_loop .align 4 .Ldec_key_inv_loop: movups (%rcx), %xmm0 // Convert an encryption round key to a form usable for decryption // with the "AES Inverse Mix Columns" instruction aesimc %xmm0, %xmm1 movups %xmm1, (%rcx) lea 0x10(%rcx), %rcx cmp %ENDAESKEY, %rcx jnz .Ldec_key_inv_loop .Ldec_key_exit: // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error // OpenSSL: rax = 0 for OK, or non-zero for error FRAME_END - ret + RET SET_SIZE(rijndael_key_setup_dec_intel) /* * aes_encrypt_intel() * Encrypt a single block (in and out can overlap). * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored. * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set * on entry. Otherwise, if TS is not set, save and restore %xmm registers * on the stack. * * Temporary register usage: * %xmm0 State * %xmm1 Key * * Original OpenSolaris Interface: * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4]) * * Original Intel OpenSSL Interface: * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key) */ #ifdef OPENSSL_INTERFACE #define aes_encrypt_intel intel_AES_encrypt #define aes_decrypt_intel intel_AES_decrypt #define INP rdi /* P1, 64 bits */ #define OUTP rsi /* P2, 64 bits */ #define KEYP rdx /* P3, 64 bits */ /* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */ #define NROUNDS32 ecx /* temporary, 32 bits */ #define NROUNDS cl /* temporary, 8 bits */ #else /* OpenSolaris Interface */ #define KEYP rdi /* P1, 64 bits */ #define NROUNDS esi /* P2, 32 bits */ #define INP rdx /* P3, 64 bits */ #define OUTP rcx /* P4, 64 bits */ #endif /* OPENSSL_INTERFACE */ #define STATE xmm0 /* temporary, 128 bits */ #define KEY xmm1 /* temporary, 128 bits */ ENTRY_NP(aes_encrypt_intel) movups (%INP), %STATE // input movups (%KEYP), %KEY // key #ifdef OPENSSL_INTERFACE mov 240(%KEYP), %NROUNDS32 // round count #else /* OpenSolaris Interface */ /* Round count is already present as P2 in %rsi/%esi */ #endif /* OPENSSL_INTERFACE */ pxor %KEY, %STATE // round 0 lea 0x30(%KEYP), %KEYP cmp $12, %NROUNDS jb .Lenc128 lea 0x20(%KEYP), %KEYP je .Lenc192 // AES 256 lea 0x20(%KEYP), %KEYP movups -0x60(%KEYP), %KEY aesenc %KEY, %STATE movups -0x50(%KEYP), %KEY aesenc %KEY, %STATE .align 4 .Lenc192: // AES 192 and 256 movups -0x40(%KEYP), %KEY aesenc %KEY, %STATE movups -0x30(%KEYP), %KEY aesenc %KEY, %STATE .align 4 .Lenc128: // AES 128, 192, and 256 movups -0x20(%KEYP), %KEY aesenc %KEY, %STATE movups -0x10(%KEYP), %KEY aesenc %KEY, %STATE movups (%KEYP), %KEY aesenc %KEY, %STATE movups 0x10(%KEYP), %KEY aesenc %KEY, %STATE movups 0x20(%KEYP), %KEY aesenc %KEY, %STATE movups 0x30(%KEYP), %KEY aesenc %KEY, %STATE movups 0x40(%KEYP), %KEY aesenc %KEY, %STATE movups 0x50(%KEYP), %KEY aesenc %KEY, %STATE movups 0x60(%KEYP), %KEY aesenc %KEY, %STATE movups 0x70(%KEYP), %KEY aesenclast %KEY, %STATE // last round movups %STATE, (%OUTP) // output - ret + RET SET_SIZE(aes_encrypt_intel) /* * aes_decrypt_intel() * Decrypt a single block (in and out can overlap). * * For kernel code, caller is responsible for ensuring kpreempt_disable() * has been called. This is because %xmm registers are not saved/restored. * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set * on entry. Otherwise, if TS is not set, save and restore %xmm registers * on the stack. * * Temporary register usage: * %xmm0 State * %xmm1 Key * * Original OpenSolaris Interface: * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * * Original Intel OpenSSL Interface: * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, * const AES_KEY *key); */ ENTRY_NP(aes_decrypt_intel) movups (%INP), %STATE // input movups (%KEYP), %KEY // key #ifdef OPENSSL_INTERFACE mov 240(%KEYP), %NROUNDS32 // round count #else /* OpenSolaris Interface */ /* Round count is already present as P2 in %rsi/%esi */ #endif /* OPENSSL_INTERFACE */ pxor %KEY, %STATE // round 0 lea 0x30(%KEYP), %KEYP cmp $12, %NROUNDS jb .Ldec128 lea 0x20(%KEYP), %KEYP je .Ldec192 // AES 256 lea 0x20(%KEYP), %KEYP movups -0x60(%KEYP), %KEY aesdec %KEY, %STATE movups -0x50(%KEYP), %KEY aesdec %KEY, %STATE .align 4 .Ldec192: // AES 192 and 256 movups -0x40(%KEYP), %KEY aesdec %KEY, %STATE movups -0x30(%KEYP), %KEY aesdec %KEY, %STATE .align 4 .Ldec128: // AES 128, 192, and 256 movups -0x20(%KEYP), %KEY aesdec %KEY, %STATE movups -0x10(%KEYP), %KEY aesdec %KEY, %STATE movups (%KEYP), %KEY aesdec %KEY, %STATE movups 0x10(%KEYP), %KEY aesdec %KEY, %STATE movups 0x20(%KEYP), %KEY aesdec %KEY, %STATE movups 0x30(%KEYP), %KEY aesdec %KEY, %STATE movups 0x40(%KEYP), %KEY aesdec %KEY, %STATE movups 0x50(%KEYP), %KEY aesdec %KEY, %STATE movups 0x60(%KEYP), %KEY aesdec %KEY, %STATE movups 0x70(%KEYP), %KEY aesdeclast %KEY, %STATE // last round movups %STATE, (%OUTP) // output - ret + RET SET_SIZE(aes_decrypt_intel) #endif /* lint || __lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S index 9db3a3179230..931d2480609c 100644 --- a/module/icp/asm-x86_64/aes/aes_amd64.S +++ b/module/icp/asm-x86_64/aes/aes_amd64.S @@ -1,906 +1,906 @@ /* * --------------------------------------------------------------------------- * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. * * LICENSE TERMS * * The free distribution and use of this software is allowed (with or without * changes) provided that: * * 1. source code distributions include the above copyright notice, this * list of conditions and the following disclaimer; * * 2. binary distributions include the above copyright notice, this list * of conditions and the following disclaimer in their documentation; * * 3. the name of the copyright holder is not used to endorse products * built using this software without specific written permission. * * DISCLAIMER * * This software is provided 'as is' with no explicit or implied warranties * in respect of its properties, including, but not limited to, correctness * and/or fitness for purpose. * --------------------------------------------------------------------------- * Issue 20/12/2007 * * I am grateful to Dag Arne Osvik for many discussions of the techniques that * can be used to optimise AES assembler code on AMD64/EM64T architectures. * Some of the techniques used in this implementation are the result of * suggestions made by him for which I am most grateful. * * An AES implementation for AMD64 processors using the YASM assembler. This * implementation provides only encryption, decryption and hence requires key * scheduling support in C. It uses 8k bytes of tables but its encryption and * decryption performance is very close to that obtained using large tables. * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, * which are as follows: * ms windows gnu/linux/opensolaris os * * in_blk rcx rdi * out_blk rdx rsi * context (cx) r8 rdx * * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 * registers rdi - on both * * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 * registers - rdi on both * * The convention used here is that for gnu/linux/opensolaris os. * * This code provides the standard AES block size (128 bits, 16 bytes) and the * three standard AES key sizes (128, 192 and 256 bits). It has the same call * interface as my C implementation. It uses the Microsoft C AMD64 calling * conventions in which the three parameters are placed in rcx, rdx and r8 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. * * OpenSolaris Note: * Modified to use GNU/Linux/Solaris calling conventions. * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. * * AES_RETURN aes_encrypt(const unsigned char in_blk[], * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt(const unsigned char in_blk[], * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_encrypt_key(const unsigned char key[], * const aes_encrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt_key(const unsigned char key[], * const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_encrypt_key(const unsigned char key[], * unsigned int len, const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt_key(const unsigned char key[], * unsigned int len, const aes_decrypt_ctx cx[1])/ * * where is 128, 102 or 256. In the last two calls the length can be in * either bits or bytes. * * Comment in/out the following lines to obtain the desired subroutines. These * selections MUST match those in the C header file aesopt.h */ #define AES_REV_DKS /* define if key decryption schedule is reversed */ #define LAST_ROUND_TABLES /* define for the faster version using extra tables */ /* * The encryption key schedule has the following in memory layout where N is the * number of rounds (10, 12 or 14): * * lo: | input key (round 0) | / each round is four 32-bit words * | encryption round 1 | * | encryption round 2 | * .... * | encryption round N-1 | * hi: | encryption round N | * * The decryption key schedule is normally set up so that it has the same * layout as above by actually reversing the order of the encryption key * schedule in memory (this happens when AES_REV_DKS is set): * * lo: | decryption round 0 | = | encryption round N | * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] * .... .... * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] * hi: | decryption round N | = | input key (round 0) | * * with rounds except the first and last modified using inv_mix_column() * But if AES_REV_DKS is NOT set the order of keys is left as it is for * encryption so that it has to be accessed in reverse when used for * decryption (although the inverse mix column modifications are done) * * lo: | decryption round 0 | = | input key (round 0) | * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] * .... .... * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] * hi: | decryption round N | = | encryption round N | * * This layout is faster when the assembler key scheduling provided here * is used. * * End of user defines */ /* * --------------------------------------------------------------------------- * OpenSolaris OS modifications * * This source originates from Brian Gladman file aes_amd64.asm * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip * with these changes: * * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, * AES_128, AES_192, AES_256, AES_VAR ifdefs. * * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define * * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef * * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax * (operands reversed, literals prefixed with "$", registers prefixed with "%", * and "[register+offset]", addressing changed to "offset(register)", * parenthesis in constant expressions "()" changed to square brackets "[]", * "." removed from local (numeric) labels, and other changes. * Examples: * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax * mov rax,(4*20h) mov $[4*0x20],%rax * mov rax,[ebx+20h] mov 0x20(%ebx),%rax * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax * * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function * definitions for lint. * * 6. Renamed functions and reordered parameters to match OpenSolaris: * Original Gladman interface: * int aes_encrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ * int aes_decrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, * and a union type, inf., containing inf.l, a uint32_t and * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is * used and contains the key schedule length * 16 where key schedule length is * 10, 12, or 14 bytes. * * OpenSolaris OS interface: * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, * ct is crypto text, and MAX_AES_NR is 14. * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. */ #if defined(lint) || defined(__lint) #include /* ARGSUSED */ void aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]) { } /* ARGSUSED */ void aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]) { } #else #define _ASM #include #define KS_LENGTH 60 #define raxd eax #define rdxd edx #define rcxd ecx #define rbxd ebx #define rsid esi #define rdid edi #define raxb al #define rdxb dl #define rcxb cl #define rbxb bl #define rsib sil #define rdib dil // finite field multiplies by {02}, {04} and {08} #define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] #define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] #define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] // finite field multiplies required in table generation #define f3(x) [[f2(x)] ^ [x]] #define f9(x) [[f8(x)] ^ [x]] #define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] #define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] #define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] // macros for expanding S-box data #define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] #define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] #define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 #define enc_vals(x) \ .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) #define dec_vals(x) \ .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) #define tptr %rbp /* table pointer */ #define kptr %r8 /* key schedule pointer */ #define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ #define fk_ref(x, y) -16*x+fofs+4*y(kptr) #ifdef AES_REV_DKS #define rofs 128 #define ik_ref(x, y) -16*x+rofs+4*y(kptr) #else #define rofs -128 #define ik_ref(x, y) 16*x+rofs+4*y(kptr) #endif /* AES_REV_DKS */ #define tab_0(x) (tptr,x,8) #define tab_1(x) 3(tptr,x,8) #define tab_2(x) 2(tptr,x,8) #define tab_3(x) 1(tptr,x,8) #define tab_f(x) 1(tptr,x,8) #define tab_i(x) 7(tptr,x,8) #define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p1; \ \ mov p1, %eax; \ mov p2, %ebx; \ mov p3, %ecx; \ mov p4, %edx #ifdef LAST_ROUND_TABLES #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ add $2048, tptr; \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p1 #else #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ xor %esi, p1; \ rol $8, %edi; \ xor %edi, p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p3; \ xor %edi, p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ xor %esi, p2; \ rol $8, %edi; \ xor %edi, p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p4; \ xor %edi, p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ shr $16, %ecx; \ xor %esi, p3; \ rol $8, %edi; \ xor %edi, p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p1; \ xor %edi, p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ shr $16, %edx; \ xor %esi, p4; \ rol $8, %edi; \ xor %edi, p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p2; \ xor %edi, p1 #endif /* LAST_ROUND_TABLES */ #define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p3; \ \ mov p1, %eax; \ mov p2, %ebx; \ mov p3, %ecx; \ mov p4, %edx #ifdef LAST_ROUND_TABLES #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ add $2048, tptr; \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p3 #else #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %eax; \ xor %esi, p1; \ rol $8, %edi; \ xor %edi, p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p3; \ xor %edi, p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %ebx; \ xor %esi, p2; \ rol $8, %edi; \ xor %edi, p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p4; \ xor %edi, p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %ecx; \ xor %esi, p3; \ rol $8, %edi; \ xor %edi, p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p1; \ xor %edi, p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %edx; \ xor %esi, p4; \ rol $8, %edi; \ xor %edi, p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p2; \ xor %edi, p3 #endif /* LAST_ROUND_TABLES */ /* * OpenSolaris OS: * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * * Original interface: * int aes_encrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ */ .data .align 64 enc_tab: enc_vals(u8) #ifdef LAST_ROUND_TABLES // Last Round Tables: enc_vals(w8) #endif ENTRY_NP(aes_encrypt_amd64) #ifdef GLADMAN_INTERFACE // Original interface sub $[4*8], %rsp // gnu/linux/opensolaris binary interface mov %rsi, (%rsp) // output pointer (P2) mov %rdx, %r8 // context (P3) mov %rbx, 1*8(%rsp) // P1: input pointer in rdi mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) mov %r12, 3*8(%rsp) // P3: context in r8 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 #else // OpenSolaris OS interface sub $[4*8], %rsp // Make room on stack to save registers mov %rcx, (%rsp) // Save output pointer (P4) on stack mov %rdi, %r8 // context (P1) mov %rdx, %rdi // P3: save input pointer shl $4, %esi // P2: esi byte key length * 16 mov %rbx, 1*8(%rsp) // Save registers mov %rbp, 2*8(%rsp) mov %r12, 3*8(%rsp) // P1: context in r8 // P2: byte key length * 16 in esi // P3: input pointer in rdi // P4: output pointer in (rsp) #endif /* GLADMAN_INTERFACE */ lea enc_tab(%rip), tptr sub $fofs, kptr // Load input block into registers mov (%rdi), %eax mov 1*4(%rdi), %ebx mov 2*4(%rdi), %ecx mov 3*4(%rdi), %edx xor fofs(kptr), %eax xor fofs+4(kptr), %ebx xor fofs+8(kptr), %ecx xor fofs+12(kptr), %edx lea (kptr,%rsi), kptr // Jump based on byte key length * 16: cmp $[10*16], %esi je 3f cmp $[12*16], %esi je 2f cmp $[14*16], %esi je 1f mov $-1, %rax // error jmp 4f // Perform normal forward rounds 1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) 2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) 3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) // Copy results mov (%rsp), %rbx mov %r9d, (%rbx) mov %r10d, 4(%rbx) mov %r11d, 8(%rbx) mov %r12d, 12(%rbx) xor %rax, %rax 4: // Restore registers mov 1*8(%rsp), %rbx mov 2*8(%rsp), %rbp mov 3*8(%rsp), %r12 add $[4*8], %rsp - ret + RET SET_SIZE(aes_encrypt_amd64) /* * OpenSolaris OS: * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * * Original interface: * int aes_decrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ */ .data .align 64 dec_tab: dec_vals(v8) #ifdef LAST_ROUND_TABLES // Last Round Tables: dec_vals(w8) #endif ENTRY_NP(aes_decrypt_amd64) #ifdef GLADMAN_INTERFACE // Original interface sub $[4*8], %rsp // gnu/linux/opensolaris binary interface mov %rsi, (%rsp) // output pointer (P2) mov %rdx, %r8 // context (P3) mov %rbx, 1*8(%rsp) // P1: input pointer in rdi mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) mov %r12, 3*8(%rsp) // P3: context in r8 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 #else // OpenSolaris OS interface sub $[4*8], %rsp // Make room on stack to save registers mov %rcx, (%rsp) // Save output pointer (P4) on stack mov %rdi, %r8 // context (P1) mov %rdx, %rdi // P3: save input pointer shl $4, %esi // P2: esi byte key length * 16 mov %rbx, 1*8(%rsp) // Save registers mov %rbp, 2*8(%rsp) mov %r12, 3*8(%rsp) // P1: context in r8 // P2: byte key length * 16 in esi // P3: input pointer in rdi // P4: output pointer in (rsp) #endif /* GLADMAN_INTERFACE */ lea dec_tab(%rip), tptr sub $rofs, kptr // Load input block into registers mov (%rdi), %eax mov 1*4(%rdi), %ebx mov 2*4(%rdi), %ecx mov 3*4(%rdi), %edx #ifdef AES_REV_DKS mov kptr, %rdi lea (kptr,%rsi), kptr #else lea (kptr,%rsi), %rdi #endif xor rofs(%rdi), %eax xor rofs+4(%rdi), %ebx xor rofs+8(%rdi), %ecx xor rofs+12(%rdi), %edx // Jump based on byte key length * 16: cmp $[10*16], %esi je 3f cmp $[12*16], %esi je 2f cmp $[14*16], %esi je 1f mov $-1, %rax // error jmp 4f // Perform normal inverse rounds 1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) 2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) 3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) il_rnd(%r9d, %r10d, %r11d, %r12d, 0) // Copy results mov (%rsp), %rbx mov %r9d, (%rbx) mov %r10d, 4(%rbx) mov %r11d, 8(%rbx) mov %r12d, 12(%rbx) xor %rax, %rax 4: // Restore registers mov 1*8(%rsp), %rbx mov 2*8(%rsp), %rbp mov 3*8(%rsp), %r12 add $[4*8], %rsp - ret + RET SET_SIZE(aes_decrypt_amd64) #endif /* lint || __lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index dc71ae2c1c89..70e419c2e4ab 100644 --- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -1,1261 +1,1261 @@ # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # # AES-NI-CTR+GHASH stitch. # # February 2013 # # OpenSSL GCM implementation is organized in such way that its # performance is rather close to the sum of its streamed components, # in the context parallelized AES-NI CTR and modulo-scheduled # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation # was observed to perform significantly better than the sum of the # components on contemporary CPUs, the effort was deemed impossible to # justify. This module is based on combination of Intel submissions, # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles # pressure with notable relative improvement, achieving 1.0 cycle per # byte processed with 128-bit key on Haswell processor, 0.74 - on # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled # measurements for favourable packet size, one divisible by 96. # Applications using the EVP interface will observe a few percent # worse performance.] # # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf # Generated once from # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl # and modified for ICP. Modification are kept at a bare minimum to ease later # upstream merges. #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) .extern gcm_avx_can_use_movbe .text #ifdef HAVE_MOVBE .type _aesni_ctr32_ghash_6x,@function .align 32 _aesni_ctr32_ghash_6x: .cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x .align 32 .Loop6x: addl $100663296,%ebx jc .Lhandle_ctr32 vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movbeq 88(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 80(%r14),%r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movbeq 72(%r14),%r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movbeq 64(%r14),%r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movbeq 56(%r14),%r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movbeq 48(%r14),%r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movbeq 40(%r14),%r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movbeq 32(%r14),%r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movbeq 24(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 16(%r14),%r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movbeq 8(%r14),%r13 vaesenc %xmm1,%xmm13,%xmm13 movbeq 0(%r14),%r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 cmpl $14,%ebp // ICP does not zero key schedule. jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail .align 32 .Lhandle_ctr32: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32 .align 32 .Lenc_tail: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%r10 subq $0x6,%rdx jc .L6x_done vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x .L6x_done: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 .byte 0xf3,0xc3 .cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x #endif /* ifdef HAVE_MOVBE */ .type _aesni_ctr32_ghash_no_movbe_6x,@function .align 32 _aesni_ctr32_ghash_no_movbe_6x: .cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x_nmb .align 32 .Loop6x_nmb: addl $100663296,%ebx jc .Lhandle_ctr32_nmb vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32_nmb: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movq 88(%r14),%r13 bswapq %r13 vaesenc %xmm15,%xmm11,%xmm11 movq 80(%r14),%r12 bswapq %r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movq 72(%r14),%r13 bswapq %r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movq 64(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movq 56(%r14),%r13 bswapq %r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movq 48(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movq 40(%r14),%r13 bswapq %r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movq 32(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movq 24(%r14),%r13 bswapq %r13 vaesenc %xmm15,%xmm11,%xmm11 movq 16(%r14),%r12 bswapq %r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movq 8(%r14),%r13 bswapq %r13 vaesenc %xmm1,%xmm13,%xmm13 movq 0(%r14),%r12 bswapq %r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. jb .Lenc_tail_nmb vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 cmpl $14,%ebp // ICP does not zero key schedule. jb .Lenc_tail_nmb vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail_nmb .align 32 .Lhandle_ctr32_nmb: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32_nmb .align 32 .Lenc_tail_nmb: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%r10 subq $0x6,%rdx jc .L6x_done_nmb vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x_nmb .L6x_done_nmb: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 .byte 0xf3,0xc3 .cfi_endproc .size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,@function .align 32 aesni_gcm_decrypt: .cfi_startproc xorq %r10,%r10 cmpq $0x60,%rdx jb .Lgcm_dec_abort leaq (%rsp),%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 pushq %r9 .cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 vmovdqu (%r9),%xmm8 andq $-128,%rsp vmovdqu (%r11),%xmm0 leaq 128(%rcx),%rcx movq 32(%r9),%r9 leaq 32(%r9),%r9 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. vpshufb %xmm0,%xmm8,%xmm8 andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Ldec_no_key_aliasing cmpq $768,%r15 jnc .Ldec_no_key_aliasing subq %r15,%rsp .Ldec_no_key_aliasing: vmovdqu 80(%rdi),%xmm7 leaq (%rdi),%r14 vmovdqu 64(%rdi),%xmm4 leaq -192(%rdi,%rdx,1),%r15 vmovdqu 48(%rdi),%xmm5 shrq $4,%rdx xorq %r10,%r10 vmovdqu 32(%rdi),%xmm6 vpshufb %xmm0,%xmm7,%xmm7 vmovdqu 16(%rdi),%xmm2 vpshufb %xmm0,%xmm4,%xmm4 vmovdqu (%rdi),%xmm3 vpshufb %xmm0,%xmm5,%xmm5 vmovdqu %xmm4,48(%rsp) vpshufb %xmm0,%xmm6,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm2,%xmm2 vmovdqu %xmm6,80(%rsp) vpshufb %xmm0,%xmm3,%xmm3 vmovdqu %xmm2,96(%rsp) vmovdqu %xmm3,112(%rsp) #ifdef HAVE_MOVBE #ifdef _KERNEL testl $1,gcm_avx_can_use_movbe(%rip) #else testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) #endif jz 1f call _aesni_ctr32_ghash_6x jmp 2f 1: #endif call _aesni_ctr32_ghash_no_movbe_6x 2: vmovups %xmm9,-96(%rsi) vmovups %xmm10,-80(%rsi) vmovups %xmm11,-64(%rsi) vmovups %xmm12,-48(%rsi) vmovups %xmm13,-32(%rsi) vmovups %xmm14,-16(%rsi) vpshufb (%r11),%xmm8,%xmm8 movq -56(%rax),%r9 .cfi_restore %r9 vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lgcm_dec_abort: movq %r10,%rax .byte 0xf3,0xc3 .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt .type _aesni_ctr32_6x,@function .align 32 _aesni_ctr32_6x: .cfi_startproc vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. vmovups 16-128(%rcx),%xmm15 leaq 32-128(%rcx),%r12 vpxor %xmm4,%xmm1,%xmm9 addl $100663296,%ebx jc .Lhandle_ctr32_2 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddb %xmm2,%xmm11,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddb %xmm2,%xmm12,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .align 16 .Loop_ctr32: vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vmovups (%r12),%xmm15 leaq 16(%r12),%r12 decl %r13d jnz .Loop_ctr32 vmovdqu (%r12),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor 0(%rdi),%xmm3,%xmm4 vaesenc %xmm15,%xmm10,%xmm10 vpxor 16(%rdi),%xmm3,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 vpxor 32(%rdi),%xmm3,%xmm6 vaesenc %xmm15,%xmm12,%xmm12 vpxor 48(%rdi),%xmm3,%xmm8 vaesenc %xmm15,%xmm13,%xmm13 vpxor 64(%rdi),%xmm3,%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vpxor 80(%rdi),%xmm3,%xmm3 leaq 96(%rdi),%rdi vaesenclast %xmm4,%xmm9,%xmm9 vaesenclast %xmm5,%xmm10,%xmm10 vaesenclast %xmm6,%xmm11,%xmm11 vaesenclast %xmm8,%xmm12,%xmm12 vaesenclast %xmm2,%xmm13,%xmm13 vaesenclast %xmm3,%xmm14,%xmm14 vmovups %xmm9,0(%rsi) vmovups %xmm10,16(%rsi) vmovups %xmm11,32(%rsi) vmovups %xmm12,48(%rsi) vmovups %xmm13,64(%rsi) vmovups %xmm14,80(%rsi) leaq 96(%rsi),%rsi .byte 0xf3,0xc3 .align 32 .Lhandle_ctr32_2: vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpshufb %xmm0,%xmm14,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpshufb %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .cfi_endproc .size _aesni_ctr32_6x,.-_aesni_ctr32_6x .globl aesni_gcm_encrypt .type aesni_gcm_encrypt,@function .align 32 aesni_gcm_encrypt: .cfi_startproc xorq %r10,%r10 cmpq $288,%rdx jb .Lgcm_enc_abort leaq (%rsp),%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 pushq %r9 .cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 leaq 128(%rcx),%rcx vmovdqu (%r11),%xmm0 andq $-128,%rsp movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Lenc_no_key_aliasing cmpq $768,%r15 jnc .Lenc_no_key_aliasing subq %r15,%rsp .Lenc_no_key_aliasing: leaq (%rsi),%r14 leaq -192(%rsi,%rdx,1),%r15 shrq $4,%rdx call _aesni_ctr32_6x vpshufb %xmm0,%xmm9,%xmm8 vpshufb %xmm0,%xmm10,%xmm2 vmovdqu %xmm8,112(%rsp) vpshufb %xmm0,%xmm11,%xmm4 vmovdqu %xmm2,96(%rsp) vpshufb %xmm0,%xmm12,%xmm5 vmovdqu %xmm4,80(%rsp) vpshufb %xmm0,%xmm13,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm14,%xmm7 vmovdqu %xmm6,48(%rsp) call _aesni_ctr32_6x vmovdqu (%r9),%xmm8 movq 32(%r9),%r9 leaq 32(%r9),%r9 subq $12,%rdx movq $192,%r10 vpshufb %xmm0,%xmm8,%xmm8 #ifdef HAVE_MOVBE #ifdef _KERNEL testl $1,gcm_avx_can_use_movbe(%rip) #else testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) #endif jz 1f call _aesni_ctr32_ghash_6x jmp 2f 1: #endif call _aesni_ctr32_ghash_no_movbe_6x 2: vmovdqu 32(%rsp),%xmm7 vmovdqu (%r11),%xmm0 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm7,%xmm7,%xmm1 vmovdqu 32-32(%r9),%xmm15 vmovups %xmm9,-96(%rsi) vpshufb %xmm0,%xmm9,%xmm9 vpxor %xmm7,%xmm1,%xmm1 vmovups %xmm10,-80(%rsi) vpshufb %xmm0,%xmm10,%xmm10 vmovups %xmm11,-64(%rsi) vpshufb %xmm0,%xmm11,%xmm11 vmovups %xmm12,-48(%rsi) vpshufb %xmm0,%xmm12,%xmm12 vmovups %xmm13,-32(%rsi) vpshufb %xmm0,%xmm13,%xmm13 vmovups %xmm14,-16(%rsi) vpshufb %xmm0,%xmm14,%xmm14 vmovdqu %xmm9,16(%rsp) vmovdqu 48(%rsp),%xmm6 vmovdqu 16-32(%r9),%xmm0 vpunpckhqdq %xmm6,%xmm6,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 vpxor %xmm6,%xmm2,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vmovdqu 64(%rsp),%xmm9 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm9,%xmm9,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 vpxor %xmm9,%xmm5,%xmm5 vpxor %xmm7,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vmovdqu 80(%rsp),%xmm1 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm4,%xmm7,%xmm7 vpunpckhqdq %xmm1,%xmm1,%xmm4 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpxor %xmm6,%xmm9,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 96(%rsp),%xmm2 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm7,%xmm6,%xmm6 vpunpckhqdq %xmm2,%xmm2,%xmm7 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 vpxor %xmm2,%xmm7,%xmm7 vpxor %xmm9,%xmm1,%xmm1 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm5,%xmm4,%xmm4 vpxor 112(%rsp),%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 vmovdqu 112-32(%r9),%xmm0 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpxor %xmm6,%xmm5,%xmm5 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 vpxor %xmm4,%xmm7,%xmm4 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm1 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 vpxor %xmm14,%xmm1,%xmm1 vpxor %xmm5,%xmm6,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 vmovdqu 32-32(%r9),%xmm15 vpxor %xmm2,%xmm8,%xmm7 vpxor %xmm4,%xmm9,%xmm6 vmovdqu 16-32(%r9),%xmm0 vpxor %xmm5,%xmm7,%xmm9 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 vpxor %xmm9,%xmm6,%xmm6 vpunpckhqdq %xmm13,%xmm13,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 vpxor %xmm13,%xmm2,%xmm2 vpslldq $8,%xmm6,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vpxor %xmm9,%xmm5,%xmm8 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm6,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm12,%xmm12,%xmm9 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 vpxor %xmm12,%xmm9,%xmm9 vpxor %xmm14,%xmm13,%xmm13 vpalignr $8,%xmm8,%xmm8,%xmm14 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm11,%xmm11,%xmm1 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm13,%xmm12,%xmm12 vxorps 16(%rsp),%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm9,%xmm9 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm10,%xmm10,%xmm2 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 vpxor %xmm10,%xmm2,%xmm2 vpalignr $8,%xmm8,%xmm8,%xmm14 vpxor %xmm12,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm9,%xmm1,%xmm1 vxorps %xmm7,%xmm14,%xmm14 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 vmovdqu 112-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm11,%xmm10,%xmm10 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 vpxor %xmm4,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 vpxor %xmm10,%xmm7,%xmm7 vpxor %xmm2,%xmm6,%xmm6 vpxor %xmm5,%xmm7,%xmm4 vpxor %xmm4,%xmm6,%xmm6 vpslldq $8,%xmm6,%xmm1 vmovdqu 16(%r11),%xmm3 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm1,%xmm5,%xmm8 vpxor %xmm6,%xmm7,%xmm7 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm2,%xmm8,%xmm8 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm7,%xmm2,%xmm2 vpxor %xmm2,%xmm8,%xmm8 vpshufb (%r11),%xmm8,%xmm8 movq -56(%rax),%r9 .cfi_restore %r9 vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lgcm_enc_abort: movq %r10,%rax .byte 0xf3,0xc3 .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt /* Some utility routines */ /* * clear all fpu registers * void clear_fpu_regs_avx(void); */ .globl clear_fpu_regs_avx .type clear_fpu_regs_avx,@function .align 32 clear_fpu_regs_avx: vzeroall - ret + RET .size clear_fpu_regs_avx,.-clear_fpu_regs_avx /* * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); * * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and * stores the result at `dst'. The XOR is performed using FPU registers, * so make sure FPU state is saved when running this in the kernel. */ .globl gcm_xor_avx .type gcm_xor_avx,@function .align 32 gcm_xor_avx: movdqu (%rdi), %xmm0 movdqu (%rsi), %xmm1 pxor %xmm1, %xmm0 movdqu %xmm0, (%rsi) - ret + RET .size gcm_xor_avx,.-gcm_xor_avx /* * Toggle a boolean_t value atomically and return the new value. * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); */ .globl atomic_toggle_boolean_nv .type atomic_toggle_boolean_nv,@function .align 32 atomic_toggle_boolean_nv: xorl %eax, %eax lock xorl $1, (%rdi) jz 1f movl $1, %eax 1: - ret + RET .size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lpoly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .Lone_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Ltwo_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S index 59edc4c8d56c..df7f188ecdae 100644 --- a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S +++ b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S @@ -1,254 +1,254 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009 Intel Corporation * All Rights Reserved. */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Accelerated GHASH implementation with Intel PCLMULQDQ-NI * instructions. This file contains an accelerated * Galois Field Multiplication implementation. * * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, * carry-less multiplication. More information about PCLMULQDQ can be * found at: * http://software.intel.com/en-us/articles/ * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ * */ /* * ==================================================================== * OpenSolaris OS modifications * * This source originates as file galois_hash_asm.c from * Intel Corporation dated September 21, 2009. * * This OpenSolaris version has these major changes from the original source: * * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function * definition for lint. * * 2. Formatted code, added comments, and added #includes and #defines. * * 3. If bit CR0.TS is set, clear and set the TS bit, after and before * calling kpreempt_disable() and kpreempt_enable(). * If the TS bit is not set, Save and restore %xmm registers at the beginning * and end of function calls (%xmm* registers are not saved and restored by * during kernel thread preemption). * * 4. Removed code to perform hashing. This is already done with C macro * GHASH in gcm.c. For better performance, this removed code should be * reintegrated in the future to replace the C GHASH macro. * * 5. Added code to byte swap 16-byte input and output. * * 6. Folded in comments from the original C source with embedded assembly * (SB_w_shift_xor.c) * * 7. Renamed function and reordered parameters to match OpenSolaris: * Intel interface: * void galois_hash_asm(unsigned char *hk, unsigned char *s, * unsigned char *d, int length) * OpenSolaris OS interface: * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); * ==================================================================== */ #if defined(lint) || defined(__lint) /* lint */ #include /* ARGSUSED */ void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { } #elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */ #define _ASM #include /* * Use this mask to byte-swap a 16-byte integer with the pshufb instruction */ // static uint8_t byte_swap16_mask[] = { // 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; .data .align XMM_ALIGN .Lbyte_swap16_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 /* * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); * * Perform a carry-less multiplication (that is, use XOR instead of the * multiply operator) on P1 and P2 and place the result in P3. * * Byte swap the input and the output. * * Note: x_in, y, and res all point to a block of 20-byte numbers * (an array of two 64-bit integers). * * Note2: For kernel code, caller is responsible for ensuring * kpreempt_disable() has been called. This is because %xmm registers are * not saved/restored. Clear and set the CR0.TS bit on entry and exit, * respectively, if TS is set on entry. Otherwise, if TS is not set, * save and restore %xmm registers on the stack. * * Note3: Original Intel definition: * void galois_hash_asm(unsigned char *hk, unsigned char *s, * unsigned char *d, int length) * * Note4: Register/parameter mapping: * Intel: * Parameter 1: %rcx (copied to %xmm0) hk or x_in * Parameter 2: %rdx (copied to %xmm1) s or y * Parameter 3: %rdi (result) d or res * OpenSolaris: * Parameter 1: %rdi (copied to %xmm0) x_in * Parameter 2: %rsi (copied to %xmm1) y * Parameter 3: %rdx (result) res */ ENTRY_NP(gcm_mul_pclmulqdq) // // Copy Parameters // movdqu (%rdi), %xmm0 // P1 movdqu (%rsi), %xmm1 // P2 // // Byte swap 16-byte input // lea .Lbyte_swap16_mask(%rip), %rax movups (%rax), %xmm10 pshufb %xmm10, %xmm0 pshufb %xmm10, %xmm1 // // Multiply with the hash key // movdqu %xmm0, %xmm3 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 movdqu %xmm0, %xmm4 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 movdqu %xmm0, %xmm5 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 movdqu %xmm0, %xmm6 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right pslldq $8, %xmm5 // shift by xmm5 64 bits to the left pxor %xmm5, %xmm3 pxor %xmm4, %xmm6 // Register pair holds the result // of the carry-less multiplication of // xmm0 by xmm1. // We shift the result of the multiplication by one bit position // to the left to cope for the fact that the bits are reversed. movdqu %xmm3, %xmm7 movdqu %xmm6, %xmm8 pslld $1, %xmm3 pslld $1, %xmm6 psrld $31, %xmm7 psrld $31, %xmm8 movdqu %xmm7, %xmm9 pslldq $4, %xmm8 pslldq $4, %xmm7 psrldq $12, %xmm9 por %xmm7, %xmm3 por %xmm8, %xmm6 por %xmm9, %xmm6 // // First phase of the reduction // // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts // independently. movdqu %xmm3, %xmm7 movdqu %xmm3, %xmm8 movdqu %xmm3, %xmm9 pslld $31, %xmm7 // packed right shift shifting << 31 pslld $30, %xmm8 // packed right shift shifting << 30 pslld $25, %xmm9 // packed right shift shifting << 25 pxor %xmm8, %xmm7 // xor the shifted versions pxor %xmm9, %xmm7 movdqu %xmm7, %xmm8 pslldq $12, %xmm7 psrldq $4, %xmm8 pxor %xmm7, %xmm3 // first phase of the reduction complete // // Second phase of the reduction // // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these // shift operations. movdqu %xmm3, %xmm2 movdqu %xmm3, %xmm4 // packed left shifting >> 1 movdqu %xmm3, %xmm5 psrld $1, %xmm2 psrld $2, %xmm4 // packed left shifting >> 2 psrld $7, %xmm5 // packed left shifting >> 7 pxor %xmm4, %xmm2 // xor the shifted versions pxor %xmm5, %xmm2 pxor %xmm8, %xmm2 pxor %xmm2, %xmm3 pxor %xmm3, %xmm6 // the result is in xmm6 // // Byte swap 16-byte result // pshufb %xmm10, %xmm6 // %xmm10 has the swap mask // // Store the result // movdqu %xmm6, (%rdx) // P3 // // Return // - ret + RET SET_SIZE(gcm_mul_pclmulqdq) #endif /* lint || __lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S index 28b048d2db24..31da7f9767df 100644 --- a/module/icp/asm-x86_64/sha2/sha256_impl.S +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -1,2089 +1,2089 @@ /* * ==================================================================== * Written by Andy Polyakov for the OpenSSL * project. Rights for redistribution and usage in source and binary * forms are granted according to the OpenSSL license. * ==================================================================== * * sha256/512_block procedure for x86_64. * * 40% improvement over compiler-generated code on Opteron. On EM64T * sha256 was observed to run >80% faster and sha512 - >40%. No magical * tricks, just straight implementation... I really wonder why gcc * [being armed with inline assembler] fails to generate as fast code. * The only thing which is cool about this module is that it's very * same instruction sequence used for both SHA-256 and SHA-512. In * former case the instructions operate on 32-bit operands, while in * latter - on 64-bit ones. All I had to do is to get one flavor right, * the other one passed the test right away:-) * * sha256_block runs in ~1005 cycles on Opteron, which gives you * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock * frequency in GHz. sha512_block runs in ~1275 cycles, which results * in 128*1000/1275=100MBps per GHz. Is there room for improvement? * Well, if you compare it to IA-64 implementation, which maintains * X[16] in register bank[!], tends to 4 instructions per CPU clock * cycle and runs in 1003 cycles, 1275 is very good result for 3-way * issue Opteron pipeline and X[16] maintained in memory. So that *if* * there is a way to improve it, *then* the only way would be to try to * offload X[16] updates to SSE unit, but that would require "deeper" * loop unroll, which in turn would naturally cause size blow-up, not * to mention increased complexity! And once again, only *if* it's * actually possible to noticeably improve overall ILP, instruction * level parallelism, on a given CPU implementation in this case. * * Special note on Intel EM64T. While Opteron CPU exhibits perfect * performance ratio of 1.5 between 64- and 32-bit flavors [see above], * [currently available] EM64T CPUs apparently are far from it. On the * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit * sha256_block:-( This is presumably because 64-bit shifts/rotates * apparently are not atomic instructions, but implemented in microcode. */ /* * OpenSolaris OS modifications * * Sun elects to use this software under the BSD license. * * This source originates from OpenSSL file sha512-x86_64.pl at * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz * (presumably for future OpenSSL release 0.9.8h), with these changes: * * 1. Added perl "use strict" and declared variables. * * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. * * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) * assemblers). Replaced the .picmeup macro with assembler code. * * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", * at the beginning of SHA2_CTX (the next field is 8-byte aligned). */ /* * This file was generated by a perl script (sha512-x86_64.pl) that were * used to generate sha256 and sha512 variants from the same code base. * The comments from the original file have been pasted above. */ #if defined(lint) || defined(__lint) #include #include /* ARGSUSED */ void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) { } #else #define _ASM #include ENTRY_NP(SHA256TransformBlocks) .cfi_startproc movq %rsp, %rax .cfi_def_cfa_register %rax push %rbx .cfi_offset %rbx,-16 push %rbp .cfi_offset %rbp,-24 push %r12 .cfi_offset %r12,-32 push %r13 .cfi_offset %r13,-40 push %r14 .cfi_offset %r14,-48 push %r15 .cfi_offset %r15,-56 mov %rsp,%rbp # copy %rsp shl $4,%rdx # num*16 sub $16*4+4*8,%rsp lea (%rsi,%rdx,4),%rdx # inp+num*16*4 and $-64,%rsp # align stack frame add $8,%rdi # Skip OpenSolaris field, "algotype" mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg mov %rbp,16*4+3*8(%rsp) # save copy of %rsp # echo ".cfi_cfa_expression %rsp+88,deref,+56" | # openssl/crypto/perlasm/x86_64-xlate.pl .cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x38 #.picmeup %rbp # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts # the address of the "next" instruction into the target register # (%rbp). This generates these 2 instructions: lea .Llea(%rip),%rbp #nop # .picmeup generates a nop for mod 8 alignment--not needed here .Llea: lea K256-.(%rbp),%rbp mov 4*0(%rdi),%eax mov 4*1(%rdi),%ebx mov 4*2(%rdi),%ecx mov 4*3(%rdi),%edx mov 4*4(%rdi),%r8d mov 4*5(%rdi),%r9d mov 4*6(%rdi),%r10d mov 4*7(%rdi),%r11d jmp .Lloop .align 16 .Lloop: xor %rdi,%rdi mov 4*0(%rsi),%r12d bswap %r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,0(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 4*1(%rsi),%r12d bswap %r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,4(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 4*2(%rsi),%r12d bswap %r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,8(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 4*3(%rsi),%r12d bswap %r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,12(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 4*4(%rsi),%r12d bswap %r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,16(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 4*5(%rsi),%r12d bswap %r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,20(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 4*6(%rsi),%r12d bswap %r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,24(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 4*7(%rsi),%r12d bswap %r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,28(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) mov 4*8(%rsi),%r12d bswap %r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,32(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 4*9(%rsi),%r12d bswap %r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,36(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 4*10(%rsi),%r12d bswap %r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,40(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 4*11(%rsi),%r12d bswap %r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,44(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 4*12(%rsi),%r12d bswap %r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,48(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 4*13(%rsi),%r12d bswap %r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,52(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 4*14(%rsi),%r12d bswap %r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,56(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 4*15(%rsi),%r12d bswap %r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,60(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: mov 4(%rsp),%r13d mov 56(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 36(%rsp),%r12d add 0(%rsp),%r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,0(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 8(%rsp),%r13d mov 60(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 40(%rsp),%r12d add 4(%rsp),%r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,4(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 12(%rsp),%r13d mov 0(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 44(%rsp),%r12d add 8(%rsp),%r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,8(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 16(%rsp),%r13d mov 4(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 48(%rsp),%r12d add 12(%rsp),%r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,12(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 20(%rsp),%r13d mov 8(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 52(%rsp),%r12d add 16(%rsp),%r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,16(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 24(%rsp),%r13d mov 12(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 56(%rsp),%r12d add 20(%rsp),%r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,20(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 28(%rsp),%r13d mov 16(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 60(%rsp),%r12d add 24(%rsp),%r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,24(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 32(%rsp),%r13d mov 20(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 0(%rsp),%r12d add 28(%rsp),%r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,28(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) mov 36(%rsp),%r13d mov 24(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 4(%rsp),%r12d add 32(%rsp),%r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,32(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 40(%rsp),%r13d mov 28(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 8(%rsp),%r12d add 36(%rsp),%r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,36(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 44(%rsp),%r13d mov 32(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 12(%rsp),%r12d add 40(%rsp),%r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,40(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 48(%rsp),%r13d mov 36(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 16(%rsp),%r12d add 44(%rsp),%r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,44(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 52(%rsp),%r13d mov 40(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 20(%rsp),%r12d add 48(%rsp),%r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,48(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 56(%rsp),%r13d mov 44(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 24(%rsp),%r12d add 52(%rsp),%r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,52(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 60(%rsp),%r13d mov 48(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 28(%rsp),%r12d add 56(%rsp),%r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,56(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 0(%rsp),%r13d mov 52(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 32(%rsp),%r12d add 60(%rsp),%r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,60(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) cmp $64,%rdi jb .Lrounds_16_xx mov 16*4+0*8(%rsp),%rdi lea 16*4(%rsi),%rsi add 4*0(%rdi),%eax add 4*1(%rdi),%ebx add 4*2(%rdi),%ecx add 4*3(%rdi),%edx add 4*4(%rdi),%r8d add 4*5(%rdi),%r9d add 4*6(%rdi),%r10d add 4*7(%rdi),%r11d cmp 16*4+2*8(%rsp),%rsi mov %eax,4*0(%rdi) mov %ebx,4*1(%rdi) mov %ecx,4*2(%rdi) mov %edx,4*3(%rdi) mov %r8d,4*4(%rdi) mov %r9d,4*5(%rdi) mov %r10d,4*6(%rdi) mov %r11d,4*7(%rdi) jb .Lloop mov 16*4+3*8(%rsp),%rsp .cfi_def_cfa %rsp,56 pop %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 pop %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 pop %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 pop %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 pop %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp pop %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx - ret + RET .cfi_endproc SET_SIZE(SHA256TransformBlocks) .data .align 64 .type K256,@object K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 #endif /* !lint && !__lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S index 746c85a98566..c2ba18538e33 100644 --- a/module/icp/asm-x86_64/sha2/sha512_impl.S +++ b/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -1,2114 +1,2114 @@ /* * ==================================================================== * Written by Andy Polyakov for the OpenSSL * project. Rights for redistribution and usage in source and binary * forms are granted according to the OpenSSL license. * ==================================================================== * * sha256/512_block procedure for x86_64. * * 40% improvement over compiler-generated code on Opteron. On EM64T * sha256 was observed to run >80% faster and sha512 - >40%. No magical * tricks, just straight implementation... I really wonder why gcc * [being armed with inline assembler] fails to generate as fast code. * The only thing which is cool about this module is that it's very * same instruction sequence used for both SHA-256 and SHA-512. In * former case the instructions operate on 32-bit operands, while in * latter - on 64-bit ones. All I had to do is to get one flavor right, * the other one passed the test right away:-) * * sha256_block runs in ~1005 cycles on Opteron, which gives you * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock * frequency in GHz. sha512_block runs in ~1275 cycles, which results * in 128*1000/1275=100MBps per GHz. Is there room for improvement? * Well, if you compare it to IA-64 implementation, which maintains * X[16] in register bank[!], tends to 4 instructions per CPU clock * cycle and runs in 1003 cycles, 1275 is very good result for 3-way * issue Opteron pipeline and X[16] maintained in memory. So that *if* * there is a way to improve it, *then* the only way would be to try to * offload X[16] updates to SSE unit, but that would require "deeper" * loop unroll, which in turn would naturally cause size blow-up, not * to mention increased complexity! And once again, only *if* it's * actually possible to noticeably improve overall ILP, instruction * level parallelism, on a given CPU implementation in this case. * * Special note on Intel EM64T. While Opteron CPU exhibits perfect * performance ratio of 1.5 between 64- and 32-bit flavors [see above], * [currently available] EM64T CPUs apparently are far from it. On the * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit * sha256_block:-( This is presumably because 64-bit shifts/rotates * apparently are not atomic instructions, but implemented in microcode. */ /* * OpenSolaris OS modifications * * Sun elects to use this software under the BSD license. * * This source originates from OpenSSL file sha512-x86_64.pl at * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz * (presumably for future OpenSSL release 0.9.8h), with these changes: * * 1. Added perl "use strict" and declared variables. * * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. * * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) * assemblers). Replaced the .picmeup macro with assembler code. * * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", * at the beginning of SHA2_CTX (the next field is 8-byte aligned). */ /* * This file was generated by a perl script (sha512-x86_64.pl) that were * used to generate sha256 and sha512 variants from the same code base. * The comments from the original file have been pasted above. */ #if defined(lint) || defined(__lint) #include #include /* ARGSUSED */ void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) { } #else #define _ASM #include ENTRY_NP(SHA512TransformBlocks) .cfi_startproc movq %rsp, %rax .cfi_def_cfa_register %rax push %rbx .cfi_offset %rbx,-16 push %rbp .cfi_offset %rbp,-24 push %r12 .cfi_offset %r12,-32 push %r13 .cfi_offset %r13,-40 push %r14 .cfi_offset %r14,-48 push %r15 .cfi_offset %r15,-56 mov %rsp,%rbp # copy %rsp shl $4,%rdx # num*16 sub $16*8+4*8,%rsp lea (%rsi,%rdx,8),%rdx # inp+num*16*8 and $-64,%rsp # align stack frame add $8,%rdi # Skip OpenSolaris field, "algotype" mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg mov %rbp,16*8+3*8(%rsp) # save copy of %rsp # echo ".cfi_cfa_expression %rsp+152,deref,+56" | # openssl/crypto/perlasm/x86_64-xlate.pl .cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x38 #.picmeup %rbp # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts # the address of the "next" instruction into the target register # (%rbp). This generates these 2 instructions: lea .Llea(%rip),%rbp #nop # .picmeup generates a nop for mod 8 alignment--not needed here .Llea: lea K512-.(%rbp),%rbp mov 8*0(%rdi),%rax mov 8*1(%rdi),%rbx mov 8*2(%rdi),%rcx mov 8*3(%rdi),%rdx mov 8*4(%rdi),%r8 mov 8*5(%rdi),%r9 mov 8*6(%rdi),%r10 mov 8*7(%rdi),%r11 jmp .Lloop .align 16 .Lloop: xor %rdi,%rdi mov 8*0(%rsi),%r12 bswap %r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,0(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 8*1(%rsi),%r12 bswap %r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,8(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 8*2(%rsi),%r12 bswap %r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,16(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 8*3(%rsi),%r12 bswap %r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,24(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 8*4(%rsi),%r12 bswap %r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,32(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 8*5(%rsi),%r12 bswap %r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,40(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 8*6(%rsi),%r12 bswap %r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,48(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 8*7(%rsi),%r12 bswap %r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,56(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) mov 8*8(%rsi),%r12 bswap %r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,64(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 8*9(%rsi),%r12 bswap %r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,72(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 8*10(%rsi),%r12 bswap %r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,80(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 8*11(%rsi),%r12 bswap %r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,88(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 8*12(%rsi),%r12 bswap %r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,96(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 8*13(%rsi),%r12 bswap %r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,104(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 8*14(%rsi),%r12 bswap %r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,112(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 8*15(%rsi),%r12 bswap %r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,120(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: mov 8(%rsp),%r13 mov 112(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 72(%rsp),%r12 add 0(%rsp),%r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,0(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 16(%rsp),%r13 mov 120(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 80(%rsp),%r12 add 8(%rsp),%r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,8(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 24(%rsp),%r13 mov 0(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 88(%rsp),%r12 add 16(%rsp),%r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,16(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 32(%rsp),%r13 mov 8(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 96(%rsp),%r12 add 24(%rsp),%r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,24(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 40(%rsp),%r13 mov 16(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 104(%rsp),%r12 add 32(%rsp),%r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,32(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 48(%rsp),%r13 mov 24(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 112(%rsp),%r12 add 40(%rsp),%r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,40(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 56(%rsp),%r13 mov 32(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 120(%rsp),%r12 add 48(%rsp),%r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,48(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 64(%rsp),%r13 mov 40(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 0(%rsp),%r12 add 56(%rsp),%r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,56(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) mov 72(%rsp),%r13 mov 48(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 8(%rsp),%r12 add 64(%rsp),%r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,64(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 80(%rsp),%r13 mov 56(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 16(%rsp),%r12 add 72(%rsp),%r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,72(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 88(%rsp),%r13 mov 64(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 24(%rsp),%r12 add 80(%rsp),%r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,80(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 96(%rsp),%r13 mov 72(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 32(%rsp),%r12 add 88(%rsp),%r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,88(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 104(%rsp),%r13 mov 80(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 40(%rsp),%r12 add 96(%rsp),%r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,96(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 112(%rsp),%r13 mov 88(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 48(%rsp),%r12 add 104(%rsp),%r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,104(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 120(%rsp),%r13 mov 96(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 56(%rsp),%r12 add 112(%rsp),%r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,112(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 0(%rsp),%r13 mov 104(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 64(%rsp),%r12 add 120(%rsp),%r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,120(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) cmp $80,%rdi jb .Lrounds_16_xx mov 16*8+0*8(%rsp),%rdi lea 16*8(%rsi),%rsi add 8*0(%rdi),%rax add 8*1(%rdi),%rbx add 8*2(%rdi),%rcx add 8*3(%rdi),%rdx add 8*4(%rdi),%r8 add 8*5(%rdi),%r9 add 8*6(%rdi),%r10 add 8*7(%rdi),%r11 cmp 16*8+2*8(%rsp),%rsi mov %rax,8*0(%rdi) mov %rbx,8*1(%rdi) mov %rcx,8*2(%rdi) mov %rdx,8*3(%rdi) mov %r8,8*4(%rdi) mov %r9,8*5(%rdi) mov %r10,8*6(%rdi) mov %r11,8*7(%rdi) jb .Lloop mov 16*8+3*8(%rsp),%rsp .cfi_def_cfa %rsp,56 pop %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 pop %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 pop %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 pop %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 pop %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp pop %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx - ret + RET .cfi_endproc SET_SIZE(SHA512TransformBlocks) .data .align 64 .type K512,@object K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 #endif /* !lint && !__lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/include/sys/ia32/asm_linkage.h b/module/icp/include/sys/ia32/asm_linkage.h index f2dae7093b94..876e21e5f1b1 100644 --- a/module/icp/include/sys/ia32/asm_linkage.h +++ b/module/icp/include/sys/ia32/asm_linkage.h @@ -1,307 +1,313 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H #include #include +#if defined(__linux__) && defined(CONFIG_SLS) +#define RET ret; int3 +#else +#define RET ret +#endif + #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 #if defined(__amd64) #define SAVE_XMM_PROLOG(sreg, nreg) \ subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \ movq %rsp, sreg #define RSTOR_XMM_EPILOG(sreg, nreg) \ addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp #elif defined(__i386) #define SAVE_XMM_PROLOG(sreg, nreg) \ subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \ movl %esp, sreg; \ addl $XMM_ALIGN, sreg; \ andl $_BITNOT(XMM_ALIGN-1), sreg #define RSTOR_XMM_EPILOG(sreg, nreg) \ addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; #endif /* __i386 */ /* * profiling causes definitions of the MCOUNT and RTMCOUNT * particular to the type */ #ifdef GPROF #define MCOUNT(x) \ pushl %ebp; \ movl %esp, %ebp; \ call _mcount; \ popl %ebp #endif /* GPROF */ #ifdef PROF #define MCOUNT(x) \ /* CSTYLED */ \ .lcomm .L_/**/x/**/1, 4, 4; \ pushl %ebp; \ movl %esp, %ebp; \ /* CSTYLED */ \ movl $.L_/**/x/**/1, %edx; \ call _mcount; \ popl %ebp #endif /* PROF */ /* * if we are not profiling, MCOUNT should be defined to nothing */ #if !defined(PROF) && !defined(GPROF) #define MCOUNT(x) #endif /* !defined(PROF) && !defined(GPROF) */ #define RTMCOUNT(x) MCOUNT(x) /* * Macro to define weak symbol aliases. These are similar to the ANSI-C * #pragma weak _name = name * except a compiler can determine type. The assembler must be told. Hence, * the second parameter must be the type of the symbol (i.e.: function,...) */ #define ANSI_PRAGMA_WEAK(sym, stype) \ /* CSTYLED */ \ .weak _/**/sym; \ /* CSTYLED */ \ .type _/**/sym, @stype; \ /* CSTYLED */ \ _/**/sym = sym /* * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in: * #pragma weak sym1 = sym2 */ #define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \ .weak sym1; \ .type sym1, @stype; \ sym1 = sym2 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ #define ENTRY(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: #define RTENTRY(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: RTMCOUNT(x) /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ /* CSTYLED */ \ x: ; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ /* CSTYLED */ \ x: ; \ y: /* * ALTENTRY provides for additional entry points. */ #define ALTENTRY(x) \ .globl x; \ .type x, @function; \ x: /* * DGDEF and DGDEF2 provide global data declarations. * * DGDEF provides a word aligned word of storage. * * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This * implies this macro is best used for byte arrays. * * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. */ #define DGDEF2(name, sz) \ .data; \ .globl name; \ .type name, @object; \ .size name, sz; \ name: #define DGDEF3(name, sz, algn) \ .data; \ .align algn; \ .globl name; \ .type name, @object; \ .size name, sz; \ name: #define DGDEF(name) DGDEF3(name, 4, 4) /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ #define SET_SIZE(x) \ .size x, [.-x] /* * NWORD provides native word value. */ #if defined(__amd64) /*CSTYLED*/ #define NWORD quad #elif defined(__i386) #define NWORD long #endif /* __i386 */ #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/module/lua/setjmp/setjmp_x86_64.S b/module/lua/setjmp/setjmp_x86_64.S index a469cbad780e..34cf2c7dce93 100644 --- a/module/lua/setjmp/setjmp_x86_64.S +++ b/module/lua/setjmp/setjmp_x86_64.S @@ -1,77 +1,83 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ #define ENTRY(x) \ .text; \ .align 8; \ .globl x; \ .type x, @function; \ x: #define SET_SIZE(x) \ .size x, [.-x] +#if defined(__linux__) && defined(CONFIG_SLS) +#define RET ret; int3 +#else +#define RET ret +#endif + /* * Setjmp and longjmp implement non-local gotos using state vectors * type label_t. */ #ifdef __x86_64__ ENTRY(setjmp) movq %rsp, 0(%rdi) movq %rbp, 8(%rdi) movq %rbx, 16(%rdi) movq %r12, 24(%rdi) movq %r13, 32(%rdi) movq %r14, 40(%rdi) movq %r15, 48(%rdi) movq 0(%rsp), %rdx /* return address */ movq %rdx, 56(%rdi) /* rip */ xorl %eax, %eax /* return 0 */ - ret + RET SET_SIZE(setjmp) ENTRY(longjmp) movq 0(%rdi), %rsp movq 8(%rdi), %rbp movq 16(%rdi), %rbx movq 24(%rdi), %r12 movq 32(%rdi), %r13 movq 40(%rdi), %r14 movq 48(%rdi), %r15 movq 56(%rdi), %rdx /* return address */ movq %rdx, 0(%rsp) xorl %eax, %eax incl %eax /* return 1 */ - ret + RET SET_SIZE(longjmp) #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif #endif /* __x86_64__ */