Index: share/man/man4/sha_sse.4 =================================================================== --- /dev/null +++ share/man/man4/sha_sse.4 @@ -0,0 +1,81 @@ +.\" Copyright (c) 2017 Conrad Meyer +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd September 21, 2017 +.Dt SHA_SSE 4 +.Os +.Sh NAME +.Nm sha_sse +.Nd "driver for the SHA accelerator on x86 CPUs" +.Sh SYNOPSIS +To compile this driver into the kernel, +place the following lines in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device crypto" +.Cd "device cryptodev" +.Cd "device sha_sse" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +sha_sse_load="YES" +.Ed +.Sh DESCRIPTION +Starting with the Intel Goldmont and AMD Ryzen microarchitectures, some x86 +processors implement a new set of SHA instructions. +The set of seven instructions accelerates the calculation of SHA1 and SHA256 +hashes. +.Pp +The processor capability is reported as SHA in the Structured Extended Features +line at boot. +The +.Nm +driver does not attach on systems that lack the required CPU capability. +.Pp +The +.Nm +driver registers itself to accelerate SHA operations for +.Xr crypto 4 . +.Sh SEE ALSO +.Xr crypto 4 , +.Xr intro 4 , +.Xr ipsec 4 , +.Xr crypto 9 +.Sh HISTORY +The +.Nm +driver first appeared in +.Fx 12.0 . +.Sh AUTHORS +.An -nosplit +The +.Nm +driver was written by +.An Conrad Meyer Aq Mt cem@FreeBSD.org . +The hash step intrinsics implementations were supplied by Intel. Index: sys/amd64/conf/NOTES =================================================================== --- sys/amd64/conf/NOTES +++ sys/amd64/conf/NOTES @@ -544,6 +544,7 @@ device padlock_rng # VIA Padlock RNG device rdrand_rng # Intel Bull Mountain RNG device aesni # AES-NI OpenCrypto module +device sha_sse # SHA SSE extension OpenCrypto module device ioat # Intel I/OAT DMA engine # Index: sys/conf/files.amd64 =================================================================== --- sys/conf/files.amd64 +++ sys/conf/files.amd64 @@ -182,6 +182,17 @@ crypto/blowfish/bf_enc.c optional crypto | ipsec | ipsec_support crypto/des/des_enc.c optional crypto | ipsec | \ ipsec_support | netsmb +intel_sha1.o optional sha_sse \ + dependency "$S/crypto/sha_sse/intel_sha1.c" \ + compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \ + no-implicit-rule \ + clean "intel_sha1.o" +intel_sha256.o optional sha_sse \ + dependency "$S/crypto/sha_sse/intel_sha256.c" \ + compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \ + no-implicit-rule \ + clean "intel_sha256.o" +crypto/sha_sse/sha_sse.c optional sha_sse crypto/via/padlock.c optional padlock crypto/via/padlock_cipher.c optional padlock crypto/via/padlock_hash.c optional padlock Index: sys/conf/files.i386 =================================================================== --- sys/conf/files.i386 +++ sys/conf/files.i386 @@ -132,6 +132,17 @@ no-implicit-rule \ clean "aesni_wrap.o" crypto/des/arch/i386/des_enc.S optional crypto | ipsec | ipsec_support | netsmb +intel_sha1.o optional sha_sse \ + dependency "$S/crypto/sha_sse/intel_sha1.c" \ + compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \ + no-implicit-rule \ + clean "intel_sha1.o" +intel_sha256.o optional sha_sse \ + dependency "$S/crypto/sha_sse/intel_sha256.c" \ + compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \ + no-implicit-rule \ + clean "intel_sha256.o" +crypto/sha_sse/sha_sse.c optional sha_sse crypto/via/padlock.c optional padlock crypto/via/padlock_cipher.c optional padlock crypto/via/padlock_hash.c optional padlock Index: sys/crypto/sha_sse/intel_sha1.c =================================================================== --- /dev/null +++ sys/crypto/sha_sse/intel_sha1.c @@ -0,0 +1,258 @@ +/******************************************************************************* +* Copyright (c) 2013, Intel Corporation +* +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: +* +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* * Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* * Neither the name of the Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived from +* this software without specific prior written permission. +* +* +* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +******************************************************************************** +* +* Intel SHA Extensions optimized implementation of a SHA-1 update function +* +* The function takes a pointer to the current hash values, a pointer to the +* input data, and a number of 64 byte blocks to process. Once all blocks have +* been processed, the digest pointer is updated with the resulting hash value. +* The function only processes complete blocks, there is no functionality to +* store partial blocks. All message padding and hash value initialization must +* be done outside the update function. +* +* The indented lines in the loop are instructions related to rounds processing. +* The non-indented lines are instructions related to the message schedule. +* +* Author: Sean Gulley +* Date: July 2013 +* +******************************************************************************** +* +* Example complier command line: +* icc intel_sha_extensions_sha1_intrinsic.c +* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c +* +*******************************************************************************/ +#include +#include + +#include + +void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) { + __m128i abcd, e0, e1; + __m128i abcd_save, e_save; + __m128i msg0, msg1, msg2, msg3; + __m128i shuf_mask, e_mask; + +#if 0 + e_mask = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull); +#else + (void)e_mask; + e0 = _mm_set_epi64x(0, 0); +#endif + shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full); + + // Load initial hash values + abcd = _mm_loadu_si128((__m128i*) digest); + e0 = _mm_insert_epi32(e0, *(digest+4), 3); + abcd = _mm_shuffle_epi32(abcd, 0x1B); +#if 0 + e0 = _mm_and_si128(e0, e_mask); +#endif + + while (num_blks > 0) { + // Save hash values for addition after rounds + abcd_save = abcd; + e_save = e0; + + // Rounds 0-3 + msg0 = _mm_loadu_si128((const __m128i*) data); + msg0 = _mm_shuffle_epi8(msg0, shuf_mask); + e0 = _mm_add_epi32(e0, msg0); + e1 = abcd; + abcd = _mm_sha1rnds4_epu32(abcd, e0, 0); + + // Rounds 4-7 + msg1 = _mm_loadu_si128((const __m128i*) (data+16)); + msg1 = _mm_shuffle_epi8(msg1, shuf_mask); + e1 = _mm_sha1nexte_epu32(e1, msg1); + e0 = abcd; + abcd = _mm_sha1rnds4_epu32(abcd, e1, 0); + msg0 = _mm_sha1msg1_epu32(msg0, msg1); + + // Rounds 8-11 + msg2 = _mm_loadu_si128((const __m128i*) (data+32)); + msg2 = _mm_shuffle_epi8(msg2, shuf_mask); + e0 = _mm_sha1nexte_epu32(e0, msg2); + e1 = abcd; + abcd = _mm_sha1rnds4_epu32(abcd, e0, 0); + msg1 = _mm_sha1msg1_epu32(msg1, msg2); + msg0 = _mm_xor_si128(msg0, msg2); + + // Rounds 12-15 + msg3 = _mm_loadu_si128((const __m128i*) (data+48)); + msg3 = _mm_shuffle_epi8(msg3, shuf_mask); + e1 = _mm_sha1nexte_epu32(e1, msg3); + e0 = abcd; + msg0 = _mm_sha1msg2_epu32(msg0, msg3); + abcd = _mm_sha1rnds4_epu32(abcd, e1, 0); + msg2 = _mm_sha1msg1_epu32(msg2, msg3); + msg1 = _mm_xor_si128(msg1, msg3); + + // Rounds 16-19 + e0 = _mm_sha1nexte_epu32(e0, msg0); + e1 = abcd; + msg1 = _mm_sha1msg2_epu32(msg1, msg0); + abcd = _mm_sha1rnds4_epu32(abcd, e0, 0); + msg3 = _mm_sha1msg1_epu32(msg3, msg0); + msg2 = _mm_xor_si128(msg2, msg0); + + // Rounds 20-23 + e1 = _mm_sha1nexte_epu32(e1, msg1); + e0 = abcd; + msg2 = _mm_sha1msg2_epu32(msg2, msg1); + abcd = _mm_sha1rnds4_epu32(abcd, e1, 1); + msg0 = _mm_sha1msg1_epu32(msg0, msg1); + msg3 = _mm_xor_si128(msg3, msg1); + + // Rounds 24-27 + e0 = _mm_sha1nexte_epu32(e0, msg2); + e1 = abcd; + msg3 = _mm_sha1msg2_epu32(msg3, msg2); + abcd = _mm_sha1rnds4_epu32(abcd, e0, 1); + msg1 = _mm_sha1msg1_epu32(msg1, msg2); + msg0 = _mm_xor_si128(msg0, msg2); + + // Rounds 28-31 + e1 = _mm_sha1nexte_epu32(e1, msg3); + e0 = abcd; + msg0 = _mm_sha1msg2_epu32(msg0, msg3); + abcd = _mm_sha1rnds4_epu32(abcd, e1, 1); + msg2 = _mm_sha1msg1_epu32(msg2, msg3); + msg1 = _mm_xor_si128(msg1, msg3); + + // Rounds 32-35 + e0 = _mm_sha1nexte_epu32(e0, msg0); + e1 = abcd; + msg1 = _mm_sha1msg2_epu32(msg1, msg0); + abcd = _mm_sha1rnds4_epu32(abcd, e0, 1); + msg3 = _mm_sha1msg1_epu32(msg3, msg0); + msg2 = _mm_xor_si128(msg2, msg0); + + // Rounds 36-39 + e1 = _mm_sha1nexte_epu32(e1, msg1); + e0 = abcd; + msg2 = _mm_sha1msg2_epu32(msg2, msg1); + abcd = _mm_sha1rnds4_epu32(abcd, e1, 1); + msg0 = _mm_sha1msg1_epu32(msg0, msg1); + msg3 = _mm_xor_si128(msg3, msg1); + + // Rounds 40-43 + e0 = _mm_sha1nexte_epu32(e0, msg2); + e1 = abcd; + msg3 = _mm_sha1msg2_epu32(msg3, msg2); + abcd = _mm_sha1rnds4_epu32(abcd, e0, 2); + msg1 = _mm_sha1msg1_epu32(msg1, msg2); + msg0 = _mm_xor_si128(msg0, msg2); + + // Rounds 44-47 + e1 = _mm_sha1nexte_epu32(e1, msg3); + e0 = abcd; + msg0 = _mm_sha1msg2_epu32(msg0, msg3); + abcd = _mm_sha1rnds4_epu32(abcd, e1, 2); + msg2 = _mm_sha1msg1_epu32(msg2, msg3); + msg1 = _mm_xor_si128(msg1, msg3); + + // Rounds 48-51 + e0 = _mm_sha1nexte_epu32(e0, msg0); + e1 = abcd; + msg1 = _mm_sha1msg2_epu32(msg1, msg0); + abcd = _mm_sha1rnds4_epu32(abcd, e0, 2); + msg3 = _mm_sha1msg1_epu32(msg3, msg0); + msg2 = _mm_xor_si128(msg2, msg0); + + // Rounds 52-55 + e1 = _mm_sha1nexte_epu32(e1, msg1); + e0 = abcd; + msg2 = _mm_sha1msg2_epu32(msg2, msg1); + abcd = _mm_sha1rnds4_epu32(abcd, e1, 2); + msg0 = _mm_sha1msg1_epu32(msg0, msg1); + msg3 = _mm_xor_si128(msg3, msg1); + + // Rounds 56-59 + e0 = _mm_sha1nexte_epu32(e0, msg2); + e1 = abcd; + msg3 = _mm_sha1msg2_epu32(msg3, msg2); + abcd = _mm_sha1rnds4_epu32(abcd, e0, 2); + msg1 = _mm_sha1msg1_epu32(msg1, msg2); + msg0 = _mm_xor_si128(msg0, msg2); + + // Rounds 60-63 + e1 = _mm_sha1nexte_epu32(e1, msg3); + e0 = abcd; + msg0 = _mm_sha1msg2_epu32(msg0, msg3); + abcd = _mm_sha1rnds4_epu32(abcd, e1, 3); + msg2 = _mm_sha1msg1_epu32(msg2, msg3); + msg1 = _mm_xor_si128(msg1, msg3); + + // Rounds 64-67 + e0 = _mm_sha1nexte_epu32(e0, msg0); + e1 = abcd; + msg1 = _mm_sha1msg2_epu32(msg1, msg0); + abcd = _mm_sha1rnds4_epu32(abcd, e0, 3); + msg3 = _mm_sha1msg1_epu32(msg3, msg0); + msg2 = _mm_xor_si128(msg2, msg0); + + // Rounds 68-71 + e1 = _mm_sha1nexte_epu32(e1, msg1); + e0 = abcd; + msg2 = _mm_sha1msg2_epu32(msg2, msg1); + abcd = _mm_sha1rnds4_epu32(abcd, e1, 3); + msg3 = _mm_xor_si128(msg3, msg1); + + // Rounds 72-75 + e0 = _mm_sha1nexte_epu32(e0, msg2); + e1 = abcd; + msg3 = _mm_sha1msg2_epu32(msg3, msg2); + abcd = _mm_sha1rnds4_epu32(abcd, e0, 3); + + // Rounds 76-79 + e1 = _mm_sha1nexte_epu32(e1, msg3); + e0 = abcd; + abcd = _mm_sha1rnds4_epu32(abcd, e1, 3); + + // Add current hash values with previously saved + e0 = _mm_sha1nexte_epu32(e0, e_save); + abcd = _mm_add_epi32(abcd, abcd_save); + + data += 64; + num_blks--; + } + + abcd = _mm_shuffle_epi32(abcd, 0x1B); + _mm_store_si128((__m128i*) digest, abcd); + *(digest+4) = _mm_extract_epi32(e0, 3); +} + Index: sys/crypto/sha_sse/intel_sha256.c =================================================================== --- /dev/null +++ sys/crypto/sha_sse/intel_sha256.c @@ -0,0 +1,274 @@ +/******************************************************************************* +* Copyright (c) 2013, Intel Corporation +* +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions are +* met: +* +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* +* * Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the +* distribution. +* +* * Neither the name of the Intel Corporation nor the names of its +* contributors may be used to endorse or promote products derived from +* this software without specific prior written permission. +* +* +* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +******************************************************************************** +* +* Intel SHA Extensions optimized implementation of a SHA-256 update function +* +* The function takes a pointer to the current hash values, a pointer to the +* input data, and a number of 64 byte blocks to process. Once all blocks have +* been processed, the digest pointer is updated with the resulting hash value. +* The function only processes complete blocks, there is no functionality to +* store partial blocks. All message padding and hash value initialization must +* be done outside the update function. +* +* The indented lines in the loop are instructions related to rounds processing. +* The non-indented lines are instructions related to the message schedule. +* +* Author: Sean Gulley +* Date: July 2013 +* +******************************************************************************** +* +* Example complier command line: +* icc intel_sha_extensions_sha256_intrinsic.c +* gcc -msha -msse4 intel_sha_extensions_sha256_intrinsic.c +* +*******************************************************************************/ +#include +#include + +#include + +void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks) { + __m128i state0, state1; + __m128i msg; + __m128i msgtmp0, msgtmp1, msgtmp2, msgtmp3; + __m128i tmp; + __m128i shuf_mask; + __m128i abef_save, cdgh_save; + + // Load initial hash values + // Need to reorder these appropriately + // DCBA, HGFE -> ABEF, CDGH + tmp = _mm_loadu_si128((__m128i*) digest); + state1 = _mm_loadu_si128((__m128i*) (digest+4)); + + tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB + state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH + state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF + state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH + + shuf_mask = _mm_set_epi64x(0x0c0d0e0f08090a0bull, 0x0405060700010203ull); + + while (num_blks > 0) { + // Save hash values for addition after rounds + abef_save = state0; + cdgh_save = state1; + + // Rounds 0-3 + msg = _mm_loadu_si128((const __m128i*) data); + msgtmp0 = _mm_shuffle_epi8(msg, shuf_mask); + msg = _mm_add_epi32(msgtmp0, + _mm_set_epi64x(0xE9B5DBA5B5C0FBCFull, 0x71374491428A2F98ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + + // Rounds 4-7 + msgtmp1 = _mm_loadu_si128((const __m128i*) (data+16)); + msgtmp1 = _mm_shuffle_epi8(msgtmp1, shuf_mask); + msg = _mm_add_epi32(msgtmp1, + _mm_set_epi64x(0xAB1C5ED5923F82A4ull, 0x59F111F13956C25Bull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1); + + // Rounds 8-11 + msgtmp2 = _mm_loadu_si128((const __m128i*) (data+32)); + msgtmp2 = _mm_shuffle_epi8(msgtmp2, shuf_mask); + msg = _mm_add_epi32(msgtmp2, + _mm_set_epi64x(0x550C7DC3243185BEull, 0x12835B01D807AA98ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2); + + // Rounds 12-15 + msgtmp3 = _mm_loadu_si128((const __m128i*) (data+48)); + msgtmp3 = _mm_shuffle_epi8(msgtmp3, shuf_mask); + msg = _mm_add_epi32(msgtmp3, + _mm_set_epi64x(0xC19BF1749BDC06A7ull, 0x80DEB1FE72BE5D74ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4); + msgtmp0 = _mm_add_epi32(msgtmp0, tmp); + msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3); + + // Rounds 16-19 + msg = _mm_add_epi32(msgtmp0, + _mm_set_epi64x(0x240CA1CC0FC19DC6ull, 0xEFBE4786E49B69C1ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4); + msgtmp1 = _mm_add_epi32(msgtmp1, tmp); + msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0); + + // Rounds 20-23 + msg = _mm_add_epi32(msgtmp1, + _mm_set_epi64x(0x76F988DA5CB0A9DCull, 0x4A7484AA2DE92C6Full)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4); + msgtmp2 = _mm_add_epi32(msgtmp2, tmp); + msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1); + + // Rounds 24-27 + msg = _mm_add_epi32(msgtmp2, + _mm_set_epi64x(0xBF597FC7B00327C8ull, 0xA831C66D983E5152ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4); + msgtmp3 = _mm_add_epi32(msgtmp3, tmp); + msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2); + + // Rounds 28-31 + msg = _mm_add_epi32(msgtmp3, + _mm_set_epi64x(0x1429296706CA6351ull, 0xD5A79147C6E00BF3ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4); + msgtmp0 = _mm_add_epi32(msgtmp0, tmp); + msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3); + + // Rounds 32-35 + msg = _mm_add_epi32(msgtmp0, + _mm_set_epi64x(0x53380D134D2C6DFCull, 0x2E1B213827B70A85ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4); + msgtmp1 = _mm_add_epi32(msgtmp1, tmp); + msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0); + + // Rounds 36-39 + msg = _mm_add_epi32(msgtmp1, + _mm_set_epi64x(0x92722C8581C2C92Eull, 0x766A0ABB650A7354ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4); + msgtmp2 = _mm_add_epi32(msgtmp2, tmp); + msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1); + + // Rounds 40-43 + msg = _mm_add_epi32(msgtmp2, + _mm_set_epi64x(0xC76C51A3C24B8B70ull, 0xA81A664BA2BFE8A1ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4); + msgtmp3 = _mm_add_epi32(msgtmp3, tmp); + msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2); + + // Rounds 44-47 + msg = _mm_add_epi32(msgtmp3, + _mm_set_epi64x(0x106AA070F40E3585ull, 0xD6990624D192E819ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4); + msgtmp0 = _mm_add_epi32(msgtmp0, tmp); + msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3); + + // Rounds 48-51 + msg = _mm_add_epi32(msgtmp0, + _mm_set_epi64x(0x34B0BCB52748774Cull, 0x1E376C0819A4C116ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4); + msgtmp1 = _mm_add_epi32(msgtmp1, tmp); + msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0); + + // Rounds 52-55 + msg = _mm_add_epi32(msgtmp1, + _mm_set_epi64x(0x682E6FF35B9CCA4Full, 0x4ED8AA4A391C0CB3ull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4); + msgtmp2 = _mm_add_epi32(msgtmp2, tmp); + msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + + // Rounds 56-59 + msg = _mm_add_epi32(msgtmp2, + _mm_set_epi64x(0x8CC7020884C87814ull, 0x78A5636F748F82EEull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4); + msgtmp3 = _mm_add_epi32(msgtmp3, tmp); + msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + + // Rounds 60-63 + msg = _mm_add_epi32(msgtmp3, + _mm_set_epi64x(0xC67178F2BEF9A3F7ull, 0xA4506CEB90BEFFFAull)); + state1 = _mm_sha256rnds2_epu32(state1, state0, msg); + msg = _mm_shuffle_epi32(msg, 0x0E); + state0 = _mm_sha256rnds2_epu32(state0, state1, msg); + + // Add current hash values with previously saved + state0 = _mm_add_epi32(state0, abef_save); + state1 = _mm_add_epi32(state1, cdgh_save); + + data += 64; + num_blks--; + } + + // Write hash values back in the correct order + tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA + state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG + state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA + state1 = _mm_alignr_epi8(state1, tmp, 8); // ABEF + + _mm_store_si128((__m128i*) digest, state0); + _mm_store_si128((__m128i*) (digest+4), state1); +} + Index: sys/crypto/sha_sse/sha_sse.h =================================================================== --- /dev/null +++ sys/crypto/sha_sse/sha_sse.h @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2017 Conrad Meyer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _CRYPTO__SHA_SSE_H_ +#define _CRYPTO__SHA_SSE_H_ + +/* + * Internal functions, implemented in intrinsics. + */ +void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks); +void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks); + +#endif /* _CRYPTO__SHA_SSE_H_ */ Index: sys/crypto/sha_sse/sha_sse.c =================================================================== --- /dev/null +++ sys/crypto/sha_sse/sha_sse.c @@ -0,0 +1,613 @@ +/*- + * Copyright (c) 2005-2008 Pawel Jakub Dawidek + * Copyright (c) 2010 Konstantin Belousov + * Copyright (c) 2014 The FreeBSD Foundation + * Copyright (c) 2017 Conrad Meyer + * All rights reserved. + * + * Portions of this software were developed by John-Mark Gurney + * under sponsorship of the FreeBSD Foundation and + * Rubicon Communications, LLC (Netgate). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include +#include +#if defined(__i386__) +#include +#elif defined(__amd64__) +#include +#endif +static struct mtx_padalign *ctx_mtx; +static struct fpu_kern_ctx **ctx_fpu; + +struct sha_sse_softc { + int dieing; + int32_t cid; + uint32_t sid; + struct rwlock lock; + TAILQ_HEAD(sha_sse_sessions_head, sha_sse_session) sessions; +}; + +struct sha_sse_session { + /* Same as the SHA256 Blocksize. */ + uint8_t key[SHA1_HMAC_BLOCK_LEN] __aligned(16); + int algo; + int used; + int mlen; + uint32_t id; + TAILQ_ENTRY(sha_sse_session) next; +}; + +#define ACQUIRE_CTX(i, ctx) \ + do { \ + (i) = PCPU_GET(cpuid); \ + mtx_lock(&ctx_mtx[(i)]); \ + (ctx) = ctx_fpu[(i)]; \ + } while (0) +#define RELEASE_CTX(i, ctx) \ + do { \ + mtx_unlock(&ctx_mtx[(i)]); \ + (i) = -1; \ + (ctx) = NULL; \ + } while (0) + +static int sha_sse_newsession(device_t, uint32_t *sidp, struct cryptoini *cri); +static int sha_sse_freesession(device_t, uint64_t tid); +static void sha_sse_freesession_locked(struct sha_sse_softc *sc, + struct sha_sse_session *ses); +static int sha_sse_cipher_setup(struct sha_sse_session *ses, + struct cryptoini *encini); +static int sha_sse_cipher_process(struct sha_sse_session *ses, + struct cryptop *crp); + +MALLOC_DEFINE(M_SHA_SSE, "sha_sse_data", "SHA_SSE Data"); + +static void +sha_sse_identify(driver_t *drv, device_t parent) +{ + + /* NB: order 10 is so we get attached after h/w devices */ + if (device_find_child(parent, "sha_sse", -1) == NULL && + BUS_ADD_CHILD(parent, 10, "sha_sse", -1) == 0) + panic("sha_sse: could not attach"); +} + +static int +sha_sse_probe(device_t dev) +{ + + if ((cpu_stdext_feature & CPUID_STDEXT_SHA) == 0) { + device_printf(dev, "No SHA support.\n"); + return (EINVAL); + } + if ((cpu_feature2 & CPUID2_SSSE3) == 0) { + device_printf(dev, "No SSSE3 support.\n"); + return (EINVAL); + } + + device_set_desc_copy(dev, "SHA1,SHA2"); + return (0); +} + +static void +sha_sse_cleanctx(void) +{ + int i; + + /* XXX - no way to return driverid */ + CPU_FOREACH(i) { + if (ctx_fpu[i] != NULL) { + mtx_destroy(&ctx_mtx[i]); + fpu_kern_free_ctx(ctx_fpu[i]); + } + ctx_fpu[i] = NULL; + } + free(ctx_mtx, M_SHA_SSE); + ctx_mtx = NULL; + free(ctx_fpu, M_SHA_SSE); + ctx_fpu = NULL; +} + +static int +sha_sse_attach(device_t dev) +{ + struct sha_sse_softc *sc; + int i; + + sc = device_get_softc(dev); + sc->dieing = 0; + TAILQ_INIT(&sc->sessions); + sc->sid = 1; + + sc->cid = crypto_get_driverid(dev, CRYPTOCAP_F_HARDWARE | + CRYPTOCAP_F_SYNC); + if (sc->cid < 0) { + device_printf(dev, "Could not get crypto driver id.\n"); + return (ENOMEM); + } + + ctx_mtx = malloc(sizeof(*ctx_mtx) * (mp_maxid + 1), M_SHA_SSE, + M_WAITOK | M_ZERO); + ctx_fpu = malloc(sizeof(*ctx_fpu) * (mp_maxid + 1), M_SHA_SSE, + M_WAITOK | M_ZERO); + + CPU_FOREACH(i) { + ctx_fpu[i] = fpu_kern_alloc_ctx(0); + mtx_init(&ctx_mtx[i], "shafpumtx", NULL, MTX_DEF | MTX_NEW); + } + + rw_init(&sc->lock, "sha_sse_lock"); + crypto_register(sc->cid, CRYPTO_SHA1, 0, 0); + crypto_register(sc->cid, CRYPTO_SHA1_HMAC, 0, 0); + crypto_register(sc->cid, CRYPTO_SHA2_256_HMAC, 0, 0); + return (0); +} + +static int +sha_sse_detach(device_t dev) +{ + struct sha_sse_softc *sc; + struct sha_sse_session *ses; + + sc = device_get_softc(dev); + + rw_wlock(&sc->lock); + TAILQ_FOREACH(ses, &sc->sessions, next) { + if (ses->used) { + rw_wunlock(&sc->lock); + device_printf(dev, + "Cannot detach, sessions still active.\n"); + return (EBUSY); + } + } + sc->dieing = 1; + while ((ses = TAILQ_FIRST(&sc->sessions)) != NULL) { + TAILQ_REMOVE(&sc->sessions, ses, next); + free(ses, M_SHA_SSE); + } + rw_wunlock(&sc->lock); + crypto_unregister_all(sc->cid); + + rw_destroy(&sc->lock); + + sha_sse_cleanctx(); + + return (0); +} + +static int +sha_sse_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri) +{ + struct sha_sse_softc *sc; + struct sha_sse_session *ses; + struct cryptoini *encini; + int error; + + if (sidp == NULL || cri == NULL) { + CRYPTDEB("no sidp or cri"); + return (EINVAL); + } + + sc = device_get_softc(dev); + if (sc->dieing) + return (EINVAL); + + ses = NULL; + encini = NULL; + for (; cri != NULL; cri = cri->cri_next) { + switch (cri->cri_alg) { + case CRYPTO_SHA1: + case CRYPTO_SHA1_HMAC: + case CRYPTO_SHA2_256_HMAC: + if (encini != NULL) { + CRYPTDEB("encini already set"); + return (EINVAL); + } + encini = cri; + break; + default: + CRYPTDEB("unhandled algorithm"); + return (EINVAL); + } + } + if (encini == NULL) { + CRYPTDEB("no cipher"); + return (EINVAL); + } + + rw_wlock(&sc->lock); + if (sc->dieing) { + rw_wunlock(&sc->lock); + return (EINVAL); + } + /* + * Free sessions goes first, so if first session is used, we need to + * allocate one. + */ + ses = TAILQ_FIRST(&sc->sessions); + if (ses == NULL || ses->used) { + ses = malloc(sizeof(*ses), M_SHA_SSE, M_NOWAIT | M_ZERO); + if (ses == NULL) { + rw_wunlock(&sc->lock); + return (ENOMEM); + } + ses->id = sc->sid++; + } else { + TAILQ_REMOVE(&sc->sessions, ses, next); + } + ses->used = 1; + TAILQ_INSERT_TAIL(&sc->sessions, ses, next); + rw_wunlock(&sc->lock); + ses->algo = encini->cri_alg; + + error = sha_sse_cipher_setup(ses, encini); + if (error != 0) { + CRYPTDEB("setup failed"); + rw_wlock(&sc->lock); + sha_sse_freesession_locked(sc, ses); + rw_wunlock(&sc->lock); + return (error); + } + + *sidp = ses->id; + return (0); +} + +static void +sha_sse_freesession_locked(struct sha_sse_softc *sc, struct sha_sse_session *ses) +{ + uint32_t sid; + + rw_assert(&sc->lock, RA_WLOCKED); + + sid = ses->id; + TAILQ_REMOVE(&sc->sessions, ses, next); + explicit_bzero(ses, sizeof(*ses)); + ses->id = sid; + TAILQ_INSERT_HEAD(&sc->sessions, ses, next); +} + +static int +sha_sse_freesession(device_t dev, uint64_t tid) +{ + struct sha_sse_softc *sc; + struct sha_sse_session *ses; + uint32_t sid; + + sc = device_get_softc(dev); + sid = ((uint32_t)tid) & 0xffffffff; + rw_wlock(&sc->lock); + TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) { + if (ses->id == sid) + break; + } + if (ses == NULL) { + rw_wunlock(&sc->lock); + return (EINVAL); + } + sha_sse_freesession_locked(sc, ses); + rw_wunlock(&sc->lock); + return (0); +} + +static int +sha_sse_process(device_t dev, struct cryptop *crp, int hint __unused) +{ + struct sha_sse_softc *sc = device_get_softc(dev); + struct sha_sse_session *ses = NULL; + int error; + + error = 0; + + /* Sanity check. */ + if (crp == NULL) + return (EINVAL); + + if (crp->crp_callback == NULL || crp->crp_desc == NULL) { + error = EINVAL; + goto out; + } + + rw_rlock(&sc->lock); + TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) { + if (ses->id == (crp->crp_sid & 0xffffffff)) + break; + } + rw_runlock(&sc->lock); + if (ses == NULL) { + error = EINVAL; + goto out; + } + + error = sha_sse_cipher_process(ses, crp); + if (error != 0) + goto out; + +out: + crp->crp_etype = error; + crypto_done(crp); + return (error); +} + +static device_method_t sha_sse_methods[] = { + DEVMETHOD(device_identify, sha_sse_identify), + DEVMETHOD(device_probe, sha_sse_probe), + DEVMETHOD(device_attach, sha_sse_attach), + DEVMETHOD(device_detach, sha_sse_detach), + + DEVMETHOD(cryptodev_newsession, sha_sse_newsession), + DEVMETHOD(cryptodev_freesession, sha_sse_freesession), + DEVMETHOD(cryptodev_process, sha_sse_process), + + {0, 0}, +}; + +static driver_t sha_sse_driver = { + "sha_sse", + sha_sse_methods, + sizeof(struct sha_sse_softc), +}; +static devclass_t sha_sse_devclass; + +DRIVER_MODULE(sha_sse, nexus, sha_sse_driver, sha_sse_devclass, 0, 0); +MODULE_VERSION(sha_sse, 1); +MODULE_DEPEND(sha_sse, crypto, 1, 1, 1); + +static int +sha_sse_cipher_setup(struct sha_sse_session *ses, struct cryptoini *encini) +{ + int error, keylen; + + keylen = encini->cri_klen / 8; + if (keylen > sizeof(ses->key)) { + printf("%s: keylen:%d ses->key:%zu\n", __func__, keylen, sizeof(ses->key)); + error = EINVAL; + goto out; + } + if (ses->algo == CRYPTO_SHA1 && keylen > 0) { + printf("%s: algo:%d keylen:%d\n", __func__, ses->algo, keylen); + error = EINVAL; + goto out; + } + + memcpy(ses->key, encini->cri_key, keylen); + ses->mlen = encini->cri_mlen; + +out: + return (error); +} + +static int +intel_sha1_update(void *vctx, const void *vdata, u_int datalen) +{ + struct sha1_ctxt *ctx = vctx; + const char *data = vdata; + size_t gaplen; + size_t gapstart; + size_t off; + size_t copysiz; + u_int blocks; + + off = 0; + /* Do any aligned blocks without redundant copying. */ + if (datalen >= 64 && ctx->count % 64 == 0) { + blocks = datalen / 64; + ctx->c.b64[0] += blocks * 64 * 8; + intel_sha1_step(ctx->h.b32, data + off, blocks); + off += blocks * 64; + } + + while (off < datalen) { + gapstart = ctx->count % 64; + gaplen = 64 - gapstart; + + copysiz = (gaplen < datalen - off) ? gaplen : datalen - off; + bcopy(&data[off], &ctx->m.b8[gapstart], copysiz); + ctx->count += copysiz; + ctx->count %= 64; + ctx->c.b64[0] += copysiz * 8; + if (ctx->count % 64 == 0) + intel_sha1_step(ctx->h.b32, (void *)ctx->m.b8, 1); + off += copysiz; + } + return (0); +} + +static void +SHA1_Finalize_fn(void *digest, void *ctx) +{ + sha1_result(ctx, digest); +} + +static int +intel_sha256_update(void *vctx, const void *vdata, u_int len) +{ + SHA256_CTX *ctx = vctx; + uint64_t bitlen; + uint32_t r; + u_int blocks; + const unsigned char *src = vdata; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen = len << 3; + + /* Update number of bits */ + ctx->count += bitlen; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return (0); + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + intel_sha256_step(ctx->state, ctx->buf, 1); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + if (len >= 64) { + blocks = len / 64; + intel_sha256_step(ctx->state, src, blocks); + src += blocks * 64; + len -= blocks * 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); + return (0); +} + +static void +SHA256_Finalize_fn(void *digest, void *ctx) +{ + SHA256_Final(digest, ctx); +} + +/* + * Compute the HASH( (key ^ xorbyte) || buf ) + */ +static void +hmac_internal(void *ctx, uint32_t *res, + int (*update)(void *, const void *, u_int), + void (*finalize)(void *, void *), uint8_t *key, uint8_t xorbyte, + const void *buf, size_t off, size_t buflen, int crpflags) +{ + size_t i; + + for (i = 0; i < 64; i++) + key[i] ^= xorbyte; + update(ctx, key, 64); + for (i = 0; i < 64; i++) + key[i] ^= xorbyte; + + crypto_apply(crpflags, __DECONST(void *, buf), off, buflen, + __DECONST(int (*)(void *, void *, u_int), update), ctx); + finalize(res, ctx); +} + +static int +sha_sse_cipher_process(struct sha_sse_session *ses, struct cryptop *crp) +{ + struct SHA256Context s2ctx; + struct sha1_ctxt sctx __aligned(16); + uint32_t res[8]; + struct fpu_kern_ctx *ctx; + struct cryptodesc *crd; + int error, kt, ctxidx, hashlen; + + error = 0; + + kt = is_fpu_kern_thread(0); + if (!kt) { + ACQUIRE_CTX(ctxidx, ctx); + error = fpu_kern_enter(curthread, ctx, + FPU_KERN_NORMAL|FPU_KERN_KTHR); + if (error != 0) + goto out2; + } + + crd = crp->crp_desc; + if (crd->crd_next != NULL || crd->crd_flags != 0) { + error = EINVAL; + goto out; + } + + switch (ses->algo) { + case CRYPTO_SHA1_HMAC: + hashlen = SHA1_HASH_LEN; + /* Inner hash: (K ^ IPAD) || data */ + sha1_init(&sctx); + hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn, + ses->key, 0x36, crp->crp_buf, crd->crd_skip, crd->crd_len, + crp->crp_flags); + /* Outer hash: (K ^ OPAD) || inner hash */ + sha1_init(&sctx); + hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn, + ses->key, 0x5C, res, 0, hashlen, 0); + break; + case CRYPTO_SHA1: + hashlen = SHA1_HASH_LEN; + sha1_init(&sctx); + crypto_apply(crp->crp_flags, crp->crp_buf, crd->crd_skip, + crd->crd_len, __DECONST(int (*)(void *, void *, u_int), + intel_sha1_update), &sctx); + sha1_result(&sctx, (void *)res); + break; + case CRYPTO_SHA2_256_HMAC: + hashlen = SHA2_256_HASH_LEN; + /* Inner hash: (K ^ IPAD) || data */ + SHA256_Init(&s2ctx); + hmac_internal(&s2ctx, res, intel_sha256_update, + SHA256_Finalize_fn, ses->key, 0x36, crp->crp_buf, + crd->crd_skip, crd->crd_len, crp->crp_flags); + /* Outer hash: (K ^ OPAD) || inner hash */ + SHA256_Init(&s2ctx); + hmac_internal(&s2ctx, res, intel_sha256_update, + SHA256_Finalize_fn, ses->key, 0x5C, res, 0, hashlen, 0); + break; + } + + if (ses->mlen != 0 && ses->mlen < hashlen) + hashlen = ses->mlen; + crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject, hashlen, + (void *)res); + +out: + if (!kt) { + fpu_kern_leave(curthread, ctx); +out2: + RELEASE_CTX(ctxidx, ctx); + } + return (error); +} Index: sys/i386/conf/NOTES =================================================================== --- sys/i386/conf/NOTES +++ sys/i386/conf/NOTES @@ -839,6 +839,7 @@ device padlock_rng # VIA Padlock RNG device rdrand_rng # Intel Bull Mountain RNG device aesni # AES-NI OpenCrypto module +device sha_sse # SHA SSE extension OpenCrypto module # # Laptop/Notebook options: Index: sys/modules/Makefile =================================================================== --- sys/modules/Makefile +++ sys/modules/Makefile @@ -351,6 +351,7 @@ sge \ ${_sgx} \ ${_sgx_linux} \ + ${_sha_sse} \ siba_bwn \ siftr \ siis \ @@ -625,6 +626,7 @@ _acpi= acpi .if ${MK_CRYPT} != "no" || defined(ALL_MODULES) _aesni= aesni +_sha_sse= sha_sse .endif _amd_ecc_inject=amd_ecc_inject _amdsbwd= amdsbwd Index: sys/modules/sha_sse/Makefile =================================================================== --- /dev/null +++ sys/modules/sha_sse/Makefile @@ -0,0 +1,25 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/crypto/sha_sse +.PATH: ${SRCTOP}/contrib/llvm/tools/clang/lib/Headers + +KMOD= sha_sse +SRCS= sha_sse.c +SRCS+= device_if.h bus_if.h opt_bus.h cryptodev_if.h + +OBJS+= intel_sha1.o intel_sha256.o + +# Remove -nostdinc so we can get the intrinsics. +intel_sha1.o: intel_sha1.c + ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \ + -mmmx -msse -msse4 -msha ${.IMPSRC} + ${CTFCONVERT_CMD} +intel_sha256.o: intel_sha256.c + ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \ + -mmmx -msse -msse4 -msha ${.IMPSRC} + ${CTFCONVERT_CMD} + +intel_sha1.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h +intel_sha256.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h + +.include Index: tests/sys/opencrypto/cryptotest.py =================================================================== --- tests/sys/opencrypto/cryptotest.py +++ tests/sys/opencrypto/cryptotest.py @@ -46,7 +46,7 @@ aesmodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ] desmodules = [ 'cryptosoft0', ] -shamodules = [ 'cryptosoft0', 'ccr0' ] +shamodules = [ 'cryptosoft0', 'ccr0', 'sha_sse0' ] def GenTestCase(cname): try: @@ -308,6 +308,7 @@ cryptosoft = GenTestCase('cryptosoft0') aesni = GenTestCase('aesni0') ccr = GenTestCase('ccr0') +sha_sse = GenTestCase('sha_sse0') if __name__ == '__main__': unittest.main()