Page MenuHomeFreeBSD

D12452.id33338.diff
No OneTemporary

D12452.id33338.diff

Index: share/man/man4/sha_sse.4
===================================================================
--- /dev/null
+++ share/man/man4/sha_sse.4
@@ -0,0 +1,81 @@
+.\" Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd September 21, 2017
+.Dt SHA_SSE 4
+.Os
+.Sh NAME
+.Nm sha_sse
+.Nd "driver for the SHA accelerator on x86 CPUs"
+.Sh SYNOPSIS
+To compile this driver into the kernel,
+place the following lines in your
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device crypto"
+.Cd "device cryptodev"
+.Cd "device sha_sse"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+sha_sse_load="YES"
+.Ed
+.Sh DESCRIPTION
+Starting with the Intel Goldmont and AMD Ryzen microarchitectures, some x86
+processors implement a new set of SHA instructions.
+The set of seven instructions accelerates the calculation of SHA1 and SHA256
+hashes.
+.Pp
+The processor capability is reported as SHA in the Structured Extended Features
+line at boot.
+The
+.Nm
+driver does not attach on systems that lack the required CPU capability.
+.Pp
+The
+.Nm
+driver registers itself to accelerate SHA operations for
+.Xr crypto 4 .
+.Sh SEE ALSO
+.Xr crypto 4 ,
+.Xr intro 4 ,
+.Xr ipsec 4 ,
+.Xr crypto 9
+.Sh HISTORY
+The
+.Nm
+driver first appeared in
+.Fx 12.0 .
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+driver was written by
+.An Conrad Meyer Aq Mt cem@FreeBSD.org .
+The hash step intrinsics implementations were supplied by Intel.
Index: sys/amd64/conf/NOTES
===================================================================
--- sys/amd64/conf/NOTES
+++ sys/amd64/conf/NOTES
@@ -544,6 +544,7 @@
device padlock_rng # VIA Padlock RNG
device rdrand_rng # Intel Bull Mountain RNG
device aesni # AES-NI OpenCrypto module
+device sha_sse # SHA SSE extension OpenCrypto module
device ioat # Intel I/OAT DMA engine
#
Index: sys/conf/files.amd64
===================================================================
--- sys/conf/files.amd64
+++ sys/conf/files.amd64
@@ -182,6 +182,17 @@
crypto/blowfish/bf_enc.c optional crypto | ipsec | ipsec_support
crypto/des/des_enc.c optional crypto | ipsec | \
ipsec_support | netsmb
+intel_sha1.o optional sha_sse \
+ dependency "$S/crypto/sha_sse/intel_sha1.c" \
+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+ no-implicit-rule \
+ clean "intel_sha1.o"
+intel_sha256.o optional sha_sse \
+ dependency "$S/crypto/sha_sse/intel_sha256.c" \
+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+ no-implicit-rule \
+ clean "intel_sha256.o"
+crypto/sha_sse/sha_sse.c optional sha_sse
crypto/via/padlock.c optional padlock
crypto/via/padlock_cipher.c optional padlock
crypto/via/padlock_hash.c optional padlock
Index: sys/conf/files.i386
===================================================================
--- sys/conf/files.i386
+++ sys/conf/files.i386
@@ -132,6 +132,17 @@
no-implicit-rule \
clean "aesni_wrap.o"
crypto/des/arch/i386/des_enc.S optional crypto | ipsec | ipsec_support | netsmb
+intel_sha1.o optional sha_sse \
+ dependency "$S/crypto/sha_sse/intel_sha1.c" \
+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+ no-implicit-rule \
+ clean "intel_sha1.o"
+intel_sha256.o optional sha_sse \
+ dependency "$S/crypto/sha_sse/intel_sha256.c" \
+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+ no-implicit-rule \
+ clean "intel_sha256.o"
+crypto/sha_sse/sha_sse.c optional sha_sse
crypto/via/padlock.c optional padlock
crypto/via/padlock_cipher.c optional padlock
crypto/via/padlock_hash.c optional padlock
Index: sys/crypto/sha_sse/intel_sha1.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/intel_sha1.c
@@ -0,0 +1,258 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation
+*
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*
+* * Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* * Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* * Neither the name of the Intel Corporation nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+*
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-1 update function
+*
+* The function takes a pointer to the current hash values, a pointer to the
+* input data, and a number of 64 byte blocks to process. Once all blocks have
+* been processed, the digest pointer is updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to
+* store partial blocks. All message padding and hash value initialization must
+* be done outside the update function.
+*
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+*
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date: July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha1_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
+*
+*******************************************************************************/
+#include <sys/types.h>
+#include <immintrin.h>
+
+#include <crypto/sha_sse/sha_sse.h>
+
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+ __m128i abcd, e0, e1;
+ __m128i abcd_save, e_save;
+ __m128i msg0, msg1, msg2, msg3;
+ __m128i shuf_mask, e_mask;
+
+#if 0
+ e_mask = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
+#else
+ (void)e_mask;
+ e0 = _mm_set_epi64x(0, 0);
+#endif
+ shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
+
+ // Load initial hash values
+ abcd = _mm_loadu_si128((__m128i*) digest);
+ e0 = _mm_insert_epi32(e0, *(digest+4), 3);
+ abcd = _mm_shuffle_epi32(abcd, 0x1B);
+#if 0
+ e0 = _mm_and_si128(e0, e_mask);
+#endif
+
+ while (num_blks > 0) {
+ // Save hash values for addition after rounds
+ abcd_save = abcd;
+ e_save = e0;
+
+ // Rounds 0-3
+ msg0 = _mm_loadu_si128((const __m128i*) data);
+ msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
+ e0 = _mm_add_epi32(e0, msg0);
+ e1 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+
+ // Rounds 4-7
+ msg1 = _mm_loadu_si128((const __m128i*) (data+16));
+ msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+
+ // Rounds 8-11
+ msg2 = _mm_loadu_si128((const __m128i*) (data+32));
+ msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ // Rounds 12-15
+ msg3 = _mm_loadu_si128((const __m128i*) (data+48));
+ msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+ msg1 = _mm_xor_si128(msg1, msg3);
+
+ // Rounds 16-19
+ e0 = _mm_sha1nexte_epu32(e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+ msg2 = _mm_xor_si128(msg2, msg0);
+
+ // Rounds 20-23
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ // Rounds 24-27
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ // Rounds 28-31
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+ msg1 = _mm_xor_si128(msg1, msg3);
+
+ // Rounds 32-35
+ e0 = _mm_sha1nexte_epu32(e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+ msg2 = _mm_xor_si128(msg2, msg0);
+
+ // Rounds 36-39
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ // Rounds 40-43
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ // Rounds 44-47
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+ msg1 = _mm_xor_si128(msg1, msg3);
+
+ // Rounds 48-51
+ e0 = _mm_sha1nexte_epu32(e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+ msg2 = _mm_xor_si128(msg2, msg0);
+
+ // Rounds 52-55
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ // Rounds 56-59
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ // Rounds 60-63
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+ msg1 = _mm_xor_si128(msg1, msg3);
+
+ // Rounds 64-67
+ e0 = _mm_sha1nexte_epu32(e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+ msg2 = _mm_xor_si128(msg2, msg0);
+
+ // Rounds 68-71
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ // Rounds 72-75
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+
+ // Rounds 76-79
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+
+ // Add current hash values with previously saved
+ e0 = _mm_sha1nexte_epu32(e0, e_save);
+ abcd = _mm_add_epi32(abcd, abcd_save);
+
+ data += 64;
+ num_blks--;
+ }
+
+ abcd = _mm_shuffle_epi32(abcd, 0x1B);
+ _mm_store_si128((__m128i*) digest, abcd);
+ *(digest+4) = _mm_extract_epi32(e0, 3);
+}
+
Index: sys/crypto/sha_sse/intel_sha256.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/intel_sha256.c
@@ -0,0 +1,274 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation
+*
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*
+* * Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* * Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* * Neither the name of the Intel Corporation nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+*
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-256 update function
+*
+* The function takes a pointer to the current hash values, a pointer to the
+* input data, and a number of 64 byte blocks to process. Once all blocks have
+* been processed, the digest pointer is updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to
+* store partial blocks. All message padding and hash value initialization must
+* be done outside the update function.
+*
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+*
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date: July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha256_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha256_intrinsic.c
+*
+*******************************************************************************/
+#include <stdint.h>
+#include <immintrin.h>
+
+#include <crypto/sha_sse/sha_sse.h>
+
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+ __m128i state0, state1;
+ __m128i msg;
+ __m128i msgtmp0, msgtmp1, msgtmp2, msgtmp3;
+ __m128i tmp;
+ __m128i shuf_mask;
+ __m128i abef_save, cdgh_save;
+
+ // Load initial hash values
+ // Need to reorder these appropriately
+ // DCBA, HGFE -> ABEF, CDGH
+ tmp = _mm_loadu_si128((__m128i*) digest);
+ state1 = _mm_loadu_si128((__m128i*) (digest+4));
+
+ tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB
+ state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH
+ state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF
+ state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
+
+ shuf_mask = _mm_set_epi64x(0x0c0d0e0f08090a0bull, 0x0405060700010203ull);
+
+ while (num_blks > 0) {
+ // Save hash values for addition after rounds
+ abef_save = state0;
+ cdgh_save = state1;
+
+ // Rounds 0-3
+ msg = _mm_loadu_si128((const __m128i*) data);
+ msgtmp0 = _mm_shuffle_epi8(msg, shuf_mask);
+ msg = _mm_add_epi32(msgtmp0,
+ _mm_set_epi64x(0xE9B5DBA5B5C0FBCFull, 0x71374491428A2F98ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+ // Rounds 4-7
+ msgtmp1 = _mm_loadu_si128((const __m128i*) (data+16));
+ msgtmp1 = _mm_shuffle_epi8(msgtmp1, shuf_mask);
+ msg = _mm_add_epi32(msgtmp1,
+ _mm_set_epi64x(0xAB1C5ED5923F82A4ull, 0x59F111F13956C25Bull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+ // Rounds 8-11
+ msgtmp2 = _mm_loadu_si128((const __m128i*) (data+32));
+ msgtmp2 = _mm_shuffle_epi8(msgtmp2, shuf_mask);
+ msg = _mm_add_epi32(msgtmp2,
+ _mm_set_epi64x(0x550C7DC3243185BEull, 0x12835B01D807AA98ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+ // Rounds 12-15
+ msgtmp3 = _mm_loadu_si128((const __m128i*) (data+48));
+ msgtmp3 = _mm_shuffle_epi8(msgtmp3, shuf_mask);
+ msg = _mm_add_epi32(msgtmp3,
+ _mm_set_epi64x(0xC19BF1749BDC06A7ull, 0x80DEB1FE72BE5D74ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+ // Rounds 16-19
+ msg = _mm_add_epi32(msgtmp0,
+ _mm_set_epi64x(0x240CA1CC0FC19DC6ull, 0xEFBE4786E49B69C1ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+ // Rounds 20-23
+ msg = _mm_add_epi32(msgtmp1,
+ _mm_set_epi64x(0x76F988DA5CB0A9DCull, 0x4A7484AA2DE92C6Full));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+ // Rounds 24-27
+ msg = _mm_add_epi32(msgtmp2,
+ _mm_set_epi64x(0xBF597FC7B00327C8ull, 0xA831C66D983E5152ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+ // Rounds 28-31
+ msg = _mm_add_epi32(msgtmp3,
+ _mm_set_epi64x(0x1429296706CA6351ull, 0xD5A79147C6E00BF3ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+ // Rounds 32-35
+ msg = _mm_add_epi32(msgtmp0,
+ _mm_set_epi64x(0x53380D134D2C6DFCull, 0x2E1B213827B70A85ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+ // Rounds 36-39
+ msg = _mm_add_epi32(msgtmp1,
+ _mm_set_epi64x(0x92722C8581C2C92Eull, 0x766A0ABB650A7354ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+ // Rounds 40-43
+ msg = _mm_add_epi32(msgtmp2,
+ _mm_set_epi64x(0xC76C51A3C24B8B70ull, 0xA81A664BA2BFE8A1ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+ // Rounds 44-47
+ msg = _mm_add_epi32(msgtmp3,
+ _mm_set_epi64x(0x106AA070F40E3585ull, 0xD6990624D192E819ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+ // Rounds 48-51
+ msg = _mm_add_epi32(msgtmp0,
+ _mm_set_epi64x(0x34B0BCB52748774Cull, 0x1E376C0819A4C116ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+ // Rounds 52-55
+ msg = _mm_add_epi32(msgtmp1,
+ _mm_set_epi64x(0x682E6FF35B9CCA4Full, 0x4ED8AA4A391C0CB3ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+ // Rounds 56-59
+ msg = _mm_add_epi32(msgtmp2,
+ _mm_set_epi64x(0x8CC7020884C87814ull, 0x78A5636F748F82EEull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+ // Rounds 60-63
+ msg = _mm_add_epi32(msgtmp3,
+ _mm_set_epi64x(0xC67178F2BEF9A3F7ull, 0xA4506CEB90BEFFFAull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+ // Add current hash values with previously saved
+ state0 = _mm_add_epi32(state0, abef_save);
+ state1 = _mm_add_epi32(state1, cdgh_save);
+
+ data += 64;
+ num_blks--;
+ }
+
+ // Write hash values back in the correct order
+ tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA
+ state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG
+ state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
+ state1 = _mm_alignr_epi8(state1, tmp, 8); // ABEF
+
+ _mm_store_si128((__m128i*) digest, state0);
+ _mm_store_si128((__m128i*) (digest+4), state1);
+}
+
Index: sys/crypto/sha_sse/sha_sse.h
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/sha_sse.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CRYPTO__SHA_SSE_H_
+#define _CRYPTO__SHA_SSE_H_
+
+/*
+ * Internal functions, implemented in intrinsics.
+ */
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks);
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks);
+
+#endif /* _CRYPTO__SHA_SSE_H_ */
Index: sys/crypto/sha_sse/sha_sse.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/sha_sse.c
@@ -0,0 +1,606 @@
+/*-
+ * Copyright (c) 2005-2008 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2014 The FreeBSD Foundation
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed by John-Mark Gurney
+ * under sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/rwlock.h>
+#include <sys/bus.h>
+#include <sys/uio.h>
+#include <sys/mbuf.h>
+#include <sys/smp.h>
+
+#include <crypto/sha1.h>
+#include <crypto/sha2/sha256.h>
+#include <crypto/sha_sse/sha_sse.h>
+
+#include <opencrypto/cryptodev.h>
+#include <opencrypto/gmac.h>
+#include <cryptodev_if.h>
+
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#if defined(__i386__)
+#include <machine/npx.h>
+#elif defined(__amd64__)
+#include <machine/fpu.h>
+#endif
+static struct mtx_padalign *ctx_mtx;
+static struct fpu_kern_ctx **ctx_fpu;
+
+struct sha_sse_softc {
+ int dieing;
+ int32_t cid;
+ uint32_t sid;
+ struct rwlock lock;
+ TAILQ_HEAD(sha_sse_sessions_head, sha_sse_session) sessions;
+};
+
+struct sha_sse_session {
+ /* Same as the SHA256 Blocksize. */
+ uint8_t key[SHA1_HMAC_BLOCK_LEN] __aligned(16);
+ int algo;
+ int used;
+ int mlen;
+ uint32_t id;
+ TAILQ_ENTRY(sha_sse_session) next;
+};
+
+#define ACQUIRE_CTX(i, ctx) \
+ do { \
+ (i) = PCPU_GET(cpuid); \
+ mtx_lock(&ctx_mtx[(i)]); \
+ (ctx) = ctx_fpu[(i)]; \
+ } while (0)
+#define RELEASE_CTX(i, ctx) \
+ do { \
+ mtx_unlock(&ctx_mtx[(i)]); \
+ (i) = -1; \
+ (ctx) = NULL; \
+ } while (0)
+
+static int sha_sse_newsession(device_t, uint32_t *sidp, struct cryptoini *cri);
+static int sha_sse_freesession(device_t, uint64_t tid);
+static void sha_sse_freesession_locked(struct sha_sse_softc *sc,
+ struct sha_sse_session *ses);
+static int sha_sse_cipher_setup(struct sha_sse_session *ses,
+ struct cryptoini *encini);
+static int sha_sse_cipher_process(struct sha_sse_session *ses,
+ struct cryptop *crp);
+
+MALLOC_DEFINE(M_SHA_SSE, "sha_sse_data", "SHA_SSE Data");
+
+static void
+sha_sse_identify(driver_t *drv, device_t parent)
+{
+
+ /* NB: order 10 is so we get attached after h/w devices */
+ if (device_find_child(parent, "sha_sse", -1) == NULL &&
+ BUS_ADD_CHILD(parent, 10, "sha_sse", -1) == 0)
+ panic("sha_sse: could not attach");
+}
+
+static int
+sha_sse_probe(device_t dev)
+{
+
+ if ((cpu_stdext_feature & CPUID_STDEXT_SHA) == 0) {
+ device_printf(dev, "No SHA support.\n");
+ return (EINVAL);
+ }
+ if ((cpu_feature2 & CPUID2_SSSE3) == 0) {
+ device_printf(dev, "No SSSE3 support.\n");
+ return (EINVAL);
+ }
+
+ device_set_desc_copy(dev, "SHA1,SHA2");
+ return (0);
+}
+
+static void
+sha_sse_cleanctx(void)
+{
+ int i;
+
+ /* XXX - no way to return driverid */
+ CPU_FOREACH(i) {
+ if (ctx_fpu[i] != NULL) {
+ mtx_destroy(&ctx_mtx[i]);
+ fpu_kern_free_ctx(ctx_fpu[i]);
+ }
+ ctx_fpu[i] = NULL;
+ }
+ free(ctx_mtx, M_SHA_SSE);
+ ctx_mtx = NULL;
+ free(ctx_fpu, M_SHA_SSE);
+ ctx_fpu = NULL;
+}
+
+static int
+sha_sse_attach(device_t dev)
+{
+ struct sha_sse_softc *sc;
+ int i;
+
+ sc = device_get_softc(dev);
+ sc->dieing = 0;
+ TAILQ_INIT(&sc->sessions);
+ sc->sid = 1;
+
+ sc->cid = crypto_get_driverid(dev, CRYPTOCAP_F_HARDWARE |
+ CRYPTOCAP_F_SYNC);
+ if (sc->cid < 0) {
+ device_printf(dev, "Could not get crypto driver id.\n");
+ return (ENOMEM);
+ }
+
+ ctx_mtx = malloc(sizeof(*ctx_mtx) * (mp_maxid + 1), M_SHA_SSE,
+ M_WAITOK | M_ZERO);
+ ctx_fpu = malloc(sizeof(*ctx_fpu) * (mp_maxid + 1), M_SHA_SSE,
+ M_WAITOK | M_ZERO);
+
+ CPU_FOREACH(i) {
+ ctx_fpu[i] = fpu_kern_alloc_ctx(0);
+ mtx_init(&ctx_mtx[i], "shafpumtx", NULL, MTX_DEF | MTX_NEW);
+ }
+
+ rw_init(&sc->lock, "sha_sse_lock");
+ crypto_register(sc->cid, CRYPTO_SHA1, 0, 0);
+ crypto_register(sc->cid, CRYPTO_SHA1_HMAC, 0, 0);
+ crypto_register(sc->cid, CRYPTO_SHA2_256_HMAC, 0, 0);
+ return (0);
+}
+
+static int
+sha_sse_detach(device_t dev)
+{
+ struct sha_sse_softc *sc;
+ struct sha_sse_session *ses;
+
+ sc = device_get_softc(dev);
+
+ rw_wlock(&sc->lock);
+ TAILQ_FOREACH(ses, &sc->sessions, next) {
+ if (ses->used) {
+ rw_wunlock(&sc->lock);
+ device_printf(dev,
+ "Cannot detach, sessions still active.\n");
+ return (EBUSY);
+ }
+ }
+ sc->dieing = 1;
+ while ((ses = TAILQ_FIRST(&sc->sessions)) != NULL) {
+ TAILQ_REMOVE(&sc->sessions, ses, next);
+ free(ses, M_SHA_SSE);
+ }
+ rw_wunlock(&sc->lock);
+ crypto_unregister_all(sc->cid);
+
+ rw_destroy(&sc->lock);
+
+ sha_sse_cleanctx();
+
+ return (0);
+}
+
+static int
+sha_sse_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
+{
+ struct sha_sse_softc *sc;
+ struct sha_sse_session *ses;
+ struct cryptoini *encini;
+ int error;
+
+ if (sidp == NULL || cri == NULL) {
+ CRYPTDEB("no sidp or cri");
+ return (EINVAL);
+ }
+
+ sc = device_get_softc(dev);
+ if (sc->dieing)
+ return (EINVAL);
+
+ ses = NULL;
+ encini = NULL;
+ for (; cri != NULL; cri = cri->cri_next) {
+ switch (cri->cri_alg) {
+ case CRYPTO_SHA1:
+ case CRYPTO_SHA1_HMAC:
+ case CRYPTO_SHA2_256_HMAC:
+ if (encini != NULL) {
+ CRYPTDEB("encini already set");
+ return (EINVAL);
+ }
+ encini = cri;
+ break;
+ default:
+ CRYPTDEB("unhandled algorithm");
+ return (EINVAL);
+ }
+ }
+ if (encini == NULL) {
+ CRYPTDEB("no cipher");
+ return (EINVAL);
+ }
+
+ rw_wlock(&sc->lock);
+ if (sc->dieing) {
+ rw_wunlock(&sc->lock);
+ return (EINVAL);
+ }
+ /*
+ * Free sessions goes first, so if first session is used, we need to
+ * allocate one.
+ */
+ ses = TAILQ_FIRST(&sc->sessions);
+ if (ses == NULL || ses->used) {
+ ses = malloc(sizeof(*ses), M_SHA_SSE, M_NOWAIT | M_ZERO);
+ if (ses == NULL) {
+ rw_wunlock(&sc->lock);
+ return (ENOMEM);
+ }
+ ses->id = sc->sid++;
+ } else {
+ TAILQ_REMOVE(&sc->sessions, ses, next);
+ }
+ ses->used = 1;
+ TAILQ_INSERT_TAIL(&sc->sessions, ses, next);
+ rw_wunlock(&sc->lock);
+ ses->algo = encini->cri_alg;
+
+ error = sha_sse_cipher_setup(ses, encini);
+ if (error != 0) {
+ CRYPTDEB("setup failed");
+ rw_wlock(&sc->lock);
+ sha_sse_freesession_locked(sc, ses);
+ rw_wunlock(&sc->lock);
+ return (error);
+ }
+
+ *sidp = ses->id;
+ return (0);
+}
+
+static void
+sha_sse_freesession_locked(struct sha_sse_softc *sc, struct sha_sse_session *ses)
+{
+ uint32_t sid;
+
+ rw_assert(&sc->lock, RA_WLOCKED);
+
+ sid = ses->id;
+ TAILQ_REMOVE(&sc->sessions, ses, next);
+ explicit_bzero(ses, sizeof(*ses));
+ ses->id = sid;
+ TAILQ_INSERT_HEAD(&sc->sessions, ses, next);
+}
+
+static int
+sha_sse_freesession(device_t dev, uint64_t tid)
+{
+ struct sha_sse_softc *sc;
+ struct sha_sse_session *ses;
+ uint32_t sid;
+
+ sc = device_get_softc(dev);
+ sid = ((uint32_t)tid) & 0xffffffff;
+ rw_wlock(&sc->lock);
+ TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) {
+ if (ses->id == sid)
+ break;
+ }
+ if (ses == NULL) {
+ rw_wunlock(&sc->lock);
+ return (EINVAL);
+ }
+ sha_sse_freesession_locked(sc, ses);
+ rw_wunlock(&sc->lock);
+ return (0);
+}
+
+static int
+sha_sse_process(device_t dev, struct cryptop *crp, int hint __unused)
+{
+ struct sha_sse_softc *sc = device_get_softc(dev);
+ struct sha_sse_session *ses = NULL;
+ int error;
+
+ error = 0;
+
+ /* Sanity check. */
+ if (crp == NULL)
+ return (EINVAL);
+
+ if (crp->crp_callback == NULL || crp->crp_desc == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ rw_rlock(&sc->lock);
+ TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) {
+ if (ses->id == (crp->crp_sid & 0xffffffff))
+ break;
+ }
+ rw_runlock(&sc->lock);
+ if (ses == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = sha_sse_cipher_process(ses, crp);
+ if (error != 0)
+ goto out;
+
+out:
+ crp->crp_etype = error;
+ crypto_done(crp);
+ return (error);
+}
+
+static device_method_t sha_sse_methods[] = {
+ DEVMETHOD(device_identify, sha_sse_identify),
+ DEVMETHOD(device_probe, sha_sse_probe),
+ DEVMETHOD(device_attach, sha_sse_attach),
+ DEVMETHOD(device_detach, sha_sse_detach),
+
+ DEVMETHOD(cryptodev_newsession, sha_sse_newsession),
+ DEVMETHOD(cryptodev_freesession, sha_sse_freesession),
+ DEVMETHOD(cryptodev_process, sha_sse_process),
+
+ {0, 0},
+};
+
+static driver_t sha_sse_driver = {
+ "sha_sse",
+ sha_sse_methods,
+ sizeof(struct sha_sse_softc),
+};
+static devclass_t sha_sse_devclass;
+
+DRIVER_MODULE(sha_sse, nexus, sha_sse_driver, sha_sse_devclass, 0, 0);
+MODULE_VERSION(sha_sse, 1);
+MODULE_DEPEND(sha_sse, crypto, 1, 1, 1);
+
+static int
+sha_sse_cipher_setup(struct sha_sse_session *ses, struct cryptoini *encini)
+{
+ int keylen;
+
+ keylen = encini->cri_klen / 8;
+ if (keylen > sizeof(ses->key))
+ return (EINVAL);
+ if (ses->algo == CRYPTO_SHA1 && keylen > 0)
+ return (EINVAL);
+
+ memcpy(ses->key, encini->cri_key, keylen);
+ ses->mlen = encini->cri_mlen;
+
+ return (0);
+}
+
+static int
+intel_sha1_update(void *vctx, const void *vdata, u_int datalen)
+{
+ struct sha1_ctxt *ctx = vctx;
+ const char *data = vdata;
+ size_t gaplen;
+ size_t gapstart;
+ size_t off;
+ size_t copysiz;
+ u_int blocks;
+
+ off = 0;
+ /* Do any aligned blocks without redundant copying. */
+ if (datalen >= 64 && ctx->count % 64 == 0) {
+ blocks = datalen / 64;
+ ctx->c.b64[0] += blocks * 64 * 8;
+ intel_sha1_step(ctx->h.b32, data + off, blocks);
+ off += blocks * 64;
+ }
+
+ while (off < datalen) {
+ gapstart = ctx->count % 64;
+ gaplen = 64 - gapstart;
+
+ copysiz = (gaplen < datalen - off) ? gaplen : datalen - off;
+ bcopy(&data[off], &ctx->m.b8[gapstart], copysiz);
+ ctx->count += copysiz;
+ ctx->count %= 64;
+ ctx->c.b64[0] += copysiz * 8;
+ if (ctx->count % 64 == 0)
+ intel_sha1_step(ctx->h.b32, (void *)ctx->m.b8, 1);
+ off += copysiz;
+ }
+ return (0);
+}
+
+static void
+SHA1_Finalize_fn(void *digest, void *ctx)
+{
+ sha1_result(ctx, digest);
+}
+
+static int
+intel_sha256_update(void *vctx, const void *vdata, u_int len)
+{
+ SHA256_CTX *ctx = vctx;
+ uint64_t bitlen;
+ uint32_t r;
+ u_int blocks;
+ const unsigned char *src = vdata;
+
+ /* Number of bytes left in the buffer from previous updates */
+ r = (ctx->count >> 3) & 0x3f;
+
+ /* Convert the length into a number of bits */
+ bitlen = len << 3;
+
+ /* Update number of bits */
+ ctx->count += bitlen;
+
+ /* Handle the case where we don't need to perform any transforms */
+ if (len < 64 - r) {
+ memcpy(&ctx->buf[r], src, len);
+ return (0);
+ }
+
+ /* Finish the current block */
+ memcpy(&ctx->buf[r], src, 64 - r);
+ intel_sha256_step(ctx->state, ctx->buf, 1);
+ src += 64 - r;
+ len -= 64 - r;
+
+ /* Perform complete blocks */
+ if (len >= 64) {
+ blocks = len / 64;
+ intel_sha256_step(ctx->state, src, blocks);
+ src += blocks * 64;
+ len -= blocks * 64;
+ }
+
+ /* Copy left over data into buffer */
+ memcpy(ctx->buf, src, len);
+ return (0);
+}
+
+static void
+SHA256_Finalize_fn(void *digest, void *ctx)
+{
+ SHA256_Final(digest, ctx);
+}
+
+/*
+ * Compute the HASH( (key ^ xorbyte) || buf )
+ */
+static void
+hmac_internal(void *ctx, uint32_t *res,
+ int (*update)(void *, const void *, u_int),
+ void (*finalize)(void *, void *), uint8_t *key, uint8_t xorbyte,
+ const void *buf, size_t off, size_t buflen, int crpflags)
+{
+ size_t i;
+
+ for (i = 0; i < 64; i++)
+ key[i] ^= xorbyte;
+ update(ctx, key, 64);
+ for (i = 0; i < 64; i++)
+ key[i] ^= xorbyte;
+
+ crypto_apply(crpflags, __DECONST(void *, buf), off, buflen,
+ __DECONST(int (*)(void *, void *, u_int), update), ctx);
+ finalize(res, ctx);
+}
+
+static int
+sha_sse_cipher_process(struct sha_sse_session *ses, struct cryptop *crp)
+{
+ struct SHA256Context s2ctx;
+ struct sha1_ctxt sctx __aligned(16);
+ uint32_t res[8];
+ struct fpu_kern_ctx *ctx;
+ struct cryptodesc *crd;
+ int error, kt, ctxidx, hashlen;
+
+ error = 0;
+
+ kt = is_fpu_kern_thread(0);
+ if (!kt) {
+ ACQUIRE_CTX(ctxidx, ctx);
+ error = fpu_kern_enter(curthread, ctx,
+ FPU_KERN_NORMAL|FPU_KERN_KTHR);
+ if (error != 0)
+ goto out2;
+ }
+
+ crd = crp->crp_desc;
+ if (crd->crd_next != NULL || crd->crd_flags != 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ switch (ses->algo) {
+ case CRYPTO_SHA1_HMAC:
+ hashlen = SHA1_HASH_LEN;
+ /* Inner hash: (K ^ IPAD) || data */
+ sha1_init(&sctx);
+ hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn,
+ ses->key, 0x36, crp->crp_buf, crd->crd_skip, crd->crd_len,
+ crp->crp_flags);
+ /* Outer hash: (K ^ OPAD) || inner hash */
+ sha1_init(&sctx);
+ hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn,
+ ses->key, 0x5C, res, 0, hashlen, 0);
+ break;
+ case CRYPTO_SHA1:
+ hashlen = SHA1_HASH_LEN;
+ sha1_init(&sctx);
+ crypto_apply(crp->crp_flags, crp->crp_buf, crd->crd_skip,
+ crd->crd_len, __DECONST(int (*)(void *, void *, u_int),
+ intel_sha1_update), &sctx);
+ sha1_result(&sctx, (void *)res);
+ break;
+ case CRYPTO_SHA2_256_HMAC:
+ hashlen = SHA2_256_HASH_LEN;
+ /* Inner hash: (K ^ IPAD) || data */
+ SHA256_Init(&s2ctx);
+ hmac_internal(&s2ctx, res, intel_sha256_update,
+ SHA256_Finalize_fn, ses->key, 0x36, crp->crp_buf,
+ crd->crd_skip, crd->crd_len, crp->crp_flags);
+ /* Outer hash: (K ^ OPAD) || inner hash */
+ SHA256_Init(&s2ctx);
+ hmac_internal(&s2ctx, res, intel_sha256_update,
+ SHA256_Finalize_fn, ses->key, 0x5C, res, 0, hashlen, 0);
+ break;
+ }
+
+ if (ses->mlen != 0 && ses->mlen < hashlen)
+ hashlen = ses->mlen;
+ crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject, hashlen,
+ (void *)res);
+
+out:
+ if (!kt) {
+ fpu_kern_leave(curthread, ctx);
+out2:
+ RELEASE_CTX(ctxidx, ctx);
+ }
+ return (error);
+}
Index: sys/i386/conf/NOTES
===================================================================
--- sys/i386/conf/NOTES
+++ sys/i386/conf/NOTES
@@ -839,6 +839,7 @@
device padlock_rng # VIA Padlock RNG
device rdrand_rng # Intel Bull Mountain RNG
device aesni # AES-NI OpenCrypto module
+device sha_sse # SHA SSE extension OpenCrypto module
#
# Laptop/Notebook options:
Index: sys/modules/Makefile
===================================================================
--- sys/modules/Makefile
+++ sys/modules/Makefile
@@ -351,6 +351,7 @@
sge \
${_sgx} \
${_sgx_linux} \
+ ${_sha_sse} \
siba_bwn \
siftr \
siis \
@@ -625,6 +626,7 @@
_acpi= acpi
.if ${MK_CRYPT} != "no" || defined(ALL_MODULES)
_aesni= aesni
+_sha_sse= sha_sse
.endif
_amd_ecc_inject=amd_ecc_inject
_amdsbwd= amdsbwd
Index: sys/modules/sha_sse/Makefile
===================================================================
--- /dev/null
+++ sys/modules/sha_sse/Makefile
@@ -0,0 +1,25 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/crypto/sha_sse
+.PATH: ${SRCTOP}/contrib/llvm/tools/clang/lib/Headers
+
+KMOD= sha_sse
+SRCS= sha_sse.c
+SRCS+= device_if.h bus_if.h opt_bus.h cryptodev_if.h
+
+OBJS+= intel_sha1.o intel_sha256.o
+
+# Remove -nostdinc so we can get the intrinsics.
+intel_sha1.o: intel_sha1.c
+ ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+ -mmmx -msse -msse4 -msha ${.IMPSRC}
+ ${CTFCONVERT_CMD}
+intel_sha256.o: intel_sha256.c
+ ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+ -mmmx -msse -msse4 -msha ${.IMPSRC}
+ ${CTFCONVERT_CMD}
+
+intel_sha1.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
+intel_sha256.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
+
+.include <bsd.kmod.mk>
Index: tests/sys/opencrypto/cryptotest.py
===================================================================
--- tests/sys/opencrypto/cryptotest.py
+++ tests/sys/opencrypto/cryptotest.py
@@ -46,7 +46,7 @@
aesmodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ]
desmodules = [ 'cryptosoft0', ]
-shamodules = [ 'cryptosoft0', 'ccr0' ]
+shamodules = [ 'cryptosoft0', 'ccr0', 'sha_sse0' ]
def GenTestCase(cname):
try:
@@ -308,6 +308,7 @@
cryptosoft = GenTestCase('cryptosoft0')
aesni = GenTestCase('aesni0')
ccr = GenTestCase('ccr0')
+sha_sse = GenTestCase('sha_sse0')
if __name__ == '__main__':
unittest.main()

File Metadata

Mime Type
text/plain
Expires
Sun, Apr 26, 5:42 AM (3 h, 19 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
32166156
Default Alt Text
D12452.id33338.diff (46 KB)

Event Timeline