Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F153978354
D12452.id33338.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
46 KB
Referenced Files
None
Subscribers
None
D12452.id33338.diff
View Options
Index: share/man/man4/sha_sse.4
===================================================================
--- /dev/null
+++ share/man/man4/sha_sse.4
@@ -0,0 +1,81 @@
+.\" Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd September 21, 2017
+.Dt SHA_SSE 4
+.Os
+.Sh NAME
+.Nm sha_sse
+.Nd "driver for the SHA accelerator on x86 CPUs"
+.Sh SYNOPSIS
+To compile this driver into the kernel,
+place the following lines in your
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device crypto"
+.Cd "device cryptodev"
+.Cd "device sha_sse"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+sha_sse_load="YES"
+.Ed
+.Sh DESCRIPTION
+Starting with the Intel Goldmont and AMD Ryzen microarchitectures, some x86
+processors implement a new set of SHA instructions.
+The set of seven instructions accelerates the calculation of SHA1 and SHA256
+hashes.
+.Pp
+The processor capability is reported as SHA in the Structured Extended Features
+line at boot.
+The
+.Nm
+driver does not attach on systems that lack the required CPU capability.
+.Pp
+The
+.Nm
+driver registers itself to accelerate SHA operations for
+.Xr crypto 4 .
+.Sh SEE ALSO
+.Xr crypto 4 ,
+.Xr intro 4 ,
+.Xr ipsec 4 ,
+.Xr crypto 9
+.Sh HISTORY
+The
+.Nm
+driver first appeared in
+.Fx 12.0 .
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+driver was written by
+.An Conrad Meyer Aq Mt cem@FreeBSD.org .
+The hash step intrinsics implementations were supplied by Intel.
Index: sys/amd64/conf/NOTES
===================================================================
--- sys/amd64/conf/NOTES
+++ sys/amd64/conf/NOTES
@@ -544,6 +544,7 @@
device padlock_rng # VIA Padlock RNG
device rdrand_rng # Intel Bull Mountain RNG
device aesni # AES-NI OpenCrypto module
+device sha_sse # SHA SSE extension OpenCrypto module
device ioat # Intel I/OAT DMA engine
#
Index: sys/conf/files.amd64
===================================================================
--- sys/conf/files.amd64
+++ sys/conf/files.amd64
@@ -182,6 +182,17 @@
crypto/blowfish/bf_enc.c optional crypto | ipsec | ipsec_support
crypto/des/des_enc.c optional crypto | ipsec | \
ipsec_support | netsmb
+intel_sha1.o optional sha_sse \
+ dependency "$S/crypto/sha_sse/intel_sha1.c" \
+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+ no-implicit-rule \
+ clean "intel_sha1.o"
+intel_sha256.o optional sha_sse \
+ dependency "$S/crypto/sha_sse/intel_sha256.c" \
+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+ no-implicit-rule \
+ clean "intel_sha256.o"
+crypto/sha_sse/sha_sse.c optional sha_sse
crypto/via/padlock.c optional padlock
crypto/via/padlock_cipher.c optional padlock
crypto/via/padlock_hash.c optional padlock
Index: sys/conf/files.i386
===================================================================
--- sys/conf/files.i386
+++ sys/conf/files.i386
@@ -132,6 +132,17 @@
no-implicit-rule \
clean "aesni_wrap.o"
crypto/des/arch/i386/des_enc.S optional crypto | ipsec | ipsec_support | netsmb
+intel_sha1.o optional sha_sse \
+ dependency "$S/crypto/sha_sse/intel_sha1.c" \
+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+ no-implicit-rule \
+ clean "intel_sha1.o"
+intel_sha256.o optional sha_sse \
+ dependency "$S/crypto/sha_sse/intel_sha256.c" \
+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+ no-implicit-rule \
+ clean "intel_sha256.o"
+crypto/sha_sse/sha_sse.c optional sha_sse
crypto/via/padlock.c optional padlock
crypto/via/padlock_cipher.c optional padlock
crypto/via/padlock_hash.c optional padlock
Index: sys/crypto/sha_sse/intel_sha1.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/intel_sha1.c
@@ -0,0 +1,258 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation
+*
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*
+* * Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* * Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* * Neither the name of the Intel Corporation nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+*
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-1 update function
+*
+* The function takes a pointer to the current hash values, a pointer to the
+* input data, and a number of 64 byte blocks to process. Once all blocks have
+* been processed, the digest pointer is updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to
+* store partial blocks. All message padding and hash value initialization must
+* be done outside the update function.
+*
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+*
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date: July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha1_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
+*
+*******************************************************************************/
+#include <sys/types.h>
+#include <immintrin.h>
+
+#include <crypto/sha_sse/sha_sse.h>
+
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+ __m128i abcd, e0, e1;
+ __m128i abcd_save, e_save;
+ __m128i msg0, msg1, msg2, msg3;
+ __m128i shuf_mask, e_mask;
+
+#if 0
+ e_mask = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
+#else
+ (void)e_mask;
+ e0 = _mm_set_epi64x(0, 0);
+#endif
+ shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
+
+ // Load initial hash values
+ abcd = _mm_loadu_si128((__m128i*) digest);
+ e0 = _mm_insert_epi32(e0, *(digest+4), 3);
+ abcd = _mm_shuffle_epi32(abcd, 0x1B);
+#if 0
+ e0 = _mm_and_si128(e0, e_mask);
+#endif
+
+ while (num_blks > 0) {
+ // Save hash values for addition after rounds
+ abcd_save = abcd;
+ e_save = e0;
+
+ // Rounds 0-3
+ msg0 = _mm_loadu_si128((const __m128i*) data);
+ msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
+ e0 = _mm_add_epi32(e0, msg0);
+ e1 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+
+ // Rounds 4-7
+ msg1 = _mm_loadu_si128((const __m128i*) (data+16));
+ msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+
+ // Rounds 8-11
+ msg2 = _mm_loadu_si128((const __m128i*) (data+32));
+ msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ // Rounds 12-15
+ msg3 = _mm_loadu_si128((const __m128i*) (data+48));
+ msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+ msg1 = _mm_xor_si128(msg1, msg3);
+
+ // Rounds 16-19
+ e0 = _mm_sha1nexte_epu32(e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+ msg2 = _mm_xor_si128(msg2, msg0);
+
+ // Rounds 20-23
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ // Rounds 24-27
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ // Rounds 28-31
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+ msg1 = _mm_xor_si128(msg1, msg3);
+
+ // Rounds 32-35
+ e0 = _mm_sha1nexte_epu32(e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+ msg2 = _mm_xor_si128(msg2, msg0);
+
+ // Rounds 36-39
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ // Rounds 40-43
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ // Rounds 44-47
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+ msg1 = _mm_xor_si128(msg1, msg3);
+
+ // Rounds 48-51
+ e0 = _mm_sha1nexte_epu32(e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+ msg2 = _mm_xor_si128(msg2, msg0);
+
+ // Rounds 52-55
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ // Rounds 56-59
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+ msg0 = _mm_xor_si128(msg0, msg2);
+
+ // Rounds 60-63
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+ msg1 = _mm_xor_si128(msg1, msg3);
+
+ // Rounds 64-67
+ e0 = _mm_sha1nexte_epu32(e0, msg0);
+ e1 = abcd;
+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+ msg2 = _mm_xor_si128(msg2, msg0);
+
+ // Rounds 68-71
+ e1 = _mm_sha1nexte_epu32(e1, msg1);
+ e0 = abcd;
+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+ msg3 = _mm_xor_si128(msg3, msg1);
+
+ // Rounds 72-75
+ e0 = _mm_sha1nexte_epu32(e0, msg2);
+ e1 = abcd;
+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+
+ // Rounds 76-79
+ e1 = _mm_sha1nexte_epu32(e1, msg3);
+ e0 = abcd;
+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+
+ // Add current hash values with previously saved
+ e0 = _mm_sha1nexte_epu32(e0, e_save);
+ abcd = _mm_add_epi32(abcd, abcd_save);
+
+ data += 64;
+ num_blks--;
+ }
+
+ abcd = _mm_shuffle_epi32(abcd, 0x1B);
+ _mm_store_si128((__m128i*) digest, abcd);
+ *(digest+4) = _mm_extract_epi32(e0, 3);
+}
+
Index: sys/crypto/sha_sse/intel_sha256.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/intel_sha256.c
@@ -0,0 +1,274 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation
+*
+* All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*
+* * Redistributions of source code must retain the above copyright
+* notice, this list of conditions and the following disclaimer.
+*
+* * Redistributions in binary form must reproduce the above copyright
+* notice, this list of conditions and the following disclaimer in the
+* documentation and/or other materials provided with the
+* distribution.
+*
+* * Neither the name of the Intel Corporation nor the names of its
+* contributors may be used to endorse or promote products derived from
+* this software without specific prior written permission.
+*
+*
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-256 update function
+*
+* The function takes a pointer to the current hash values, a pointer to the
+* input data, and a number of 64 byte blocks to process. Once all blocks have
+* been processed, the digest pointer is updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to
+* store partial blocks. All message padding and hash value initialization must
+* be done outside the update function.
+*
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+*
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date: July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha256_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha256_intrinsic.c
+*
+*******************************************************************************/
+#include <stdint.h>
+#include <immintrin.h>
+
+#include <crypto/sha_sse/sha_sse.h>
+
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+ __m128i state0, state1;
+ __m128i msg;
+ __m128i msgtmp0, msgtmp1, msgtmp2, msgtmp3;
+ __m128i tmp;
+ __m128i shuf_mask;
+ __m128i abef_save, cdgh_save;
+
+ // Load initial hash values
+ // Need to reorder these appropriately
+ // DCBA, HGFE -> ABEF, CDGH
+ tmp = _mm_loadu_si128((__m128i*) digest);
+ state1 = _mm_loadu_si128((__m128i*) (digest+4));
+
+ tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB
+ state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH
+ state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF
+ state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
+
+ shuf_mask = _mm_set_epi64x(0x0c0d0e0f08090a0bull, 0x0405060700010203ull);
+
+ while (num_blks > 0) {
+ // Save hash values for addition after rounds
+ abef_save = state0;
+ cdgh_save = state1;
+
+ // Rounds 0-3
+ msg = _mm_loadu_si128((const __m128i*) data);
+ msgtmp0 = _mm_shuffle_epi8(msg, shuf_mask);
+ msg = _mm_add_epi32(msgtmp0,
+ _mm_set_epi64x(0xE9B5DBA5B5C0FBCFull, 0x71374491428A2F98ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+ // Rounds 4-7
+ msgtmp1 = _mm_loadu_si128((const __m128i*) (data+16));
+ msgtmp1 = _mm_shuffle_epi8(msgtmp1, shuf_mask);
+ msg = _mm_add_epi32(msgtmp1,
+ _mm_set_epi64x(0xAB1C5ED5923F82A4ull, 0x59F111F13956C25Bull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+ // Rounds 8-11
+ msgtmp2 = _mm_loadu_si128((const __m128i*) (data+32));
+ msgtmp2 = _mm_shuffle_epi8(msgtmp2, shuf_mask);
+ msg = _mm_add_epi32(msgtmp2,
+ _mm_set_epi64x(0x550C7DC3243185BEull, 0x12835B01D807AA98ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+ // Rounds 12-15
+ msgtmp3 = _mm_loadu_si128((const __m128i*) (data+48));
+ msgtmp3 = _mm_shuffle_epi8(msgtmp3, shuf_mask);
+ msg = _mm_add_epi32(msgtmp3,
+ _mm_set_epi64x(0xC19BF1749BDC06A7ull, 0x80DEB1FE72BE5D74ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+ // Rounds 16-19
+ msg = _mm_add_epi32(msgtmp0,
+ _mm_set_epi64x(0x240CA1CC0FC19DC6ull, 0xEFBE4786E49B69C1ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+ // Rounds 20-23
+ msg = _mm_add_epi32(msgtmp1,
+ _mm_set_epi64x(0x76F988DA5CB0A9DCull, 0x4A7484AA2DE92C6Full));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+ // Rounds 24-27
+ msg = _mm_add_epi32(msgtmp2,
+ _mm_set_epi64x(0xBF597FC7B00327C8ull, 0xA831C66D983E5152ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+ // Rounds 28-31
+ msg = _mm_add_epi32(msgtmp3,
+ _mm_set_epi64x(0x1429296706CA6351ull, 0xD5A79147C6E00BF3ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+ // Rounds 32-35
+ msg = _mm_add_epi32(msgtmp0,
+ _mm_set_epi64x(0x53380D134D2C6DFCull, 0x2E1B213827B70A85ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+ // Rounds 36-39
+ msg = _mm_add_epi32(msgtmp1,
+ _mm_set_epi64x(0x92722C8581C2C92Eull, 0x766A0ABB650A7354ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+ // Rounds 40-43
+ msg = _mm_add_epi32(msgtmp2,
+ _mm_set_epi64x(0xC76C51A3C24B8B70ull, 0xA81A664BA2BFE8A1ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+ // Rounds 44-47
+ msg = _mm_add_epi32(msgtmp3,
+ _mm_set_epi64x(0x106AA070F40E3585ull, 0xD6990624D192E819ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+ // Rounds 48-51
+ msg = _mm_add_epi32(msgtmp0,
+ _mm_set_epi64x(0x34B0BCB52748774Cull, 0x1E376C0819A4C116ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+ // Rounds 52-55
+ msg = _mm_add_epi32(msgtmp1,
+ _mm_set_epi64x(0x682E6FF35B9CCA4Full, 0x4ED8AA4A391C0CB3ull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+ // Rounds 56-59
+ msg = _mm_add_epi32(msgtmp2,
+ _mm_set_epi64x(0x8CC7020884C87814ull, 0x78A5636F748F82EEull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+ // Rounds 60-63
+ msg = _mm_add_epi32(msgtmp3,
+ _mm_set_epi64x(0xC67178F2BEF9A3F7ull, 0xA4506CEB90BEFFFAull));
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+ msg = _mm_shuffle_epi32(msg, 0x0E);
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+ // Add current hash values with previously saved
+ state0 = _mm_add_epi32(state0, abef_save);
+ state1 = _mm_add_epi32(state1, cdgh_save);
+
+ data += 64;
+ num_blks--;
+ }
+
+ // Write hash values back in the correct order
+ tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA
+ state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG
+ state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
+ state1 = _mm_alignr_epi8(state1, tmp, 8); // ABEF
+
+ _mm_store_si128((__m128i*) digest, state0);
+ _mm_store_si128((__m128i*) (digest+4), state1);
+}
+
Index: sys/crypto/sha_sse/sha_sse.h
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/sha_sse.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CRYPTO__SHA_SSE_H_
+#define _CRYPTO__SHA_SSE_H_
+
+/*
+ * Internal functions, implemented in intrinsics.
+ */
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks);
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks);
+
+#endif /* _CRYPTO__SHA_SSE_H_ */
Index: sys/crypto/sha_sse/sha_sse.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/sha_sse.c
@@ -0,0 +1,606 @@
+/*-
+ * Copyright (c) 2005-2008 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2014 The FreeBSD Foundation
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed by John-Mark Gurney
+ * under sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/rwlock.h>
+#include <sys/bus.h>
+#include <sys/uio.h>
+#include <sys/mbuf.h>
+#include <sys/smp.h>
+
+#include <crypto/sha1.h>
+#include <crypto/sha2/sha256.h>
+#include <crypto/sha_sse/sha_sse.h>
+
+#include <opencrypto/cryptodev.h>
+#include <opencrypto/gmac.h>
+#include <cryptodev_if.h>
+
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#if defined(__i386__)
+#include <machine/npx.h>
+#elif defined(__amd64__)
+#include <machine/fpu.h>
+#endif
+static struct mtx_padalign *ctx_mtx;
+static struct fpu_kern_ctx **ctx_fpu;
+
+struct sha_sse_softc {
+ int dieing;
+ int32_t cid;
+ uint32_t sid;
+ struct rwlock lock;
+ TAILQ_HEAD(sha_sse_sessions_head, sha_sse_session) sessions;
+};
+
+struct sha_sse_session {
+ /* Same as the SHA256 Blocksize. */
+ uint8_t key[SHA1_HMAC_BLOCK_LEN] __aligned(16);
+ int algo;
+ int used;
+ int mlen;
+ uint32_t id;
+ TAILQ_ENTRY(sha_sse_session) next;
+};
+
+#define ACQUIRE_CTX(i, ctx) \
+ do { \
+ (i) = PCPU_GET(cpuid); \
+ mtx_lock(&ctx_mtx[(i)]); \
+ (ctx) = ctx_fpu[(i)]; \
+ } while (0)
+#define RELEASE_CTX(i, ctx) \
+ do { \
+ mtx_unlock(&ctx_mtx[(i)]); \
+ (i) = -1; \
+ (ctx) = NULL; \
+ } while (0)
+
+static int sha_sse_newsession(device_t, uint32_t *sidp, struct cryptoini *cri);
+static int sha_sse_freesession(device_t, uint64_t tid);
+static void sha_sse_freesession_locked(struct sha_sse_softc *sc,
+ struct sha_sse_session *ses);
+static int sha_sse_cipher_setup(struct sha_sse_session *ses,
+ struct cryptoini *encini);
+static int sha_sse_cipher_process(struct sha_sse_session *ses,
+ struct cryptop *crp);
+
+MALLOC_DEFINE(M_SHA_SSE, "sha_sse_data", "SHA_SSE Data");
+
+static void
+sha_sse_identify(driver_t *drv, device_t parent)
+{
+
+ /* NB: order 10 is so we get attached after h/w devices */
+ if (device_find_child(parent, "sha_sse", -1) == NULL &&
+ BUS_ADD_CHILD(parent, 10, "sha_sse", -1) == 0)
+ panic("sha_sse: could not attach");
+}
+
+static int
+sha_sse_probe(device_t dev)
+{
+
+ if ((cpu_stdext_feature & CPUID_STDEXT_SHA) == 0) {
+ device_printf(dev, "No SHA support.\n");
+ return (EINVAL);
+ }
+ if ((cpu_feature2 & CPUID2_SSSE3) == 0) {
+ device_printf(dev, "No SSSE3 support.\n");
+ return (EINVAL);
+ }
+
+ device_set_desc_copy(dev, "SHA1,SHA2");
+ return (0);
+}
+
+static void
+sha_sse_cleanctx(void)
+{
+ int i;
+
+ /* XXX - no way to return driverid */
+ CPU_FOREACH(i) {
+ if (ctx_fpu[i] != NULL) {
+ mtx_destroy(&ctx_mtx[i]);
+ fpu_kern_free_ctx(ctx_fpu[i]);
+ }
+ ctx_fpu[i] = NULL;
+ }
+ free(ctx_mtx, M_SHA_SSE);
+ ctx_mtx = NULL;
+ free(ctx_fpu, M_SHA_SSE);
+ ctx_fpu = NULL;
+}
+
+static int
+sha_sse_attach(device_t dev)
+{
+ struct sha_sse_softc *sc;
+ int i;
+
+ sc = device_get_softc(dev);
+ sc->dieing = 0;
+ TAILQ_INIT(&sc->sessions);
+ sc->sid = 1;
+
+ sc->cid = crypto_get_driverid(dev, CRYPTOCAP_F_HARDWARE |
+ CRYPTOCAP_F_SYNC);
+ if (sc->cid < 0) {
+ device_printf(dev, "Could not get crypto driver id.\n");
+ return (ENOMEM);
+ }
+
+ ctx_mtx = malloc(sizeof(*ctx_mtx) * (mp_maxid + 1), M_SHA_SSE,
+ M_WAITOK | M_ZERO);
+ ctx_fpu = malloc(sizeof(*ctx_fpu) * (mp_maxid + 1), M_SHA_SSE,
+ M_WAITOK | M_ZERO);
+
+ CPU_FOREACH(i) {
+ ctx_fpu[i] = fpu_kern_alloc_ctx(0);
+ mtx_init(&ctx_mtx[i], "shafpumtx", NULL, MTX_DEF | MTX_NEW);
+ }
+
+ rw_init(&sc->lock, "sha_sse_lock");
+ crypto_register(sc->cid, CRYPTO_SHA1, 0, 0);
+ crypto_register(sc->cid, CRYPTO_SHA1_HMAC, 0, 0);
+ crypto_register(sc->cid, CRYPTO_SHA2_256_HMAC, 0, 0);
+ return (0);
+}
+
+static int
+sha_sse_detach(device_t dev)
+{
+ struct sha_sse_softc *sc;
+ struct sha_sse_session *ses;
+
+ sc = device_get_softc(dev);
+
+ rw_wlock(&sc->lock);
+ TAILQ_FOREACH(ses, &sc->sessions, next) {
+ if (ses->used) {
+ rw_wunlock(&sc->lock);
+ device_printf(dev,
+ "Cannot detach, sessions still active.\n");
+ return (EBUSY);
+ }
+ }
+ sc->dieing = 1;
+ while ((ses = TAILQ_FIRST(&sc->sessions)) != NULL) {
+ TAILQ_REMOVE(&sc->sessions, ses, next);
+ free(ses, M_SHA_SSE);
+ }
+ rw_wunlock(&sc->lock);
+ crypto_unregister_all(sc->cid);
+
+ rw_destroy(&sc->lock);
+
+ sha_sse_cleanctx();
+
+ return (0);
+}
+
+static int
+sha_sse_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
+{
+ struct sha_sse_softc *sc;
+ struct sha_sse_session *ses;
+ struct cryptoini *encini;
+ int error;
+
+ if (sidp == NULL || cri == NULL) {
+ CRYPTDEB("no sidp or cri");
+ return (EINVAL);
+ }
+
+ sc = device_get_softc(dev);
+ if (sc->dieing)
+ return (EINVAL);
+
+ ses = NULL;
+ encini = NULL;
+ for (; cri != NULL; cri = cri->cri_next) {
+ switch (cri->cri_alg) {
+ case CRYPTO_SHA1:
+ case CRYPTO_SHA1_HMAC:
+ case CRYPTO_SHA2_256_HMAC:
+ if (encini != NULL) {
+ CRYPTDEB("encini already set");
+ return (EINVAL);
+ }
+ encini = cri;
+ break;
+ default:
+ CRYPTDEB("unhandled algorithm");
+ return (EINVAL);
+ }
+ }
+ if (encini == NULL) {
+ CRYPTDEB("no cipher");
+ return (EINVAL);
+ }
+
+ rw_wlock(&sc->lock);
+ if (sc->dieing) {
+ rw_wunlock(&sc->lock);
+ return (EINVAL);
+ }
+ /*
+ * Free sessions goes first, so if first session is used, we need to
+ * allocate one.
+ */
+ ses = TAILQ_FIRST(&sc->sessions);
+ if (ses == NULL || ses->used) {
+ ses = malloc(sizeof(*ses), M_SHA_SSE, M_NOWAIT | M_ZERO);
+ if (ses == NULL) {
+ rw_wunlock(&sc->lock);
+ return (ENOMEM);
+ }
+ ses->id = sc->sid++;
+ } else {
+ TAILQ_REMOVE(&sc->sessions, ses, next);
+ }
+ ses->used = 1;
+ TAILQ_INSERT_TAIL(&sc->sessions, ses, next);
+ rw_wunlock(&sc->lock);
+ ses->algo = encini->cri_alg;
+
+ error = sha_sse_cipher_setup(ses, encini);
+ if (error != 0) {
+ CRYPTDEB("setup failed");
+ rw_wlock(&sc->lock);
+ sha_sse_freesession_locked(sc, ses);
+ rw_wunlock(&sc->lock);
+ return (error);
+ }
+
+ *sidp = ses->id;
+ return (0);
+}
+
+static void
+sha_sse_freesession_locked(struct sha_sse_softc *sc, struct sha_sse_session *ses)
+{
+ uint32_t sid;
+
+ rw_assert(&sc->lock, RA_WLOCKED);
+
+ sid = ses->id;
+ TAILQ_REMOVE(&sc->sessions, ses, next);
+ explicit_bzero(ses, sizeof(*ses));
+ ses->id = sid;
+ TAILQ_INSERT_HEAD(&sc->sessions, ses, next);
+}
+
+static int
+sha_sse_freesession(device_t dev, uint64_t tid)
+{
+ struct sha_sse_softc *sc;
+ struct sha_sse_session *ses;
+ uint32_t sid;
+
+ sc = device_get_softc(dev);
+ sid = ((uint32_t)tid) & 0xffffffff;
+ rw_wlock(&sc->lock);
+ TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) {
+ if (ses->id == sid)
+ break;
+ }
+ if (ses == NULL) {
+ rw_wunlock(&sc->lock);
+ return (EINVAL);
+ }
+ sha_sse_freesession_locked(sc, ses);
+ rw_wunlock(&sc->lock);
+ return (0);
+}
+
+static int
+sha_sse_process(device_t dev, struct cryptop *crp, int hint __unused)
+{
+ struct sha_sse_softc *sc = device_get_softc(dev);
+ struct sha_sse_session *ses = NULL;
+ int error;
+
+ error = 0;
+
+ /* Sanity check. */
+ if (crp == NULL)
+ return (EINVAL);
+
+ if (crp->crp_callback == NULL || crp->crp_desc == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ rw_rlock(&sc->lock);
+ TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) {
+ if (ses->id == (crp->crp_sid & 0xffffffff))
+ break;
+ }
+ rw_runlock(&sc->lock);
+ if (ses == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ error = sha_sse_cipher_process(ses, crp);
+ if (error != 0)
+ goto out;
+
+out:
+ crp->crp_etype = error;
+ crypto_done(crp);
+ return (error);
+}
+
+static device_method_t sha_sse_methods[] = {
+ DEVMETHOD(device_identify, sha_sse_identify),
+ DEVMETHOD(device_probe, sha_sse_probe),
+ DEVMETHOD(device_attach, sha_sse_attach),
+ DEVMETHOD(device_detach, sha_sse_detach),
+
+ DEVMETHOD(cryptodev_newsession, sha_sse_newsession),
+ DEVMETHOD(cryptodev_freesession, sha_sse_freesession),
+ DEVMETHOD(cryptodev_process, sha_sse_process),
+
+ {0, 0},
+};
+
+static driver_t sha_sse_driver = {
+ "sha_sse",
+ sha_sse_methods,
+ sizeof(struct sha_sse_softc),
+};
+static devclass_t sha_sse_devclass;
+
+DRIVER_MODULE(sha_sse, nexus, sha_sse_driver, sha_sse_devclass, 0, 0);
+MODULE_VERSION(sha_sse, 1);
+MODULE_DEPEND(sha_sse, crypto, 1, 1, 1);
+
+static int
+sha_sse_cipher_setup(struct sha_sse_session *ses, struct cryptoini *encini)
+{
+ int keylen;
+
+ keylen = encini->cri_klen / 8;
+ if (keylen > sizeof(ses->key))
+ return (EINVAL);
+ if (ses->algo == CRYPTO_SHA1 && keylen > 0)
+ return (EINVAL);
+
+ memcpy(ses->key, encini->cri_key, keylen);
+ ses->mlen = encini->cri_mlen;
+
+ return (0);
+}
+
+static int
+intel_sha1_update(void *vctx, const void *vdata, u_int datalen)
+{
+ struct sha1_ctxt *ctx = vctx;
+ const char *data = vdata;
+ size_t gaplen;
+ size_t gapstart;
+ size_t off;
+ size_t copysiz;
+ u_int blocks;
+
+ off = 0;
+ /* Do any aligned blocks without redundant copying. */
+ if (datalen >= 64 && ctx->count % 64 == 0) {
+ blocks = datalen / 64;
+ ctx->c.b64[0] += blocks * 64 * 8;
+ intel_sha1_step(ctx->h.b32, data + off, blocks);
+ off += blocks * 64;
+ }
+
+ while (off < datalen) {
+ gapstart = ctx->count % 64;
+ gaplen = 64 - gapstart;
+
+ copysiz = (gaplen < datalen - off) ? gaplen : datalen - off;
+ bcopy(&data[off], &ctx->m.b8[gapstart], copysiz);
+ ctx->count += copysiz;
+ ctx->count %= 64;
+ ctx->c.b64[0] += copysiz * 8;
+ if (ctx->count % 64 == 0)
+ intel_sha1_step(ctx->h.b32, (void *)ctx->m.b8, 1);
+ off += copysiz;
+ }
+ return (0);
+}
+
+static void
+SHA1_Finalize_fn(void *digest, void *ctx)
+{
+ sha1_result(ctx, digest);
+}
+
+static int
+intel_sha256_update(void *vctx, const void *vdata, u_int len)
+{
+ SHA256_CTX *ctx = vctx;
+ uint64_t bitlen;
+ uint32_t r;
+ u_int blocks;
+ const unsigned char *src = vdata;
+
+ /* Number of bytes left in the buffer from previous updates */
+ r = (ctx->count >> 3) & 0x3f;
+
+ /* Convert the length into a number of bits */
+ bitlen = len << 3;
+
+ /* Update number of bits */
+ ctx->count += bitlen;
+
+ /* Handle the case where we don't need to perform any transforms */
+ if (len < 64 - r) {
+ memcpy(&ctx->buf[r], src, len);
+ return (0);
+ }
+
+ /* Finish the current block */
+ memcpy(&ctx->buf[r], src, 64 - r);
+ intel_sha256_step(ctx->state, ctx->buf, 1);
+ src += 64 - r;
+ len -= 64 - r;
+
+ /* Perform complete blocks */
+ if (len >= 64) {
+ blocks = len / 64;
+ intel_sha256_step(ctx->state, src, blocks);
+ src += blocks * 64;
+ len -= blocks * 64;
+ }
+
+ /* Copy left over data into buffer */
+ memcpy(ctx->buf, src, len);
+ return (0);
+}
+
+static void
+SHA256_Finalize_fn(void *digest, void *ctx)
+{
+ SHA256_Final(digest, ctx);
+}
+
+/*
+ * Compute the HASH( (key ^ xorbyte) || buf )
+ */
+static void
+hmac_internal(void *ctx, uint32_t *res,
+ int (*update)(void *, const void *, u_int),
+ void (*finalize)(void *, void *), uint8_t *key, uint8_t xorbyte,
+ const void *buf, size_t off, size_t buflen, int crpflags)
+{
+ size_t i;
+
+ for (i = 0; i < 64; i++)
+ key[i] ^= xorbyte;
+ update(ctx, key, 64);
+ for (i = 0; i < 64; i++)
+ key[i] ^= xorbyte;
+
+ crypto_apply(crpflags, __DECONST(void *, buf), off, buflen,
+ __DECONST(int (*)(void *, void *, u_int), update), ctx);
+ finalize(res, ctx);
+}
+
+static int
+sha_sse_cipher_process(struct sha_sse_session *ses, struct cryptop *crp)
+{
+ struct SHA256Context s2ctx;
+ struct sha1_ctxt sctx __aligned(16);
+ uint32_t res[8];
+ struct fpu_kern_ctx *ctx;
+ struct cryptodesc *crd;
+ int error, kt, ctxidx, hashlen;
+
+ error = 0;
+
+ kt = is_fpu_kern_thread(0);
+ if (!kt) {
+ ACQUIRE_CTX(ctxidx, ctx);
+ error = fpu_kern_enter(curthread, ctx,
+ FPU_KERN_NORMAL|FPU_KERN_KTHR);
+ if (error != 0)
+ goto out2;
+ }
+
+ crd = crp->crp_desc;
+ if (crd->crd_next != NULL || crd->crd_flags != 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ switch (ses->algo) {
+ case CRYPTO_SHA1_HMAC:
+ hashlen = SHA1_HASH_LEN;
+ /* Inner hash: (K ^ IPAD) || data */
+ sha1_init(&sctx);
+ hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn,
+ ses->key, 0x36, crp->crp_buf, crd->crd_skip, crd->crd_len,
+ crp->crp_flags);
+ /* Outer hash: (K ^ OPAD) || inner hash */
+ sha1_init(&sctx);
+ hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn,
+ ses->key, 0x5C, res, 0, hashlen, 0);
+ break;
+ case CRYPTO_SHA1:
+ hashlen = SHA1_HASH_LEN;
+ sha1_init(&sctx);
+ crypto_apply(crp->crp_flags, crp->crp_buf, crd->crd_skip,
+ crd->crd_len, __DECONST(int (*)(void *, void *, u_int),
+ intel_sha1_update), &sctx);
+ sha1_result(&sctx, (void *)res);
+ break;
+ case CRYPTO_SHA2_256_HMAC:
+ hashlen = SHA2_256_HASH_LEN;
+ /* Inner hash: (K ^ IPAD) || data */
+ SHA256_Init(&s2ctx);
+ hmac_internal(&s2ctx, res, intel_sha256_update,
+ SHA256_Finalize_fn, ses->key, 0x36, crp->crp_buf,
+ crd->crd_skip, crd->crd_len, crp->crp_flags);
+ /* Outer hash: (K ^ OPAD) || inner hash */
+ SHA256_Init(&s2ctx);
+ hmac_internal(&s2ctx, res, intel_sha256_update,
+ SHA256_Finalize_fn, ses->key, 0x5C, res, 0, hashlen, 0);
+ break;
+ }
+
+ if (ses->mlen != 0 && ses->mlen < hashlen)
+ hashlen = ses->mlen;
+ crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject, hashlen,
+ (void *)res);
+
+out:
+ if (!kt) {
+ fpu_kern_leave(curthread, ctx);
+out2:
+ RELEASE_CTX(ctxidx, ctx);
+ }
+ return (error);
+}
Index: sys/i386/conf/NOTES
===================================================================
--- sys/i386/conf/NOTES
+++ sys/i386/conf/NOTES
@@ -839,6 +839,7 @@
device padlock_rng # VIA Padlock RNG
device rdrand_rng # Intel Bull Mountain RNG
device aesni # AES-NI OpenCrypto module
+device sha_sse # SHA SSE extension OpenCrypto module
#
# Laptop/Notebook options:
Index: sys/modules/Makefile
===================================================================
--- sys/modules/Makefile
+++ sys/modules/Makefile
@@ -351,6 +351,7 @@
sge \
${_sgx} \
${_sgx_linux} \
+ ${_sha_sse} \
siba_bwn \
siftr \
siis \
@@ -625,6 +626,7 @@
_acpi= acpi
.if ${MK_CRYPT} != "no" || defined(ALL_MODULES)
_aesni= aesni
+_sha_sse= sha_sse
.endif
_amd_ecc_inject=amd_ecc_inject
_amdsbwd= amdsbwd
Index: sys/modules/sha_sse/Makefile
===================================================================
--- /dev/null
+++ sys/modules/sha_sse/Makefile
@@ -0,0 +1,25 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/crypto/sha_sse
+.PATH: ${SRCTOP}/contrib/llvm/tools/clang/lib/Headers
+
+KMOD= sha_sse
+SRCS= sha_sse.c
+SRCS+= device_if.h bus_if.h opt_bus.h cryptodev_if.h
+
+OBJS+= intel_sha1.o intel_sha256.o
+
+# Remove -nostdinc so we can get the intrinsics.
+intel_sha1.o: intel_sha1.c
+ ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+ -mmmx -msse -msse4 -msha ${.IMPSRC}
+ ${CTFCONVERT_CMD}
+intel_sha256.o: intel_sha256.c
+ ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+ -mmmx -msse -msse4 -msha ${.IMPSRC}
+ ${CTFCONVERT_CMD}
+
+intel_sha1.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
+intel_sha256.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
+
+.include <bsd.kmod.mk>
Index: tests/sys/opencrypto/cryptotest.py
===================================================================
--- tests/sys/opencrypto/cryptotest.py
+++ tests/sys/opencrypto/cryptotest.py
@@ -46,7 +46,7 @@
aesmodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ]
desmodules = [ 'cryptosoft0', ]
-shamodules = [ 'cryptosoft0', 'ccr0' ]
+shamodules = [ 'cryptosoft0', 'ccr0', 'sha_sse0' ]
def GenTestCase(cname):
try:
@@ -308,6 +308,7 @@
cryptosoft = GenTestCase('cryptosoft0')
aesni = GenTestCase('aesni0')
ccr = GenTestCase('ccr0')
+sha_sse = GenTestCase('sha_sse0')
if __name__ == '__main__':
unittest.main()
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Apr 26, 5:42 AM (3 h, 19 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
32166156
Default Alt Text
D12452.id33338.diff (46 KB)
Attached To
Mode
D12452: aesni(4): Add support for x86 SHA intrinsics
Attached
Detach File
Event Timeline
Log In to Comment