D12452.id33338.diff
No OneTemporary
Actions

Size

46 KB

Referenced Files

None

Subscribers

None

D12452.id33338.diff
View Options

	Index: share/man/man4/sha_sse.4
	===================================================================
	--- /dev/null
	+++ share/man/man4/sha_sse.4
	@@ -0,0 +1,81 @@
	+.\" Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
	+.\" All rights reserved.
	+.\"
	+.\" Redistribution and use in source and binary forms, with or without
	+.\" modification, are permitted provided that the following conditions
	+.\" are met:
	+.\" 1. Redistributions of source code must retain the above copyright
	+.\" notice, this list of conditions and the following disclaimer.
	+.\" 2. Redistributions in binary form must reproduce the above copyright
	+.\" notice, this list of conditions and the following disclaimer in the
	+.\" documentation and/or other materials provided with the distribution.
	+.\"
	+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+.\" SUCH DAMAGE.
	+.\"
	+.\" $FreeBSD$
	+.\"
	+.Dd September 21, 2017
	+.Dt SHA_SSE 4
	+.Os
	+.Sh NAME
	+.Nm sha_sse
	+.Nd "driver for the SHA accelerator on x86 CPUs"
	+.Sh SYNOPSIS
	+To compile this driver into the kernel,
	+place the following lines in your
	+kernel configuration file:
	+.Bd -ragged -offset indent
	+.Cd "device crypto"
	+.Cd "device cryptodev"
	+.Cd "device sha_sse"
	+.Ed
	+.Pp
	+Alternatively, to load the driver as a
	+module at boot time, place the following line in
	+.Xr loader.conf 5 :
	+.Bd -literal -offset indent
	+sha_sse_load="YES"
	+.Ed
	+.Sh DESCRIPTION
	+Starting with the Intel Goldmont and AMD Ryzen microarchitectures, some x86
	+processors implement a new set of SHA instructions.
	+The set of seven instructions accelerates the calculation of SHA1 and SHA256
	+hashes.
	+.Pp
	+The processor capability is reported as SHA in the Structured Extended Features
	+line at boot.
	+The
	+.Nm
	+driver does not attach on systems that lack the required CPU capability.
	+.Pp
	+The
	+.Nm
	+driver registers itself to accelerate SHA operations for
	+.Xr crypto 4 .
	+.Sh SEE ALSO
	+.Xr crypto 4 ,
	+.Xr intro 4 ,
	+.Xr ipsec 4 ,
	+.Xr crypto 9
	+.Sh HISTORY
	+The
	+.Nm
	+driver first appeared in
	+.Fx 12.0 .
	+.Sh AUTHORS
	+.An -nosplit
	+The
	+.Nm
	+driver was written by
	+.An Conrad Meyer Aq Mt cem@FreeBSD.org .
	+The hash step intrinsics implementations were supplied by Intel.
	Index: sys/amd64/conf/NOTES
	===================================================================
	--- sys/amd64/conf/NOTES
	+++ sys/amd64/conf/NOTES
	@@ -544,6 +544,7 @@
	device padlock_rng # VIA Padlock RNG
	device rdrand_rng # Intel Bull Mountain RNG
	device aesni # AES-NI OpenCrypto module
	+device sha_sse # SHA SSE extension OpenCrypto module
	device ioat # Intel I/OAT DMA engine

	#
	Index: sys/conf/files.amd64
	===================================================================
	--- sys/conf/files.amd64
	+++ sys/conf/files.amd64
	@@ -182,6 +182,17 @@
	crypto/blowfish/bf_enc.c optional crypto \| ipsec \| ipsec_support
	crypto/des/des_enc.c optional crypto \| ipsec \| \
	ipsec_support \| netsmb
	+intel_sha1.o optional sha_sse \
	+ dependency "$S/crypto/sha_sse/intel_sha1.c" \
	+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
	+ no-implicit-rule \
	+ clean "intel_sha1.o"
	+intel_sha256.o optional sha_sse \
	+ dependency "$S/crypto/sha_sse/intel_sha256.c" \
	+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
	+ no-implicit-rule \
	+ clean "intel_sha256.o"
	+crypto/sha_sse/sha_sse.c optional sha_sse
	crypto/via/padlock.c optional padlock
	crypto/via/padlock_cipher.c optional padlock
	crypto/via/padlock_hash.c optional padlock
	Index: sys/conf/files.i386
	===================================================================
	--- sys/conf/files.i386
	+++ sys/conf/files.i386
	@@ -132,6 +132,17 @@
	no-implicit-rule \
	clean "aesni_wrap.o"
	crypto/des/arch/i386/des_enc.S optional crypto \| ipsec \| ipsec_support \| netsmb
	+intel_sha1.o optional sha_sse \
	+ dependency "$S/crypto/sha_sse/intel_sha1.c" \
	+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
	+ no-implicit-rule \
	+ clean "intel_sha1.o"
	+intel_sha256.o optional sha_sse \
	+ dependency "$S/crypto/sha_sse/intel_sha256.c" \
	+ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
	+ no-implicit-rule \
	+ clean "intel_sha256.o"
	+crypto/sha_sse/sha_sse.c optional sha_sse
	crypto/via/padlock.c optional padlock
	crypto/via/padlock_cipher.c optional padlock
	crypto/via/padlock_hash.c optional padlock
	Index: sys/crypto/sha_sse/intel_sha1.c
	===================================================================
	--- /dev/null
	+++ sys/crypto/sha_sse/intel_sha1.c
	@@ -0,0 +1,258 @@
	+/*******************************************************************************
	+* Copyright (c) 2013, Intel Corporation
	+*
	+* All rights reserved.
	+*
	+* Redistribution and use in source and binary forms, with or without
	+* modification, are permitted provided that the following conditions are
	+* met:
	+*
	+* * Redistributions of source code must retain the above copyright
	+* notice, this list of conditions and the following disclaimer.
	+*
	+* * Redistributions in binary form must reproduce the above copyright
	+* notice, this list of conditions and the following disclaimer in the
	+* documentation and/or other materials provided with the
	+* distribution.
	+*
	+* * Neither the name of the Intel Corporation nor the names of its
	+* contributors may be used to endorse or promote products derived from
	+* this software without specific prior written permission.
	+*
	+*
	+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
	+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
	+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	+********************************************************************************
	+*
	+* Intel SHA Extensions optimized implementation of a SHA-1 update function
	+*
	+* The function takes a pointer to the current hash values, a pointer to the
	+* input data, and a number of 64 byte blocks to process. Once all blocks have
	+* been processed, the digest pointer is updated with the resulting hash value.
	+* The function only processes complete blocks, there is no functionality to
	+* store partial blocks. All message padding and hash value initialization must
	+* be done outside the update function.
	+*
	+* The indented lines in the loop are instructions related to rounds processing.
	+* The non-indented lines are instructions related to the message schedule.
	+*
	+* Author: Sean Gulley <sean.m.gulley@intel.com>
	+* Date: July 2013
	+*
	+********************************************************************************
	+*
	+* Example complier command line:
	+* icc intel_sha_extensions_sha1_intrinsic.c
	+* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
	+*
	+*******************************************************************************/
	+#include <sys/types.h>
	+#include <immintrin.h>
	+
	+#include <crypto/sha_sse/sha_sse.h>
	+
	+void intel_sha1_step(uint32_t digest, const char data, uint32_t num_blks) {
	+ __m128i abcd, e0, e1;
	+ __m128i abcd_save, e_save;
	+ __m128i msg0, msg1, msg2, msg3;
	+ __m128i shuf_mask, e_mask;
	+
	+#if 0
	+ e_mask = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
	+#else
	+ (void)e_mask;
	+ e0 = _mm_set_epi64x(0, 0);
	+#endif
	+ shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
	+
	+ // Load initial hash values
	+ abcd = _mm_loadu_si128((__m128i*) digest);
	+ e0 = _mm_insert_epi32(e0, *(digest+4), 3);
	+ abcd = _mm_shuffle_epi32(abcd, 0x1B);
	+#if 0
	+ e0 = _mm_and_si128(e0, e_mask);
	+#endif
	+
	+ while (num_blks > 0) {
	+ // Save hash values for addition after rounds
	+ abcd_save = abcd;
	+ e_save = e0;
	+
	+ // Rounds 0-3
	+ msg0 = _mm_loadu_si128((const __m128i*) data);
	+ msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
	+ e0 = _mm_add_epi32(e0, msg0);
	+ e1 = abcd;
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
	+
	+ // Rounds 4-7
	+ msg1 = _mm_loadu_si128((const __m128i*) (data+16));
	+ msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
	+ e1 = _mm_sha1nexte_epu32(e1, msg1);
	+ e0 = abcd;
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
	+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
	+
	+ // Rounds 8-11
	+ msg2 = _mm_loadu_si128((const __m128i*) (data+32));
	+ msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
	+ e0 = _mm_sha1nexte_epu32(e0, msg2);
	+ e1 = abcd;
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
	+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
	+ msg0 = _mm_xor_si128(msg0, msg2);
	+
	+ // Rounds 12-15
	+ msg3 = _mm_loadu_si128((const __m128i*) (data+48));
	+ msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
	+ e1 = _mm_sha1nexte_epu32(e1, msg3);
	+ e0 = abcd;
	+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
	+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
	+ msg1 = _mm_xor_si128(msg1, msg3);
	+
	+ // Rounds 16-19
	+ e0 = _mm_sha1nexte_epu32(e0, msg0);
	+ e1 = abcd;
	+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
	+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
	+ msg2 = _mm_xor_si128(msg2, msg0);
	+
	+ // Rounds 20-23
	+ e1 = _mm_sha1nexte_epu32(e1, msg1);
	+ e0 = abcd;
	+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
	+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
	+ msg3 = _mm_xor_si128(msg3, msg1);
	+
	+ // Rounds 24-27
	+ e0 = _mm_sha1nexte_epu32(e0, msg2);
	+ e1 = abcd;
	+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
	+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
	+ msg0 = _mm_xor_si128(msg0, msg2);
	+
	+ // Rounds 28-31
	+ e1 = _mm_sha1nexte_epu32(e1, msg3);
	+ e0 = abcd;
	+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
	+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
	+ msg1 = _mm_xor_si128(msg1, msg3);
	+
	+ // Rounds 32-35
	+ e0 = _mm_sha1nexte_epu32(e0, msg0);
	+ e1 = abcd;
	+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
	+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
	+ msg2 = _mm_xor_si128(msg2, msg0);
	+
	+ // Rounds 36-39
	+ e1 = _mm_sha1nexte_epu32(e1, msg1);
	+ e0 = abcd;
	+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
	+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
	+ msg3 = _mm_xor_si128(msg3, msg1);
	+
	+ // Rounds 40-43
	+ e0 = _mm_sha1nexte_epu32(e0, msg2);
	+ e1 = abcd;
	+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
	+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
	+ msg0 = _mm_xor_si128(msg0, msg2);
	+
	+ // Rounds 44-47
	+ e1 = _mm_sha1nexte_epu32(e1, msg3);
	+ e0 = abcd;
	+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
	+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
	+ msg1 = _mm_xor_si128(msg1, msg3);
	+
	+ // Rounds 48-51
	+ e0 = _mm_sha1nexte_epu32(e0, msg0);
	+ e1 = abcd;
	+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
	+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
	+ msg2 = _mm_xor_si128(msg2, msg0);
	+
	+ // Rounds 52-55
	+ e1 = _mm_sha1nexte_epu32(e1, msg1);
	+ e0 = abcd;
	+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
	+ msg0 = _mm_sha1msg1_epu32(msg0, msg1);
	+ msg3 = _mm_xor_si128(msg3, msg1);
	+
	+ // Rounds 56-59
	+ e0 = _mm_sha1nexte_epu32(e0, msg2);
	+ e1 = abcd;
	+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
	+ msg1 = _mm_sha1msg1_epu32(msg1, msg2);
	+ msg0 = _mm_xor_si128(msg0, msg2);
	+
	+ // Rounds 60-63
	+ e1 = _mm_sha1nexte_epu32(e1, msg3);
	+ e0 = abcd;
	+ msg0 = _mm_sha1msg2_epu32(msg0, msg3);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
	+ msg2 = _mm_sha1msg1_epu32(msg2, msg3);
	+ msg1 = _mm_xor_si128(msg1, msg3);
	+
	+ // Rounds 64-67
	+ e0 = _mm_sha1nexte_epu32(e0, msg0);
	+ e1 = abcd;
	+ msg1 = _mm_sha1msg2_epu32(msg1, msg0);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
	+ msg3 = _mm_sha1msg1_epu32(msg3, msg0);
	+ msg2 = _mm_xor_si128(msg2, msg0);
	+
	+ // Rounds 68-71
	+ e1 = _mm_sha1nexte_epu32(e1, msg1);
	+ e0 = abcd;
	+ msg2 = _mm_sha1msg2_epu32(msg2, msg1);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
	+ msg3 = _mm_xor_si128(msg3, msg1);
	+
	+ // Rounds 72-75
	+ e0 = _mm_sha1nexte_epu32(e0, msg2);
	+ e1 = abcd;
	+ msg3 = _mm_sha1msg2_epu32(msg3, msg2);
	+ abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
	+
	+ // Rounds 76-79
	+ e1 = _mm_sha1nexte_epu32(e1, msg3);
	+ e0 = abcd;
	+ abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
	+
	+ // Add current hash values with previously saved
	+ e0 = _mm_sha1nexte_epu32(e0, e_save);
	+ abcd = _mm_add_epi32(abcd, abcd_save);
	+
	+ data += 64;
	+ num_blks--;
	+ }
	+
	+ abcd = _mm_shuffle_epi32(abcd, 0x1B);
	+ _mm_store_si128((__m128i*) digest, abcd);
	+ *(digest+4) = _mm_extract_epi32(e0, 3);
	+}
	+
	Index: sys/crypto/sha_sse/intel_sha256.c
	===================================================================
	--- /dev/null
	+++ sys/crypto/sha_sse/intel_sha256.c
	@@ -0,0 +1,274 @@
	+/*******************************************************************************
	+* Copyright (c) 2013, Intel Corporation
	+*
	+* All rights reserved.
	+*
	+* Redistribution and use in source and binary forms, with or without
	+* modification, are permitted provided that the following conditions are
	+* met:
	+*
	+* * Redistributions of source code must retain the above copyright
	+* notice, this list of conditions and the following disclaimer.
	+*
	+* * Redistributions in binary form must reproduce the above copyright
	+* notice, this list of conditions and the following disclaimer in the
	+* documentation and/or other materials provided with the
	+* distribution.
	+*
	+* * Neither the name of the Intel Corporation nor the names of its
	+* contributors may be used to endorse or promote products derived from
	+* this software without specific prior written permission.
	+*
	+*
	+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
	+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
	+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
	+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
	+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	+********************************************************************************
	+*
	+* Intel SHA Extensions optimized implementation of a SHA-256 update function
	+*
	+* The function takes a pointer to the current hash values, a pointer to the
	+* input data, and a number of 64 byte blocks to process. Once all blocks have
	+* been processed, the digest pointer is updated with the resulting hash value.
	+* The function only processes complete blocks, there is no functionality to
	+* store partial blocks. All message padding and hash value initialization must
	+* be done outside the update function.
	+*
	+* The indented lines in the loop are instructions related to rounds processing.
	+* The non-indented lines are instructions related to the message schedule.
	+*
	+* Author: Sean Gulley <sean.m.gulley@intel.com>
	+* Date: July 2013
	+*
	+********************************************************************************
	+*
	+* Example complier command line:
	+* icc intel_sha_extensions_sha256_intrinsic.c
	+* gcc -msha -msse4 intel_sha_extensions_sha256_intrinsic.c
	+*
	+*******************************************************************************/
	+#include <stdint.h>
	+#include <immintrin.h>
	+
	+#include <crypto/sha_sse/sha_sse.h>
	+
	+void intel_sha256_step(uint32_t digest, const char data, uint32_t num_blks) {
	+ __m128i state0, state1;
	+ __m128i msg;
	+ __m128i msgtmp0, msgtmp1, msgtmp2, msgtmp3;
	+ __m128i tmp;
	+ __m128i shuf_mask;
	+ __m128i abef_save, cdgh_save;
	+
	+ // Load initial hash values
	+ // Need to reorder these appropriately
	+ // DCBA, HGFE -> ABEF, CDGH
	+ tmp = _mm_loadu_si128((__m128i*) digest);
	+ state1 = _mm_loadu_si128((__m128i*) (digest+4));
	+
	+ tmp = _mm_shuffle_epi32(tmp, 0xB1); // CDAB
	+ state1 = _mm_shuffle_epi32(state1, 0x1B); // EFGH
	+ state0 = _mm_alignr_epi8(tmp, state1, 8); // ABEF
	+ state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
	+
	+ shuf_mask = _mm_set_epi64x(0x0c0d0e0f08090a0bull, 0x0405060700010203ull);
	+
	+ while (num_blks > 0) {
	+ // Save hash values for addition after rounds
	+ abef_save = state0;
	+ cdgh_save = state1;
	+
	+ // Rounds 0-3
	+ msg = _mm_loadu_si128((const __m128i*) data);
	+ msgtmp0 = _mm_shuffle_epi8(msg, shuf_mask);
	+ msg = _mm_add_epi32(msgtmp0,
	+ _mm_set_epi64x(0xE9B5DBA5B5C0FBCFull, 0x71374491428A2F98ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+
	+ // Rounds 4-7
	+ msgtmp1 = _mm_loadu_si128((const __m128i*) (data+16));
	+ msgtmp1 = _mm_shuffle_epi8(msgtmp1, shuf_mask);
	+ msg = _mm_add_epi32(msgtmp1,
	+ _mm_set_epi64x(0xAB1C5ED5923F82A4ull, 0x59F111F13956C25Bull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
	+
	+ // Rounds 8-11
	+ msgtmp2 = _mm_loadu_si128((const __m128i*) (data+32));
	+ msgtmp2 = _mm_shuffle_epi8(msgtmp2, shuf_mask);
	+ msg = _mm_add_epi32(msgtmp2,
	+ _mm_set_epi64x(0x550C7DC3243185BEull, 0x12835B01D807AA98ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
	+
	+ // Rounds 12-15
	+ msgtmp3 = _mm_loadu_si128((const __m128i*) (data+48));
	+ msgtmp3 = _mm_shuffle_epi8(msgtmp3, shuf_mask);
	+ msg = _mm_add_epi32(msgtmp3,
	+ _mm_set_epi64x(0xC19BF1749BDC06A7ull, 0x80DEB1FE72BE5D74ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
	+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
	+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
	+
	+ // Rounds 16-19
	+ msg = _mm_add_epi32(msgtmp0,
	+ _mm_set_epi64x(0x240CA1CC0FC19DC6ull, 0xEFBE4786E49B69C1ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
	+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
	+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
	+
	+ // Rounds 20-23
	+ msg = _mm_add_epi32(msgtmp1,
	+ _mm_set_epi64x(0x76F988DA5CB0A9DCull, 0x4A7484AA2DE92C6Full));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
	+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
	+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
	+
	+ // Rounds 24-27
	+ msg = _mm_add_epi32(msgtmp2,
	+ _mm_set_epi64x(0xBF597FC7B00327C8ull, 0xA831C66D983E5152ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
	+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
	+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
	+
	+ // Rounds 28-31
	+ msg = _mm_add_epi32(msgtmp3,
	+ _mm_set_epi64x(0x1429296706CA6351ull, 0xD5A79147C6E00BF3ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
	+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
	+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
	+
	+ // Rounds 32-35
	+ msg = _mm_add_epi32(msgtmp0,
	+ _mm_set_epi64x(0x53380D134D2C6DFCull, 0x2E1B213827B70A85ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
	+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
	+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
	+
	+ // Rounds 36-39
	+ msg = _mm_add_epi32(msgtmp1,
	+ _mm_set_epi64x(0x92722C8581C2C92Eull, 0x766A0ABB650A7354ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
	+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
	+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
	+
	+ // Rounds 40-43
	+ msg = _mm_add_epi32(msgtmp2,
	+ _mm_set_epi64x(0xC76C51A3C24B8B70ull, 0xA81A664BA2BFE8A1ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
	+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
	+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
	+
	+ // Rounds 44-47
	+ msg = _mm_add_epi32(msgtmp3,
	+ _mm_set_epi64x(0x106AA070F40E3585ull, 0xD6990624D192E819ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
	+ msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
	+ msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
	+
	+ // Rounds 48-51
	+ msg = _mm_add_epi32(msgtmp0,
	+ _mm_set_epi64x(0x34B0BCB52748774Cull, 0x1E376C0819A4C116ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
	+ msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
	+ msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+ msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
	+
	+ // Rounds 52-55
	+ msg = _mm_add_epi32(msgtmp1,
	+ _mm_set_epi64x(0x682E6FF35B9CCA4Full, 0x4ED8AA4A391C0CB3ull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
	+ msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
	+ msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+
	+ // Rounds 56-59
	+ msg = _mm_add_epi32(msgtmp2,
	+ _mm_set_epi64x(0x8CC7020884C87814ull, 0x78A5636F748F82EEull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ tmp = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
	+ msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
	+ msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+
	+ // Rounds 60-63
	+ msg = _mm_add_epi32(msgtmp3,
	+ _mm_set_epi64x(0xC67178F2BEF9A3F7ull, 0xA4506CEB90BEFFFAull));
	+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
	+ msg = _mm_shuffle_epi32(msg, 0x0E);
	+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
	+
	+ // Add current hash values with previously saved
	+ state0 = _mm_add_epi32(state0, abef_save);
	+ state1 = _mm_add_epi32(state1, cdgh_save);
	+
	+ data += 64;
	+ num_blks--;
	+ }
	+
	+ // Write hash values back in the correct order
	+ tmp = _mm_shuffle_epi32(state0, 0x1B); // FEBA
	+ state1 = _mm_shuffle_epi32(state1, 0xB1); // DCHG
	+ state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
	+ state1 = _mm_alignr_epi8(state1, tmp, 8); // ABEF
	+
	+ _mm_store_si128((__m128i*) digest, state0);
	+ _mm_store_si128((__m128i*) (digest+4), state1);
	+}
	+
	Index: sys/crypto/sha_sse/sha_sse.h
	===================================================================
	--- /dev/null
	+++ sys/crypto/sha_sse/sha_sse.h
	@@ -0,0 +1,38 @@
	+/*-
	+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef _CRYPTO__SHA_SSE_H_
	+#define _CRYPTO__SHA_SSE_H_
	+
	+/*
	+ * Internal functions, implemented in intrinsics.
	+ */
	+void intel_sha1_step(uint32_t digest, const char data, uint32_t num_blks);
	+void intel_sha256_step(uint32_t digest, const char data, uint32_t num_blks);
	+
	+#endif /* _CRYPTO__SHA_SSE_H_ */
	Index: sys/crypto/sha_sse/sha_sse.c
	===================================================================
	--- /dev/null
	+++ sys/crypto/sha_sse/sha_sse.c
	@@ -0,0 +1,606 @@
	+/*-
	+ * Copyright (c) 2005-2008 Pawel Jakub Dawidek <pjd@FreeBSD.org>
	+ * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
	+ * Copyright (c) 2014 The FreeBSD Foundation
	+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
	+ * All rights reserved.
	+ *
	+ * Portions of this software were developed by John-Mark Gurney
	+ * under sponsorship of the FreeBSD Foundation and
	+ * Rubicon Communications, LLC (Netgate).
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ */
	+
	+#include <sys/cdefs.h>
	+__FBSDID("$FreeBSD$");
	+
	+#include <sys/param.h>
	+#include <sys/systm.h>
	+#include <sys/kernel.h>
	+#include <sys/kobj.h>
	+#include <sys/libkern.h>
	+#include <sys/lock.h>
	+#include <sys/module.h>
	+#include <sys/malloc.h>
	+#include <sys/rwlock.h>
	+#include <sys/bus.h>
	+#include <sys/uio.h>
	+#include <sys/mbuf.h>
	+#include <sys/smp.h>
	+
	+#include <crypto/sha1.h>
	+#include <crypto/sha2/sha256.h>
	+#include <crypto/sha_sse/sha_sse.h>
	+
	+#include <opencrypto/cryptodev.h>
	+#include <opencrypto/gmac.h>
	+#include <cryptodev_if.h>
	+
	+#include <machine/md_var.h>
	+#include <machine/specialreg.h>
	+#if defined(__i386__)
	+#include <machine/npx.h>
	+#elif defined(__amd64__)
	+#include <machine/fpu.h>
	+#endif
	+static struct mtx_padalign *ctx_mtx;
	+static struct fpu_kern_ctx **ctx_fpu;
	+
	+struct sha_sse_softc {
	+ int dieing;
	+ int32_t cid;
	+ uint32_t sid;
	+ struct rwlock lock;
	+ TAILQ_HEAD(sha_sse_sessions_head, sha_sse_session) sessions;
	+};
	+
	+struct sha_sse_session {
	+ /* Same as the SHA256 Blocksize. */
	+ uint8_t key[SHA1_HMAC_BLOCK_LEN] __aligned(16);
	+ int algo;
	+ int used;
	+ int mlen;
	+ uint32_t id;
	+ TAILQ_ENTRY(sha_sse_session) next;
	+};
	+
	+#define ACQUIRE_CTX(i, ctx) \
	+ do { \
	+ (i) = PCPU_GET(cpuid); \
	+ mtx_lock(&ctx_mtx[(i)]); \
	+ (ctx) = ctx_fpu[(i)]; \
	+ } while (0)
	+#define RELEASE_CTX(i, ctx) \
	+ do { \
	+ mtx_unlock(&ctx_mtx[(i)]); \
	+ (i) = -1; \
	+ (ctx) = NULL; \
	+ } while (0)
	+
	+static int sha_sse_newsession(device_t, uint32_t sidp, struct cryptoini cri);
	+static int sha_sse_freesession(device_t, uint64_t tid);
	+static void sha_sse_freesession_locked(struct sha_sse_softc *sc,
	+ struct sha_sse_session *ses);
	+static int sha_sse_cipher_setup(struct sha_sse_session *ses,
	+ struct cryptoini *encini);
	+static int sha_sse_cipher_process(struct sha_sse_session *ses,
	+ struct cryptop *crp);
	+
	+MALLOC_DEFINE(M_SHA_SSE, "sha_sse_data", "SHA_SSE Data");
	+
	+static void
	+sha_sse_identify(driver_t *drv, device_t parent)
	+{
	+
	+ /* NB: order 10 is so we get attached after h/w devices */
	+ if (device_find_child(parent, "sha_sse", -1) == NULL &&
	+ BUS_ADD_CHILD(parent, 10, "sha_sse", -1) == 0)
	+ panic("sha_sse: could not attach");
	+}
	+
	+static int
	+sha_sse_probe(device_t dev)
	+{
	+
	+ if ((cpu_stdext_feature & CPUID_STDEXT_SHA) == 0) {
	+ device_printf(dev, "No SHA support.\n");
	+ return (EINVAL);
	+ }
	+ if ((cpu_feature2 & CPUID2_SSSE3) == 0) {
	+ device_printf(dev, "No SSSE3 support.\n");
	+ return (EINVAL);
	+ }
	+
	+ device_set_desc_copy(dev, "SHA1,SHA2");
	+ return (0);
	+}
	+
	+static void
	+sha_sse_cleanctx(void)
	+{
	+ int i;
	+
	+ /* XXX - no way to return driverid */
	+ CPU_FOREACH(i) {
	+ if (ctx_fpu[i] != NULL) {
	+ mtx_destroy(&ctx_mtx[i]);
	+ fpu_kern_free_ctx(ctx_fpu[i]);
	+ }
	+ ctx_fpu[i] = NULL;
	+ }
	+ free(ctx_mtx, M_SHA_SSE);
	+ ctx_mtx = NULL;
	+ free(ctx_fpu, M_SHA_SSE);
	+ ctx_fpu = NULL;
	+}
	+
	+static int
	+sha_sse_attach(device_t dev)
	+{
	+ struct sha_sse_softc *sc;
	+ int i;
	+
	+ sc = device_get_softc(dev);
	+ sc->dieing = 0;
	+ TAILQ_INIT(&sc->sessions);
	+ sc->sid = 1;
	+
	+ sc->cid = crypto_get_driverid(dev, CRYPTOCAP_F_HARDWARE \|
	+ CRYPTOCAP_F_SYNC);
	+ if (sc->cid < 0) {
	+ device_printf(dev, "Could not get crypto driver id.\n");
	+ return (ENOMEM);
	+ }
	+
	+ ctx_mtx = malloc(sizeof(ctx_mtx) (mp_maxid + 1), M_SHA_SSE,
	+ M_WAITOK \| M_ZERO);
	+ ctx_fpu = malloc(sizeof(ctx_fpu) (mp_maxid + 1), M_SHA_SSE,
	+ M_WAITOK \| M_ZERO);
	+
	+ CPU_FOREACH(i) {
	+ ctx_fpu[i] = fpu_kern_alloc_ctx(0);
	+ mtx_init(&ctx_mtx[i], "shafpumtx", NULL, MTX_DEF \| MTX_NEW);
	+ }
	+
	+ rw_init(&sc->lock, "sha_sse_lock");
	+ crypto_register(sc->cid, CRYPTO_SHA1, 0, 0);
	+ crypto_register(sc->cid, CRYPTO_SHA1_HMAC, 0, 0);
	+ crypto_register(sc->cid, CRYPTO_SHA2_256_HMAC, 0, 0);
	+ return (0);
	+}
	+
	+static int
	+sha_sse_detach(device_t dev)
	+{
	+ struct sha_sse_softc *sc;
	+ struct sha_sse_session *ses;
	+
	+ sc = device_get_softc(dev);
	+
	+ rw_wlock(&sc->lock);
	+ TAILQ_FOREACH(ses, &sc->sessions, next) {
	+ if (ses->used) {
	+ rw_wunlock(&sc->lock);
	+ device_printf(dev,
	+ "Cannot detach, sessions still active.\n");
	+ return (EBUSY);
	+ }
	+ }
	+ sc->dieing = 1;
	+ while ((ses = TAILQ_FIRST(&sc->sessions)) != NULL) {
	+ TAILQ_REMOVE(&sc->sessions, ses, next);
	+ free(ses, M_SHA_SSE);
	+ }
	+ rw_wunlock(&sc->lock);
	+ crypto_unregister_all(sc->cid);
	+
	+ rw_destroy(&sc->lock);
	+
	+ sha_sse_cleanctx();
	+
	+ return (0);
	+}
	+
	+static int
	+sha_sse_newsession(device_t dev, uint32_t sidp, struct cryptoini cri)
	+{
	+ struct sha_sse_softc *sc;
	+ struct sha_sse_session *ses;
	+ struct cryptoini *encini;
	+ int error;
	+
	+ if (sidp == NULL \|\| cri == NULL) {
	+ CRYPTDEB("no sidp or cri");
	+ return (EINVAL);
	+ }
	+
	+ sc = device_get_softc(dev);
	+ if (sc->dieing)
	+ return (EINVAL);
	+
	+ ses = NULL;
	+ encini = NULL;
	+ for (; cri != NULL; cri = cri->cri_next) {
	+ switch (cri->cri_alg) {
	+ case CRYPTO_SHA1:
	+ case CRYPTO_SHA1_HMAC:
	+ case CRYPTO_SHA2_256_HMAC:
	+ if (encini != NULL) {
	+ CRYPTDEB("encini already set");
	+ return (EINVAL);
	+ }
	+ encini = cri;
	+ break;
	+ default:
	+ CRYPTDEB("unhandled algorithm");
	+ return (EINVAL);
	+ }
	+ }
	+ if (encini == NULL) {
	+ CRYPTDEB("no cipher");
	+ return (EINVAL);
	+ }
	+
	+ rw_wlock(&sc->lock);
	+ if (sc->dieing) {
	+ rw_wunlock(&sc->lock);
	+ return (EINVAL);
	+ }
	+ /*
	+ * Free sessions goes first, so if first session is used, we need to
	+ * allocate one.
	+ */
	+ ses = TAILQ_FIRST(&sc->sessions);
	+ if (ses == NULL \|\| ses->used) {
	+ ses = malloc(sizeof(*ses), M_SHA_SSE, M_NOWAIT \| M_ZERO);
	+ if (ses == NULL) {
	+ rw_wunlock(&sc->lock);
	+ return (ENOMEM);
	+ }
	+ ses->id = sc->sid++;
	+ } else {
	+ TAILQ_REMOVE(&sc->sessions, ses, next);
	+ }
	+ ses->used = 1;
	+ TAILQ_INSERT_TAIL(&sc->sessions, ses, next);
	+ rw_wunlock(&sc->lock);
	+ ses->algo = encini->cri_alg;
	+
	+ error = sha_sse_cipher_setup(ses, encini);
	+ if (error != 0) {
	+ CRYPTDEB("setup failed");
	+ rw_wlock(&sc->lock);
	+ sha_sse_freesession_locked(sc, ses);
	+ rw_wunlock(&sc->lock);
	+ return (error);
	+ }
	+
	+ *sidp = ses->id;
	+ return (0);
	+}
	+
	+static void
	+sha_sse_freesession_locked(struct sha_sse_softc sc, struct sha_sse_session ses)
	+{
	+ uint32_t sid;
	+
	+ rw_assert(&sc->lock, RA_WLOCKED);
	+
	+ sid = ses->id;
	+ TAILQ_REMOVE(&sc->sessions, ses, next);
	+ explicit_bzero(ses, sizeof(*ses));
	+ ses->id = sid;
	+ TAILQ_INSERT_HEAD(&sc->sessions, ses, next);
	+}
	+
	+static int
	+sha_sse_freesession(device_t dev, uint64_t tid)
	+{
	+ struct sha_sse_softc *sc;
	+ struct sha_sse_session *ses;
	+ uint32_t sid;
	+
	+ sc = device_get_softc(dev);
	+ sid = ((uint32_t)tid) & 0xffffffff;
	+ rw_wlock(&sc->lock);
	+ TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) {
	+ if (ses->id == sid)
	+ break;
	+ }
	+ if (ses == NULL) {
	+ rw_wunlock(&sc->lock);
	+ return (EINVAL);
	+ }
	+ sha_sse_freesession_locked(sc, ses);
	+ rw_wunlock(&sc->lock);
	+ return (0);
	+}
	+
	+static int
	+sha_sse_process(device_t dev, struct cryptop *crp, int hint __unused)
	+{
	+ struct sha_sse_softc *sc = device_get_softc(dev);
	+ struct sha_sse_session *ses = NULL;
	+ int error;
	+
	+ error = 0;
	+
	+ /* Sanity check. */
	+ if (crp == NULL)
	+ return (EINVAL);
	+
	+ if (crp->crp_callback == NULL \|\| crp->crp_desc == NULL) {
	+ error = EINVAL;
	+ goto out;
	+ }
	+
	+ rw_rlock(&sc->lock);
	+ TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) {
	+ if (ses->id == (crp->crp_sid & 0xffffffff))
	+ break;
	+ }
	+ rw_runlock(&sc->lock);
	+ if (ses == NULL) {
	+ error = EINVAL;
	+ goto out;
	+ }
	+
	+ error = sha_sse_cipher_process(ses, crp);
	+ if (error != 0)
	+ goto out;
	+
	+out:
	+ crp->crp_etype = error;
	+ crypto_done(crp);
	+ return (error);
	+}
	+
	+static device_method_t sha_sse_methods[] = {
	+ DEVMETHOD(device_identify, sha_sse_identify),
	+ DEVMETHOD(device_probe, sha_sse_probe),
	+ DEVMETHOD(device_attach, sha_sse_attach),
	+ DEVMETHOD(device_detach, sha_sse_detach),
	+
	+ DEVMETHOD(cryptodev_newsession, sha_sse_newsession),
	+ DEVMETHOD(cryptodev_freesession, sha_sse_freesession),
	+ DEVMETHOD(cryptodev_process, sha_sse_process),
	+
	+ {0, 0},
	+};
	+
	+static driver_t sha_sse_driver = {
	+ "sha_sse",
	+ sha_sse_methods,
	+ sizeof(struct sha_sse_softc),
	+};
	+static devclass_t sha_sse_devclass;
	+
	+DRIVER_MODULE(sha_sse, nexus, sha_sse_driver, sha_sse_devclass, 0, 0);
	+MODULE_VERSION(sha_sse, 1);
	+MODULE_DEPEND(sha_sse, crypto, 1, 1, 1);
	+
	+static int
	+sha_sse_cipher_setup(struct sha_sse_session ses, struct cryptoini encini)
	+{
	+ int keylen;
	+
	+ keylen = encini->cri_klen / 8;
	+ if (keylen > sizeof(ses->key))
	+ return (EINVAL);
	+ if (ses->algo == CRYPTO_SHA1 && keylen > 0)
	+ return (EINVAL);
	+
	+ memcpy(ses->key, encini->cri_key, keylen);
	+ ses->mlen = encini->cri_mlen;
	+
	+ return (0);
	+}
	+
	+static int
	+intel_sha1_update(void vctx, const void vdata, u_int datalen)
	+{
	+ struct sha1_ctxt *ctx = vctx;
	+ const char *data = vdata;
	+ size_t gaplen;
	+ size_t gapstart;
	+ size_t off;
	+ size_t copysiz;
	+ u_int blocks;
	+
	+ off = 0;
	+ /* Do any aligned blocks without redundant copying. */
	+ if (datalen >= 64 && ctx->count % 64 == 0) {
	+ blocks = datalen / 64;
	+ ctx->c.b64[0] += blocks * 64 * 8;
	+ intel_sha1_step(ctx->h.b32, data + off, blocks);
	+ off += blocks * 64;
	+ }
	+
	+ while (off < datalen) {
	+ gapstart = ctx->count % 64;
	+ gaplen = 64 - gapstart;
	+
	+ copysiz = (gaplen < datalen - off) ? gaplen : datalen - off;
	+ bcopy(&data[off], &ctx->m.b8[gapstart], copysiz);
	+ ctx->count += copysiz;
	+ ctx->count %= 64;
	+ ctx->c.b64[0] += copysiz * 8;
	+ if (ctx->count % 64 == 0)
	+ intel_sha1_step(ctx->h.b32, (void *)ctx->m.b8, 1);
	+ off += copysiz;
	+ }
	+ return (0);
	+}
	+
	+static void
	+SHA1_Finalize_fn(void digest, void ctx)
	+{
	+ sha1_result(ctx, digest);
	+}
	+
	+static int
	+intel_sha256_update(void vctx, const void vdata, u_int len)
	+{
	+ SHA256_CTX *ctx = vctx;
	+ uint64_t bitlen;
	+ uint32_t r;
	+ u_int blocks;
	+ const unsigned char *src = vdata;
	+
	+ /* Number of bytes left in the buffer from previous updates */
	+ r = (ctx->count >> 3) & 0x3f;
	+
	+ /* Convert the length into a number of bits */
	+ bitlen = len << 3;
	+
	+ /* Update number of bits */
	+ ctx->count += bitlen;
	+
	+ /* Handle the case where we don't need to perform any transforms */
	+ if (len < 64 - r) {
	+ memcpy(&ctx->buf[r], src, len);
	+ return (0);
	+ }
	+
	+ /* Finish the current block */
	+ memcpy(&ctx->buf[r], src, 64 - r);
	+ intel_sha256_step(ctx->state, ctx->buf, 1);
	+ src += 64 - r;
	+ len -= 64 - r;
	+
	+ /* Perform complete blocks */
	+ if (len >= 64) {
	+ blocks = len / 64;
	+ intel_sha256_step(ctx->state, src, blocks);
	+ src += blocks * 64;
	+ len -= blocks * 64;
	+ }
	+
	+ /* Copy left over data into buffer */
	+ memcpy(ctx->buf, src, len);
	+ return (0);
	+}
	+
	+static void
	+SHA256_Finalize_fn(void digest, void ctx)
	+{
	+ SHA256_Final(digest, ctx);
	+}
	+
	+/*
	+ * Compute the HASH( (key ^ xorbyte) \|\| buf )
	+ */
	+static void
	+hmac_internal(void ctx, uint32_t res,
	+ int (update)(void , const void *, u_int),
	+ void (finalize)(void , void ), uint8_t key, uint8_t xorbyte,
	+ const void *buf, size_t off, size_t buflen, int crpflags)
	+{
	+ size_t i;
	+
	+ for (i = 0; i < 64; i++)
	+ key[i] ^= xorbyte;
	+ update(ctx, key, 64);
	+ for (i = 0; i < 64; i++)
	+ key[i] ^= xorbyte;
	+
	+ crypto_apply(crpflags, __DECONST(void *, buf), off, buflen,
	+ __DECONST(int ()(void , void *, u_int), update), ctx);
	+ finalize(res, ctx);
	+}
	+
	+static int
	+sha_sse_cipher_process(struct sha_sse_session ses, struct cryptop crp)
	+{
	+ struct SHA256Context s2ctx;
	+ struct sha1_ctxt sctx __aligned(16);
	+ uint32_t res[8];
	+ struct fpu_kern_ctx *ctx;
	+ struct cryptodesc *crd;
	+ int error, kt, ctxidx, hashlen;
	+
	+ error = 0;
	+
	+ kt = is_fpu_kern_thread(0);
	+ if (!kt) {
	+ ACQUIRE_CTX(ctxidx, ctx);
	+ error = fpu_kern_enter(curthread, ctx,
	+ FPU_KERN_NORMAL\|FPU_KERN_KTHR);
	+ if (error != 0)
	+ goto out2;
	+ }
	+
	+ crd = crp->crp_desc;
	+ if (crd->crd_next != NULL \|\| crd->crd_flags != 0) {
	+ error = EINVAL;
	+ goto out;
	+ }
	+
	+ switch (ses->algo) {
	+ case CRYPTO_SHA1_HMAC:
	+ hashlen = SHA1_HASH_LEN;
	+ /* Inner hash: (K ^ IPAD) \|\| data */
	+ sha1_init(&sctx);
	+ hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn,
	+ ses->key, 0x36, crp->crp_buf, crd->crd_skip, crd->crd_len,
	+ crp->crp_flags);
	+ /* Outer hash: (K ^ OPAD) \|\| inner hash */
	+ sha1_init(&sctx);
	+ hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn,
	+ ses->key, 0x5C, res, 0, hashlen, 0);
	+ break;
	+ case CRYPTO_SHA1:
	+ hashlen = SHA1_HASH_LEN;
	+ sha1_init(&sctx);
	+ crypto_apply(crp->crp_flags, crp->crp_buf, crd->crd_skip,
	+ crd->crd_len, __DECONST(int ()(void , void *, u_int),
	+ intel_sha1_update), &sctx);
	+ sha1_result(&sctx, (void *)res);
	+ break;
	+ case CRYPTO_SHA2_256_HMAC:
	+ hashlen = SHA2_256_HASH_LEN;
	+ /* Inner hash: (K ^ IPAD) \|\| data */
	+ SHA256_Init(&s2ctx);
	+ hmac_internal(&s2ctx, res, intel_sha256_update,
	+ SHA256_Finalize_fn, ses->key, 0x36, crp->crp_buf,
	+ crd->crd_skip, crd->crd_len, crp->crp_flags);
	+ /* Outer hash: (K ^ OPAD) \|\| inner hash */
	+ SHA256_Init(&s2ctx);
	+ hmac_internal(&s2ctx, res, intel_sha256_update,
	+ SHA256_Finalize_fn, ses->key, 0x5C, res, 0, hashlen, 0);
	+ break;
	+ }
	+
	+ if (ses->mlen != 0 && ses->mlen < hashlen)
	+ hashlen = ses->mlen;
	+ crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject, hashlen,
	+ (void *)res);
	+
	+out:
	+ if (!kt) {
	+ fpu_kern_leave(curthread, ctx);
	+out2:
	+ RELEASE_CTX(ctxidx, ctx);
	+ }
	+ return (error);
	+}
	Index: sys/i386/conf/NOTES
	===================================================================
	--- sys/i386/conf/NOTES
	+++ sys/i386/conf/NOTES
	@@ -839,6 +839,7 @@
	device padlock_rng # VIA Padlock RNG
	device rdrand_rng # Intel Bull Mountain RNG
	device aesni # AES-NI OpenCrypto module
	+device sha_sse # SHA SSE extension OpenCrypto module

	#
	# Laptop/Notebook options:
	Index: sys/modules/Makefile
	===================================================================
	--- sys/modules/Makefile
	+++ sys/modules/Makefile
	@@ -351,6 +351,7 @@
	sge \
	${_sgx} \
	${_sgx_linux} \
	+ ${_sha_sse} \
	siba_bwn \
	siftr \
	siis \
	@@ -625,6 +626,7 @@
	_acpi= acpi
	.if ${MK_CRYPT} != "no" \|\| defined(ALL_MODULES)
	_aesni= aesni
	+_sha_sse= sha_sse
	.endif
	_amd_ecc_inject=amd_ecc_inject
	_amdsbwd= amdsbwd
	Index: sys/modules/sha_sse/Makefile
	===================================================================
	--- /dev/null
	+++ sys/modules/sha_sse/Makefile
	@@ -0,0 +1,25 @@
	+# $FreeBSD$
	+
	+.PATH: ${SRCTOP}/sys/crypto/sha_sse
	+.PATH: ${SRCTOP}/contrib/llvm/tools/clang/lib/Headers
	+
	+KMOD= sha_sse
	+SRCS= sha_sse.c
	+SRCS+= device_if.h bus_if.h opt_bus.h cryptodev_if.h
	+
	+OBJS+= intel_sha1.o intel_sha256.o
	+
	+# Remove -nostdinc so we can get the intrinsics.
	+intel_sha1.o: intel_sha1.c
	+ ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
	+ -mmmx -msse -msse4 -msha ${.IMPSRC}
	+ ${CTFCONVERT_CMD}
	+intel_sha256.o: intel_sha256.c
	+ ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
	+ -mmmx -msse -msse4 -msha ${.IMPSRC}
	+ ${CTFCONVERT_CMD}
	+
	+intel_sha1.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
	+intel_sha256.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
	+
	+.include <bsd.kmod.mk>
	Index: tests/sys/opencrypto/cryptotest.py
	===================================================================
	--- tests/sys/opencrypto/cryptotest.py
	+++ tests/sys/opencrypto/cryptotest.py
	@@ -46,7 +46,7 @@

	aesmodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ]
	desmodules = [ 'cryptosoft0', ]
	-shamodules = [ 'cryptosoft0', 'ccr0' ]
	+shamodules = [ 'cryptosoft0', 'ccr0', 'sha_sse0' ]

	def GenTestCase(cname):
	try:
	@@ -308,6 +308,7 @@
	cryptosoft = GenTestCase('cryptosoft0')
	aesni = GenTestCase('aesni0')
	ccr = GenTestCase('ccr0')
	+sha_sse = GenTestCase('sha_sse0')

	if __name__ == '__main__':
	unittest.main()

File Metadata

Mime Type: text/plain
Expires: Sun, Apr 26, 5:42 AM (3 h, 19 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 32166156
Default Alt Text: D12452.id33338.diff (46 KB)

D12452.id33338.diffNo OneTemporaryActions

D12452.id33338.diffView Options

File Metadata

Event Timeline

D12452.id33338.diff
No OneTemporary
Actions

D12452.id33338.diff
View Options