Index: share/man/man4/sha_sse.4
===================================================================
--- /dev/null
+++ share/man/man4/sha_sse.4
@@ -0,0 +1,81 @@
+.\" Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd September 21, 2017
+.Dt SHA_SSE 4
+.Os
+.Sh NAME
+.Nm sha_sse
+.Nd "driver for the SHA accelerator on x86 CPUs"
+.Sh SYNOPSIS
+To compile this driver into the kernel,
+place the following lines in your
+kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "device crypto"
+.Cd "device cryptodev"
+.Cd "device sha_sse"
+.Ed
+.Pp
+Alternatively, to load the driver as a
+module at boot time, place the following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+sha_sse_load="YES"
+.Ed
+.Sh DESCRIPTION
+Starting with the Intel Goldmont and AMD Ryzen microarchitectures, some x86
+processors implement a new set of SHA instructions.
+The set of seven instructions accelerates the calculation of SHA1 and SHA256
+hashes.
+.Pp
+The processor capability is reported as SHA in the Structured Extended Features
+line at boot.
+The
+.Nm
+driver does not attach on systems that lack the required CPU capability.
+.Pp
+The
+.Nm
+driver registers itself to accelerate SHA operations for
+.Xr crypto 4 .
+.Sh SEE ALSO
+.Xr crypto 4 ,
+.Xr intro 4 ,
+.Xr ipsec 4 ,
+.Xr crypto 9
+.Sh HISTORY
+The
+.Nm
+driver first appeared in
+.Fx 12.0 .
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+driver was written by
+.An Conrad Meyer Aq Mt cem@FreeBSD.org .
+The hash step intrinsics implementations were supplied by Intel.
Index: sys/amd64/conf/NOTES
===================================================================
--- sys/amd64/conf/NOTES
+++ sys/amd64/conf/NOTES
@@ -544,6 +544,7 @@
 device		padlock_rng	# VIA Padlock RNG
 device		rdrand_rng	# Intel Bull Mountain RNG
 device		aesni		# AES-NI OpenCrypto module
+device		sha_sse		# SHA SSE extension OpenCrypto module
 device		ioat		# Intel I/OAT DMA engine
 
 #
Index: sys/conf/files.amd64
===================================================================
--- sys/conf/files.amd64
+++ sys/conf/files.amd64
@@ -182,6 +182,17 @@
 crypto/blowfish/bf_enc.c	optional	crypto | ipsec | ipsec_support
 crypto/des/des_enc.c		optional	crypto | ipsec | \
 	ipsec_support | netsmb
+intel_sha1.o			optional	sha_sse			\
+	dependency	"$S/crypto/sha_sse/intel_sha1.c"		\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha1.o"
+intel_sha256.o			optional	sha_sse			\
+	dependency	"$S/crypto/sha_sse/intel_sha256.c"		\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha256.o"
+crypto/sha_sse/sha_sse.c	optional	sha_sse
 crypto/via/padlock.c		optional	padlock
 crypto/via/padlock_cipher.c	optional	padlock
 crypto/via/padlock_hash.c	optional	padlock
Index: sys/conf/files.i386
===================================================================
--- sys/conf/files.i386
+++ sys/conf/files.i386
@@ -132,6 +132,17 @@
 	no-implicit-rule						\
 	clean		"aesni_wrap.o"
 crypto/des/arch/i386/des_enc.S	optional crypto | ipsec | ipsec_support | netsmb
+intel_sha1.o			optional	sha_sse			\
+	dependency	"$S/crypto/sha_sse/intel_sha1.c"		\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha1.o"
+intel_sha256.o			optional	sha_sse			\
+	dependency	"$S/crypto/sha_sse/intel_sha256.c"		\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha256.o"
+crypto/sha_sse/sha_sse.c	optional sha_sse
 crypto/via/padlock.c		optional padlock
 crypto/via/padlock_cipher.c	optional padlock
 crypto/via/padlock_hash.c	optional padlock
Index: sys/crypto/sha_sse/intel_sha1.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/intel_sha1.c
@@ -0,0 +1,258 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation 
+* 
+* All rights reserved. 
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: 
+* 
+* * Redistributions of source code must retain the above copyright
+*   notice, this list of conditions and the following disclaimer.  
+* 
+* * Redistributions in binary form must reproduce the above copyright
+*   notice, this list of conditions and the following disclaimer in the
+*   documentation and/or other materials provided with the
+*   distribution. 
+* 
+* * Neither the name of the Intel Corporation nor the names of its
+*   contributors may be used to endorse or promote products derived from
+*   this software without specific prior written permission. 
+* 
+* 
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-1 update function 
+* 
+* The function takes a pointer to the current hash values, a pointer to the 
+* input data, and a number of 64 byte blocks to process.  Once all blocks have 
+* been processed, the digest pointer is  updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to 
+* store partial blocks.  All message padding and hash value initialization must
+* be done outside the update function.  
+* 
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+* 
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date:   July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha1_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
+*
+*******************************************************************************/
+#include <sys/types.h>
+#include <immintrin.h>
+
+#include <crypto/sha_sse/sha_sse.h>
+
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+   __m128i abcd, e0, e1;
+   __m128i abcd_save, e_save;
+   __m128i msg0, msg1, msg2, msg3;
+   __m128i shuf_mask, e_mask;
+
+#if 0
+   e_mask    = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
+#else
+   (void)e_mask;
+   e0        = _mm_set_epi64x(0, 0);
+#endif
+   shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
+
+   // Load initial hash values
+   abcd      = _mm_loadu_si128((__m128i*) digest);
+   e0        = _mm_insert_epi32(e0, *(digest+4), 3);
+   abcd      = _mm_shuffle_epi32(abcd, 0x1B);
+#if 0
+   e0        = _mm_and_si128(e0, e_mask);
+#endif
+
+   while (num_blks > 0) {
+      // Save hash values for addition after rounds
+      abcd_save = abcd;
+      e_save    = e0;
+
+      // Rounds 0-3
+      msg0 = _mm_loadu_si128((const __m128i*) data);
+      msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
+         e0   = _mm_add_epi32(e0, msg0);
+         e1   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+
+      // Rounds 4-7
+      msg1 = _mm_loadu_si128((const __m128i*) (data+16));
+      msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+
+      // Rounds 8-11
+      msg2 = _mm_loadu_si128((const __m128i*) (data+32));
+      msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 12-15
+      msg3 = _mm_loadu_si128((const __m128i*) (data+48));
+      msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 16-19
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 20-23
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 24-27
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 28-31
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 32-35
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 36-39
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 40-43
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 44-47
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 48-51
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 52-55
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 56-59
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 60-63
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 64-67
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 68-71
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 72-75
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+
+      // Rounds 76-79
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+
+      // Add current hash values with previously saved
+      e0   = _mm_sha1nexte_epu32(e0, e_save);
+      abcd = _mm_add_epi32(abcd, abcd_save);
+
+      data += 64;
+      num_blks--;
+   }
+
+   abcd = _mm_shuffle_epi32(abcd, 0x1B);
+   _mm_store_si128((__m128i*) digest, abcd);
+   *(digest+4) = _mm_extract_epi32(e0, 3);
+}
+
Index: sys/crypto/sha_sse/intel_sha256.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/intel_sha256.c
@@ -0,0 +1,274 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation 
+* 
+* All rights reserved. 
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: 
+* 
+* * Redistributions of source code must retain the above copyright
+*   notice, this list of conditions and the following disclaimer.  
+* 
+* * Redistributions in binary form must reproduce the above copyright
+*   notice, this list of conditions and the following disclaimer in the
+*   documentation and/or other materials provided with the
+*   distribution. 
+* 
+* * Neither the name of the Intel Corporation nor the names of its
+*   contributors may be used to endorse or promote products derived from
+*   this software without specific prior written permission. 
+* 
+* 
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-256 update function 
+*
+* The function takes a pointer to the current hash values, a pointer to the 
+* input data, and a number of 64 byte blocks to process.  Once all blocks have 
+* been processed, the digest pointer is  updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to 
+* store partial blocks.  All message padding and hash value initialization must
+* be done outside the update function.  
+*
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+*
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date:   July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha256_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha256_intrinsic.c
+*
+*******************************************************************************/
+#include <stdint.h>
+#include <immintrin.h>
+
+#include <crypto/sha_sse/sha_sse.h>
+
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+   __m128i state0, state1;
+   __m128i msg;
+   __m128i msgtmp0, msgtmp1, msgtmp2, msgtmp3;
+   __m128i tmp;
+   __m128i shuf_mask;
+   __m128i abef_save, cdgh_save;
+
+   // Load initial hash values
+   // Need to reorder these appropriately
+   // DCBA, HGFE -> ABEF, CDGH
+   tmp    = _mm_loadu_si128((__m128i*) digest);
+   state1 = _mm_loadu_si128((__m128i*) (digest+4));
+
+   tmp    = _mm_shuffle_epi32(tmp, 0xB1);       // CDAB
+   state1 = _mm_shuffle_epi32(state1, 0x1B);    // EFGH
+   state0 = _mm_alignr_epi8(tmp, state1, 8);    // ABEF
+   state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
+
+   shuf_mask = _mm_set_epi64x(0x0c0d0e0f08090a0bull, 0x0405060700010203ull);
+
+   while (num_blks > 0) {
+      // Save hash values for addition after rounds
+      abef_save = state0;
+      cdgh_save = state1;
+
+      // Rounds 0-3
+      msg     = _mm_loadu_si128((const __m128i*) data);
+      msgtmp0 = _mm_shuffle_epi8(msg, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0xE9B5DBA5B5C0FBCFull, 0x71374491428A2F98ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 4-7
+      msgtmp1 = _mm_loadu_si128((const __m128i*) (data+16));
+      msgtmp1 = _mm_shuffle_epi8(msgtmp1, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0xAB1C5ED5923F82A4ull, 0x59F111F13956C25Bull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 8-11
+      msgtmp2 = _mm_loadu_si128((const __m128i*) (data+32));
+      msgtmp2 = _mm_shuffle_epi8(msgtmp2, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0x550C7DC3243185BEull, 0x12835B01D807AA98ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 12-15
+      msgtmp3 = _mm_loadu_si128((const __m128i*) (data+48));
+      msgtmp3 = _mm_shuffle_epi8(msgtmp3, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0xC19BF1749BDC06A7ull, 0x80DEB1FE72BE5D74ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 16-19
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x240CA1CC0FC19DC6ull, 0xEFBE4786E49B69C1ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 20-23
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x76F988DA5CB0A9DCull, 0x4A7484AA2DE92C6Full));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 24-27
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0xBF597FC7B00327C8ull, 0xA831C66D983E5152ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 28-31
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0x1429296706CA6351ull, 0xD5A79147C6E00BF3ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 32-35
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x53380D134D2C6DFCull, 0x2E1B213827B70A85ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 36-39
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x92722C8581C2C92Eull, 0x766A0ABB650A7354ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 40-43
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0xC76C51A3C24B8B70ull, 0xA81A664BA2BFE8A1ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 44-47
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0x106AA070F40E3585ull, 0xD6990624D192E819ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 48-51
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x34B0BCB52748774Cull, 0x1E376C0819A4C116ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 52-55
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x682E6FF35B9CCA4Full, 0x4ED8AA4A391C0CB3ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 56-59
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0x8CC7020884C87814ull, 0x78A5636F748F82EEull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 60-63
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0xC67178F2BEF9A3F7ull, 0xA4506CEB90BEFFFAull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Add current hash values with previously saved
+      state0 = _mm_add_epi32(state0, abef_save);
+      state1 = _mm_add_epi32(state1, cdgh_save);
+
+      data += 64;
+      num_blks--;
+   }
+
+   // Write hash values back in the correct order
+   tmp    = _mm_shuffle_epi32(state0, 0x1B);    // FEBA
+   state1 = _mm_shuffle_epi32(state1, 0xB1);    // DCHG
+   state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
+   state1 = _mm_alignr_epi8(state1, tmp, 8);    // ABEF
+
+   _mm_store_si128((__m128i*) digest, state0);
+   _mm_store_si128((__m128i*) (digest+4), state1);
+}
+
Index: sys/crypto/sha_sse/sha_sse.h
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/sha_sse.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CRYPTO__SHA_SSE_H_
+#define _CRYPTO__SHA_SSE_H_
+
+/*
+ * Internal functions, implemented in intrinsics.
+ */
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks);
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks);
+
+#endif /* _CRYPTO__SHA_SSE_H_ */
Index: sys/crypto/sha_sse/sha_sse.c
===================================================================
--- /dev/null
+++ sys/crypto/sha_sse/sha_sse.c
@@ -0,0 +1,613 @@
+/*-
+ * Copyright (c) 2005-2008 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
+ * Copyright (c) 2014 The FreeBSD Foundation
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions of this software were developed by John-Mark Gurney
+ * under sponsorship of the FreeBSD Foundation and
+ * Rubicon Communications, LLC (Netgate).
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/kobj.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+#include <sys/rwlock.h>
+#include <sys/bus.h>
+#include <sys/uio.h>
+#include <sys/mbuf.h>
+#include <sys/smp.h>
+
+#include <crypto/sha1.h>
+#include <crypto/sha2/sha256.h>
+#include <crypto/sha_sse/sha_sse.h>
+
+#include <opencrypto/cryptodev.h>
+#include <opencrypto/gmac.h>
+#include <cryptodev_if.h>
+
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#if defined(__i386__)
+#include <machine/npx.h>
+#elif defined(__amd64__)
+#include <machine/fpu.h>
+#endif
+static struct mtx_padalign *ctx_mtx;
+static struct fpu_kern_ctx **ctx_fpu;
+
+struct sha_sse_softc {
+	int		dieing;
+	int32_t		cid;
+	uint32_t	sid;
+	struct rwlock	lock;
+	TAILQ_HEAD(sha_sse_sessions_head, sha_sse_session) sessions;
+};
+
+struct sha_sse_session {
+	/* Same as the SHA256 Blocksize. */
+	uint8_t key[SHA1_HMAC_BLOCK_LEN] __aligned(16);
+	int algo;
+	int used;
+	int mlen;
+	uint32_t id;
+	TAILQ_ENTRY(sha_sse_session) next;
+};
+
+#define ACQUIRE_CTX(i, ctx)					\
+	do {							\
+		(i) = PCPU_GET(cpuid);				\
+		mtx_lock(&ctx_mtx[(i)]);			\
+		(ctx) = ctx_fpu[(i)];				\
+	} while (0)
+#define RELEASE_CTX(i, ctx)					\
+	do {							\
+		mtx_unlock(&ctx_mtx[(i)]);			\
+		(i) = -1;					\
+		(ctx) = NULL;					\
+	} while (0)
+
+static int sha_sse_newsession(device_t, uint32_t *sidp, struct cryptoini *cri);
+static int sha_sse_freesession(device_t, uint64_t tid);
+static void sha_sse_freesession_locked(struct sha_sse_softc *sc,
+    struct sha_sse_session *ses);
+static int sha_sse_cipher_setup(struct sha_sse_session *ses,
+    struct cryptoini *encini);
+static int sha_sse_cipher_process(struct sha_sse_session *ses,
+    struct cryptop *crp);
+
+MALLOC_DEFINE(M_SHA_SSE, "sha_sse_data", "SHA_SSE Data");
+
+static void
+sha_sse_identify(driver_t *drv, device_t parent)
+{
+
+	/* NB: order 10 is so we get attached after h/w devices */
+	if (device_find_child(parent, "sha_sse", -1) == NULL &&
+	    BUS_ADD_CHILD(parent, 10, "sha_sse", -1) == 0)
+		panic("sha_sse: could not attach");
+}
+
+static int
+sha_sse_probe(device_t dev)
+{
+
+	if ((cpu_stdext_feature & CPUID_STDEXT_SHA) == 0) {
+		device_printf(dev, "No SHA support.\n");
+		return (EINVAL);
+	}
+	if ((cpu_feature2 & CPUID2_SSSE3) == 0) {
+		device_printf(dev, "No SSSE3 support.\n");
+		return (EINVAL);
+	}
+
+	device_set_desc_copy(dev, "SHA1,SHA2");
+	return (0);
+}
+
+static void
+sha_sse_cleanctx(void)
+{
+	int i;
+
+	/* XXX - no way to return driverid */
+	CPU_FOREACH(i) {
+		if (ctx_fpu[i] != NULL) {
+			mtx_destroy(&ctx_mtx[i]);
+			fpu_kern_free_ctx(ctx_fpu[i]);
+		}
+		ctx_fpu[i] = NULL;
+	}
+	free(ctx_mtx, M_SHA_SSE);
+	ctx_mtx = NULL;
+	free(ctx_fpu, M_SHA_SSE);
+	ctx_fpu = NULL;
+}
+
+static int
+sha_sse_attach(device_t dev)
+{
+	struct sha_sse_softc *sc;
+	int i;
+
+	sc = device_get_softc(dev);
+	sc->dieing = 0;
+	TAILQ_INIT(&sc->sessions);
+	sc->sid = 1;
+
+	sc->cid = crypto_get_driverid(dev, CRYPTOCAP_F_HARDWARE |
+	    CRYPTOCAP_F_SYNC);
+	if (sc->cid < 0) {
+		device_printf(dev, "Could not get crypto driver id.\n");
+		return (ENOMEM);
+	}
+
+	ctx_mtx = malloc(sizeof(*ctx_mtx) * (mp_maxid + 1), M_SHA_SSE,
+	    M_WAITOK | M_ZERO);
+	ctx_fpu = malloc(sizeof(*ctx_fpu) * (mp_maxid + 1), M_SHA_SSE,
+	    M_WAITOK | M_ZERO);
+
+	CPU_FOREACH(i) {
+		ctx_fpu[i] = fpu_kern_alloc_ctx(0);
+		mtx_init(&ctx_mtx[i], "shafpumtx", NULL, MTX_DEF | MTX_NEW);
+	}
+
+	rw_init(&sc->lock, "sha_sse_lock");
+	crypto_register(sc->cid, CRYPTO_SHA1, 0, 0);
+	crypto_register(sc->cid, CRYPTO_SHA1_HMAC, 0, 0);
+	crypto_register(sc->cid, CRYPTO_SHA2_256_HMAC, 0, 0);
+	return (0);
+}
+
+static int
+sha_sse_detach(device_t dev)
+{
+	struct sha_sse_softc *sc;
+	struct sha_sse_session *ses;
+
+	sc = device_get_softc(dev);
+
+	rw_wlock(&sc->lock);
+	TAILQ_FOREACH(ses, &sc->sessions, next) {
+		if (ses->used) {
+			rw_wunlock(&sc->lock);
+			device_printf(dev,
+			    "Cannot detach, sessions still active.\n");
+			return (EBUSY);
+		}
+	}
+	sc->dieing = 1;
+	while ((ses = TAILQ_FIRST(&sc->sessions)) != NULL) {
+		TAILQ_REMOVE(&sc->sessions, ses, next);
+		free(ses, M_SHA_SSE);
+	}
+	rw_wunlock(&sc->lock);
+	crypto_unregister_all(sc->cid);
+
+	rw_destroy(&sc->lock);
+
+	sha_sse_cleanctx();
+
+	return (0);
+}
+
+static int
+sha_sse_newsession(device_t dev, uint32_t *sidp, struct cryptoini *cri)
+{
+	struct sha_sse_softc *sc;
+	struct sha_sse_session *ses;
+	struct cryptoini *encini;
+	int error;
+
+	if (sidp == NULL || cri == NULL) {
+		CRYPTDEB("no sidp or cri");
+		return (EINVAL);
+	}
+
+	sc = device_get_softc(dev);
+	if (sc->dieing)
+		return (EINVAL);
+
+	ses = NULL;
+	encini = NULL;
+	for (; cri != NULL; cri = cri->cri_next) {
+		switch (cri->cri_alg) {
+		case CRYPTO_SHA1:
+		case CRYPTO_SHA1_HMAC:
+		case CRYPTO_SHA2_256_HMAC:
+			if (encini != NULL) {
+				CRYPTDEB("encini already set");
+				return (EINVAL);
+			}
+			encini = cri;
+			break;
+		default:
+			CRYPTDEB("unhandled algorithm");
+			return (EINVAL);
+		}
+	}
+	if (encini == NULL) {
+		CRYPTDEB("no cipher");
+		return (EINVAL);
+	}
+
+	rw_wlock(&sc->lock);
+	if (sc->dieing) {
+		rw_wunlock(&sc->lock);
+		return (EINVAL);
+	}
+	/*
+	 * Free sessions goes first, so if first session is used, we need to
+	 * allocate one.
+	 */
+	ses = TAILQ_FIRST(&sc->sessions);
+	if (ses == NULL || ses->used) {
+		ses = malloc(sizeof(*ses), M_SHA_SSE, M_NOWAIT | M_ZERO);
+		if (ses == NULL) {
+			rw_wunlock(&sc->lock);
+			return (ENOMEM);
+		}
+		ses->id = sc->sid++;
+	} else {
+		TAILQ_REMOVE(&sc->sessions, ses, next);
+	}
+	ses->used = 1;
+	TAILQ_INSERT_TAIL(&sc->sessions, ses, next);
+	rw_wunlock(&sc->lock);
+	ses->algo = encini->cri_alg;
+
+	error = sha_sse_cipher_setup(ses, encini);
+	if (error != 0) {
+		CRYPTDEB("setup failed");
+		rw_wlock(&sc->lock);
+		sha_sse_freesession_locked(sc, ses);
+		rw_wunlock(&sc->lock);
+		return (error);
+	}
+
+	*sidp = ses->id;
+	return (0);
+}
+
+static void
+sha_sse_freesession_locked(struct sha_sse_softc *sc, struct sha_sse_session *ses)
+{
+	uint32_t sid;
+
+	rw_assert(&sc->lock, RA_WLOCKED);
+
+	sid = ses->id;
+	TAILQ_REMOVE(&sc->sessions, ses, next);
+	explicit_bzero(ses, sizeof(*ses));
+	ses->id = sid;
+	TAILQ_INSERT_HEAD(&sc->sessions, ses, next);
+}
+
+static int
+sha_sse_freesession(device_t dev, uint64_t tid)
+{
+	struct sha_sse_softc *sc;
+	struct sha_sse_session *ses;
+	uint32_t sid;
+
+	sc = device_get_softc(dev);
+	sid = ((uint32_t)tid) & 0xffffffff;
+	rw_wlock(&sc->lock);
+	TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) {
+		if (ses->id == sid)
+			break;
+	}
+	if (ses == NULL) {
+		rw_wunlock(&sc->lock);
+		return (EINVAL);
+	}
+	sha_sse_freesession_locked(sc, ses);
+	rw_wunlock(&sc->lock);
+	return (0);
+}
+
+static int
+sha_sse_process(device_t dev, struct cryptop *crp, int hint __unused)
+{
+	struct sha_sse_softc *sc = device_get_softc(dev);
+	struct sha_sse_session *ses = NULL;
+	int error;
+
+	error = 0;
+
+	/* Sanity check. */
+	if (crp == NULL)
+		return (EINVAL);
+
+	if (crp->crp_callback == NULL || crp->crp_desc == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+
+	rw_rlock(&sc->lock);
+	TAILQ_FOREACH_REVERSE(ses, &sc->sessions, sha_sse_sessions_head, next) {
+		if (ses->id == (crp->crp_sid & 0xffffffff))
+			break;
+	}
+	rw_runlock(&sc->lock);
+	if (ses == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+
+	error = sha_sse_cipher_process(ses, crp);
+	if (error != 0)
+		goto out;
+
+out:
+	crp->crp_etype = error;
+	crypto_done(crp);
+	return (error);
+}
+
+static device_method_t sha_sse_methods[] = {
+	DEVMETHOD(device_identify, sha_sse_identify),
+	DEVMETHOD(device_probe, sha_sse_probe),
+	DEVMETHOD(device_attach, sha_sse_attach),
+	DEVMETHOD(device_detach, sha_sse_detach),
+
+	DEVMETHOD(cryptodev_newsession, sha_sse_newsession),
+	DEVMETHOD(cryptodev_freesession, sha_sse_freesession),
+	DEVMETHOD(cryptodev_process, sha_sse_process),
+
+	{0, 0},
+};
+
+static driver_t sha_sse_driver = {
+	"sha_sse",
+	sha_sse_methods,
+	sizeof(struct sha_sse_softc),
+};
+static devclass_t sha_sse_devclass;
+
+DRIVER_MODULE(sha_sse, nexus, sha_sse_driver, sha_sse_devclass, 0, 0);
+MODULE_VERSION(sha_sse, 1);
+MODULE_DEPEND(sha_sse, crypto, 1, 1, 1);
+
+static int
+sha_sse_cipher_setup(struct sha_sse_session *ses, struct cryptoini *encini)
+{
+	int error, keylen;
+
+	keylen = encini->cri_klen / 8;
+	if (keylen > sizeof(ses->key)) {
+		printf("%s: keylen:%d ses->key:%zu\n", __func__, keylen, sizeof(ses->key));
+		error = EINVAL;
+		goto out;
+	}
+	if (ses->algo == CRYPTO_SHA1 && keylen > 0) {
+		printf("%s: algo:%d keylen:%d\n", __func__, ses->algo, keylen);
+		error = EINVAL;
+		goto out;
+	}
+
+	memcpy(ses->key, encini->cri_key, keylen);
+	ses->mlen = encini->cri_mlen;
+
+out:
+	return (error);
+}
+
+static int
+intel_sha1_update(void *vctx, const void *vdata, u_int datalen)
+{
+	struct sha1_ctxt *ctx = vctx;
+	const char *data = vdata;
+	size_t gaplen;
+	size_t gapstart;
+	size_t off;
+	size_t copysiz;
+	u_int blocks;
+
+	off = 0;
+	/* Do any aligned blocks without redundant copying. */
+	if (datalen >= 64 && ctx->count % 64 == 0) {
+		blocks = datalen / 64;
+		ctx->c.b64[0] += blocks * 64 * 8;
+		intel_sha1_step(ctx->h.b32, data + off, blocks);
+		off += blocks * 64;
+	}
+
+	while (off < datalen) {
+		gapstart = ctx->count % 64;
+		gaplen = 64 - gapstart;
+
+		copysiz = (gaplen < datalen - off) ? gaplen : datalen - off;
+		bcopy(&data[off], &ctx->m.b8[gapstart], copysiz);
+		ctx->count += copysiz;
+		ctx->count %= 64;
+		ctx->c.b64[0] += copysiz * 8;
+		if (ctx->count % 64 == 0)
+			intel_sha1_step(ctx->h.b32, (void *)ctx->m.b8, 1);
+		off += copysiz;
+	}
+	return (0);
+}
+
+static void
+SHA1_Finalize_fn(void *digest, void *ctx)
+{
+	sha1_result(ctx, digest);
+}
+
+static int
+intel_sha256_update(void *vctx, const void *vdata, u_int len)
+{
+	SHA256_CTX *ctx = vctx;
+	uint64_t bitlen;
+	uint32_t r;
+	u_int blocks;
+	const unsigned char *src = vdata;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen = len << 3;
+
+	/* Update number of bits */
+	ctx->count += bitlen;
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return (0);
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	intel_sha256_step(ctx->state, ctx->buf, 1);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	if (len >= 64) {
+		blocks = len / 64;
+		intel_sha256_step(ctx->state, src, blocks);
+		src += blocks * 64;
+		len -= blocks * 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+	return (0);
+}
+
+static void
+SHA256_Finalize_fn(void *digest, void *ctx)
+{
+	SHA256_Final(digest, ctx);
+}
+
+/*
+ * Compute the HASH( (key ^ xorbyte) || buf )
+ */
+static void
+hmac_internal(void *ctx, uint32_t *res,
+	int (*update)(void *, const void *, u_int),
+	void (*finalize)(void *, void *), uint8_t *key, uint8_t xorbyte,
+	const void *buf, size_t off, size_t buflen, int crpflags)
+{
+	size_t i;
+
+	for (i = 0; i < 64; i++)
+		key[i] ^= xorbyte;
+	update(ctx, key, 64);
+	for (i = 0; i < 64; i++)
+		key[i] ^= xorbyte;
+
+	crypto_apply(crpflags, __DECONST(void *, buf), off, buflen,
+	    __DECONST(int (*)(void *, void *, u_int), update), ctx);
+	finalize(res, ctx);
+}
+
+static int
+sha_sse_cipher_process(struct sha_sse_session *ses, struct cryptop *crp)
+{
+	struct SHA256Context s2ctx;
+	struct sha1_ctxt sctx __aligned(16);
+	uint32_t res[8];
+	struct fpu_kern_ctx *ctx;
+	struct cryptodesc *crd;
+	int error, kt, ctxidx, hashlen;
+
+	error = 0;
+
+	kt = is_fpu_kern_thread(0);
+	if (!kt) {
+		ACQUIRE_CTX(ctxidx, ctx);
+		error = fpu_kern_enter(curthread, ctx,
+		    FPU_KERN_NORMAL|FPU_KERN_KTHR);
+		if (error != 0)
+			goto out2;
+	}
+
+	crd = crp->crp_desc;
+	if (crd->crd_next != NULL || crd->crd_flags != 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	switch (ses->algo) {
+	case CRYPTO_SHA1_HMAC:
+		hashlen = SHA1_HASH_LEN;
+		/* Inner hash: (K ^ IPAD) || data */
+		sha1_init(&sctx);
+		hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn,
+		    ses->key, 0x36, crp->crp_buf, crd->crd_skip, crd->crd_len,
+		    crp->crp_flags);
+		/* Outer hash: (K ^ OPAD) || inner hash */
+		sha1_init(&sctx);
+		hmac_internal(&sctx, res, intel_sha1_update, SHA1_Finalize_fn,
+		    ses->key, 0x5C, res, 0, hashlen, 0);
+		break;
+	case CRYPTO_SHA1:
+		hashlen = SHA1_HASH_LEN;
+		sha1_init(&sctx);
+		crypto_apply(crp->crp_flags, crp->crp_buf, crd->crd_skip,
+		    crd->crd_len, __DECONST(int (*)(void *, void *, u_int),
+		    intel_sha1_update), &sctx);
+		sha1_result(&sctx, (void *)res);
+		break;
+	case CRYPTO_SHA2_256_HMAC:
+		hashlen = SHA2_256_HASH_LEN;
+		/* Inner hash: (K ^ IPAD) || data */
+		SHA256_Init(&s2ctx);
+		hmac_internal(&s2ctx, res, intel_sha256_update,
+		    SHA256_Finalize_fn, ses->key, 0x36, crp->crp_buf,
+		    crd->crd_skip, crd->crd_len, crp->crp_flags);
+		/* Outer hash: (K ^ OPAD) || inner hash */
+		SHA256_Init(&s2ctx);
+		hmac_internal(&s2ctx, res, intel_sha256_update,
+		    SHA256_Finalize_fn, ses->key, 0x5C, res, 0, hashlen, 0);
+		break;
+	}
+
+	if (ses->mlen != 0 && ses->mlen < hashlen)
+		hashlen = ses->mlen;
+	crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject, hashlen,
+	    (void *)res);
+
+out:
+	if (!kt) {
+		fpu_kern_leave(curthread, ctx);
+out2:
+		RELEASE_CTX(ctxidx, ctx);
+	}
+	return (error);
+}
Index: sys/i386/conf/NOTES
===================================================================
--- sys/i386/conf/NOTES
+++ sys/i386/conf/NOTES
@@ -839,6 +839,7 @@
 device		padlock_rng	# VIA Padlock RNG
 device		rdrand_rng	# Intel Bull Mountain RNG
 device		aesni		# AES-NI OpenCrypto module
+device		sha_sse		# SHA SSE extension OpenCrypto module
 
 #
 # Laptop/Notebook options:
Index: sys/modules/Makefile
===================================================================
--- sys/modules/Makefile
+++ sys/modules/Makefile
@@ -351,6 +351,7 @@
 	sge \
 	${_sgx} \
 	${_sgx_linux} \
+	${_sha_sse} \
 	siba_bwn \
 	siftr \
 	siis \
@@ -625,6 +626,7 @@
 _acpi=		acpi
 .if ${MK_CRYPT} != "no" || defined(ALL_MODULES)
 _aesni=		aesni
+_sha_sse=	sha_sse
 .endif
 _amd_ecc_inject=amd_ecc_inject
 _amdsbwd=	amdsbwd
Index: sys/modules/sha_sse/Makefile
===================================================================
--- /dev/null
+++ sys/modules/sha_sse/Makefile
@@ -0,0 +1,25 @@
+# $FreeBSD$
+
+.PATH: ${SRCTOP}/sys/crypto/sha_sse
+.PATH: ${SRCTOP}/contrib/llvm/tools/clang/lib/Headers
+
+KMOD=	sha_sse
+SRCS=	sha_sse.c
+SRCS+=	device_if.h bus_if.h opt_bus.h cryptodev_if.h
+
+OBJS+=	intel_sha1.o intel_sha256.o
+
+# Remove -nostdinc so we can get the intrinsics.
+intel_sha1.o: intel_sha1.c
+	${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+	     -mmmx -msse -msse4 -msha ${.IMPSRC}
+	${CTFCONVERT_CMD}
+intel_sha256.o: intel_sha256.c
+	${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+	     -mmmx -msse -msse4 -msha ${.IMPSRC}
+	${CTFCONVERT_CMD}
+
+intel_sha1.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
+intel_sha256.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
+
+.include <bsd.kmod.mk>
Index: tests/sys/opencrypto/cryptotest.py
===================================================================
--- tests/sys/opencrypto/cryptotest.py
+++ tests/sys/opencrypto/cryptotest.py
@@ -46,7 +46,7 @@
 
 aesmodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ]
 desmodules = [ 'cryptosoft0', ]
-shamodules = [ 'cryptosoft0', 'ccr0' ]
+shamodules = [ 'cryptosoft0', 'ccr0', 'sha_sse0' ]
 
 def GenTestCase(cname):
 	try:
@@ -308,6 +308,7 @@
 cryptosoft = GenTestCase('cryptosoft0')
 aesni = GenTestCase('aesni0')
 ccr = GenTestCase('ccr0')
+sha_sse = GenTestCase('sha_sse0')
 
 if __name__ == '__main__':
 	unittest.main()