Index: head/share/man/man4/aesni.4
===================================================================
--- head/share/man/man4/aesni.4
+++ head/share/man/man4/aesni.4
@@ -24,12 +24,12 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd December 14, 2015
+.Dd September 26, 2017
 .Dt AESNI 4
 .Os
 .Sh NAME
 .Nm aesni
-.Nd "driver for the AES accelerator on Intel CPUs"
+.Nd "driver for the AES and SHA accelerator on x86 CPUs"
 .Sh SYNOPSIS
 To compile this driver into the kernel,
 place the following lines in your
@@ -47,8 +47,8 @@
 aesni_load="YES"
 .Ed
 .Sh DESCRIPTION
-Starting with some models of Core i5/i7, Intel processors implement
-a new set of instructions called AESNI.
+Starting with Intel Westmere and AMD Bulldozer, some x86 processors implement a
+new set of instructions called AESNI.
 The set of six instructions accelerates the calculation of the key
 schedule for key lengths of 128, 192, and 256 of the Advanced
 Encryption Standard (AES) symmetric cipher, and provides a hardware
@@ -56,13 +56,24 @@
 rounds.
 .Pp
 The processor capability is reported as AESNI in the Features2 line at boot.
+.Pp
+Starting with the Intel Goldmont and AMD Ryzen microarchitectures, some x86
+processors implement a new set of SHA instructions.
+The set of seven instructions accelerates the calculation of SHA1 and SHA256
+hashes.
+.Pp
+The processor capability is reported as SHA in the Structured Extended Features
+line at boot.
+.Pp
 The
 .Nm
-driver does not attach on systems that lack the required CPU capability.
+driver does not attach on systems that lack both CPU capabilities.
+On systems that support only one of AESNI or SHA extensions, the driver will
+attach and support that one function.
 .Pp
 The
 .Nm
-driver registers itself to accelerate AES operations for
+driver registers itself to accelerate AES and SHA operations for
 .Xr crypto 4 .
 Besides speed, the advantage of using the
 .Nm
@@ -83,13 +94,18 @@
 .Nm
 driver first appeared in
 .Fx 9.0 .
+SHA support was added in
+.Fx 12.0 .
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 driver was written by
-.An Konstantin Belousov Aq Mt kib@FreeBSD.org .
+.An Konstantin Belousov Aq Mt kib@FreeBSD.org
+and
+.An Conrad Meyer Aq Mt cem@FreeBSD.org .
 The key schedule calculation code was adopted from the sample provided
 by Intel and used in the analogous
 .Ox
 driver.
+The hash step intrinsics implementations were supplied by Intel.
Index: head/sys/conf/files.amd64
===================================================================
--- head/sys/conf/files.amd64
+++ head/sys/conf/files.amd64
@@ -182,6 +182,16 @@
 crypto/blowfish/bf_enc.c	optional	crypto | ipsec | ipsec_support
 crypto/des/des_enc.c		optional	crypto | ipsec | \
 	ipsec_support | netsmb
+intel_sha1.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/intel_sha1.c"			\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha1.o"
+intel_sha256.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/intel_sha256.c"		\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha256.o"
 crypto/via/padlock.c		optional	padlock
 crypto/via/padlock_cipher.c	optional	padlock
 crypto/via/padlock_hash.c	optional	padlock
Index: head/sys/conf/files.i386
===================================================================
--- head/sys/conf/files.i386
+++ head/sys/conf/files.i386
@@ -132,6 +132,16 @@
 	no-implicit-rule						\
 	clean		"aesni_wrap.o"
 crypto/des/arch/i386/des_enc.S	optional crypto | ipsec | ipsec_support | netsmb
+intel_sha1.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/intel_sha1.c"			\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha1.o"
+intel_sha256.o			optional	aesni			\
+	dependency	"$S/crypto/aesni/intel_sha256.c"		\
+	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} -mmmx -msse -msse4 -msha ${.IMPSRC}" \
+	no-implicit-rule						\
+	clean		"intel_sha256.o"
 crypto/via/padlock.c		optional padlock
 crypto/via/padlock_cipher.c	optional padlock
 crypto/via/padlock_hash.c	optional padlock
Index: head/sys/crypto/aesni/aesni.h
===================================================================
--- head/sys/crypto/aesni/aesni.h
+++ head/sys/crypto/aesni/aesni.h
@@ -56,12 +56,16 @@
 	uint8_t enc_schedule[AES_SCHED_LEN] __aligned(16);
 	uint8_t dec_schedule[AES_SCHED_LEN] __aligned(16);
 	uint8_t xts_schedule[AES_SCHED_LEN] __aligned(16);
+	/* Same as the SHA256 Blocksize. */
+	uint8_t hmac_key[SHA1_HMAC_BLOCK_LEN] __aligned(16);
 	int algo;
 	int rounds;
 	/* uint8_t *ses_ictx; */
 	/* uint8_t *ses_octx; */
 	/* int ses_mlen; */
 	int used;
+	int auth_algo;
+	int mlen;
 	uint32_t id;
 	TAILQ_ENTRY(aesni_session) next;
 };
@@ -111,7 +115,5 @@
 
 int aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
     int keylen);
-uint8_t *aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp,
-    int *allocated);
 
 #endif /* _AESNI_H_ */
Index: head/sys/crypto/aesni/aesni.c
===================================================================
--- head/sys/crypto/aesni/aesni.c
+++ head/sys/crypto/aesni/aesni.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2005-2008 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * Copyright (c) 2010 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2014 The FreeBSD Foundation
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed by John-Mark Gurney
@@ -46,10 +47,24 @@
 #include <sys/uio.h>
 #include <sys/mbuf.h>
 #include <sys/smp.h>
+
 #include <crypto/aesni/aesni.h>
-#include <cryptodev_if.h>
+#include <crypto/aesni/sha_sse.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2/sha256.h>
+
+#include <opencrypto/cryptodev.h>
 #include <opencrypto/gmac.h>
+#include <cryptodev_if.h>
 
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#if defined(__i386__)
+#include <machine/npx.h>
+#elif defined(__amd64__)
+#include <machine/fpu.h>
+#endif
+
 static struct mtx_padalign *ctx_mtx;
 static struct fpu_kern_ctx **ctx_fpu;
 
@@ -57,6 +72,8 @@
 	int	dieing;
 	int32_t cid;
 	uint32_t sid;
+	bool	has_aes;
+	bool	has_sha;
 	TAILQ_HEAD(aesni_sessions_head, aesni_session) sessions;
 	struct rwlock lock;
 };
@@ -79,9 +96,13 @@
 static void aesni_freesession_locked(struct aesni_softc *sc,
     struct aesni_session *ses);
 static int aesni_cipher_setup(struct aesni_session *ses,
-    struct cryptoini *encini);
+    struct cryptoini *encini, struct cryptoini *authini);
 static int aesni_cipher_process(struct aesni_session *ses,
     struct cryptodesc *enccrd, struct cryptodesc *authcrd, struct cryptop *crp);
+static int aesni_cipher_crypt(struct aesni_session *ses,
+    struct cryptodesc *enccrd, struct cryptodesc *authcrd, struct cryptop *crp);
+static int aesni_cipher_mac(struct aesni_session *ses, struct cryptodesc *crd,
+    struct cryptop *crp);
 
 MALLOC_DEFINE(M_AESNI, "aesni_data", "AESNI Data");
 
@@ -95,21 +116,33 @@
 		panic("aesni: could not attach");
 }
 
+static void
+detect_cpu_features(bool *has_aes, bool *has_sha)
+{
+
+	*has_aes = ((cpu_feature2 & CPUID2_AESNI) != 0 &&
+	    (cpu_feature2 & CPUID2_SSE41) != 0);
+	*has_sha = ((cpu_stdext_feature & CPUID_STDEXT_SHA) != 0 &&
+	    (cpu_feature2 & CPUID2_SSSE3) != 0);
+}
+
 static int
 aesni_probe(device_t dev)
 {
+	bool has_aes, has_sha;
 
-	if ((cpu_feature2 & CPUID2_AESNI) == 0) {
-		device_printf(dev, "No AESNI support.\n");
+	detect_cpu_features(&has_aes, &has_sha);
+	if (!has_aes && !has_sha) {
+		device_printf(dev, "No AES or SHA support.\n");
 		return (EINVAL);
-	}
+	} else if (has_aes && has_sha)
+		device_set_desc(dev,
+		    "AES-CBC,AES-XTS,AES-GCM,AES-ICM,SHA1,SHA256");
+	else if (has_aes)
+		device_set_desc(dev, "AES-CBC,AES-XTS,AES-GCM,AES-ICM");
+	else
+		device_set_desc(dev, "SHA1,SHA256");
 
-	if ((cpu_feature2 & CPUID2_SSE41) == 0) {
-		device_printf(dev, "No SSE4.1 support.\n");
-		return (EINVAL);
-	}
-
-	device_set_desc_copy(dev, "AES-CBC,AES-XTS,AES-GCM,AES-ICM");
 	return (0);
 }
 
@@ -161,13 +194,22 @@
 	}
 
 	rw_init(&sc->lock, "aesni_lock");
-	crypto_register(sc->cid, CRYPTO_AES_CBC, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_ICM, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_NIST_GCM_16, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_128_NIST_GMAC, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_192_NIST_GMAC, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_256_NIST_GMAC, 0, 0);
-	crypto_register(sc->cid, CRYPTO_AES_XTS, 0, 0);
+
+	detect_cpu_features(&sc->has_aes, &sc->has_sha);
+	if (sc->has_aes) {
+		crypto_register(sc->cid, CRYPTO_AES_CBC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_ICM, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_NIST_GCM_16, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_128_NIST_GMAC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_192_NIST_GMAC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_256_NIST_GMAC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_AES_XTS, 0, 0);
+	}
+	if (sc->has_sha) {
+		crypto_register(sc->cid, CRYPTO_SHA1, 0, 0);
+		crypto_register(sc->cid, CRYPTO_SHA1_HMAC, 0, 0);
+		crypto_register(sc->cid, CRYPTO_SHA2_256_HMAC, 0, 0);
+	}
 	return (0);
 }
 
@@ -208,7 +250,8 @@
 {
 	struct aesni_softc *sc;
 	struct aesni_session *ses;
-	struct cryptoini *encini;
+	struct cryptoini *encini, *authini;
+	bool gcm_hash, gcm;
 	int error;
 
 	if (sidp == NULL || cri == NULL) {
@@ -221,13 +264,20 @@
 		return (EINVAL);
 
 	ses = NULL;
+	authini = NULL;
 	encini = NULL;
+	gcm = false;
+	gcm_hash = false;
 	for (; cri != NULL; cri = cri->cri_next) {
 		switch (cri->cri_alg) {
+		case CRYPTO_AES_NIST_GCM_16:
+			gcm = true;
+			/* FALLTHROUGH */
 		case CRYPTO_AES_CBC:
 		case CRYPTO_AES_ICM:
 		case CRYPTO_AES_XTS:
-		case CRYPTO_AES_NIST_GCM_16:
+			if (!sc->has_aes)
+				goto unhandled;
 			if (encini != NULL) {
 				CRYPTDEB("encini already set");
 				return (EINVAL);
@@ -241,16 +291,35 @@
 			 * nothing to do here, maybe in the future cache some
 			 * values for GHASH
 			 */
+			gcm_hash = true;
 			break;
+		case CRYPTO_SHA1:
+		case CRYPTO_SHA1_HMAC:
+		case CRYPTO_SHA2_256_HMAC:
+			if (!sc->has_sha)
+				goto unhandled;
+			if (authini != NULL) {
+				CRYPTDEB("authini already set");
+				return (EINVAL);
+			}
+			authini = cri;
+			break;
 		default:
+unhandled:
 			CRYPTDEB("unhandled algorithm");
 			return (EINVAL);
 		}
 	}
-	if (encini == NULL) {
+	if (encini == NULL && authini == NULL) {
 		CRYPTDEB("no cipher");
 		return (EINVAL);
 	}
+	/*
+	 * GMAC algorithms are only supported with simultaneous GCM.  Likewise
+	 * GCM is not supported without GMAC.
+	 */
+	if (gcm_hash != gcm)
+		return (EINVAL);
 
 	rw_wlock(&sc->lock);
 	if (sc->dieing) {
@@ -275,9 +344,13 @@
 	ses->used = 1;
 	TAILQ_INSERT_TAIL(&sc->sessions, ses, next);
 	rw_wunlock(&sc->lock);
-	ses->algo = encini->cri_alg;
 
-	error = aesni_cipher_setup(ses, encini);
+	if (encini != NULL)
+		ses->algo = encini->cri_alg;
+	if (authini != NULL)
+		ses->auth_algo = authini->cri_alg;
+
+	error = aesni_cipher_setup(ses, encini, authini);
 	if (error != 0) {
 		CRYPTDEB("setup failed");
 		rw_wlock(&sc->lock);
@@ -299,7 +372,7 @@
 
 	sid = ses->id;
 	TAILQ_REMOVE(&sc->sessions, ses, next);
-	*ses = (struct aesni_session){};
+	explicit_bzero(ses, sizeof(*ses));
 	ses->id = sid;
 	TAILQ_INSERT_HEAD(&sc->sessions, ses, next);
 }
@@ -351,6 +424,9 @@
 
 	for (crd = crp->crp_desc; crd != NULL; crd = crd->crd_next) {
 		switch (crd->crd_alg) {
+		case CRYPTO_AES_NIST_GCM_16:
+			needauth = 1;
+			/* FALLTHROUGH */
 		case CRYPTO_AES_CBC:
 		case CRYPTO_AES_ICM:
 		case CRYPTO_AES_XTS:
@@ -361,24 +437,17 @@
 			enccrd = crd;
 			break;
 
-		case CRYPTO_AES_NIST_GCM_16:
-			if (enccrd != NULL) {
-				error = EINVAL;
-				goto out;
-			}
-			enccrd = crd;
-			needauth = 1;
-			break;
-
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
+		case CRYPTO_SHA1:
+		case CRYPTO_SHA1_HMAC:
+		case CRYPTO_SHA2_256_HMAC:
 			if (authcrd != NULL) {
 				error = EINVAL;
 				goto out;
 			}
 			authcrd = crd;
-			needauth = 1;
 			break;
 
 		default:
@@ -387,14 +456,16 @@
 		}
 	}
 
-	if (enccrd == NULL || (needauth && authcrd == NULL)) {
+	if ((enccrd == NULL && authcrd == NULL) ||
+	    (needauth && authcrd == NULL)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* CBC & XTS can only handle full blocks for now */
-	if ((enccrd->crd_alg == CRYPTO_AES_CBC || enccrd->crd_alg ==
-	    CRYPTO_AES_XTS) && (enccrd->crd_len % AES_BLOCK_LEN) != 0) {
+	if (enccrd != NULL && (enccrd->crd_alg == CRYPTO_AES_CBC ||
+	    enccrd->crd_alg == CRYPTO_AES_XTS) &&
+	    (enccrd->crd_len % AES_BLOCK_LEN) != 0) {
 		error = EINVAL;
 		goto out;
 	}
@@ -420,9 +491,9 @@
 	return (error);
 }
 
-uint8_t *
+static uint8_t *
 aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp,
-    int *allocated)
+    bool *allocated)
 {
 	struct mbuf *m;
 	struct uio *uio;
@@ -442,18 +513,18 @@
 		addr = (uint8_t *)iov->iov_base;
 	} else
 		addr = (uint8_t *)crp->crp_buf;
-	*allocated = 0;
+	*allocated = false;
 	addr += enccrd->crd_skip;
 	return (addr);
 
 alloc:
 	addr = malloc(enccrd->crd_len, M_AESNI, M_NOWAIT);
 	if (addr != NULL) {
-		*allocated = 1;
+		*allocated = true;
 		crypto_copydata(crp->crp_flags, crp->crp_buf, enccrd->crd_skip,
 		    enccrd->crd_len, addr);
 	} else
-		*allocated = 0;
+		*allocated = false;
 	return (addr);
 }
 
@@ -482,13 +553,28 @@
 MODULE_DEPEND(aesni, crypto, 1, 1, 1);
 
 static int
-aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini)
+aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini,
+    struct cryptoini *authini)
 {
 	struct fpu_kern_ctx *ctx;
-	int error;
-	int kt, ctxidx;
+	int kt, ctxidx, keylen, error;
 
-	kt = is_fpu_kern_thread(0);
+	switch (ses->auth_algo) {
+	case CRYPTO_SHA1:
+	case CRYPTO_SHA1_HMAC:
+	case CRYPTO_SHA2_256_HMAC:
+		if (authini->cri_klen % 8 != 0)
+			return (EINVAL);
+		keylen = authini->cri_klen / 8;
+		if (keylen > sizeof(ses->hmac_key))
+			return (EINVAL);
+		if (ses->auth_algo == CRYPTO_SHA1 && keylen > 0)
+			return (EINVAL);
+		memcpy(ses->hmac_key, authini->cri_key, keylen);
+		ses->mlen = authini->cri_mlen;
+	}
+
+	kt = is_fpu_kern_thread(0) || (encini == NULL);
 	if (!kt) {
 		ACQUIRE_CTX(ctxidx, ctx);
 		error = fpu_kern_enter(curthread, ctx,
@@ -497,8 +583,10 @@
 			goto out;
 	}
 
-	error = aesni_cipher_setup_common(ses, encini->cri_key,
-	    encini->cri_klen);
+	error = 0;
+	if (encini != NULL)
+		error = aesni_cipher_setup_common(ses, encini->cri_key,
+		    encini->cri_klen);
 
 	if (!kt) {
 		fpu_kern_leave(curthread, ctx);
@@ -508,52 +596,198 @@
 	return (error);
 }
 
+static int
+intel_sha1_update(void *vctx, const void *vdata, u_int datalen)
+{
+	struct sha1_ctxt *ctx = vctx;
+	const char *data = vdata;
+	size_t gaplen;
+	size_t gapstart;
+	size_t off;
+	size_t copysiz;
+	u_int blocks;
+
+	off = 0;
+	/* Do any aligned blocks without redundant copying. */
+	if (datalen >= 64 && ctx->count % 64 == 0) {
+		blocks = datalen / 64;
+		ctx->c.b64[0] += blocks * 64 * 8;
+		intel_sha1_step(ctx->h.b32, data + off, blocks);
+		off += blocks * 64;
+	}
+
+	while (off < datalen) {
+		gapstart = ctx->count % 64;
+		gaplen = 64 - gapstart;
+
+		copysiz = (gaplen < datalen - off) ? gaplen : datalen - off;
+		bcopy(&data[off], &ctx->m.b8[gapstart], copysiz);
+		ctx->count += copysiz;
+		ctx->count %= 64;
+		ctx->c.b64[0] += copysiz * 8;
+		if (ctx->count % 64 == 0)
+			intel_sha1_step(ctx->h.b32, (void *)ctx->m.b8, 1);
+		off += copysiz;
+	}
+	return (0);
+}
+
+static void
+SHA1_Finalize_fn(void *digest, void *ctx)
+{
+	sha1_result(ctx, digest);
+}
+
+static int
+intel_sha256_update(void *vctx, const void *vdata, u_int len)
+{
+	SHA256_CTX *ctx = vctx;
+	uint64_t bitlen;
+	uint32_t r;
+	u_int blocks;
+	const unsigned char *src = vdata;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen = len << 3;
+
+	/* Update number of bits */
+	ctx->count += bitlen;
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return (0);
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	intel_sha256_step(ctx->state, ctx->buf, 1);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	if (len >= 64) {
+		blocks = len / 64;
+		intel_sha256_step(ctx->state, src, blocks);
+		src += blocks * 64;
+		len -= blocks * 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+	return (0);
+}
+
+static void
+SHA256_Finalize_fn(void *digest, void *ctx)
+{
+	SHA256_Final(digest, ctx);
+}
+
 /*
- * authcrd contains the associated date.
+ * Compute the HASH( (key ^ xorbyte) || buf )
  */
+static void
+hmac_internal(void *ctx, uint32_t *res,
+	int (*update)(void *, const void *, u_int),
+	void (*finalize)(void *, void *), uint8_t *key, uint8_t xorbyte,
+	const void *buf, size_t off, size_t buflen, int crpflags)
+{
+	size_t i;
+
+	for (i = 0; i < 64; i++)
+		key[i] ^= xorbyte;
+	update(ctx, key, 64);
+	for (i = 0; i < 64; i++)
+		key[i] ^= xorbyte;
+
+	crypto_apply(crpflags, __DECONST(void *, buf), off, buflen,
+	    __DECONST(int (*)(void *, void *, u_int), update), ctx);
+	finalize(res, ctx);
+}
+
 static int
 aesni_cipher_process(struct aesni_session *ses, struct cryptodesc *enccrd,
     struct cryptodesc *authcrd, struct cryptop *crp)
 {
 	struct fpu_kern_ctx *ctx;
-	uint8_t iv[AES_BLOCK_LEN];
-	uint8_t tag[GMAC_DIGEST_LEN];
-	uint8_t *buf, *authbuf;
-	int error, allocated, authallocated;
-	int ivlen, encflag;
-	int kt, ctxidx;
+	int error, ctxidx;
+	bool kt;
 
-	encflag = (enccrd->crd_flags & CRD_F_ENCRYPT) == CRD_F_ENCRYPT;
+	if (enccrd != NULL) {
+		if ((enccrd->crd_alg == CRYPTO_AES_ICM ||
+		    enccrd->crd_alg == CRYPTO_AES_NIST_GCM_16) &&
+		    (enccrd->crd_flags & CRD_F_IV_EXPLICIT) == 0)
+			return (EINVAL);
+	}
 
-	if ((enccrd->crd_alg == CRYPTO_AES_ICM ||
-	    enccrd->crd_alg == CRYPTO_AES_NIST_GCM_16) &&
-	    (enccrd->crd_flags & CRD_F_IV_EXPLICIT) == 0)
-		return (EINVAL);
+	error = 0;
+	kt = is_fpu_kern_thread(0);
+	if (!kt) {
+		ACQUIRE_CTX(ctxidx, ctx);
+		error = fpu_kern_enter(curthread, ctx,
+		    FPU_KERN_NORMAL | FPU_KERN_KTHR);
+		if (error != 0)
+			goto out2;
+	}
 
+	/* Do work */
+	if (enccrd != NULL && authcrd != NULL) {
+		/* Perform the first operation */
+		if (crp->crp_desc == enccrd)
+			error = aesni_cipher_crypt(ses, enccrd, authcrd, crp);
+		else
+			error = aesni_cipher_mac(ses, authcrd, crp);
+		if (error != 0)
+			goto out;
+		/* Perform the second operation */
+		if (crp->crp_desc == enccrd)
+			error = aesni_cipher_mac(ses, authcrd, crp);
+		else
+			error = aesni_cipher_crypt(ses, enccrd, authcrd, crp);
+	} else if (enccrd != NULL)
+		error = aesni_cipher_crypt(ses, enccrd, authcrd, crp);
+	else
+		error = aesni_cipher_mac(ses, authcrd, crp);
+
+	if (error != 0)
+		goto out;
+
+out:
+	if (!kt) {
+		fpu_kern_leave(curthread, ctx);
+out2:
+		RELEASE_CTX(ctxidx, ctx);
+	}
+	return (error);
+}
+
+static int
+aesni_cipher_crypt(struct aesni_session *ses, struct cryptodesc *enccrd,
+	struct cryptodesc *authcrd, struct cryptop *crp)
+{
+	uint8_t iv[AES_BLOCK_LEN], tag[GMAC_DIGEST_LEN], *buf, *authbuf;
+	int error, ivlen;
+	bool encflag, allocated, authallocated;
+
 	buf = aesni_cipher_alloc(enccrd, crp, &allocated);
 	if (buf == NULL)
 		return (ENOMEM);
 
-	error = 0;
-	authbuf = NULL;
-	authallocated = 0;
-	if (authcrd != NULL) {
+	authallocated = false;
+	if (ses->algo == CRYPTO_AES_NIST_GCM_16 && authcrd != NULL) {
 		authbuf = aesni_cipher_alloc(authcrd, crp, &authallocated);
 		if (authbuf == NULL) {
 			error = ENOMEM;
-			goto out1;
+			goto out;
 		}
 	}
 
-	kt = is_fpu_kern_thread(0);
-	if (!kt) {
-		ACQUIRE_CTX(ctxidx, ctx);
-		error = fpu_kern_enter(curthread, ctx,
-		    FPU_KERN_NORMAL|FPU_KERN_KTHR);
-		if (error != 0)
-			goto out2;
-	}
-
+	error = 0;
+	encflag = (enccrd->crd_flags & CRD_F_ENCRYPT) == CRD_F_ENCRYPT;
 	if ((enccrd->crd_flags & CRD_F_KEY_EXPLICIT) != 0) {
 		error = aesni_cipher_setup_common(ses, enccrd->crd_key,
 		    enccrd->crd_klen);
@@ -561,7 +795,6 @@
 			goto out;
 	}
 
-	/* XXX - validate that enccrd and authcrd have/use same key? */
 	switch (enccrd->crd_alg) {
 	case CRYPTO_AES_CBC:
 	case CRYPTO_AES_ICM:
@@ -593,13 +826,6 @@
 			    enccrd->crd_inject, ivlen, iv);
 	}
 
-	if (authcrd != NULL && !encflag)
-		crypto_copydata(crp->crp_flags, crp->crp_buf,
-		    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
-	else
-		bzero(tag, sizeof tag);
-
-	/* Do work */
 	switch (ses->algo) {
 	case CRYPTO_AES_CBC:
 		if (encflag)
@@ -625,11 +851,21 @@
 			    iv);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
-		if (encflag)
+		if (authcrd != NULL && !encflag)
+			crypto_copydata(crp->crp_flags, crp->crp_buf,
+			    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
+		else
+			bzero(tag, sizeof tag);
+
+		if (encflag) {
 			AES_GCM_encrypt(buf, buf, authbuf, iv, tag,
 			    enccrd->crd_len, authcrd->crd_len, ivlen,
 			    ses->enc_schedule, ses->rounds);
-		else {
+
+			if (authcrd != NULL)
+				crypto_copyback(crp->crp_flags, crp->crp_buf,
+				    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
+		} else {
 			if (!AES_GCM_decrypt(buf, buf, authbuf, iv, tag,
 			    enccrd->crd_len, authcrd->crd_len, ivlen,
 			    ses->enc_schedule, ses->rounds))
@@ -638,28 +874,78 @@
 		break;
 	}
 
-	if (allocated)
-		crypto_copyback(crp->crp_flags, crp->crp_buf, enccrd->crd_skip,
-		    enccrd->crd_len, buf);
-
-	if (!error && authcrd != NULL) {
-		crypto_copyback(crp->crp_flags, crp->crp_buf,
-		    authcrd->crd_inject, GMAC_DIGEST_LEN, tag);
-	}
-
 out:
-	if (!kt) {
-		fpu_kern_leave(curthread, ctx);
-out2:
-		RELEASE_CTX(ctxidx, ctx);
-	}
-
-out1:
 	if (allocated) {
-		bzero(buf, enccrd->crd_len);
+		explicit_bzero(buf, enccrd->crd_len);
 		free(buf, M_AESNI);
 	}
-	if (authallocated)
+	if (authallocated) {
+		explicit_bzero(authbuf, authcrd->crd_len);
 		free(authbuf, M_AESNI);
+	}
 	return (error);
+}
+
+static int
+aesni_cipher_mac(struct aesni_session *ses, struct cryptodesc *crd,
+    struct cryptop *crp)
+{
+	union {
+		struct SHA256Context sha2 __aligned(16);
+		struct sha1_ctxt sha1 __aligned(16);
+	} sctx;
+	uint32_t res[SHA2_256_HASH_LEN / sizeof(uint32_t)];
+	int hashlen;
+
+	if (crd->crd_flags != 0)
+		return (EINVAL);
+
+	switch (ses->auth_algo) {
+	case CRYPTO_SHA1_HMAC:
+		hashlen = SHA1_HASH_LEN;
+		/* Inner hash: (K ^ IPAD) || data */
+		sha1_init(&sctx.sha1);
+		hmac_internal(&sctx.sha1, res, intel_sha1_update,
+		    SHA1_Finalize_fn, ses->hmac_key, 0x36, crp->crp_buf,
+		    crd->crd_skip, crd->crd_len, crp->crp_flags);
+		/* Outer hash: (K ^ OPAD) || inner hash */
+		sha1_init(&sctx.sha1);
+		hmac_internal(&sctx.sha1, res, intel_sha1_update,
+		    SHA1_Finalize_fn, ses->hmac_key, 0x5C, res, 0, hashlen, 0);
+		break;
+	case CRYPTO_SHA1:
+		hashlen = SHA1_HASH_LEN;
+		sha1_init(&sctx.sha1);
+		crypto_apply(crp->crp_flags, crp->crp_buf, crd->crd_skip,
+		    crd->crd_len, __DECONST(int (*)(void *, void *, u_int),
+		    intel_sha1_update), &sctx.sha1);
+		sha1_result(&sctx.sha1, (void *)res);
+		break;
+	case CRYPTO_SHA2_256_HMAC:
+		hashlen = SHA2_256_HASH_LEN;
+		/* Inner hash: (K ^ IPAD) || data */
+		SHA256_Init(&sctx.sha2);
+		hmac_internal(&sctx.sha2, res, intel_sha256_update,
+		    SHA256_Finalize_fn, ses->hmac_key, 0x36, crp->crp_buf,
+		    crd->crd_skip, crd->crd_len, crp->crp_flags);
+		/* Outer hash: (K ^ OPAD) || inner hash */
+		SHA256_Init(&sctx.sha2);
+		hmac_internal(&sctx.sha2, res, intel_sha256_update,
+		    SHA256_Finalize_fn, ses->hmac_key, 0x5C, res, 0, hashlen,
+		    0);
+		break;
+	default:
+		/*
+		 * AES-GMAC authentication is verified while processing the
+		 * enccrd
+		 */
+		return (0);
+	}
+
+	if (ses->mlen != 0 && ses->mlen < hashlen)
+		hashlen = ses->mlen;
+
+	crypto_copyback(crp->crp_flags, crp->crp_buf, crd->crd_inject, hashlen,
+	    (void *)res);
+	return (0);
 }
Index: head/sys/crypto/aesni/intel_sha1.c
===================================================================
--- head/sys/crypto/aesni/intel_sha1.c
+++ head/sys/crypto/aesni/intel_sha1.c
@@ -0,0 +1,261 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation 
+* 
+* All rights reserved. 
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: 
+* 
+* * Redistributions of source code must retain the above copyright
+*   notice, this list of conditions and the following disclaimer.  
+* 
+* * Redistributions in binary form must reproduce the above copyright
+*   notice, this list of conditions and the following disclaimer in the
+*   documentation and/or other materials provided with the
+*   distribution. 
+* 
+* * Neither the name of the Intel Corporation nor the names of its
+*   contributors may be used to endorse or promote products derived from
+*   this software without specific prior written permission. 
+* 
+* 
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-1 update function 
+* 
+* The function takes a pointer to the current hash values, a pointer to the 
+* input data, and a number of 64 byte blocks to process.  Once all blocks have 
+* been processed, the digest pointer is  updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to 
+* store partial blocks.  All message padding and hash value initialization must
+* be done outside the update function.  
+* 
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+* 
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date:   July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha1_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha1_intrinsic.c
+*
+*******************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <immintrin.h>
+
+#include <crypto/aesni/sha_sse.h>
+
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+   __m128i abcd, e0, e1;
+   __m128i abcd_save, e_save;
+   __m128i msg0, msg1, msg2, msg3;
+   __m128i shuf_mask, e_mask;
+
+#if 0
+   e_mask    = _mm_set_epi64x(0xFFFFFFFF00000000ull, 0x0000000000000000ull);
+#else
+   (void)e_mask;
+   e0        = _mm_set_epi64x(0, 0);
+#endif
+   shuf_mask = _mm_set_epi64x(0x0001020304050607ull, 0x08090a0b0c0d0e0full);
+
+   // Load initial hash values
+   abcd      = _mm_loadu_si128((__m128i*) digest);
+   e0        = _mm_insert_epi32(e0, *(digest+4), 3);
+   abcd      = _mm_shuffle_epi32(abcd, 0x1B);
+#if 0
+   e0        = _mm_and_si128(e0, e_mask);
+#endif
+
+   while (num_blks > 0) {
+      // Save hash values for addition after rounds
+      abcd_save = abcd;
+      e_save    = e0;
+
+      // Rounds 0-3
+      msg0 = _mm_loadu_si128((const __m128i*) data);
+      msg0 = _mm_shuffle_epi8(msg0, shuf_mask);
+         e0   = _mm_add_epi32(e0, msg0);
+         e1   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+
+      // Rounds 4-7
+      msg1 = _mm_loadu_si128((const __m128i*) (data+16));
+      msg1 = _mm_shuffle_epi8(msg1, shuf_mask);
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+
+      // Rounds 8-11
+      msg2 = _mm_loadu_si128((const __m128i*) (data+32));
+      msg2 = _mm_shuffle_epi8(msg2, shuf_mask);
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 12-15
+      msg3 = _mm_loadu_si128((const __m128i*) (data+48));
+      msg3 = _mm_shuffle_epi8(msg3, shuf_mask);
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 0);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 16-19
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 0);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 20-23
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 24-27
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 28-31
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 32-35
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 1);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 36-39
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 1);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 40-43
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 44-47
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 48-51
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 52-55
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 2);
+      msg0 = _mm_sha1msg1_epu32(msg0, msg1);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 56-59
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 2);
+      msg1 = _mm_sha1msg1_epu32(msg1, msg2);
+      msg0 = _mm_xor_si128(msg0, msg2);
+
+      // Rounds 60-63
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+      msg0 = _mm_sha1msg2_epu32(msg0, msg3);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+      msg2 = _mm_sha1msg1_epu32(msg2, msg3);
+      msg1 = _mm_xor_si128(msg1, msg3);
+
+      // Rounds 64-67
+         e0   = _mm_sha1nexte_epu32(e0, msg0);
+         e1   = abcd;
+      msg1 = _mm_sha1msg2_epu32(msg1, msg0);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+      msg3 = _mm_sha1msg1_epu32(msg3, msg0);
+      msg2 = _mm_xor_si128(msg2, msg0);
+
+      // Rounds 68-71
+         e1   = _mm_sha1nexte_epu32(e1, msg1);
+         e0   = abcd;
+      msg2 = _mm_sha1msg2_epu32(msg2, msg1);
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+      msg3 = _mm_xor_si128(msg3, msg1);
+	
+      // Rounds 72-75
+         e0   = _mm_sha1nexte_epu32(e0, msg2);
+         e1   = abcd;
+      msg3 = _mm_sha1msg2_epu32(msg3, msg2);
+         abcd = _mm_sha1rnds4_epu32(abcd, e0, 3);
+
+      // Rounds 76-79
+         e1   = _mm_sha1nexte_epu32(e1, msg3);
+         e0   = abcd;
+         abcd = _mm_sha1rnds4_epu32(abcd, e1, 3);
+
+      // Add current hash values with previously saved
+      e0   = _mm_sha1nexte_epu32(e0, e_save);
+      abcd = _mm_add_epi32(abcd, abcd_save);
+
+      data += 64;
+      num_blks--;
+   }
+
+   abcd = _mm_shuffle_epi32(abcd, 0x1B);
+   _mm_store_si128((__m128i*) digest, abcd);
+   *(digest+4) = _mm_extract_epi32(e0, 3);
+}
+
Index: head/sys/crypto/aesni/intel_sha256.c
===================================================================
--- head/sys/crypto/aesni/intel_sha256.c
+++ head/sys/crypto/aesni/intel_sha256.c
@@ -0,0 +1,277 @@
+/*******************************************************************************
+* Copyright (c) 2013, Intel Corporation 
+* 
+* All rights reserved. 
+* 
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met: 
+* 
+* * Redistributions of source code must retain the above copyright
+*   notice, this list of conditions and the following disclaimer.  
+* 
+* * Redistributions in binary form must reproduce the above copyright
+*   notice, this list of conditions and the following disclaimer in the
+*   documentation and/or other materials provided with the
+*   distribution. 
+* 
+* * Neither the name of the Intel Corporation nor the names of its
+*   contributors may be used to endorse or promote products derived from
+*   this software without specific prior written permission. 
+* 
+* 
+* THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+********************************************************************************
+*
+* Intel SHA Extensions optimized implementation of a SHA-256 update function 
+*
+* The function takes a pointer to the current hash values, a pointer to the 
+* input data, and a number of 64 byte blocks to process.  Once all blocks have 
+* been processed, the digest pointer is  updated with the resulting hash value.
+* The function only processes complete blocks, there is no functionality to 
+* store partial blocks.  All message padding and hash value initialization must
+* be done outside the update function.  
+*
+* The indented lines in the loop are instructions related to rounds processing.
+* The non-indented lines are instructions related to the message schedule.
+*
+* Author: Sean Gulley <sean.m.gulley@intel.com>
+* Date:   July 2013
+*
+********************************************************************************
+*
+* Example complier command line:
+* icc intel_sha_extensions_sha256_intrinsic.c
+* gcc -msha -msse4 intel_sha_extensions_sha256_intrinsic.c
+*
+*******************************************************************************/
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <immintrin.h>
+
+#include <crypto/aesni/sha_sse.h>
+
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks) {
+   __m128i state0, state1;
+   __m128i msg;
+   __m128i msgtmp0, msgtmp1, msgtmp2, msgtmp3;
+   __m128i tmp;
+   __m128i shuf_mask;
+   __m128i abef_save, cdgh_save;
+
+   // Load initial hash values
+   // Need to reorder these appropriately
+   // DCBA, HGFE -> ABEF, CDGH
+   tmp    = _mm_loadu_si128((__m128i*) digest);
+   state1 = _mm_loadu_si128((__m128i*) (digest+4));
+
+   tmp    = _mm_shuffle_epi32(tmp, 0xB1);       // CDAB
+   state1 = _mm_shuffle_epi32(state1, 0x1B);    // EFGH
+   state0 = _mm_alignr_epi8(tmp, state1, 8);    // ABEF
+   state1 = _mm_blend_epi16(state1, tmp, 0xF0); // CDGH
+
+   shuf_mask = _mm_set_epi64x(0x0c0d0e0f08090a0bull, 0x0405060700010203ull);
+
+   while (num_blks > 0) {
+      // Save hash values for addition after rounds
+      abef_save = state0;
+      cdgh_save = state1;
+
+      // Rounds 0-3
+      msg     = _mm_loadu_si128((const __m128i*) data);
+      msgtmp0 = _mm_shuffle_epi8(msg, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0xE9B5DBA5B5C0FBCFull, 0x71374491428A2F98ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 4-7
+      msgtmp1 = _mm_loadu_si128((const __m128i*) (data+16));
+      msgtmp1 = _mm_shuffle_epi8(msgtmp1, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0xAB1C5ED5923F82A4ull, 0x59F111F13956C25Bull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 8-11
+      msgtmp2 = _mm_loadu_si128((const __m128i*) (data+32));
+      msgtmp2 = _mm_shuffle_epi8(msgtmp2, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0x550C7DC3243185BEull, 0x12835B01D807AA98ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 12-15
+      msgtmp3 = _mm_loadu_si128((const __m128i*) (data+48));
+      msgtmp3 = _mm_shuffle_epi8(msgtmp3, shuf_mask);
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0xC19BF1749BDC06A7ull, 0x80DEB1FE72BE5D74ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 16-19
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x240CA1CC0FC19DC6ull, 0xEFBE4786E49B69C1ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 20-23
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x76F988DA5CB0A9DCull, 0x4A7484AA2DE92C6Full));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 24-27
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0xBF597FC7B00327C8ull, 0xA831C66D983E5152ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 28-31
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0x1429296706CA6351ull, 0xD5A79147C6E00BF3ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 32-35
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x53380D134D2C6DFCull, 0x2E1B213827B70A85ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 36-39
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x92722C8581C2C92Eull, 0x766A0ABB650A7354ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp0 = _mm_sha256msg1_epu32(msgtmp0, msgtmp1);
+
+      // Rounds 40-43
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0xC76C51A3C24B8B70ull, 0xA81A664BA2BFE8A1ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp1 = _mm_sha256msg1_epu32(msgtmp1, msgtmp2);
+
+      // Rounds 44-47
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0x106AA070F40E3585ull, 0xD6990624D192E819ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp3, msgtmp2, 4);
+      msgtmp0 = _mm_add_epi32(msgtmp0, tmp);
+      msgtmp0 = _mm_sha256msg2_epu32(msgtmp0, msgtmp3);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp2 = _mm_sha256msg1_epu32(msgtmp2, msgtmp3);
+
+      // Rounds 48-51
+         msg    = _mm_add_epi32(msgtmp0, 
+                  _mm_set_epi64x(0x34B0BCB52748774Cull, 0x1E376C0819A4C116ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp0, msgtmp3, 4);
+      msgtmp1 = _mm_add_epi32(msgtmp1, tmp);
+      msgtmp1 = _mm_sha256msg2_epu32(msgtmp1, msgtmp0);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+      msgtmp3 = _mm_sha256msg1_epu32(msgtmp3, msgtmp0);
+
+      // Rounds 52-55
+         msg    = _mm_add_epi32(msgtmp1, 
+                  _mm_set_epi64x(0x682E6FF35B9CCA4Full, 0x4ED8AA4A391C0CB3ull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp1, msgtmp0, 4);
+      msgtmp2 = _mm_add_epi32(msgtmp2, tmp);
+      msgtmp2 = _mm_sha256msg2_epu32(msgtmp2, msgtmp1);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 56-59
+         msg    = _mm_add_epi32(msgtmp2, 
+                  _mm_set_epi64x(0x8CC7020884C87814ull, 0x78A5636F748F82EEull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+      tmp     = _mm_alignr_epi8(msgtmp2, msgtmp1, 4);
+      msgtmp3 = _mm_add_epi32(msgtmp3, tmp);
+      msgtmp3 = _mm_sha256msg2_epu32(msgtmp3, msgtmp2);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Rounds 60-63
+         msg    = _mm_add_epi32(msgtmp3, 
+                  _mm_set_epi64x(0xC67178F2BEF9A3F7ull, 0xA4506CEB90BEFFFAull));
+         state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
+         msg    = _mm_shuffle_epi32(msg, 0x0E);
+         state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
+
+      // Add current hash values with previously saved
+      state0 = _mm_add_epi32(state0, abef_save);
+      state1 = _mm_add_epi32(state1, cdgh_save);
+
+      data += 64;
+      num_blks--;
+   }
+
+   // Write hash values back in the correct order
+   tmp    = _mm_shuffle_epi32(state0, 0x1B);    // FEBA
+   state1 = _mm_shuffle_epi32(state1, 0xB1);    // DCHG
+   state0 = _mm_blend_epi16(tmp, state1, 0xF0); // DCBA
+   state1 = _mm_alignr_epi8(state1, tmp, 8);    // ABEF
+
+   _mm_store_si128((__m128i*) digest, state0);
+   _mm_store_si128((__m128i*) (digest+4), state1);
+}
+
Index: head/sys/crypto/aesni/sha_sse.h
===================================================================
--- head/sys/crypto/aesni/sha_sse.h
+++ head/sys/crypto/aesni/sha_sse.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2017 Conrad Meyer <cem@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CRYPTO__SHA_SSE_H_
+#define _CRYPTO__SHA_SSE_H_
+
+/*
+ * Internal functions, implemented in intrinsics.
+ */
+void intel_sha1_step(uint32_t *digest, const char *data, uint32_t num_blks);
+void intel_sha256_step(uint32_t *digest, const char *data, uint32_t num_blks);
+
+#endif /* _CRYPTO__SHA_SSE_H_ */
Index: head/sys/modules/aesni/Makefile
===================================================================
--- head/sys/modules/aesni/Makefile
+++ head/sys/modules/aesni/Makefile
@@ -1,6 +1,7 @@
 # $FreeBSD$
 
 .PATH: ${SRCTOP}/sys/crypto/aesni
+.PATH: ${SRCTOP}/contrib/llvm/tools/clang/lib/Headers
 
 KMOD=	aesni
 SRCS=	aesni.c
@@ -8,6 +9,7 @@
 SRCS+=	device_if.h bus_if.h opt_bus.h cryptodev_if.h
 
 OBJS+=	aesni_ghash.o aesni_wrap.o
+OBJS+=	intel_sha1.o intel_sha256.o
 
 # Remove -nostdinc so we can get the intrinsics.
 aesni_ghash.o: aesni_ghash.c
@@ -21,8 +23,20 @@
 	     -mmmx -msse -msse4 -maes ${.IMPSRC}
 	${CTFCONVERT_CMD}
 
+intel_sha1.o: intel_sha1.c
+	${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+	     -mmmx -msse -msse4 -msha ${.IMPSRC}
+	${CTFCONVERT_CMD}
+
+intel_sha256.o: intel_sha256.c
+	${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${PROF} \
+	     -mmmx -msse -msse4 -msha ${.IMPSRC}
+	${CTFCONVERT_CMD}
+
 aesni_ghash.o: aesni.h
 aesni_wrap.o: aesni.h
+intel_sha1.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
+intel_sha256.o: sha_sse.h immintrin.h shaintrin.h tmmintrin.h xmmintrin.h
 
 .include <bsd.kmod.mk>
 
Index: head/tests/sys/opencrypto/cryptotest.py
===================================================================
--- head/tests/sys/opencrypto/cryptotest.py
+++ head/tests/sys/opencrypto/cryptotest.py
@@ -47,7 +47,7 @@
 
 aesmodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ]
 desmodules = [ 'cryptosoft0', ]
-shamodules = [ 'cryptosoft0', 'ccr0' ]
+shamodules = [ 'cryptosoft0', 'aesni0', 'ccr0' ]
 
 def GenTestCase(cname):
 	try: