diff --git a/include/i386/Makefile b/include/i386/Makefile
--- a/include/i386/Makefile
+++ b/include/i386/Makefile
@@ -16,6 +16,7 @@
 INCS+=	\
 	counter.h \
 	md_var.h \
+	npx.h \
 	pcpu.h \
 	pcpu_aux.h
 INCSDIR=	${INCLUDEDIR}/i386
diff --git a/sys/i386/i386/npx.c b/sys/i386/i386/npx.c
--- a/sys/i386/i386/npx.c
+++ b/sys/i386/i386/npx.c
@@ -57,6 +57,7 @@
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/md_var.h>
+#include <machine/npx.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/resource.h>
@@ -1522,6 +1523,31 @@
 	return ((curpcb->pcb_flags & PCB_KERNNPX_THR) != 0);
 }
 
+u_int
+fpu_kern_critical_enter(void)
+{
+	u_int res;
+
+	critical_enter();
+	if ((rcr0() & CR0_TS) != 0) {
+		res = 1;
+		fpu_enable();
+	} else
+		res = 0;
+
+	return (res);
+}
+
+void
+fpu_kern_critical_exit(u_int enter_result)
+{
+	CRITICAL_ASSERT(curthread);
+
+	if (enter_result != 0)
+		fpu_disable();
+	critical_exit();
+}
+
 /*
  * FPU save area alloc/free/init utility routines
  */
diff --git a/sys/i386/include/atomic.h b/sys/i386/include/atomic.h
--- a/sys/i386/include/atomic.h
+++ b/sys/i386/include/atomic.h
@@ -32,6 +32,7 @@
 
 #ifdef _KERNEL
 #include <machine/md_var.h>
+#include <i386/npx.h>
 #include <machine/specialreg.h>
 #endif
 
@@ -455,6 +456,42 @@
 	return (res);
 }
 
+static __inline uint64_t __attribute__((__target__("sse2")))
+atomic_load_acq_64_sse2(const volatile uint64_t *p)
+{
+	char _Alignas(16) xmm_sav[16];
+	uint64_t res;
+	u_int fpu_res;
+	volatile __unused u_int res_half;
+
+	/*
+	 * Prefetch the target content before entering into a critical section
+	 * as we want to avoid cache miss penalty inside it.
+	 */
+	res_half = *(const volatile u_int*)p;
+
+	fpu_res = fpu_kern_critical_enter();
+
+	/*
+	 * All XMM registers are scratch ones in the ABI, but we generally don't
+	 * use them in the kernel and userland XMM registers may not have been
+	 * saved.  Accessing the PCB and taking a branch to determine that is
+	 * likely to consume more time than just always saving the temporary XMM
+	 * register we use.
+	 */
+	__asm (
+	"	movdqa	%%xmm0, %0;	"
+	"	movq	%2, %%xmm0;	"
+	"	movq	%%xmm0, %1;	"
+	"	movdqa	%0, %%xmm0;	"
+	: "=m" (*xmm_sav),		/* 0 */
+	  "=m" (res)			/* 1 */
+	: "m" (*p));			/* 2 */
+
+	fpu_kern_critical_exit(fpu_res);
+	return (res);
+}
+
 static __inline void
 atomic_store_rel_64_i586(volatile uint64_t *p, uint64_t v)
 {
@@ -512,8 +549,10 @@
 
 	if ((cpu_feature & CPUID_CX8) == 0)
 		return (atomic_load_acq_64_i386(p));
-	else
+	else if ((cpu_feature & CPUID_SSE2) == 0)
 		return (atomic_load_acq_64_i586(p));
+	else
+		return (atomic_load_acq_64_sse2(p));
 }
 
 static __inline void
@@ -571,6 +610,21 @@
 	}
 }
 
+#else /* !_KERNEL */
+
+static __inline uint64_t __attribute__((__target__("sse2")))
+atomic_load_acq_64_sse2(const volatile uint64_t *p)
+{
+	uint64_t res;
+
+	__asm (
+	"	movq	%1, %0;	"
+	: "=x" (res)		       /* 0 */
+	: "m" (*p));		       /* 1 */
+
+	return (res);
+}
+
 #endif /* _KERNEL */
 
 ATOMIC_ASM(set,	     char,  "orb %b1,%0",  "iq",  v);
diff --git a/sys/i386/include/npx.h b/sys/i386/include/npx.h
--- a/sys/i386/include/npx.h
+++ b/sys/i386/include/npx.h
@@ -78,6 +78,8 @@
 int	fpu_kern_leave(struct thread *td, struct fpu_kern_ctx *ctx);
 int	fpu_kern_thread(u_int flags);
 int	is_fpu_kern_thread(u_int flags);
+u_int	fpu_kern_critical_enter(void);
+void	fpu_kern_critical_exit(u_int enter_result);
 
 union savefpu	*fpu_save_area_alloc(void);
 void	fpu_save_area_free(union savefpu *fsa);