diff --git a/include/i386/Makefile b/include/i386/Makefile --- a/include/i386/Makefile +++ b/include/i386/Makefile @@ -16,6 +16,7 @@ INCS+= \ counter.h \ md_var.h \ + npx.h \ pcpu.h \ pcpu_aux.h INCSDIR= ${INCLUDEDIR}/i386 diff --git a/sys/i386/i386/npx.c b/sys/i386/i386/npx.c --- a/sys/i386/i386/npx.c +++ b/sys/i386/i386/npx.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -1522,6 +1523,31 @@ return ((curpcb->pcb_flags & PCB_KERNNPX_THR) != 0); } +u_int +fpu_kern_critical_enter(void) +{ + u_int res; + + critical_enter(); + if ((rcr0() & CR0_TS) != 0) { + res = 1; + fpu_enable(); + } else + res = 0; + + return (res); +} + +void +fpu_kern_critical_exit(u_int enter_result) +{ + CRITICAL_ASSERT(curthread); + + if (enter_result != 0) + fpu_disable(); + critical_exit(); +} + /* * FPU save area alloc/free/init utility routines */ diff --git a/sys/i386/include/atomic.h b/sys/i386/include/atomic.h --- a/sys/i386/include/atomic.h +++ b/sys/i386/include/atomic.h @@ -32,6 +32,7 @@ #ifdef _KERNEL #include +#include #include #endif @@ -455,6 +456,42 @@ return (res); } +static __inline uint64_t __attribute__((__target__("sse2"))) +atomic_load_acq_64_sse2(const volatile uint64_t *p) +{ + char _Alignas(16) xmm_sav[16]; + uint64_t res; + u_int fpu_res; + volatile __unused u_int res_half; + + /* + * Prefetch the target content before entering into a critical section + * as we want to avoid cache miss penalty inside it. + */ + res_half = *(const volatile u_int*)p; + + fpu_res = fpu_kern_critical_enter(); + + /* + * All XMM registers are scratch ones in the ABI, but we generally don't + * use them in the kernel and userland XMM registers may not have been + * saved. Accessing the PCB and taking a branch to determine that is + * likely to consume more time than just always saving the temporary XMM + * register we use. + */ + __asm ( + " movdqa %%xmm0, %0; " + " movq %2, %%xmm0; " + " movq %%xmm0, %1; " + " movdqa %0, %%xmm0; " + : "=m" (*xmm_sav), /* 0 */ + "=m" (res) /* 1 */ + : "m" (*p)); /* 2 */ + + fpu_kern_critical_exit(fpu_res); + return (res); +} + static __inline void atomic_store_rel_64_i586(volatile uint64_t *p, uint64_t v) { @@ -512,8 +549,10 @@ if ((cpu_feature & CPUID_CX8) == 0) return (atomic_load_acq_64_i386(p)); - else + else if ((cpu_feature & CPUID_SSE2) == 0) return (atomic_load_acq_64_i586(p)); + else + return (atomic_load_acq_64_sse2(p)); } static __inline void @@ -571,6 +610,21 @@ } } +#else /* !_KERNEL */ + +static __inline uint64_t __attribute__((__target__("sse2"))) +atomic_load_acq_64_sse2(const volatile uint64_t *p) +{ + uint64_t res; + + __asm ( + " movq %1, %0; " + : "=x" (res) /* 0 */ + : "m" (*p)); /* 1 */ + + return (res); +} + #endif /* _KERNEL */ ATOMIC_ASM(set, char, "orb %b1,%0", "iq", v); diff --git a/sys/i386/include/npx.h b/sys/i386/include/npx.h --- a/sys/i386/include/npx.h +++ b/sys/i386/include/npx.h @@ -78,6 +78,8 @@ int fpu_kern_leave(struct thread *td, struct fpu_kern_ctx *ctx); int fpu_kern_thread(u_int flags); int is_fpu_kern_thread(u_int flags); +u_int fpu_kern_critical_enter(void); +void fpu_kern_critical_exit(u_int enter_result); union savefpu *fpu_save_area_alloc(void); void fpu_save_area_free(union savefpu *fsa);