Index: head/sys/amd64/amd64/cpu_switch.S =================================================================== --- head/sys/amd64/amd64/cpu_switch.S +++ head/sys/amd64/amd64/cpu_switch.S @@ -87,7 +87,6 @@ ENTRY(cpu_switch) /* Switch to new thread. First, save context. */ movq TD_PCB(%rdi),%r8 - orl $PCB_FULL_IRET,PCB_FLAGS(%r8) movq (%rsp),%rax /* Hardware registers */ movq %r15,PCB_R15(%r8) @@ -99,6 +98,30 @@ movq %rbx,PCB_RBX(%r8) movq %rax,PCB_RIP(%r8) + testl $PCB_FULL_IRET,PCB_FLAGS(%r8) + jnz 2f + orl $PCB_FULL_IRET,PCB_FLAGS(%r8) + testl $TDP_KTHREAD,TD_PFLAGS(%rdi) + jnz 2f + testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) + jz 2f + movl %fs,%eax + cmpl $KUF32SEL,%eax + jne 1f + rdfsbaseq %rax + movq %rax,PCB_FSBASE(%r8) +1: movl %gs,%eax + cmpl $KUG32SEL,%eax + jne 2f + movq %rdx,%r12 + movl $MSR_KGSBASE,%ecx /* Read user gs base */ + rdmsr + shlq $32,%rdx + orq %rdx,%rax + movq %rax,PCB_GSBASE(%r8) + movq %r12,%rdx + +2: testl $PCB_DBREGS,PCB_FLAGS(%r8) jnz store_dr /* static predict not taken */ done_store_dr: Index: head/sys/amd64/amd64/exception.S =================================================================== --- head/sys/amd64/amd64/exception.S +++ head/sys/amd64/amd64/exception.S @@ -187,12 +187,13 @@ jz alltraps_pushregs_no_rdi sti alltraps_pushregs_no_rdi: - movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) + movq %rax,TF_RAX(%rsp) +alltraps_pushregs_no_rax: + movq %rsi,TF_RSI(%rsp) movq %rcx,TF_RCX(%rsp) movq %r8,TF_R8(%rsp) movq %r9,TF_R9(%rsp) - movq %rax,TF_RAX(%rsp) movq %rbx,TF_RBX(%rsp) movq %rbp,TF_RBP(%rsp) movq %r10,TF_R10(%rsp) @@ -326,31 +327,53 @@ prot_addrf: movq $0,TF_ADDR(%rsp) movq %rdi,TF_RDI(%rsp) /* free up a GP register */ + movq %rax,TF_RAX(%rsp) + movq %rdx,TF_RDX(%rsp) + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) leaq doreti_iret(%rip),%rdi cmpq %rdi,TF_RIP(%rsp) - je 1f /* kernel but with user gsbase!! */ + je 5f /* kernel but with user gsbase!! */ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 2f /* already running with kernel GS.base */ -1: swapgs -2: movq PCPU(CURPCB),%rdi - orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) /* always full iret from GPF */ - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) + jz 6f /* already running with kernel GS.base */ + testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) + jz 2f + cmpw $KUF32SEL,TF_FS(%rsp) + jne 1f + rdfsbaseq %rax +1: cmpw $KUG32SEL,TF_GS(%rsp) + jne 2f + rdgsbaseq %rdx +2: swapgs + movq PCPU(CURPCB),%rdi + testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) + jz 4f + cmpw $KUF32SEL,TF_FS(%rsp) + jne 3f + movq %rax,PCB_FSBASE(%rdi) +3: cmpw $KUG32SEL,TF_GS(%rsp) + jne 4f + movq %rdx,PCB_GSBASE(%rdi) +4: orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) /* always full iret from GPF */ movw %es,TF_ES(%rsp) movw %ds,TF_DS(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) - jz alltraps_pushregs_no_rdi + jz alltraps_pushregs_no_rax sti - jmp alltraps_pushregs_no_rdi + jmp alltraps_pushregs_no_rax +5: swapgs +6: movq PCPU(CURPCB),%rdi + jmp 4b + /* * Fast syscall entry point. We enter here with just our new %cs/%ss set, * and the new privilige level. We are still running on the old user stack * pointer. We have to juggle a few things around to find our stack etc. * swapgs gives us access to our PCPU space only. * - * We do not support invoking this from a custom %cs or %ss (e.g. using - * entries from an LDT). + * We do not support invoking this from a custom segment registers, + * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT. */ IDTVEC(fast_syscall) swapgs @@ -503,6 +526,23 @@ nmi_fromuserspace: incl %ebx swapgs + testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) + jz 2f + movq PCPU(CURPCB),%rdi + testq %rdi,%rdi + jz 2f + cmpw $KUF32SEL,TF_FS(%rsp) + jne 1f + rdfsbaseq %rax + movq %rax,PCB_FSBASE(%rdi) +1: cmpw $KUG32SEL,TF_GS(%rsp) + jne 2f + movl $MSR_KGSBASE,%ecx + rdmsr + shlq $32,%rdx + orq %rdx,%rax + movq %rax,PCB_GSBASE(%rdi) +2: /* Note: this label is also used by ddb and gdb: */ nmi_calltrap: FAKE_MCOUNT(TF_RIP(%rsp)) @@ -705,6 +745,7 @@ jz ld_regs testl $PCB_FULL_IRET,PCB_FLAGS(%r8) jz ld_regs + andl $~PCB_FULL_IRET,PCB_FLAGS(%r8) testl $TF_HASSEGS,TF_FLAGS(%rsp) je set_segs Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c +++ head/sys/amd64/amd64/machdep.c @@ -372,6 +372,7 @@ sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); + update_pcb_bases(pcb); sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; bzero(sf.sf_uc.uc_mcontext.mc_spare, @@ -442,7 +443,6 @@ regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; - set_pcb_flags(pcb, PCB_FULL_IRET); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } @@ -548,6 +548,7 @@ return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); + update_pcb_bases(pcb); pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; @@ -559,7 +560,6 @@ #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); - set_pcb_flags(pcb, PCB_FULL_IRET); return (EJUSTRETURN); } @@ -587,11 +587,11 @@ else mtx_unlock(&dt_lock); + update_pcb_bases(pcb); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; - set_pcb_flags(pcb, PCB_FULL_IRET); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; @@ -2135,6 +2135,7 @@ mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); + update_pcb_bases(pcb); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; @@ -2205,11 +2206,11 @@ tp->tf_fs = mcp->mc_fs; tp->tf_gs = mcp->mc_gs; } + set_pcb_flags(pcb, PCB_FULL_IRET); if (mcp->mc_flags & _MC_HASBASES) { pcb->pcb_fsbase = mcp->mc_fsbase; pcb->pcb_gsbase = mcp->mc_gsbase; } - set_pcb_flags(pcb, PCB_FULL_IRET); return (0); } @@ -2478,6 +2479,71 @@ * None of the breakpoints are in user space. */ return 0; +} + +/* + * The pcb_flags is only modified by current thread, or by other threads + * when current thread is stopped. However, current thread may change it + * from the interrupt context in cpu_switch(), or in the trap handler. + * When we read-modify-write pcb_flags from C sources, compiler may generate + * code that is not atomic regarding the interrupt handler. If a trap or + * interrupt happens and any flag is modified from the handler, it can be + * clobbered with the cached value later. Therefore, we implement setting + * and clearing flags with single-instruction functions, which do not race + * with possible modification of the flags from the trap or interrupt context, + * because traps and interrupts are executed only on instruction boundary. + */ +void +set_pcb_flags_raw(struct pcb *pcb, const u_int flags) +{ + + __asm __volatile("orl %1,%0" + : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) + : "cc", "memory"); + +} + +/* + * The support for RDFSBASE, WRFSBASE and similar instructions for %gs + * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into + * pcb if user space modified the bases. We must save on the context + * switch or if the return to usermode happens through the doreti. + * + * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, + * which have a consequence that the base MSRs must be saved each time + * the PCB_FULL_IRET flag is set. We disable interrupts to sync with + * context switches. + */ +void +set_pcb_flags(struct pcb *pcb, const u_int flags) +{ + register_t r; + + if (curpcb == pcb && + (flags & PCB_FULL_IRET) != 0 && + (pcb->pcb_flags & PCB_FULL_IRET) == 0 && + (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) { + r = intr_disable(); + if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { + if (rfs() == _ufssel) + pcb->pcb_fsbase = rdfsbase(); + if (rgs() == _ugssel) + pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); + } + set_pcb_flags_raw(pcb, flags); + intr_restore(r); + } else { + set_pcb_flags_raw(pcb, flags); + } +} + +void +clear_pcb_flags(struct pcb *pcb, const u_int flags) +{ + + __asm __volatile("andl %1,%0" + : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) + : "cc", "memory"); } #ifdef KDB Index: head/sys/amd64/amd64/ptrace_machdep.c =================================================================== --- head/sys/amd64/amd64/ptrace_machdep.c +++ head/sys/amd64/amd64/ptrace_machdep.c @@ -117,15 +117,17 @@ static void cpu_ptrace_setbase(struct thread *td, int req, register_t r) { + struct pcb *pcb; + pcb = td->td_pcb; + set_pcb_flags(pcb, PCB_FULL_IRET); if (req == PT_SETFSBASE) { - td->td_pcb->pcb_fsbase = r; + pcb->pcb_fsbase = r; td->td_frame->tf_fs = _ufssel; } else { - td->td_pcb->pcb_gsbase = r; + pcb->pcb_gsbase = r; td->td_frame->tf_gs = _ugssel; } - set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } #ifdef COMPAT_FREEBSD32 @@ -136,6 +138,7 @@ cpu32_ptrace(struct thread *td, int req, void *addr, int data) { struct savefpu *fpstate; + struct pcb *pcb; uint32_t r; int error; @@ -167,8 +170,10 @@ error = EINVAL; break; } - r = req == PT_GETFSBASE ? td->td_pcb->pcb_fsbase : - td->td_pcb->pcb_gsbase; + pcb = td->td_pcb; + if (td == curthread) + update_pcb_bases(pcb); + r = req == PT_GETFSBASE ? pcb->pcb_fsbase : pcb->pcb_gsbase; error = copyout(&r, addr, sizeof(r)); break; @@ -197,6 +202,7 @@ cpu_ptrace(struct thread *td, int req, void *addr, int data) { register_t *r, rv; + struct pcb *pcb; int error; #ifdef COMPAT_FREEBSD32 @@ -221,8 +227,10 @@ case PT_GETFSBASE: case PT_GETGSBASE: - r = req == PT_GETFSBASE ? &td->td_pcb->pcb_fsbase : - &td->td_pcb->pcb_gsbase; + pcb = td->td_pcb; + if (td == curthread) + update_pcb_bases(pcb); + r = req == PT_GETFSBASE ? &pcb->pcb_fsbase : &pcb->pcb_gsbase; error = copyout(r, addr, sizeof(*r)); break; Index: head/sys/amd64/amd64/sys_machdep.c =================================================================== --- head/sys/amd64/amd64/sys_machdep.c +++ head/sys/amd64/amd64/sys_machdep.c @@ -254,39 +254,45 @@ error = amd64_set_ioperm(td, &iargs); break; case I386_GET_FSBASE: + update_pcb_bases(pcb); i386base = pcb->pcb_fsbase; error = copyout(&i386base, uap->parms, sizeof(i386base)); break; case I386_SET_FSBASE: error = copyin(uap->parms, &i386base, sizeof(i386base)); if (!error) { + set_pcb_flags(pcb, PCB_FULL_IRET); pcb->pcb_fsbase = i386base; td->td_frame->tf_fs = _ufssel; update_gdt_fsbase(td, i386base); } break; case I386_GET_GSBASE: + update_pcb_bases(pcb); i386base = pcb->pcb_gsbase; error = copyout(&i386base, uap->parms, sizeof(i386base)); break; case I386_SET_GSBASE: error = copyin(uap->parms, &i386base, sizeof(i386base)); if (!error) { + set_pcb_flags(pcb, PCB_FULL_IRET); pcb->pcb_gsbase = i386base; td->td_frame->tf_gs = _ugssel; update_gdt_gsbase(td, i386base); } break; case AMD64_GET_FSBASE: - error = copyout(&pcb->pcb_fsbase, uap->parms, sizeof(pcb->pcb_fsbase)); + update_pcb_bases(pcb); + error = copyout(&pcb->pcb_fsbase, uap->parms, + sizeof(pcb->pcb_fsbase)); break; case AMD64_SET_FSBASE: error = copyin(uap->parms, &a64base, sizeof(a64base)); if (!error) { if (a64base < VM_MAXUSER_ADDRESS) { - pcb->pcb_fsbase = a64base; set_pcb_flags(pcb, PCB_FULL_IRET); + pcb->pcb_fsbase = a64base; td->td_frame->tf_fs = _ufssel; } else error = EINVAL; @@ -294,15 +300,17 @@ break; case AMD64_GET_GSBASE: - error = copyout(&pcb->pcb_gsbase, uap->parms, sizeof(pcb->pcb_gsbase)); + update_pcb_bases(pcb); + error = copyout(&pcb->pcb_gsbase, uap->parms, + sizeof(pcb->pcb_gsbase)); break; case AMD64_SET_GSBASE: error = copyin(uap->parms, &a64base, sizeof(a64base)); if (!error) { if (a64base < VM_MAXUSER_ADDRESS) { - pcb->pcb_gsbase = a64base; set_pcb_flags(pcb, PCB_FULL_IRET); + pcb->pcb_gsbase = a64base; td->td_frame->tf_gs = _ugssel; } else error = EINVAL; Index: head/sys/amd64/amd64/vm_machdep.c =================================================================== --- head/sys/amd64/amd64/vm_machdep.c +++ head/sys/amd64/amd64/vm_machdep.c @@ -238,7 +238,7 @@ pcb2->pcb_tssp = NULL; /* New segment registers. */ - set_pcb_flags(pcb2, PCB_FULL_IRET); + set_pcb_flags_raw(pcb2, PCB_FULL_IRET); /* Copy the LDT, if necessary. */ mdp1 = &td1->td_proc->p_md; @@ -439,7 +439,7 @@ pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save, cpu_max_ext_state_size); - set_pcb_flags(pcb2, PCB_FULL_IRET); + set_pcb_flags_raw(pcb2, PCB_FULL_IRET); /* * Create a new fresh stack for the new thread. Index: head/sys/amd64/include/asmacros.h =================================================================== --- head/sys/amd64/include/asmacros.h +++ head/sys/amd64/include/asmacros.h @@ -177,7 +177,12 @@ movw %es,TF_ES(%rsp) ; \ movw %ds,TF_DS(%rsp) ; \ movl $TF_HASSEGS,TF_FLAGS(%rsp) ; \ - cld + cld ; \ + testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel ? */ \ + jz 2f ; /* yes, leave PCB_FULL_IRET alone */ \ + movq PCPU(CURPCB),%r8 ; \ + andl $~PCB_FULL_IRET,PCB_FLAGS(%r8) ; \ +2: #define POP_FRAME \ movq TF_RDI(%rsp),%rdi ; \ Index: head/sys/amd64/include/pcb.h =================================================================== --- head/sys/amd64/include/pcb.h +++ head/sys/amd64/include/pcb.h @@ -119,40 +119,15 @@ #ifdef _KERNEL struct trapframe; -/* - * The pcb_flags is only modified by current thread, or by other threads - * when current thread is stopped. However, current thread may change it - * from the interrupt context in cpu_switch(), or in the trap handler. - * When we read-modify-write pcb_flags from C sources, compiler may generate - * code that is not atomic regarding the interrupt handler. If a trap or - * interrupt happens and any flag is modified from the handler, it can be - * clobbered with the cached value later. Therefore, we implement setting - * and clearing flags with single-instruction functions, which do not race - * with possible modification of the flags from the trap or interrupt context, - * because traps and interrupts are executed only on instruction boundary. - */ -static __inline void -set_pcb_flags(struct pcb *pcb, const u_int flags) -{ - - __asm __volatile("orl %1,%0" - : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) - : "cc"); -} - -static __inline void -clear_pcb_flags(struct pcb *pcb, const u_int flags) -{ - - __asm __volatile("andl %1,%0" - : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) - : "cc"); -} - +void clear_pcb_flags(struct pcb *pcb, const u_int flags); void makectx(struct trapframe *, struct pcb *); +void set_pcb_flags(struct pcb *pcb, const u_int flags); +void set_pcb_flags_raw(struct pcb *pcb, const u_int flags); int savectx(struct pcb *) __returns_twice; void resumectx(struct pcb *); +/* Ensure that pcb_gsbase and pcb_fsbase are up to date */ +#define update_pcb_bases(pcb) set_pcb_flags((pcb), PCB_FULL_IRET) #endif #endif /* _AMD64_PCB_H_ */ Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h +++ head/sys/sys/param.h @@ -58,7 +58,7 @@ * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1200040 /* Master, propagated to newvers */ +#define __FreeBSD_version 1200041 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, @@ -83,6 +83,7 @@ #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 +#define P_OSREL_WRFSBASE 1200041 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif