Index: sys/amd64/amd64/genassym.c =================================================================== --- sys/amd64/amd64/genassym.c +++ sys/amd64/amd64/genassym.c @@ -156,8 +156,6 @@ ASSYM(PCB_GS32SD, offsetof(struct pcb, pcb_gs32sd)); ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); -ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct savefpu)); -ASSYM(PCB_USERFPU, sizeof(struct pcb)); ASSYM(PCB_EFER, offsetof(struct pcb, pcb_efer)); ASSYM(PCB_STAR, offsetof(struct pcb, pcb_star)); ASSYM(PCB_LSTAR, offsetof(struct pcb, pcb_lstar)); Index: sys/amd64/amd64/sys_machdep.c =================================================================== --- sys/amd64/amd64/sys_machdep.c +++ sys/amd64/amd64/sys_machdep.c @@ -319,7 +319,7 @@ fpugetregs(td); error = copyout((char *)(get_pcb_user_save_td(td) + 1), a64xfpu.addr, a64xfpu.len); - return (error); + break; default: error = EINVAL; Index: sys/amd64/amd64/vm_machdep.c =================================================================== --- sys/amd64/amd64/vm_machdep.c +++ sys/amd64/amd64/vm_machdep.c @@ -127,7 +127,7 @@ void * alloc_fpusave(int flags) { - struct pcb *res; + void *res; struct savefpu_ymm *sf; res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags); Index: sys/i386/i386/genassym.c =================================================================== --- sys/i386/i386/genassym.c +++ sys/i386/i386/genassym.c @@ -144,7 +144,6 @@ ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); -ASSYM(PCB_USERFPU, offsetof(struct pcb, pcb_user_save)); ASSYM(PCB_PSL, offsetof(struct pcb, pcb_psl)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); @@ -154,7 +153,6 @@ ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); -ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); ASSYM(PCB_SIZE, sizeof(struct pcb)); Index: sys/i386/i386/initcpu.c =================================================================== --- sys/i386/i386/initcpu.c +++ sys/i386/i386/initcpu.c @@ -102,6 +102,7 @@ #endif u_int cpu_clflush_line_size = 32; u_int cpu_stdext_feature; +u_int cpu_max_ext_state_size; u_int cpu_mon_mwait_flags; /* MONITOR/MWAIT flags (CPUID.05H.ECX) */ u_int cpu_mon_min_size; /* MONITOR minimum range size, bytes */ u_int cpu_mon_max_size; /* MONITOR minimum range size, bytes */ Index: sys/i386/i386/locore.s =================================================================== --- sys/i386/i386/locore.s +++ sys/i386/i386/locore.s @@ -302,17 +302,14 @@ begin: /* set up bootstrap stack */ movl proc0kstack,%eax /* location of in-kernel stack */ - /* bootstrap stack end location */ - leal (KSTACK_PAGES*PAGE_SIZE-PCB_SIZE)(%eax),%esp - xorl %ebp,%ebp /* mark end of frames */ + /* + * Only use bottom page for init386(). init386() calculates the + * PCB + FPU save area size and returns the true top of stack. + */ + leal PAGE_SIZE(%eax),%esp -#ifdef PAE - movl IdlePDPT,%esi -#else - movl IdlePTD,%esi -#endif - movl %esi,(KSTACK_PAGES*PAGE_SIZE-PCB_SIZE+PCB_CR3)(%eax) + xorl %ebp,%ebp /* mark end of frames */ pushl physfree /* value of first for init386(first) */ call init386 /* wire 386 chip for unix operation */ @@ -324,6 +321,9 @@ */ addl $4,%esp + /* Switch to true top of stack. */ + movl %eax,%esp + call mi_startup /* autoconfiguration, mountroot etc */ /* NOTREACHED */ addl $0,%esp /* for db_numargs() again */ Index: sys/i386/i386/machdep.c =================================================================== --- sys/i386/i386/machdep.c +++ sys/i386/i386/machdep.c @@ -181,7 +181,7 @@ /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); -extern void init386(int first); +extern register_t init386(int first); extern void dblfault_handler(void); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) @@ -193,8 +193,10 @@ static void cpu_startup(void *); static void fpstate_drop(struct thread *td); -static void get_fpcontext(struct thread *td, mcontext_t *mcp); -static int set_fpcontext(struct thread *td, const mcontext_t *mcp); +static void get_fpcontext(struct thread *td, mcontext_t *mcp, + char *xfpusave, size_t xfpusave_len); +static int set_fpcontext(struct thread *td, const mcontext_t *mcp, + char *xfpustate, size_t xfpustate_len); #ifdef CPU_ENABLE_SSE static void set_fpregs_xmm(struct save87 *, struct savexmm *); static void fill_fpregs_xmm(struct savexmm *, struct save87 *); @@ -363,7 +365,7 @@ * Send an interrupt to process. * * Stack is set up to allow sigcode stored - * at top to call routine, followed by kcall + * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user @@ -642,6 +644,8 @@ char *sp; struct trapframe *regs; struct segment_descriptor *sdp; + char *xfpusave; + size_t xfpusave_len; int sig; int oonstack; @@ -666,6 +670,14 @@ regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); + if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { + xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); + xfpusave = __builtin_alloca(xfpusave_len); + } else { + xfpusave_len = 0; + xfpusave = NULL; + } + /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; @@ -676,7 +688,7 @@ sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ - get_fpcontext(td, &sf.sf_uc.uc_mcontext); + get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); /* * Unconditionally fill the fsbase and gsbase into the mcontext. @@ -687,7 +699,6 @@ sdp = &td->td_pcb->pcb_gsd; sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; - sf.sf_uc.uc_mcontext.mc_flags = 0; bzero(sf.sf_uc.uc_mcontext.mc_spare2, sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); @@ -695,13 +706,19 @@ /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { - sp = td->td_sigstk.ss_sp + - td->td_sigstk.ss_size - sizeof(struct sigframe); + sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else - sp = (char *)regs->tf_esp - sizeof(struct sigframe); + sp = (char *)regs->tf_esp - 128; + if (xfpusave != NULL) { + sp -= xfpusave_len; + sp = (char *)((unsigned int)sp & ~0x3F); + sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; + } + sp -= sizeof(struct sigframe); + /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned int)sp & ~0xF); @@ -762,7 +779,10 @@ /* * Copy the sigframe out to the user's stack. */ - if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { + if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || + (xfpusave != NULL && copyout(xfpusave, + (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) + != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif @@ -1022,11 +1042,16 @@ } */ *uap; { ucontext_t uc; + struct proc *p; struct trapframe *regs; ucontext_t *ucp; + char *xfpustate; + size_t xfpustate_len; int cs, eflags, error, ret; ksiginfo_t ksi; + p = td->td_proc; + error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); @@ -1101,7 +1126,30 @@ return (EINVAL); } - ret = set_fpcontext(td, &ucp->uc_mcontext); + if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { + xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; + if (xfpustate_len > cpu_max_ext_state_size - + sizeof(union savefpu)) { + uprintf( + "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", + p->p_pid, td->td_name, xfpustate_len); + return (EINVAL); + } + xfpustate = __builtin_alloca(xfpustate_len); + error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, + xfpustate, xfpustate_len); + if (error != 0) { + uprintf( + "pid %d (%s): sigreturn copying xfpustate failed\n", + p->p_pid, td->td_name); + return (error); + } + } else { + xfpustate = NULL; + xfpustate_len = 0; + } + ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, + xfpustate_len); if (ret != 0) return (ret); bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); @@ -1599,17 +1647,9 @@ */ reset_dbregs(); } - pcb->pcb_flags &= ~PCB_DBREGS; + pcb->pcb_flags &= ~PCB_DBREGS; } - /* - * Initialize the math emulator (if any) for the current process. - * Actually, just clear the bit that says that the emulator has - * been initialized. Initialization is delayed until the process - * traps to the emulator (if it is done at all) mainly because - * emulators don't provide an entry point for initialization. - */ - td->td_pcb->pcb_flags &= ~FP_SOFTFP; pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; /* @@ -2861,14 +2901,14 @@ #ifdef XEN #define MTOPSIZE (1<<(14 + PAGE_SHIFT)) -void +register_t init386(first) int first; { unsigned long gdtmachpfn; int error, gsel_tss, metadata_missing, x, pa; - size_t kstack0_sz; struct pcpu *pc; + struct xstate_hdr *xhdr; struct callback_register event = { .type = CALLBACKTYPE_event, .address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback }, @@ -2880,8 +2920,6 @@ thread0.td_kstack = proc0kstack; thread0.td_kstack_pages = KSTACK_PAGES; - kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; - thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; /* * This may be done better later if it gets more high level @@ -2961,7 +2999,6 @@ PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); - PCPU_SET(curpcb, thread0.td_pcb); /* * Initialize mutexes. @@ -3043,15 +3080,6 @@ initializecpu(); /* Initialize CPU registers */ initializecpucache(); - /* make an initial tss so cpu can get interrupt stack on syscall! */ - /* Note: -16 is so we can grow the trapframe if we came from vm86 */ - PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + - kstack0_sz - sizeof(struct pcb) - 16); - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); - gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); - HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), - PCPU_GET(common_tss.tss_esp0)); - /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); @@ -3079,6 +3107,30 @@ /* now running on new page tables, configured,and u/iom is accessible */ msgbufinit(msgbufp, msgbufsize); +#ifdef DEV_NPX + npxinit(true); +#endif + /* + * Set up thread0 pcb after npxinit calculated pcb + fpu save + * area size. Zero out the extended state header in fpu save + * area. + */ + thread0.td_pcb = get_pcb_td(&thread0); + bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); + if (use_xsave) { + xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + + 1); + xhdr->xstate_bv = xsave_mask; + } + PCPU_SET(curpcb, thread0.td_pcb); + /* make an initial tss so cpu can get interrupt stack on syscall! */ + /* Note: -16 is so we can grow the trapframe if we came from vm86 */ + PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); + HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), + PCPU_GET(common_tss.tss_esp0)); + /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); @@ -3097,22 +3149,23 @@ thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1]; cpu_probe_amdc1e(); + + /* Location of kernel stack for locore */ + return ((register_t)thread0.td_pcb); } #else -void +register_t init386(first) int first; { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, x, pa; - size_t kstack0_sz; struct pcpu *pc; + struct xstate_hdr *xhdr; thread0.td_kstack = proc0kstack; thread0.td_kstack_pages = KSTACK_PAGES; - kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; - thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1; /* * This may be done better later if it gets more high level @@ -3173,7 +3226,6 @@ first += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); - PCPU_SET(curpcb, thread0.td_pcb); /* * Initialize mutexes. @@ -3328,17 +3380,6 @@ initializecpu(); /* Initialize CPU registers */ initializecpucache(); - /* make an initial tss so cpu can get interrupt stack on syscall! */ - /* Note: -16 is so we can grow the trapframe if we came from vm86 */ - PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + - kstack0_sz - sizeof(struct pcb) - 16); - PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); - gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); - PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); - PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); - PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); - ltr(gsel_tss); - /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); @@ -3366,6 +3407,31 @@ /* now running on new page tables, configured,and u/iom is accessible */ msgbufinit(msgbufp, msgbufsize); +#ifdef DEV_NPX + npxinit(true); +#endif + /* + * Set up thread0 pcb after npxinit calculated pcb + fpu save + * area size. Zero out the extended state header in fpu save + * area. + */ + thread0.td_pcb = get_pcb_td(&thread0); + bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); + if (use_xsave) { + xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + + 1); + xhdr->xstate_bv = xsave_mask; + } + PCPU_SET(curpcb, thread0.td_pcb); + /* make an initial tss so cpu can get interrupt stack on syscall! */ + /* Note: -16 is so we can grow the trapframe if we came from vm86 */ + PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); + PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); + gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); + PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); + PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); + PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); + ltr(gsel_tss); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; @@ -3404,6 +3470,9 @@ #ifdef FDT x86_init_fdt(); #endif + + /* Location of kernel stack for locore */ + return ((register_t)thread0.td_pcb); } #endif @@ -3684,11 +3753,11 @@ #endif #ifdef CPU_ENABLE_SSE if (cpu_fxsr) - fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm, + fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, (struct save87 *)fpregs); else #endif /* CPU_ENABLE_SSE */ - bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs, + bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, sizeof(*fpregs)); return (0); } @@ -3700,10 +3769,10 @@ #ifdef CPU_ENABLE_SSE if (cpu_fxsr) set_fpregs_xmm((struct save87 *)fpregs, - &td->td_pcb->pcb_user_save.sv_xmm); + &get_pcb_user_save_td(td)->sv_xmm); else #endif /* CPU_ENABLE_SSE */ - bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87, + bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, sizeof(*fpregs)); #ifdef DEV_NPX npxuserinited(td); @@ -3749,12 +3818,14 @@ mcp->mc_esp = tp->tf_esp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); - get_fpcontext(td, mcp); + get_fpcontext(td, mcp, NULL, 0); sdp = &td->td_pcb->pcb_fsd; mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; sdp = &td->td_pcb->pcb_gsd; mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; mcp->mc_flags = 0; + mcp->mc_xfpustate = 0; + mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); return (0); } @@ -3769,6 +3840,7 @@ set_mcontext(struct thread *td, const mcontext_t *mcp) { struct trapframe *tp; + char *xfpustate; int eflags, ret; tp = td->td_frame; @@ -3776,30 +3848,43 @@ return (EINVAL); eflags = (mcp->mc_eflags & PSL_USERCHANGE) | (tp->tf_eflags & ~PSL_USERCHANGE); - if ((ret = set_fpcontext(td, mcp)) == 0) { - tp->tf_fs = mcp->mc_fs; - tp->tf_es = mcp->mc_es; - tp->tf_ds = mcp->mc_ds; - tp->tf_edi = mcp->mc_edi; - tp->tf_esi = mcp->mc_esi; - tp->tf_ebp = mcp->mc_ebp; - tp->tf_ebx = mcp->mc_ebx; - tp->tf_edx = mcp->mc_edx; - tp->tf_ecx = mcp->mc_ecx; - tp->tf_eax = mcp->mc_eax; - tp->tf_eip = mcp->mc_eip; - tp->tf_eflags = eflags; - tp->tf_esp = mcp->mc_esp; - tp->tf_ss = mcp->mc_ss; - td->td_pcb->pcb_gs = mcp->mc_gs; - ret = 0; - } - return (ret); + if (mcp->mc_flags & _MC_HASFPXSTATE) { + if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - + sizeof(union savefpu)) + return (EINVAL); + xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); + ret = copyin((void *)mcp->mc_xfpustate, xfpustate, + mcp->mc_xfpustate_len); + if (ret != 0) + return (ret); + } else + xfpustate = NULL; + ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); + if (ret != 0) + return (ret); + tp->tf_fs = mcp->mc_fs; + tp->tf_es = mcp->mc_es; + tp->tf_ds = mcp->mc_ds; + tp->tf_edi = mcp->mc_edi; + tp->tf_esi = mcp->mc_esi; + tp->tf_ebp = mcp->mc_ebp; + tp->tf_ebx = mcp->mc_ebx; + tp->tf_edx = mcp->mc_edx; + tp->tf_ecx = mcp->mc_ecx; + tp->tf_eax = mcp->mc_eax; + tp->tf_eip = mcp->mc_eip; + tp->tf_eflags = eflags; + tp->tf_esp = mcp->mc_esp; + tp->tf_ss = mcp->mc_ss; + td->td_pcb->pcb_gs = mcp->mc_gs; + return (0); } static void -get_fpcontext(struct thread *td, mcontext_t *mcp) +get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, + size_t xfpusave_len) { + size_t max_len, len; #ifndef DEV_NPX mcp->mc_fpformat = _MC_FPFMT_NODEV; @@ -3807,37 +3892,54 @@ bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); #else mcp->mc_ownedfp = npxgetregs(td); - bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate[0], + bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = npxformat(); + if (!use_xsave || xfpusave_len == 0) + return; + max_len = cpu_max_ext_state_size - sizeof(union savefpu); + len = xfpusave_len; + if (len > max_len) { + len = max_len; + bzero(xfpusave + max_len, len - max_len); + } + mcp->mc_flags |= _MC_HASFPXSTATE; + mcp->mc_xfpustate_len = len; + bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); #endif } static int -set_fpcontext(struct thread *td, const mcontext_t *mcp) +set_fpcontext(struct thread *td, const mcontext_t *mcp, char *xfpustate, + size_t xfpustate_len) { + union savefpu *fpstate; + int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_387 && mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); - else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) + else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); - else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || + error = 0; + } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { #ifdef DEV_NPX + fpstate = (union savefpu *)&mcp->mc_fpstate; #ifdef CPU_ENABLE_SSE if (cpu_fxsr) - ((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env. - en_mxcsr &= cpu_mxcsr_mask; + fpstate->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask; #endif - npxsetregs(td, (union savefpu *)&mcp->mc_fpstate); + error = npxsetregs(td, fpstate, xfpustate, xfpustate_len); +#else + error = EINVAL; #endif } else return (EINVAL); - return (0); + return (error); } static void Index: sys/i386/i386/mp_machdep.c =================================================================== --- sys/i386/i386/mp_machdep.c +++ sys/i386/i386/mp_machdep.c @@ -749,7 +749,7 @@ initializecpu(); /* set up FPU state on the AP */ - npxinit(); + npxinit(false); if (cpu_ops.cpu_init) cpu_ops.cpu_init(); @@ -1512,11 +1512,11 @@ cpu = PCPU_GET(cpuid); if (savectx(&susppcbs[cpu]->sp_pcb)) { - npxsuspend(&susppcbs[cpu]->sp_fpususpend); + npxsuspend(susppcbs[cpu]->sp_fpususpend); wbinvd(); CPU_SET_ATOMIC(cpu, &suspended_cpus); } else { - npxresume(&susppcbs[cpu]->sp_fpususpend); + npxresume(susppcbs[cpu]->sp_fpususpend); pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); Index: sys/i386/i386/ptrace_machdep.c =================================================================== --- sys/i386/i386/ptrace_machdep.c +++ sys/i386/i386/ptrace_machdep.c @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -41,6 +42,47 @@ #define CPU_ENABLE_SSE #endif +#ifdef CPU_ENABLE_SSE +static int +cpu_ptrace_xstate(struct thread *td, int req, void *addr, int data) +{ + char *savefpu; + int error; + + if (!use_xsave) + return (EOPNOTSUPP); + + switch (req) { + case PT_GETXSTATE: + npxgetregs(td); + savefpu = (char *)(get_pcb_user_save_td(td) + 1); + error = copyout(savefpu, addr, + cpu_max_ext_state_size - sizeof(union savefpu)); + break; + + case PT_SETXSTATE: + if (data > cpu_max_ext_state_size - sizeof(union savefpu)) { + error = EINVAL; + break; + } + savefpu = malloc(data, M_TEMP, M_WAITOK); + error = copyin(addr, savefpu, data); + if (error == 0) { + npxgetregs(td); + error = npxsetxstate(td, savefpu, data); + } + free(savefpu, M_TEMP); + break; + + default: + error = EINVAL; + break; + } + + return (error); +} +#endif + int cpu_ptrace(struct thread *td, int req, void *addr, int data) { @@ -51,7 +93,7 @@ if (!cpu_fxsr) return (EINVAL); - fpstate = &td->td_pcb->pcb_user_save.sv_xmm; + fpstate = &get_pcb_user_save_td(td)->sv_xmm; switch (req) { case PT_GETXMMREGS: npxgetregs(td); @@ -64,6 +106,11 @@ fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask; break; + case PT_GETXSTATE: + case PT_SETXSTATE: + error = cpu_ptrace_xstate(td, req, addr, data); + break; + default: return (EINVAL); } Index: sys/i386/i386/sys_machdep.c =================================================================== --- sys/i386/i386/sys_machdep.c +++ sys/i386/i386/sys_machdep.c @@ -105,6 +105,7 @@ union { struct i386_ldt_args largs; struct i386_ioperm_args iargs; + struct i386_get_xfpustate xfpu; } kargs; uint32_t base; struct segment_descriptor sd, *sdp; @@ -126,6 +127,7 @@ case I386_SET_FSBASE: case I386_GET_GSBASE: case I386_SET_GSBASE: + case I386_GET_XFPUSTATE: break; case I386_SET_IOPERM: @@ -154,6 +156,11 @@ if (kargs.largs.num > MAX_LD || kargs.largs.num <= 0) return (EINVAL); break; + case I386_GET_XFPUSTATE: + if ((error = copyin(uap->parms, &kargs.xfpu, + sizeof(struct i386_get_xfpustate))) != 0) + return (error); + break; default: break; } @@ -270,6 +277,14 @@ load_gs(GSEL(GUGS_SEL, SEL_UPL)); } break; + case I386_GET_XFPUSTATE: + if (kargs.xfpu.len > cpu_max_ext_state_size - + sizeof(union savefpu)) + return (EINVAL); + npxgetregs(td); + error = copyout((char *)(get_pcb_user_save_td(td) + 1), + kargs.xfpu.addr, kargs.xfpu.len); + break; default: error = EINVAL; break; Index: sys/i386/i386/trap.c =================================================================== --- sys/i386/i386/trap.c +++ sys/i386/i386/trap.c @@ -1150,7 +1150,7 @@ KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, sa.code))); - KASSERT(td->td_pcb->pcb_save == &td->td_pcb->pcb_user_save, + KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, sa.code))); Index: sys/i386/i386/vm86bios.s =================================================================== --- sys/i386/i386/vm86bios.s +++ sys/i386/i386/vm86bios.s @@ -69,8 +69,6 @@ movl PCPU(CURTHREAD),%ecx cmpl %ecx,PCPU(FPCURTHREAD) /* do we need to save fp? */ jne 1f - testl %ecx,%ecx - je 1f /* no curproc/npxproc */ pushl %edx movl TD_PCB(%ecx),%ecx pushl PCB_SAVEFPU(%ecx) Index: sys/i386/i386/vm_machdep.c =================================================================== --- sys/i386/i386/vm_machdep.c +++ sys/i386/i386/vm_machdep.c @@ -118,8 +118,50 @@ static volatile u_int cpu_reset_proxy_active; #endif -extern int _ucodesel, _udatasel; +union savefpu * +get_pcb_user_save_td(struct thread *td) +{ + vm_offset_t p; + + p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE - + cpu_max_ext_state_size; + KASSERT((p % 64) == 0, ("Unaligned pcb_user_save area")); + return ((union savefpu *)p); +} + +union savefpu * +get_pcb_user_save_pcb(struct pcb *pcb) +{ + vm_offset_t p; + + p = (vm_offset_t)(pcb + 1); + return ((union savefpu *)p); +} + +struct pcb * +get_pcb_td(struct thread *td) +{ + vm_offset_t p; + p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE - + cpu_max_ext_state_size - sizeof(struct pcb); + return ((struct pcb *)p); +} + +void * +alloc_fpusave(int flags) +{ + void *res; + struct savefpu_ymm *sf; + + res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags); + if (use_xsave) { + sf = (struct savefpu_ymm *)res; + bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd)); + sf->sv_xstate.sx_hd.xstate_bv = xsave_mask; + } + return (res); +} /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child @@ -169,15 +211,16 @@ #endif /* Point the pcb to the top of the stack */ - pcb2 = (struct pcb *)(td2->td_kstack + - td2->td_kstack_pages * PAGE_SIZE) - 1; + pcb2 = get_pcb_td(td2); td2->td_pcb = pcb2; /* Copy td1's pcb */ bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); /* Properly initialize pcb_save */ - pcb2->pcb_save = &pcb2->pcb_user_save; + pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); + bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2), + cpu_max_ext_state_size); /* Point mdproc and then copy over td1's contents */ mdp2 = &p2->p_md; @@ -354,12 +397,18 @@ void cpu_thread_alloc(struct thread *td) { - - td->td_pcb = (struct pcb *)(td->td_kstack + - td->td_kstack_pages * PAGE_SIZE) - 1; - td->td_frame = (struct trapframe *)((caddr_t)td->td_pcb - 16) - 1; - td->td_pcb->pcb_ext = NULL; - td->td_pcb->pcb_save = &td->td_pcb->pcb_user_save; + struct pcb *pcb; + struct xstate_hdr *xhdr; + + td->td_pcb = pcb = get_pcb_td(td); + td->td_frame = (struct trapframe *)((caddr_t)pcb - 16) - 1; + pcb->pcb_ext = NULL; + pcb->pcb_save = get_pcb_user_save_pcb(pcb); + if (use_xsave) { + xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1); + bzero(xhdr, sizeof(*xhdr)); + xhdr->xstate_bv = xsave_mask; + } } void @@ -427,7 +476,9 @@ bcopy(td0->td_pcb, pcb2, sizeof(*pcb2)); pcb2->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE | PCB_KERNNPX); - pcb2->pcb_save = &pcb2->pcb_user_save; + pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); + bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save, + cpu_max_ext_state_size); /* * Create a new fresh stack for the new thread. Index: sys/i386/include/cpufunc.h =================================================================== --- sys/i386/include/cpufunc.h +++ sys/i386/include/cpufunc.h @@ -457,6 +457,25 @@ return (data); } +static __inline uint64_t +rxcr(u_int reg) +{ + u_int low, high; + + __asm __volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (reg)); + return (low | ((uint64_t)high << 32)); +} + +static __inline void +load_xcr(u_int reg, uint64_t val) +{ + u_int low, high; + + low = val; + high = val >> 32; + __asm __volatile("xsetbv" : : "c" (reg), "a" (low), "d" (high)); +} + /* * Global TLB flush (except for thise for pages marked PG_G) */ Index: sys/i386/include/md_var.h =================================================================== --- sys/i386/include/md_var.h +++ sys/i386/include/md_var.h @@ -52,6 +52,7 @@ extern u_int cpu_fxsr; extern u_int cpu_high; extern u_int cpu_id; +extern u_int cpu_max_ext_state_size; extern u_int cpu_mxcsr_mask; extern u_int cpu_procinfo; extern u_int cpu_procinfo2; @@ -76,14 +77,21 @@ extern uint32_t *vm_page_dump; extern int vm_page_dump_size; extern int workaround_erratum383; +extern int _udatasel; +extern int _ucodesel; +extern int use_xsave; +extern uint64_t xsave_mask; typedef void alias_for_inthand_t(u_int cs, u_int ef, u_int esp, u_int ss); +struct pcb; +union savefpu; struct thread; struct reg; struct fpreg; struct dbreg; struct dumperinfo; +void *alloc_fpusave(int flags); void bcopyb(const void *from, void *to, size_t len); void busdma_swi(void); void cpu_setregs(void); @@ -114,5 +122,8 @@ void setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int selec); int user_dbreg_trap(void); int minidumpsys(struct dumperinfo *); +union savefpu *get_pcb_user_save_td(struct thread *td); +union savefpu *get_pcb_user_save_pcb(struct pcb *pcb); +struct pcb *get_pcb_td(struct thread *td); #endif /* !_MACHINE_MD_VAR_H_ */ Index: sys/i386/include/npx.h =================================================================== --- sys/i386/include/npx.h +++ sys/i386/include/npx.h @@ -45,17 +45,24 @@ #ifdef _KERNEL +struct fpu_kern_ctx; + #define PCB_USER_FPU(pcb) (((pcb)->pcb_flags & PCB_KERNNPX) == 0) +#define XSAVE_AREA_ALIGN 64 + int npxdna(void); void npxdrop(void); void npxexit(struct thread *td); int npxformat(void); int npxgetregs(struct thread *td); -void npxinit(void); +void npxinit(bool bsp); void npxresume(union savefpu *addr); void npxsave(union savefpu *addr); -void npxsetregs(struct thread *td, union savefpu *addr); +int npxsetregs(struct thread *td, union savefpu *addr, + char *xfpustate, size_t xfpustate_size); +int npxsetxstate(struct thread *td, char *xfpustate, + size_t xfpustate_size); void npxsuspend(union savefpu *addr); int npxtrap_x87(void); int npxtrap_sse(void); @@ -68,8 +75,12 @@ int fpu_kern_thread(u_int flags); int is_fpu_kern_thread(u_int flags); +union savefpu *fpu_save_area_alloc(void); +void fpu_save_area_free(union savefpu *fsa); +void fpu_save_area_reset(union savefpu *fsa); + /* - * Flags for fpu_kern_enter() and fpu_kern_thread(). + * Flags for fpu_kern_alloc_ctx(), fpu_kern_enter() and fpu_kern_thread(). */ #define FPU_KERN_NORMAL 0x0000 #define FPU_KERN_NOWAIT 0x0001 Index: sys/i386/include/pcb.h =================================================================== --- sys/i386/include/pcb.h +++ sys/i386/include/pcb.h @@ -45,17 +45,23 @@ #include struct pcb { - int pcb_cr0; - int pcb_cr2; - int pcb_cr3; - int pcb_cr4; int pcb_edi; int pcb_esi; int pcb_ebp; int pcb_esp; int pcb_ebx; int pcb_eip; - + struct segment_descriptor pcb_fsd; + struct segment_descriptor pcb_gsd; + int pcb_ds; + int pcb_es; + int pcb_fs; + int pcb_gs; + int pcb_ss; + int pcb_cr0; + int pcb_cr2; + int pcb_cr3; + int pcb_cr4; int pcb_dr0; int pcb_dr1; int pcb_dr2; @@ -63,38 +69,35 @@ int pcb_dr6; int pcb_dr7; - union savefpu pcb_user_save; - uint16_t pcb_initial_npxcw; + struct region_descriptor pcb_gdt; + struct region_descriptor pcb_idt; + uint16_t pcb_ldt; + uint16_t pcb_tr; + u_int pcb_flags; -#define FP_SOFTFP 0x01 /* process using software fltng pnt emulator */ #define PCB_DBREGS 0x02 /* process using debug registers */ #define PCB_NPXINITDONE 0x08 /* fpu state is initialized */ #define PCB_VM86CALL 0x10 /* in vm86 call */ #define PCB_NPXUSERINITDONE 0x20 /* user fpu state is initialized */ #define PCB_KERNNPX 0x40 /* kernel uses npx */ + uint16_t pcb_initial_npxcw; + caddr_t pcb_onfault; /* copyin/out fault recovery */ - int pcb_ds; - int pcb_es; - int pcb_fs; - int pcb_gs; - int pcb_ss; - struct segment_descriptor pcb_fsd; - struct segment_descriptor pcb_gsd; struct pcb_ext *pcb_ext; /* optional pcb extension */ int pcb_psl; /* process status long */ u_long pcb_vm86[2]; /* vm86bios scratch space */ union savefpu *pcb_save; - struct region_descriptor pcb_gdt; - struct region_descriptor pcb_idt; - uint16_t pcb_ldt; - uint16_t pcb_tr; + uint32_t pcb_pad[10]; }; +/* Per-CPU state saved during suspend and resume. */ struct susppcb { struct pcb sp_pcb; - union savefpu sp_fpususpend; + + /* fpu context for suspend/resume */ + void *sp_fpususpend; }; #ifdef _KERNEL Index: sys/i386/isa/npx.c =================================================================== --- sys/i386/isa/npx.c +++ sys/i386/isa/npx.c @@ -55,6 +55,7 @@ #include #endif #include +#include #include #include @@ -99,7 +100,40 @@ #ifdef CPU_ENABLE_SSE #define fxrstor(addr) __asm __volatile("fxrstor %0" : : "m" (*(addr))) #define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) +#define ldmxcsr(csr) __asm __volatile("ldmxcsr %0" : : "m" (csr)) #define stmxcsr(addr) __asm __volatile("stmxcsr %0" : : "m" (*(addr))) + +static __inline void +xrstor(char *addr, uint64_t mask) +{ + uint32_t low, hi; + + low = mask; + hi = mask >> 32; + __asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi)); +} + +static __inline void +xsave(char *addr, uint64_t mask) +{ + uint32_t low, hi; + + low = mask; + hi = mask >> 32; + __asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) : + "memory"); +} + +static __inline void +xsaveopt(char *addr, uint64_t mask) +{ + uint32_t low, hi; + + low = mask; + hi = mask >> 32; + __asm __volatile("xsaveopt %0" : "=m" (*addr) : "a" (low), "d" (hi) : + "memory"); +} #endif #else /* !(__GNUCLIKE_ASM && !lint) */ @@ -114,7 +148,11 @@ #ifdef CPU_ENABLE_SSE void fxsave(caddr_t addr); void fxrstor(caddr_t addr); +void ldmxcsr(u_int csr); void stmxcsr(u_int *csr); +void xrstor(char *addr, uint64_t mask); +void xsave(char *addr, uint64_t mask); +void xsaveopt(char *addr, uint64_t mask); #endif #endif /* __GNUCLIKE_ASM && !lint */ @@ -151,25 +189,42 @@ (savefpu)->sv_87.sv_env.en_cw = (value) #endif /* CPU_ENABLE_SSE */ -typedef u_char bool_t; - #ifdef CPU_ENABLE_SSE +CTASSERT(sizeof(union savefpu) == 512); +CTASSERT(sizeof(struct xstate_hdr) == 64); +CTASSERT(sizeof(struct savefpu_ymm) == 832); + +/* + * This requirement is to make it easier for asm code to calculate + * offset of the fpu save area from the pcb address. FPU save area + * must be 64-byte aligned. + */ +CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0); + static void fpu_clean_state(void); #endif static void fpusave(union savefpu *); static void fpurstor(union savefpu *); -static int npx_attach(device_t dev); -static void npx_identify(driver_t *driver, device_t parent); -static int npx_probe(device_t dev); int hw_float; SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD, &hw_float, 0, "Floating point instructions executed in hardware"); +int use_xsave; +uint64_t xsave_mask; +static uma_zone_t fpu_save_area_zone; +static union savefpu *npx_initialstate; + +struct xsave_area_elm_descr { + u_int offset; + u_int size; +} *xsave_area_desc; + +static int use_xsaveopt; + static volatile u_int npx_traps_while_probing; -static union savefpu npx_initialstate; alias_for_inthand_t probetrap; __asm(" \n\ @@ -184,32 +239,14 @@ "); /* - * Identify routine. Create a connection point on our parent for probing. - */ -static void -npx_identify(driver, parent) - driver_t *driver; - device_t parent; -{ - device_t child; - - child = BUS_ADD_CHILD(parent, 0, "npx", 0); - if (child == NULL) - panic("npx_identify"); -} - -/* - * Probe routine. Set flags to tell npxattach() what to do. Set up an - * interrupt handler if npx needs to use interrupts. + * Determine if an FPU is present and how to use it. */ static int -npx_probe(device_t dev) +npx_probe(void) { struct gate_descriptor save_idt_npxtrap; u_short control, status; - device_set_desc(dev, "math processor"); - /* * Modern CPUs all have an FPU that uses the INT16 interface * and provide a simple way to verify that, so handle the @@ -217,8 +254,7 @@ */ if (cpu_feature & CPUID_FPU) { hw_float = 1; - device_quiet(dev); - return (0); + return (1); } save_idt_npxtrap = idt[IDT_MF]; @@ -272,7 +308,7 @@ * accelerator board. */ hw_float = 1; - return (0); + return (1); #endif npx_traps_while_probing = 0; fp_divide_by_0(); @@ -283,7 +319,7 @@ hw_float = 1; goto cleanup; } - device_printf(dev, + printf( "FPU does not use exception 16 for error reporting\n"); goto cleanup; } @@ -293,80 +329,205 @@ * Probe failed. Floating point simply won't work. * Notify user and disable FPU/MMX/SSE instruction execution. */ - device_printf(dev, "WARNING: no FPU!\n"); + printf("WARNING: no FPU!\n"); __asm __volatile("smsw %%ax; orb %0,%%al; lmsw %%ax" : : "n" (CR0_EM | CR0_MP) : "ax"); cleanup: idt[IDT_MF] = save_idt_npxtrap; - return (hw_float ? 0 : ENXIO); + return (hw_float); } /* - * Attach routine - announce which it is, and wire into system + * Enable XSAVE if supported and allowed by user. + * Calculate the xsave_mask. */ -static int -npx_attach(device_t dev) +static void +npxinit_bsp1(void) { + u_int cp[4]; + uint64_t xsave_mask_user; - npxinit(); - critical_enter(); - stop_emulating(); - fpusave(&npx_initialstate); - start_emulating(); -#ifdef CPU_ENABLE_SSE - if (cpu_fxsr) { - if (npx_initialstate.sv_xmm.sv_env.en_mxcsr_mask) - cpu_mxcsr_mask = - npx_initialstate.sv_xmm.sv_env.en_mxcsr_mask; - else - cpu_mxcsr_mask = 0xFFBF; - bzero(npx_initialstate.sv_xmm.sv_fp, - sizeof(npx_initialstate.sv_xmm.sv_fp)); - bzero(npx_initialstate.sv_xmm.sv_xmm, - sizeof(npx_initialstate.sv_xmm.sv_xmm)); - /* XXX might need even more zeroing. */ - } else -#endif - bzero(npx_initialstate.sv_87.sv_ac, - sizeof(npx_initialstate.sv_87.sv_ac)); - critical_exit(); + if (cpu_fxsr && (cpu_feature2 & CPUID2_XSAVE) != 0) { + use_xsave = 1; + TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); + } + if (!use_xsave) + return; - return (0); + cpuid_count(0xd, 0x0, cp); + xsave_mask = XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; + if ((cp[0] & xsave_mask) != xsave_mask) + panic("CPU0 does not support X87 or SSE: %x", cp[0]); + xsave_mask = ((uint64_t)cp[3] << 32) | cp[0]; + xsave_mask_user = xsave_mask; + TUNABLE_QUAD_FETCH("hw.xsave_mask", &xsave_mask_user); + xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; + xsave_mask &= xsave_mask_user; + if ((xsave_mask & XFEATURE_AVX512) != XFEATURE_AVX512) + xsave_mask &= ~XFEATURE_AVX512; + if ((xsave_mask & XFEATURE_MPX) != XFEATURE_MPX) + xsave_mask &= ~XFEATURE_MPX; + + cpuid_count(0xd, 0x1, cp); + if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) + use_xsaveopt = 1; +} + +/* + * Calculate the fpu save area size. + */ +static void +npxinit_bsp2(void) +{ + u_int cp[4]; + + if (use_xsave) { + cpuid_count(0xd, 0x0, cp); + cpu_max_ext_state_size = cp[1]; + + /* + * Reload the cpu_feature2, since we enabled OSXSAVE. + */ + do_cpuid(1, cp); + cpu_feature2 = cp[2]; + } else + cpu_max_ext_state_size = sizeof(union savefpu); } /* * Initialize floating point unit. */ void -npxinit(void) +npxinit(bool bsp) { static union savefpu dummy; register_t saveintr; + u_int mxcsr; u_short control; - if (!hw_float) - return; + if (bsp) { + if (!npx_probe()) + return; + npxinit_bsp1(); + } + + if (use_xsave) { + load_cr4(rcr4() | CR4_XSAVE); + load_xcr(XCR0, xsave_mask); + } + + /* + * XCR0 shall be set up before CPU can report the save area size. + */ + if (bsp) + npxinit_bsp2(); + /* * fninit has the same h/w bugs as fnsave. Use the detoxified - * fnsave to throw away any junk in the fpu. npxsave() initializes - * the fpu and sets fpcurthread = NULL as important side effects. + * fnsave to throw away any junk in the fpu. fpusave() initializes + * the fpu. * * It is too early for critical_enter() to work on AP. */ saveintr = intr_disable(); - npxsave(&dummy); stop_emulating(); #ifdef CPU_ENABLE_SSE - /* XXX npxsave() doesn't actually initialize the fpu in the SSE case. */ if (cpu_fxsr) fninit(); + else #endif + fnsave(&dummy); control = __INITIAL_NPXCW__; fldcw(control); +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + mxcsr = __INITIAL_MXCSR__; + ldmxcsr(mxcsr); + } +#endif + start_emulating(); + intr_restore(saveintr); +} + +/* + * On the boot CPU we generate a clean state that is used to + * initialize the floating point unit when it is first used by a + * process. + */ +static void +npxinitstate(void *arg __unused) +{ + register_t saveintr; + int cp[4], i, max_ext_n; + + if (!hw_float) + return; + + npx_initialstate = malloc(cpu_max_ext_state_size, M_DEVBUF, + M_WAITOK | M_ZERO); + saveintr = intr_disable(); + stop_emulating(); + + fpusave(npx_initialstate); +#ifdef CPU_ENABLE_SSE + if (cpu_fxsr) { + if (npx_initialstate->sv_xmm.sv_env.en_mxcsr_mask) + cpu_mxcsr_mask = + npx_initialstate->sv_xmm.sv_env.en_mxcsr_mask; + else + cpu_mxcsr_mask = 0xFFBF; + + /* + * The fninit instruction does not modify XMM + * registers. The fpusave call dumped the garbage + * contained in the registers after reset to the + * initial state saved. Clear XMM registers file + * image to make the startup program state and signal + * handler XMM register content predictable. + */ + bzero(npx_initialstate->sv_xmm.sv_fp, + sizeof(npx_initialstate->sv_xmm.sv_fp)); + bzero(npx_initialstate->sv_xmm.sv_xmm, + sizeof(npx_initialstate->sv_xmm.sv_xmm)); + } else +#endif + bzero(npx_initialstate->sv_87.sv_ac, + sizeof(npx_initialstate->sv_87.sv_ac)); + + /* + * Create a table describing the layout of the CPU Extended + * Save Area. + */ + if (use_xsave) { + if (xsave_mask >> 32 != 0) + max_ext_n = fls(xsave_mask >> 32) + 32; + else + max_ext_n = fls(xsave_mask); + xsave_area_desc = malloc(max_ext_n * sizeof(struct + xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO); + /* x87 state */ + xsave_area_desc[0].offset = 0; + xsave_area_desc[0].size = 160; + /* XMM */ + xsave_area_desc[1].offset = 160; + xsave_area_desc[1].size = 288 - 160; + + for (i = 2; i < max_ext_n; i++) { + cpuid_count(0xd, i, cp); + xsave_area_desc[i].offset = cp[1]; + xsave_area_desc[i].size = cp[0]; + } + } + + fpu_save_area_zone = uma_zcreate("FPU_save_area", + cpu_max_ext_state_size, NULL, NULL, NULL, NULL, + XSAVE_AREA_ALIGN - 1, 0); + start_emulating(); intr_restore(saveintr); } +SYSINIT(npxinitstate, SI_SUB_DRIVERS, SI_ORDER_ANY, npxinitstate, NULL); /* * Free coprocessor (if we have it). @@ -377,8 +538,12 @@ { critical_enter(); - if (curthread == PCPU_GET(fpcurthread)) - npxsave(curpcb->pcb_save); + if (curthread == PCPU_GET(fpcurthread)) { + stop_emulating(); + fpusave(curpcb->pcb_save); + start_emulating(); + PCPU_SET(fpcurthread, NULL); + } critical_exit(); #ifdef NPX_DEBUG if (hw_float) { @@ -683,7 +848,7 @@ } stop_emulating(); /* - * Record new context early in case frstor causes an IRQ13. + * Record new context early in case frstor causes a trap. */ PCPU_SET(fpcurthread, curthread); @@ -697,28 +862,20 @@ * This is the first time this thread has used the FPU or * the PCB doesn't contain a clean FPU state. Explicitly * load an initial state. + * + * We prefer to restore the state from the actual save + * area in PCB instead of directly loading from + * npx_initialstate, to ignite the XSAVEOPT + * tracking engine. */ - fpurstor(&npx_initialstate); + bcopy(npx_initialstate, curpcb->pcb_save, cpu_max_ext_state_size); + fpurstor(curpcb->pcb_save); if (curpcb->pcb_initial_npxcw != __INITIAL_NPXCW__) fldcw(curpcb->pcb_initial_npxcw); curpcb->pcb_flags |= PCB_NPXINITDONE; if (PCB_USER_FPU(curpcb)) curpcb->pcb_flags |= PCB_NPXUSERINITDONE; } else { - /* - * The following fpurstor() may cause an IRQ13 when the - * state being restored has a pending error. The error will - * appear to have been triggered by the current (npx) user - * instruction even when that instruction is a no-wait - * instruction that should not trigger an error (e.g., - * fnclex). On at least one 486 system all of the no-wait - * instructions are broken the same as frstor, so our - * treatment does not amplify the breakage. On at least - * one 386/Cyrix 387 system, fnclex works correctly while - * frstor and fnsave are broken, so our treatment breaks - * fnclex if it is the first FPU instruction after a context - * switch. - */ fpurstor(curpcb->pcb_save); } critical_exit(); @@ -727,27 +884,12 @@ } /* - * Wrapper for fnsave instruction, partly to handle hardware bugs. When npx - * exceptions are reported via IRQ13, spurious IRQ13's may be triggered by - * no-wait npx instructions. See the Intel application note AP-578 for - * details. This doesn't cause any additional complications here. IRQ13's - * are inherently asynchronous unless the CPU is frozen to deliver them -- - * one that started in userland may be delivered many instructions later, - * after the process has entered the kernel. It may even be delivered after - * the fnsave here completes. A spurious IRQ13 for the fnsave is handled in - * the same way as a very-late-arriving non-spurious IRQ13 from user mode: - * it is normally ignored at first because we set fpcurthread to NULL; it is - * normally retriggered in npxdna() after return to user mode. + * Wrapper for fpusave() called from context switch routines. * * npxsave() must be called with interrupts disabled, so that it clears * fpcurthread atomically with saving the state. We require callers to do the * disabling, since most callers need to disable interrupts anyway to call * npxsave() atomically with checking fpcurthread. - * - * A previous version of npxsave() went to great lengths to excecute fnsave - * with interrupts enabled in case executing it froze the CPU. This case - * can't happen, at least for Intel CPU/NPX's. Spurious IRQ13's don't imply - * spurious freezes. */ void npxsave(addr) @@ -755,8 +897,10 @@ { stop_emulating(); - fpusave(addr); - + if (use_xsaveopt) + xsaveopt((char *)addr, xsave_mask); + else + fpusave(addr); start_emulating(); PCPU_SET(fpcurthread, NULL); } @@ -773,11 +917,11 @@ if (!hw_float) return; if (PCPU_GET(fpcurthread) == NULL) { - *addr = npx_initialstate; + bcopy(npx_initialstate, addr, cpu_max_ext_state_size); return; } cr0 = rcr0(); - clts(); + stop_emulating(); fpusave(addr); load_cr0(cr0); } @@ -791,8 +935,7 @@ return; cr0 = rcr0(); - clts(); - npxinit(); + npxinit(false); stop_emulating(); fpurstor(addr); load_cr0(cr0); @@ -829,21 +972,24 @@ npxgetregs(struct thread *td) { struct pcb *pcb; + uint64_t *xstate_bv, bit; + char *sa; + int max_ext_n, i, owned; if (!hw_float) return (_MC_FPOWNED_NONE); pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_NPXINITDONE) == 0) { - bcopy(&npx_initialstate, &pcb->pcb_user_save, - sizeof(npx_initialstate)); - SET_FPU_CW(&pcb->pcb_user_save, pcb->pcb_initial_npxcw); + bcopy(npx_initialstate, get_pcb_user_save_pcb(pcb), + cpu_max_ext_state_size); + SET_FPU_CW(get_pcb_user_save_pcb(pcb), pcb->pcb_initial_npxcw); npxuserinited(td); return (_MC_FPOWNED_PCB); } critical_enter(); if (td == PCPU_GET(fpcurthread)) { - fpusave(&pcb->pcb_user_save); + fpusave(get_pcb_user_save_pcb(pcb)); #ifdef CPU_ENABLE_SSE if (!cpu_fxsr) #endif @@ -853,12 +999,34 @@ * starts with a clean state next time. */ npxdrop(); - critical_exit(); - return (_MC_FPOWNED_FPU); + owned = _MC_FPOWNED_FPU; } else { - critical_exit(); - return (_MC_FPOWNED_PCB); + owned = _MC_FPOWNED_PCB; } + critical_exit(); + if (use_xsave) { + /* + * Handle partially saved state. + */ + sa = (char *)get_pcb_user_save_pcb(pcb); + xstate_bv = (uint64_t *)(sa + sizeof(union savefpu) + + offsetof(struct xstate_hdr, xstate_bv)); + if (xsave_mask >> 32 != 0) + max_ext_n = fls(xsave_mask >> 32) + 32; + else + max_ext_n = fls(xsave_mask); + for (i = 0; i < max_ext_n; i++) { + bit = 1ULL << i; + if ((xsave_mask & bit) == 0 || (*xstate_bv & bit) != 0) + continue; + bcopy((char *)npx_initialstate + + xsave_area_desc[i].offset, + sa + xsave_area_desc[i].offset, + xsave_area_desc[i].size); + *xstate_bv |= bit; + } + } + return (owned); } void @@ -872,34 +1040,79 @@ pcb->pcb_flags |= PCB_NPXUSERINITDONE; } +int +npxsetxstate(struct thread *td, char *xfpustate, size_t xfpustate_size) +{ + struct xstate_hdr *hdr, *ehdr; + size_t len, max_len; + uint64_t bv; + + /* XXXKIB should we clear all extended state in xstate_bv instead ? */ + if (xfpustate == NULL) + return (0); + if (!use_xsave) + return (EOPNOTSUPP); -void -npxsetregs(struct thread *td, union savefpu *addr) + len = xfpustate_size; + if (len < sizeof(struct xstate_hdr)) + return (EINVAL); + max_len = cpu_max_ext_state_size - sizeof(union savefpu); + if (len > max_len) + return (EINVAL); + + ehdr = (struct xstate_hdr *)xfpustate; + bv = ehdr->xstate_bv; + + /* + * Avoid #gp. + */ + if (bv & ~xsave_mask) + return (EINVAL); + + hdr = (struct xstate_hdr *)(get_pcb_user_save_td(td) + 1); + + hdr->xstate_bv = bv; + bcopy(xfpustate + sizeof(struct xstate_hdr), + (char *)(hdr + 1), len - sizeof(struct xstate_hdr)); + + return (0); +} + +int +npxsetregs(struct thread *td, union savefpu *addr, char *xfpustate, + size_t xfpustate_size) { struct pcb *pcb; + int error; if (!hw_float) - return; + return (ENXIO); pcb = td->td_pcb; critical_enter(); if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) { + error = npxsetxstate(td, xfpustate, xfpustate_size); + if (error != 0) { + critical_exit(); + return (error); + } #ifdef CPU_ENABLE_SSE if (!cpu_fxsr) #endif fnclex(); /* As in npxdrop(). */ - if (((uintptr_t)addr & 0xf) != 0) { - bcopy(addr, &pcb->pcb_user_save, sizeof(*addr)); - fpurstor(&pcb->pcb_user_save); - } else - fpurstor(addr); + bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr)); + fpurstor(get_pcb_user_save_td(td)); critical_exit(); pcb->pcb_flags |= PCB_NPXUSERINITDONE | PCB_NPXINITDONE; } else { critical_exit(); - bcopy(addr, &pcb->pcb_user_save, sizeof(*addr)); + error = npxsetxstate(td, xfpustate, xfpustate_size); + if (error != 0) + return (error); + bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr)); npxuserinited(td); } + return (0); } static void @@ -908,7 +1121,9 @@ { #ifdef CPU_ENABLE_SSE - if (cpu_fxsr) + if (use_xsave) + xsave((char *)addr, xsave_mask); + else if (cpu_fxsr) fxsave(addr); else #endif @@ -954,40 +1169,15 @@ { #ifdef CPU_ENABLE_SSE - if (cpu_fxsr) + if (use_xsave) + xrstor((char *)addr, xsave_mask); + else if (cpu_fxsr) fxrstor(addr); else #endif frstor(addr); } -static device_method_t npx_methods[] = { - /* Device interface */ - DEVMETHOD(device_identify, npx_identify), - DEVMETHOD(device_probe, npx_probe), - DEVMETHOD(device_attach, npx_attach), - DEVMETHOD(device_detach, bus_generic_detach), - DEVMETHOD(device_shutdown, bus_generic_shutdown), - DEVMETHOD(device_suspend, bus_generic_suspend), - DEVMETHOD(device_resume, bus_generic_resume), - - { 0, 0 } -}; - -static driver_t npx_driver = { - "npx", - npx_methods, - 1, /* no softc */ -}; - -static devclass_t npx_devclass; - -/* - * We prefer to attach to the root nexus so that the usual case (exception 16) - * doesn't describe the processor as being `on isa'. - */ -DRIVER_MODULE(npx, nexus, npx_driver, npx_devclass, 0, 0); - #ifdef DEV_ISA /* * This sucks up the legacy ISA support assignments from PNPBIOS/ACPI. @@ -1042,8 +1232,6 @@ static MALLOC_DEFINE(M_FPUKERN_CTX, "fpukern_ctx", "Kernel contexts for FPU state"); -#define XSAVE_AREA_ALIGN 64 - #define FPU_KERN_CTX_NPXINITDONE 0x01 #define FPU_KERN_CTX_DUMMY 0x02 @@ -1060,7 +1248,7 @@ size_t sz; sz = sizeof(struct fpu_kern_ctx) + XSAVE_AREA_ALIGN + - sizeof(union savefpu); + cpu_max_ext_state_size; res = malloc(sz, M_FPUKERN_CTX, ((flags & FPU_KERN_NOWAIT) ? M_NOWAIT : M_WAITOK) | M_ZERO); return (res); @@ -1094,8 +1282,8 @@ return (0); } pcb = td->td_pcb; - KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save == &pcb->pcb_user_save, - ("mangled pcb_save")); + KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save == + get_pcb_user_save_pcb(pcb), ("mangled pcb_save")); ctx->flags = 0; if ((pcb->pcb_flags & PCB_NPXINITDONE) != 0) ctx->flags |= FPU_KERN_CTX_NPXINITDONE; @@ -1120,7 +1308,7 @@ npxdrop(); critical_exit(); pcb->pcb_save = ctx->prev; - if (pcb->pcb_save == &pcb->pcb_user_save) { + if (pcb->pcb_save == get_pcb_user_save_pcb(pcb)) { if ((pcb->pcb_flags & PCB_NPXUSERINITDONE) != 0) pcb->pcb_flags |= PCB_NPXINITDONE; else @@ -1144,7 +1332,7 @@ pcb = curpcb; KASSERT((curthread->td_pflags & TDP_KTHREAD) != 0, ("Only kthread may use fpu_kern_thread")); - KASSERT(curpcb->pcb_save == &curpcb->pcb_user_save, + KASSERT(curpcb->pcb_save == get_pcb_user_save_pcb(curpcb), ("mangled pcb_save")); KASSERT(PCB_USER_FPU(curpcb), ("recursive call")); @@ -1160,3 +1348,27 @@ return (0); return ((curpcb->pcb_flags & PCB_KERNNPX) != 0); } + +/* + * FPU save area alloc/free/init utility routines + */ +union savefpu * +fpu_save_area_alloc(void) +{ + + return (uma_zalloc(fpu_save_area_zone, 0)); +} + +void +fpu_save_area_free(union savefpu *fsa) +{ + + uma_zfree(fpu_save_area_zone, fsa); +} + +void +fpu_save_area_reset(union savefpu *fsa) +{ + + bcopy(npx_initialstate, fsa, cpu_max_ext_state_size); +} Index: sys/i386/linux/linux_ptrace.c =================================================================== --- sys/i386/linux/linux_ptrace.c +++ sys/i386/linux/linux_ptrace.c @@ -225,7 +225,7 @@ PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (cpu_fxsr == 0 || (td->td_proc->p_flag & P_INMEM) == 0) return (EIO); - bcopy(&td->td_pcb->pcb_user_save.sv_xmm, fpxregs, sizeof(*fpxregs)); + bcopy(&get_pcb_user_save_td(td)->sv_xmm, fpxregs, sizeof(*fpxregs)); return (0); } @@ -236,7 +236,7 @@ PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (cpu_fxsr == 0 || (td->td_proc->p_flag & P_INMEM) == 0) return (EIO); - bcopy(fpxregs, &td->td_pcb->pcb_user_save.sv_xmm, sizeof(*fpxregs)); + bcopy(fpxregs, &get_pcb_user_save_td(td)->sv_xmm, sizeof(*fpxregs)); return (0); } #endif Index: sys/i386/linux/linux_sysvec.c =================================================================== --- sys/i386/linux/linux_sysvec.c +++ sys/i386/linux/linux_sysvec.c @@ -400,7 +400,6 @@ -extern int _ucodesel, _udatasel; extern unsigned long linux_sznonrtsigcode; static void Index: sys/i386/svr4/svr4_machdep.c =================================================================== --- sys/i386/svr4/svr4_machdep.c +++ sys/i386/svr4/svr4_machdep.c @@ -65,7 +65,6 @@ extern int svr4_szsigcode; extern char svr4_sigcode[]; -extern int _udatasel, _ucodesel; static void svr4_getsiginfo(union svr4_siginfo *, int, u_long, caddr_t); Index: sys/x86/acpica/acpi_wakeup.c =================================================================== --- sys/x86/acpica/acpi_wakeup.c +++ sys/x86/acpica/acpi_wakeup.c @@ -211,7 +211,7 @@ #ifdef __amd64__ fpususpend(susppcbs[0]->sp_fpususpend); #elif defined(DEV_NPX) - npxsuspend(&susppcbs[0]->sp_fpususpend); + npxsuspend(susppcbs[0]->sp_fpususpend); #endif #ifdef SMP if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) { @@ -248,7 +248,7 @@ #ifdef __amd64__ fpuresume(susppcbs[0]->sp_fpususpend); #elif defined(DEV_NPX) - npxresume(&susppcbs[0]->sp_fpususpend); + npxresume(susppcbs[0]->sp_fpususpend); #endif } @@ -327,9 +327,7 @@ susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK); for (i = 0; i < mp_ncpus; i++) { susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK); -#ifdef __amd64__ susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK); -#endif } return (wakeaddr);