Index: stable/10/sys/amd64/amd64/fpu.c =================================================================== --- stable/10/sys/amd64/amd64/fpu.c (revision 272539) +++ stable/10/sys/amd64/amd64/fpu.c (revision 272540) @@ -1,1044 +1,1048 @@ /*- * Copyright (c) 1990 William Jolitz. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)npx.c 7.2 (Berkeley) 5/12/91 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Floating point support. */ #if defined(__GNUCLIKE_ASM) && !defined(lint) #define fldcw(cw) __asm __volatile("fldcw %0" : : "m" (cw)) #define fnclex() __asm __volatile("fnclex") #define fninit() __asm __volatile("fninit") #define fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr))) #define fnstsw(addr) __asm __volatile("fnstsw %0" : "=am" (*(addr))) #define fxrstor(addr) __asm __volatile("fxrstor %0" : : "m" (*(addr))) #define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) #define ldmxcsr(csr) __asm __volatile("ldmxcsr %0" : : "m" (csr)) #define stmxcsr(addr) __asm __volatile("stmxcsr %0" : : "m" (*(addr))) static __inline void xrstor(char *addr, uint64_t mask) { uint32_t low, hi; low = mask; hi = mask >> 32; __asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi)); } static __inline void xsave(char *addr, uint64_t mask) { uint32_t low, hi; low = mask; hi = mask >> 32; __asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) : "memory"); } #else /* !(__GNUCLIKE_ASM && !lint) */ void fldcw(u_short cw); void fnclex(void); void fninit(void); void fnstcw(caddr_t addr); void fnstsw(caddr_t addr); void fxsave(caddr_t addr); void fxrstor(caddr_t addr); void ldmxcsr(u_int csr); void stmxcsr(u_int *csr); void xrstor(char *addr, uint64_t mask); void xsave(char *addr, uint64_t mask); #endif /* __GNUCLIKE_ASM && !lint */ #define start_emulating() load_cr0(rcr0() | CR0_TS) #define stop_emulating() clts() CTASSERT(sizeof(struct savefpu) == 512); CTASSERT(sizeof(struct xstate_hdr) == 64); CTASSERT(sizeof(struct savefpu_ymm) == 832); /* * This requirement is to make it easier for asm code to calculate * offset of the fpu save area from the pcb address. FPU save area * must be 64-byte aligned. */ CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0); static void fpu_clean_state(void); SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD, NULL, 1, "Floating point instructions executed in hardware"); int use_xsave; /* non-static for cpu_switch.S */ uint64_t xsave_mask; /* the same */ static uma_zone_t fpu_save_area_zone; static struct savefpu *fpu_initialstate; struct xsave_area_elm_descr { u_int offset; u_int size; } *xsave_area_desc; void fpusave(void *addr) { if (use_xsave) xsave((char *)addr, xsave_mask); else fxsave((char *)addr); } void fpurestore(void *addr) { if (use_xsave) xrstor((char *)addr, xsave_mask); else fxrstor((char *)addr); } void fpususpend(void *addr) { u_long cr0; cr0 = rcr0(); stop_emulating(); fpusave(addr); load_cr0(cr0); } void fpuresume(void *addr) { u_long cr0; cr0 = rcr0(); stop_emulating(); fninit(); if (use_xsave) load_xcr(XCR0, xsave_mask); fpurestore(addr); load_cr0(cr0); } /* * Enable XSAVE if supported and allowed by user. * Calculate the xsave_mask. */ static void fpuinit_bsp1(void) { u_int cp[4]; uint64_t xsave_mask_user; if ((cpu_feature2 & CPUID2_XSAVE) != 0) { use_xsave = 1; TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); } if (!use_xsave) return; cpuid_count(0xd, 0x0, cp); xsave_mask = XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; if ((cp[0] & xsave_mask) != xsave_mask) panic("CPU0 does not support X87 or SSE: %x", cp[0]); xsave_mask = ((uint64_t)cp[3] << 32) | cp[0]; xsave_mask_user = xsave_mask; TUNABLE_ULONG_FETCH("hw.xsave_mask", &xsave_mask_user); xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; xsave_mask &= xsave_mask_user; if ((xsave_mask & XFEATURE_AVX512) != XFEATURE_AVX512) xsave_mask &= ~XFEATURE_AVX512; if ((xsave_mask & XFEATURE_MPX) != XFEATURE_MPX) xsave_mask &= ~XFEATURE_MPX; cpuid_count(0xd, 0x1, cp); if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) { /* * Patch the XSAVE instruction in the cpu_switch code * to XSAVEOPT. We assume that XSAVE encoding used * REX byte, and set the bit 4 of the r/m byte. */ ctx_switch_xsave[3] |= 0x10; } } /* * Calculate the fpu save area size. */ static void fpuinit_bsp2(void) { u_int cp[4]; if (use_xsave) { cpuid_count(0xd, 0x0, cp); cpu_max_ext_state_size = cp[1]; /* * Reload the cpu_feature2, since we enabled OSXSAVE. */ do_cpuid(1, cp); cpu_feature2 = cp[2]; } else cpu_max_ext_state_size = sizeof(struct savefpu); } /* * Initialize the floating point unit. */ void fpuinit(void) { register_t saveintr; u_int mxcsr; u_short control; if (IS_BSP()) fpuinit_bsp1(); if (use_xsave) { load_cr4(rcr4() | CR4_XSAVE); load_xcr(XCR0, xsave_mask); } /* * XCR0 shall be set up before CPU can report the save area size. */ if (IS_BSP()) fpuinit_bsp2(); /* * It is too early for critical_enter() to work on AP. */ saveintr = intr_disable(); stop_emulating(); fninit(); control = __INITIAL_FPUCW__; fldcw(control); mxcsr = __INITIAL_MXCSR__; ldmxcsr(mxcsr); start_emulating(); intr_restore(saveintr); } /* * On the boot CPU we generate a clean state that is used to * initialize the floating point unit when it is first used by a * process. */ static void fpuinitstate(void *arg __unused) { register_t saveintr; int cp[4], i, max_ext_n; fpu_initialstate = malloc(cpu_max_ext_state_size, M_DEVBUF, M_WAITOK | M_ZERO); saveintr = intr_disable(); stop_emulating(); fpusave(fpu_initialstate); if (fpu_initialstate->sv_env.en_mxcsr_mask) cpu_mxcsr_mask = fpu_initialstate->sv_env.en_mxcsr_mask; else cpu_mxcsr_mask = 0xFFBF; /* * The fninit instruction does not modify XMM registers. The * fpusave call dumped the garbage contained in the registers * after reset to the initial state saved. Clear XMM * registers file image to make the startup program state and * signal handler XMM register content predictable. */ bzero(&fpu_initialstate->sv_xmm[0], sizeof(struct xmmacc)); /* * Create a table describing the layout of the CPU Extended * Save Area. */ if (use_xsave) { max_ext_n = flsl(xsave_mask); xsave_area_desc = malloc(max_ext_n * sizeof(struct xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO); /* x87 state */ xsave_area_desc[0].offset = 0; xsave_area_desc[0].size = 160; /* XMM */ xsave_area_desc[1].offset = 160; xsave_area_desc[1].size = 288 - 160; for (i = 2; i < max_ext_n; i++) { cpuid_count(0xd, i, cp); xsave_area_desc[i].offset = cp[1]; xsave_area_desc[i].size = cp[0]; } } fpu_save_area_zone = uma_zcreate("FPU_save_area", cpu_max_ext_state_size, NULL, NULL, NULL, NULL, XSAVE_AREA_ALIGN - 1, 0); start_emulating(); intr_restore(saveintr); } SYSINIT(fpuinitstate, SI_SUB_DRIVERS, SI_ORDER_ANY, fpuinitstate, NULL); /* * Free coprocessor (if we have it). */ void fpuexit(struct thread *td) { critical_enter(); if (curthread == PCPU_GET(fpcurthread)) { stop_emulating(); fpusave(curpcb->pcb_save); start_emulating(); - PCPU_SET(fpcurthread, 0); + PCPU_SET(fpcurthread, NULL); } critical_exit(); } int fpuformat() { return (_MC_FPFMT_XMM); } /* * The following mechanism is used to ensure that the FPE_... value * that is passed as a trapcode to the signal handler of the user * process does not have more than one bit set. * * Multiple bits may be set if the user process modifies the control * word while a status word bit is already set. While this is a sign * of bad coding, we have no choise than to narrow them down to one * bit, since we must not send a trapcode that is not exactly one of * the FPE_ macros. * * The mechanism has a static table with 127 entries. Each combination * of the 7 FPU status word exception bits directly translates to a * position in this table, where a single FPE_... value is stored. * This FPE_... value stored there is considered the "most important" * of the exception bits and will be sent as the signal code. The * precedence of the bits is based upon Intel Document "Numerical * Applications", Chapter "Special Computational Situations". * * The macro to choose one of these values does these steps: 1) Throw * away status word bits that cannot be masked. 2) Throw away the bits * currently masked in the control word, assuming the user isn't * interested in them anymore. 3) Reinsert status word bit 7 (stack * fault) if it is set, which cannot be masked but must be presered. * 4) Use the remaining bits to point into the trapcode table. * * The 6 maskable bits in order of their preference, as stated in the * above referenced Intel manual: * 1 Invalid operation (FP_X_INV) * 1a Stack underflow * 1b Stack overflow * 1c Operand of unsupported format * 1d SNaN operand. * 2 QNaN operand (not an exception, irrelavant here) * 3 Any other invalid-operation not mentioned above or zero divide * (FP_X_INV, FP_X_DZ) * 4 Denormal operand (FP_X_DNML) * 5 Numeric over/underflow (FP_X_OFL, FP_X_UFL) * 6 Inexact result (FP_X_IMP) */ static char fpetable[128] = { 0, FPE_FLTINV, /* 1 - INV */ FPE_FLTUND, /* 2 - DNML */ FPE_FLTINV, /* 3 - INV | DNML */ FPE_FLTDIV, /* 4 - DZ */ FPE_FLTINV, /* 5 - INV | DZ */ FPE_FLTDIV, /* 6 - DNML | DZ */ FPE_FLTINV, /* 7 - INV | DNML | DZ */ FPE_FLTOVF, /* 8 - OFL */ FPE_FLTINV, /* 9 - INV | OFL */ FPE_FLTUND, /* A - DNML | OFL */ FPE_FLTINV, /* B - INV | DNML | OFL */ FPE_FLTDIV, /* C - DZ | OFL */ FPE_FLTINV, /* D - INV | DZ | OFL */ FPE_FLTDIV, /* E - DNML | DZ | OFL */ FPE_FLTINV, /* F - INV | DNML | DZ | OFL */ FPE_FLTUND, /* 10 - UFL */ FPE_FLTINV, /* 11 - INV | UFL */ FPE_FLTUND, /* 12 - DNML | UFL */ FPE_FLTINV, /* 13 - INV | DNML | UFL */ FPE_FLTDIV, /* 14 - DZ | UFL */ FPE_FLTINV, /* 15 - INV | DZ | UFL */ FPE_FLTDIV, /* 16 - DNML | DZ | UFL */ FPE_FLTINV, /* 17 - INV | DNML | DZ | UFL */ FPE_FLTOVF, /* 18 - OFL | UFL */ FPE_FLTINV, /* 19 - INV | OFL | UFL */ FPE_FLTUND, /* 1A - DNML | OFL | UFL */ FPE_FLTINV, /* 1B - INV | DNML | OFL | UFL */ FPE_FLTDIV, /* 1C - DZ | OFL | UFL */ FPE_FLTINV, /* 1D - INV | DZ | OFL | UFL */ FPE_FLTDIV, /* 1E - DNML | DZ | OFL | UFL */ FPE_FLTINV, /* 1F - INV | DNML | DZ | OFL | UFL */ FPE_FLTRES, /* 20 - IMP */ FPE_FLTINV, /* 21 - INV | IMP */ FPE_FLTUND, /* 22 - DNML | IMP */ FPE_FLTINV, /* 23 - INV | DNML | IMP */ FPE_FLTDIV, /* 24 - DZ | IMP */ FPE_FLTINV, /* 25 - INV | DZ | IMP */ FPE_FLTDIV, /* 26 - DNML | DZ | IMP */ FPE_FLTINV, /* 27 - INV | DNML | DZ | IMP */ FPE_FLTOVF, /* 28 - OFL | IMP */ FPE_FLTINV, /* 29 - INV | OFL | IMP */ FPE_FLTUND, /* 2A - DNML | OFL | IMP */ FPE_FLTINV, /* 2B - INV | DNML | OFL | IMP */ FPE_FLTDIV, /* 2C - DZ | OFL | IMP */ FPE_FLTINV, /* 2D - INV | DZ | OFL | IMP */ FPE_FLTDIV, /* 2E - DNML | DZ | OFL | IMP */ FPE_FLTINV, /* 2F - INV | DNML | DZ | OFL | IMP */ FPE_FLTUND, /* 30 - UFL | IMP */ FPE_FLTINV, /* 31 - INV | UFL | IMP */ FPE_FLTUND, /* 32 - DNML | UFL | IMP */ FPE_FLTINV, /* 33 - INV | DNML | UFL | IMP */ FPE_FLTDIV, /* 34 - DZ | UFL | IMP */ FPE_FLTINV, /* 35 - INV | DZ | UFL | IMP */ FPE_FLTDIV, /* 36 - DNML | DZ | UFL | IMP */ FPE_FLTINV, /* 37 - INV | DNML | DZ | UFL | IMP */ FPE_FLTOVF, /* 38 - OFL | UFL | IMP */ FPE_FLTINV, /* 39 - INV | OFL | UFL | IMP */ FPE_FLTUND, /* 3A - DNML | OFL | UFL | IMP */ FPE_FLTINV, /* 3B - INV | DNML | OFL | UFL | IMP */ FPE_FLTDIV, /* 3C - DZ | OFL | UFL | IMP */ FPE_FLTINV, /* 3D - INV | DZ | OFL | UFL | IMP */ FPE_FLTDIV, /* 3E - DNML | DZ | OFL | UFL | IMP */ FPE_FLTINV, /* 3F - INV | DNML | DZ | OFL | UFL | IMP */ FPE_FLTSUB, /* 40 - STK */ FPE_FLTSUB, /* 41 - INV | STK */ FPE_FLTUND, /* 42 - DNML | STK */ FPE_FLTSUB, /* 43 - INV | DNML | STK */ FPE_FLTDIV, /* 44 - DZ | STK */ FPE_FLTSUB, /* 45 - INV | DZ | STK */ FPE_FLTDIV, /* 46 - DNML | DZ | STK */ FPE_FLTSUB, /* 47 - INV | DNML | DZ | STK */ FPE_FLTOVF, /* 48 - OFL | STK */ FPE_FLTSUB, /* 49 - INV | OFL | STK */ FPE_FLTUND, /* 4A - DNML | OFL | STK */ FPE_FLTSUB, /* 4B - INV | DNML | OFL | STK */ FPE_FLTDIV, /* 4C - DZ | OFL | STK */ FPE_FLTSUB, /* 4D - INV | DZ | OFL | STK */ FPE_FLTDIV, /* 4E - DNML | DZ | OFL | STK */ FPE_FLTSUB, /* 4F - INV | DNML | DZ | OFL | STK */ FPE_FLTUND, /* 50 - UFL | STK */ FPE_FLTSUB, /* 51 - INV | UFL | STK */ FPE_FLTUND, /* 52 - DNML | UFL | STK */ FPE_FLTSUB, /* 53 - INV | DNML | UFL | STK */ FPE_FLTDIV, /* 54 - DZ | UFL | STK */ FPE_FLTSUB, /* 55 - INV | DZ | UFL | STK */ FPE_FLTDIV, /* 56 - DNML | DZ | UFL | STK */ FPE_FLTSUB, /* 57 - INV | DNML | DZ | UFL | STK */ FPE_FLTOVF, /* 58 - OFL | UFL | STK */ FPE_FLTSUB, /* 59 - INV | OFL | UFL | STK */ FPE_FLTUND, /* 5A - DNML | OFL | UFL | STK */ FPE_FLTSUB, /* 5B - INV | DNML | OFL | UFL | STK */ FPE_FLTDIV, /* 5C - DZ | OFL | UFL | STK */ FPE_FLTSUB, /* 5D - INV | DZ | OFL | UFL | STK */ FPE_FLTDIV, /* 5E - DNML | DZ | OFL | UFL | STK */ FPE_FLTSUB, /* 5F - INV | DNML | DZ | OFL | UFL | STK */ FPE_FLTRES, /* 60 - IMP | STK */ FPE_FLTSUB, /* 61 - INV | IMP | STK */ FPE_FLTUND, /* 62 - DNML | IMP | STK */ FPE_FLTSUB, /* 63 - INV | DNML | IMP | STK */ FPE_FLTDIV, /* 64 - DZ | IMP | STK */ FPE_FLTSUB, /* 65 - INV | DZ | IMP | STK */ FPE_FLTDIV, /* 66 - DNML | DZ | IMP | STK */ FPE_FLTSUB, /* 67 - INV | DNML | DZ | IMP | STK */ FPE_FLTOVF, /* 68 - OFL | IMP | STK */ FPE_FLTSUB, /* 69 - INV | OFL | IMP | STK */ FPE_FLTUND, /* 6A - DNML | OFL | IMP | STK */ FPE_FLTSUB, /* 6B - INV | DNML | OFL | IMP | STK */ FPE_FLTDIV, /* 6C - DZ | OFL | IMP | STK */ FPE_FLTSUB, /* 6D - INV | DZ | OFL | IMP | STK */ FPE_FLTDIV, /* 6E - DNML | DZ | OFL | IMP | STK */ FPE_FLTSUB, /* 6F - INV | DNML | DZ | OFL | IMP | STK */ FPE_FLTUND, /* 70 - UFL | IMP | STK */ FPE_FLTSUB, /* 71 - INV | UFL | IMP | STK */ FPE_FLTUND, /* 72 - DNML | UFL | IMP | STK */ FPE_FLTSUB, /* 73 - INV | DNML | UFL | IMP | STK */ FPE_FLTDIV, /* 74 - DZ | UFL | IMP | STK */ FPE_FLTSUB, /* 75 - INV | DZ | UFL | IMP | STK */ FPE_FLTDIV, /* 76 - DNML | DZ | UFL | IMP | STK */ FPE_FLTSUB, /* 77 - INV | DNML | DZ | UFL | IMP | STK */ FPE_FLTOVF, /* 78 - OFL | UFL | IMP | STK */ FPE_FLTSUB, /* 79 - INV | OFL | UFL | IMP | STK */ FPE_FLTUND, /* 7A - DNML | OFL | UFL | IMP | STK */ FPE_FLTSUB, /* 7B - INV | DNML | OFL | UFL | IMP | STK */ FPE_FLTDIV, /* 7C - DZ | OFL | UFL | IMP | STK */ FPE_FLTSUB, /* 7D - INV | DZ | OFL | UFL | IMP | STK */ FPE_FLTDIV, /* 7E - DNML | DZ | OFL | UFL | IMP | STK */ FPE_FLTSUB, /* 7F - INV | DNML | DZ | OFL | UFL | IMP | STK */ }; /* * Read the FP status and control words, then generate si_code value * for SIGFPE. The error code chosen will be one of the * FPE_... macros. It will be sent as the second argument to old * BSD-style signal handlers and as "siginfo_t->si_code" (second * argument) to SA_SIGINFO signal handlers. * * Some time ago, we cleared the x87 exceptions with FNCLEX there. * Clearing exceptions was necessary mainly to avoid IRQ13 bugs. The * usermode code which understands the FPU hardware enough to enable * the exceptions, can also handle clearing the exception state in the * handler. The only consequence of not clearing the exception is the * rethrow of the SIGFPE on return from the signal handler and * reexecution of the corresponding instruction. * * For XMM traps, the exceptions were never cleared. */ int fputrap_x87(void) { struct savefpu *pcb_save; u_short control, status; critical_enter(); /* * Interrupt handling (for another interrupt) may have pushed the * state to memory. Fetch the relevant parts of the state from * wherever they are. */ if (PCPU_GET(fpcurthread) != curthread) { pcb_save = curpcb->pcb_save; control = pcb_save->sv_env.en_cw; status = pcb_save->sv_env.en_sw; } else { fnstcw(&control); fnstsw(&status); } critical_exit(); return (fpetable[status & ((~control & 0x3f) | 0x40)]); } int fputrap_sse(void) { u_int mxcsr; critical_enter(); if (PCPU_GET(fpcurthread) != curthread) mxcsr = curpcb->pcb_save->sv_env.en_mxcsr; else stmxcsr(&mxcsr); critical_exit(); return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]); } /* - * Implement device not available (DNA) exception + * Device Not Available (DNA, #NM) exception handler. * - * It would be better to switch FP context here (if curthread != fpcurthread) - * and not necessarily for every context switch, but it is too hard to - * access foreign pcb's. + * It would be better to switch FP context here (if curthread != + * fpcurthread) and not necessarily for every context switch, but it + * is too hard to access foreign pcb's. */ - -static int err_count = 0; - void fpudna(void) { + /* + * This handler is entered with interrupts enabled, so context + * switches may occur before critical_enter() is executed. If + * a context switch occurs, then when we regain control, our + * state will have been completely restored. The CPU may + * change underneath us, but the only part of our context that + * lives in the CPU is CR0.TS and that will be "restored" by + * setting it on the new CPU. + */ critical_enter(); + if (PCPU_GET(fpcurthread) == curthread) { - printf("fpudna: fpcurthread == curthread %d times\n", - ++err_count); + printf("fpudna: fpcurthread == curthread\n"); stop_emulating(); critical_exit(); return; } if (PCPU_GET(fpcurthread) != NULL) { - printf("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n", - PCPU_GET(fpcurthread), - PCPU_GET(fpcurthread)->td_proc->p_pid, - curthread, curthread->td_proc->p_pid); - panic("fpudna"); + panic("fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n", + PCPU_GET(fpcurthread), PCPU_GET(fpcurthread)->td_tid, + curthread, curthread->td_tid); } stop_emulating(); /* * Record new context early in case frstor causes a trap. */ PCPU_SET(fpcurthread, curthread); fpu_clean_state(); if ((curpcb->pcb_flags & PCB_FPUINITDONE) == 0) { /* * This is the first time this thread has used the FPU or * the PCB doesn't contain a clean FPU state. Explicitly * load an initial state. * * We prefer to restore the state from the actual save * area in PCB instead of directly loading from * fpu_initialstate, to ignite the XSAVEOPT * tracking engine. */ bcopy(fpu_initialstate, curpcb->pcb_save, cpu_max_ext_state_size); fpurestore(curpcb->pcb_save); if (curpcb->pcb_initial_fpucw != __INITIAL_FPUCW__) fldcw(curpcb->pcb_initial_fpucw); if (PCB_USER_FPU(curpcb)) set_pcb_flags(curpcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); else set_pcb_flags(curpcb, PCB_FPUINITDONE); } else fpurestore(curpcb->pcb_save); critical_exit(); } void fpudrop() { struct thread *td; td = PCPU_GET(fpcurthread); KASSERT(td == curthread, ("fpudrop: fpcurthread != curthread")); CRITICAL_ASSERT(td); PCPU_SET(fpcurthread, NULL); clear_pcb_flags(td->td_pcb, PCB_FPUINITDONE); start_emulating(); } /* * Get the user state of the FPU into pcb->pcb_user_save without * dropping ownership (if possible). It returns the FPU ownership * status. */ int fpugetregs(struct thread *td) { struct pcb *pcb; uint64_t *xstate_bv, bit; char *sa; int max_ext_n, i, owned; pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_USERFPUINITDONE) == 0) { bcopy(fpu_initialstate, get_pcb_user_save_pcb(pcb), cpu_max_ext_state_size); get_pcb_user_save_pcb(pcb)->sv_env.en_cw = pcb->pcb_initial_fpucw; fpuuserinited(td); return (_MC_FPOWNED_PCB); } critical_enter(); if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) { fpusave(get_pcb_user_save_pcb(pcb)); owned = _MC_FPOWNED_FPU; } else { owned = _MC_FPOWNED_PCB; } critical_exit(); if (use_xsave) { /* * Handle partially saved state. */ sa = (char *)get_pcb_user_save_pcb(pcb); xstate_bv = (uint64_t *)(sa + sizeof(struct savefpu) + offsetof(struct xstate_hdr, xstate_bv)); max_ext_n = flsl(xsave_mask); for (i = 0; i < max_ext_n; i++) { bit = 1ULL << i; if ((xsave_mask & bit) == 0 || (*xstate_bv & bit) != 0) continue; bcopy((char *)fpu_initialstate + xsave_area_desc[i].offset, sa + xsave_area_desc[i].offset, xsave_area_desc[i].size); *xstate_bv |= bit; } } return (owned); } void fpuuserinited(struct thread *td) { struct pcb *pcb; pcb = td->td_pcb; if (PCB_USER_FPU(pcb)) set_pcb_flags(pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); else set_pcb_flags(pcb, PCB_FPUINITDONE); } int fpusetxstate(struct thread *td, char *xfpustate, size_t xfpustate_size) { struct xstate_hdr *hdr, *ehdr; size_t len, max_len; uint64_t bv; /* XXXKIB should we clear all extended state in xstate_bv instead ? */ if (xfpustate == NULL) return (0); if (!use_xsave) return (EOPNOTSUPP); len = xfpustate_size; if (len < sizeof(struct xstate_hdr)) return (EINVAL); max_len = cpu_max_ext_state_size - sizeof(struct savefpu); if (len > max_len) return (EINVAL); ehdr = (struct xstate_hdr *)xfpustate; bv = ehdr->xstate_bv; /* * Avoid #gp. */ if (bv & ~xsave_mask) return (EINVAL); hdr = (struct xstate_hdr *)(get_pcb_user_save_td(td) + 1); hdr->xstate_bv = bv; bcopy(xfpustate + sizeof(struct xstate_hdr), (char *)(hdr + 1), len - sizeof(struct xstate_hdr)); return (0); } /* * Set the state of the FPU. */ int fpusetregs(struct thread *td, struct savefpu *addr, char *xfpustate, size_t xfpustate_size) { struct pcb *pcb; int error; pcb = td->td_pcb; critical_enter(); if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) { error = fpusetxstate(td, xfpustate, xfpustate_size); if (error != 0) { critical_exit(); return (error); } bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr)); fpurestore(get_pcb_user_save_td(td)); critical_exit(); set_pcb_flags(pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); } else { critical_exit(); error = fpusetxstate(td, xfpustate, xfpustate_size); if (error != 0) return (error); bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr)); fpuuserinited(td); } return (0); } /* * On AuthenticAMD processors, the fxrstor instruction does not restore * the x87's stored last instruction pointer, last data pointer, and last * opcode values, except in the rare case in which the exception summary * (ES) bit in the x87 status word is set to 1. * * In order to avoid leaking this information across processes, we clean * these values by performing a dummy load before executing fxrstor(). */ static void fpu_clean_state(void) { static float dummy_variable = 0.0; u_short status; /* * Clear the ES bit in the x87 status word if it is currently * set, in order to avoid causing a fault in the upcoming load. */ fnstsw(&status); if (status & 0x80) fnclex(); /* * Load the dummy variable into the x87 stack. This mangles * the x87 stack, but we don't care since we're about to call * fxrstor() anyway. */ __asm __volatile("ffree %%st(7); flds %0" : : "m" (dummy_variable)); } /* * This really sucks. We want the acpi version only, but it requires * the isa_if.h file in order to get the definitions. */ #include "opt_isa.h" #ifdef DEV_ISA #include /* * This sucks up the legacy ISA support assignments from PNPBIOS/ACPI. */ static struct isa_pnp_id fpupnp_ids[] = { { 0x040cd041, "Legacy ISA coprocessor support" }, /* PNP0C04 */ { 0 } }; static int fpupnp_probe(device_t dev) { int result; result = ISA_PNP_PROBE(device_get_parent(dev), dev, fpupnp_ids); if (result <= 0) device_quiet(dev); return (result); } static int fpupnp_attach(device_t dev) { return (0); } static device_method_t fpupnp_methods[] = { /* Device interface */ DEVMETHOD(device_probe, fpupnp_probe), DEVMETHOD(device_attach, fpupnp_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), { 0, 0 } }; static driver_t fpupnp_driver = { "fpupnp", fpupnp_methods, 1, /* no softc */ }; static devclass_t fpupnp_devclass; DRIVER_MODULE(fpupnp, acpi, fpupnp_driver, fpupnp_devclass, 0, 0); #endif /* DEV_ISA */ static MALLOC_DEFINE(M_FPUKERN_CTX, "fpukern_ctx", "Kernel contexts for FPU state"); #define FPU_KERN_CTX_FPUINITDONE 0x01 #define FPU_KERN_CTX_DUMMY 0x02 /* avoided save for the kern thread */ struct fpu_kern_ctx { struct savefpu *prev; uint32_t flags; char hwstate1[]; }; struct fpu_kern_ctx * fpu_kern_alloc_ctx(u_int flags) { struct fpu_kern_ctx *res; size_t sz; sz = sizeof(struct fpu_kern_ctx) + XSAVE_AREA_ALIGN + cpu_max_ext_state_size; res = malloc(sz, M_FPUKERN_CTX, ((flags & FPU_KERN_NOWAIT) ? M_NOWAIT : M_WAITOK) | M_ZERO); return (res); } void fpu_kern_free_ctx(struct fpu_kern_ctx *ctx) { /* XXXKIB clear the memory ? */ free(ctx, M_FPUKERN_CTX); } static struct savefpu * fpu_kern_ctx_savefpu(struct fpu_kern_ctx *ctx) { vm_offset_t p; p = (vm_offset_t)&ctx->hwstate1; p = roundup2(p, XSAVE_AREA_ALIGN); return ((struct savefpu *)p); } int fpu_kern_enter(struct thread *td, struct fpu_kern_ctx *ctx, u_int flags) { struct pcb *pcb; if ((flags & FPU_KERN_KTHR) != 0 && is_fpu_kern_thread(0)) { ctx->flags = FPU_KERN_CTX_DUMMY; return (0); } pcb = td->td_pcb; KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save == get_pcb_user_save_pcb(pcb), ("mangled pcb_save")); ctx->flags = 0; if ((pcb->pcb_flags & PCB_FPUINITDONE) != 0) ctx->flags |= FPU_KERN_CTX_FPUINITDONE; fpuexit(td); ctx->prev = pcb->pcb_save; pcb->pcb_save = fpu_kern_ctx_savefpu(ctx); set_pcb_flags(pcb, PCB_KERNFPU); clear_pcb_flags(pcb, PCB_FPUINITDONE); return (0); } int fpu_kern_leave(struct thread *td, struct fpu_kern_ctx *ctx) { struct pcb *pcb; if (is_fpu_kern_thread(0) && (ctx->flags & FPU_KERN_CTX_DUMMY) != 0) return (0); KASSERT((ctx->flags & FPU_KERN_CTX_DUMMY) == 0, ("dummy ctx")); pcb = td->td_pcb; critical_enter(); if (curthread == PCPU_GET(fpcurthread)) fpudrop(); critical_exit(); pcb->pcb_save = ctx->prev; if (pcb->pcb_save == get_pcb_user_save_pcb(pcb)) { if ((pcb->pcb_flags & PCB_USERFPUINITDONE) != 0) { set_pcb_flags(pcb, PCB_FPUINITDONE); clear_pcb_flags(pcb, PCB_KERNFPU); } else clear_pcb_flags(pcb, PCB_FPUINITDONE | PCB_KERNFPU); } else { if ((ctx->flags & FPU_KERN_CTX_FPUINITDONE) != 0) set_pcb_flags(pcb, PCB_FPUINITDONE); else clear_pcb_flags(pcb, PCB_FPUINITDONE); KASSERT(!PCB_USER_FPU(pcb), ("unpaired fpu_kern_leave")); } return (0); } int fpu_kern_thread(u_int flags) { KASSERT((curthread->td_pflags & TDP_KTHREAD) != 0, ("Only kthread may use fpu_kern_thread")); KASSERT(curpcb->pcb_save == get_pcb_user_save_pcb(curpcb), ("mangled pcb_save")); KASSERT(PCB_USER_FPU(curpcb), ("recursive call")); set_pcb_flags(curpcb, PCB_KERNFPU); return (0); } int is_fpu_kern_thread(u_int flags) { if ((curthread->td_pflags & TDP_KTHREAD) == 0) return (0); return ((curpcb->pcb_flags & PCB_KERNFPU) != 0); } /* * FPU save area alloc/free/init utility routines */ struct savefpu * fpu_save_area_alloc(void) { return (uma_zalloc(fpu_save_area_zone, 0)); } void fpu_save_area_free(struct savefpu *fsa) { uma_zfree(fpu_save_area_zone, fsa); } void fpu_save_area_reset(struct savefpu *fsa) { bcopy(fpu_initialstate, fsa, cpu_max_ext_state_size); } Index: stable/10/sys/amd64/amd64/trap.c =================================================================== --- stable/10/sys/amd64/amd64/trap.c (revision 272539) +++ stable/10/sys/amd64/amd64/trap.c (revision 272540) @@ -1,999 +1,999 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_kdtrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #ifdef KDTRACE_HOOKS #include #endif extern void trap(struct trapframe *frame); extern void syscall(struct trapframe *frame); void dblfault_handler(struct trapframe *frame); static int trap_pfault(struct trapframe *, int); static void trap_fatal(struct trapframe *, vm_offset_t); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; #ifdef KDB static int kdb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW, &kdb_on_nmi, 0, "Go to KDB on NMI"); TUNABLE_INT("machdep.kdb_on_nmi", &kdb_on_nmi); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, &panic_on_nmi, 0, "Panic on NMI"); TUNABLE_INT("machdep.panic_on_nmi", &panic_on_nmi); static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; int i = 0, ucode = 0, code; u_int type; register_t addr = 0; ksiginfo_t ksi; PCPU_INC(cnt.v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI. If the PMC module is * active, pass the 'rip' value to the PMC module's interrupt * handler. A return value of '1' from the handler means that * the NMI was handled by it and we can return immediately. */ if (type == T_NMI && pmc_intr && (*pmc_intr)(PCPU_GET(cpuid), frame)) goto out; #endif if (type == T_MCHK) { mca_intr(); goto out; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. */ if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type)) goto out; #endif if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (ISPL(frame->tf_cs) == SEL_UPL) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while holding a * spin lock. */ if (td->td_md.md_spinlock_count == 0) enable_intr(); } } code = frame->tf_err; if (ISPL(frame->tf_cs) == SEL_UPL) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_rip; if (td->td_ucred != p->p_ucred) cred_update_thread(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif frame->tf_rflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ ucode = fputrap_x87(); if (ucode == -1) goto userout; i = SIGFPE; break; case T_PROTFLT: /* general protection fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ addr = frame->tf_addr; i = trap_pfault(frame, TRUE); if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* KDB */ goto userout; } else if (panic_on_nmi) panic("NMI indicates hardware failure"); break; #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); fpudna(); goto userout; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = fputrap_sse(); if (ucode == -1) goto userout; i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE); goto out; case T_DNA: KASSERT(!PCB_USER_FPU(td->td_pcb), ("Unregistered use of FPU in kernel")); fpudna(); goto out; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* - * XXXKIB for now disable any FPU traps in kernel - * handler registration seems to be overkill + * For now, supporting kernel handler + * registration for FPU traps is overkill. */ trap_fatal(frame, 0); goto out; case T_STKFLT: /* stack fault */ break; case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; goto out; } if (frame->tf_rip == (long)ld_ds) { frame->tf_rip = (long)ds_load_fault; goto out; } if (frame->tf_rip == (long)ld_es) { frame->tf_rip = (long)es_load_fault; goto out; } if (frame->tf_rip == (long)ld_fs) { frame->tf_rip = (long)fs_load_fault; goto out; } if (frame->tf_rip == (long)ld_gs) { frame->tf_rip = (long)gs_load_fault; goto out; } if (frame->tf_rip == (long)ld_gsbase) { frame->tf_rip = (long)gsbase_load_fault; goto out; } if (frame->tf_rip == (long)ld_fsbase) { frame->tf_rip = (long)fsbase_load_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap()) { /* * Reset breakpoint bits because the * processor doesn't */ /* XXX check upper bits here */ load_dr6(rdr6() & 0xfffffff0); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB if (kdb_trap(type, 0, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(code) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton * for debugging. */ if (kdb_on_nmi) { printf ("NMI ... going to debugger\n"); kdb_trap(type, 0, frame); } #endif /* KDB */ goto out; } else if (panic_on_nmi == 0) goto out; /* FALLTHROUGH */ #endif /* DEV_ISA */ } trap_fatal(frame, 0); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_trapno = type; ksi.ksi_addr = (void *)addr; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %lx code %d type %d " "addr 0x%lx rsp 0x%lx rip 0x%lx " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_rsp, frame->tf_rip, fubyte((void *)(frame->tf_rip + 0)), fubyte((void *)(frame->tf_rip + 1)), fubyte((void *)(frame->tf_rip + 2)), fubyte((void *)(frame->tf_rip + 3)), fubyte((void *)(frame->tf_rip + 4)), fubyte((void *)(frame->tf_rip + 5)), fubyte((void *)(frame->tf_rip + 6)), fubyte((void *)(frame->tf_rip + 7))); } KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; } static int trap_pfault(frame, usermode) struct trapframe *frame; int usermode; { vm_offset_t va; struct vmspace *vm; vm_map_t map; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; vm_offset_t eva = frame->tf_addr; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) goto nogo; map = kernel_map; } else { /* * This is a fault on non-kernel virtual memory. If either * p or p->p_vmspace is NULL, then the fault is fatal. */ if (p == NULL || (vm = p->p_vmspace) == NULL) goto nogo; map = &vm->vm_map; /* * When accessing a usermode address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. */ if (!usermode && (td->td_intr_nesting_level != 0 || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return (0); } if ((td->td_pflags & TDP_DEVMEMIO) != 0) { KASSERT(curpcb->pcb_onfault != NULL, ("/dev/mem without pcb_onfault")); frame->tf_rip = (long)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss; u_int type; long esp; struct soft_segment_descriptor softseg; char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)], &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s %s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", code & PGEX_RSV ? " rsv" : "", code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); if (ISPL(frame->tf_cs) == SEL_UPL) { ss = frame->tf_ss & 0xffff; esp = frame->tf_rsp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (long)&frame->tf_rsp; } printf("stack pointer = 0x%x:0x%lx\n", ss, esp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = "); if (curproc) { printf("%lu (%s)\n", (u_long)curproc->p_pid, curthread->td_name ? curthread->td_name : ""); } else { printf("Idle\n"); } #ifdef KDB if (debugger_on_panic || kdb_active) if (kdb_trap(type, 0, frame)) return; #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault\n"); printf("rip = 0x%lx\n", frame->tf_rip); printf("rsp = 0x%lx\n", frame->tf_rsp); printf("rbp = 0x%lx\n", frame->tf_rbp); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; register_t *argp; caddr_t params; int reg, regcnt, error; p = td->td_proc; frame = td->td_frame; reg = 0; regcnt = 6; params = (caddr_t)frame->tf_rsp + sizeof(register_t); sa->code = frame->tf_rax; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = frame->tf_rdi; reg++; regcnt--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]), ("Too many syscall arguments!")); error = 0; argp = &frame->tf_rdi; argp += reg; bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt); if (sa->narg > regcnt) { KASSERT(params != NULL, ("copyin args with no params!")); error = copyin(params, &sa->args[regcnt], (sa->narg - regcnt) * sizeof(sa->args[0])); } if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; } return (error); } #include "../../kern/subr_syscall.c" /* * System call handler for native binaries. The trap frame is already * set up by the assembler trampoline and a pointer to it is saved in * td_frame. */ void amd64_syscall(struct thread *td, int traced) { struct syscall_args sa; int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (ISPL(td->td_frame->tf_cs) != SEL_UPL) { panic("syscall"); /* NOT REACHED */ } #endif error = syscallenter(td, &sa); /* * Traced syscall. */ if (__predict_false(traced)) { td->td_frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)td->td_frame->tf_rip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returing with kernel FPU ctx leaked", syscallname(td->td_proc, sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, sa.code))); syscallret(td, error, &sa); /* * If the user-supplied value of %rip is not a canonical * address, then some CPUs will trigger a ring 0 #GP during * the sysret instruction. However, the fault handler would * execute in ring 0 with the user's %gs and %rsp which would * not be safe. Instead, use the full return path which * catches the problem safely. */ if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS) set_pcb_flags(td->td_pcb, PCB_FULL_IRET); } Index: stable/10 =================================================================== --- stable/10 (revision 272539) +++ stable/10 (revision 272540) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r271747,271924