Index: head/sys/amd64/amd64/db_trace.c =================================================================== --- head/sys/amd64/amd64/db_trace.c +++ head/sys/amd64/amd64/db_trace.c @@ -200,6 +200,7 @@ if (name != NULL) { if (strcmp(name, "calltrap") == 0 || strcmp(name, "fork_trampoline") == 0 || + strcmp(name, "mchk_calltrap") == 0 || strcmp(name, "nmi_calltrap") == 0 || strcmp(name, "Xdblfault") == 0) frame_type = TRAP; Index: head/sys/amd64/amd64/exception.S =================================================================== --- head/sys/amd64/amd64/exception.S +++ head/sys/amd64/amd64/exception.S @@ -141,7 +141,6 @@ TRAP ill, T_PRIVINFLT TRAP dna, T_DNA TRAP fpusegm, T_FPOPFLT - TRAP mchk, T_MCHK TRAP rsvd, T_RESERVED TRAP fpu, T_ARITHTRAP TRAP xmm, T_XMMFLT @@ -663,6 +662,103 @@ cli nocallchain: #endif + testl %ebx,%ebx /* %ebx == 0 => return to userland */ + jnz doreti_exit + /* + * Put back the preserved MSR_GSBASE value. + */ + movl $MSR_GSBASE,%ecx + movq %r12,%rdx + movl %edx,%eax + shrq $32,%rdx + wrmsr + movq %r13,%cr3 + RESTORE_REGS + addq $TF_RIP,%rsp + jmp doreti_iret + +/* + * MC# handling is similar to NMI. + * + * As with NMIs, machine check exceptions do not respect RFLAGS.IF and + * can occur at any time with a GS.base value that does not correspond + * to the privilege level in CS. + * + * Machine checks are not unblocked by iretq, but it is best to run + * the handler with interrupts disabled since the exception may have + * interrupted a critical section. + * + * The MC# handler runs on its own stack (tss_ist3). The canonical + * GS.base value for the processor is stored just above the bottom of + * its MC# stack. For exceptions taken from kernel mode, the current + * value in the processor's GS.base is saved at entry to C-preserved + * register %r12, the canonical value for GS.base is then loaded into + * the processor, and the saved value is restored at exit time. For + * exceptions taken from user mode, the cheaper 'SWAPGS' instructions + * are used for swapping GS.base. + */ + +IDTVEC(mchk) + subq $TF_RIP,%rsp + movl $(T_MCHK),TF_TRAPNO(%rsp) + movq $0,TF_ADDR(%rsp) + movq $0,TF_ERR(%rsp) + movq %rdi,TF_RDI(%rsp) + movq %rsi,TF_RSI(%rsp) + movq %rdx,TF_RDX(%rsp) + movq %rcx,TF_RCX(%rsp) + movq %r8,TF_R8(%rsp) + movq %r9,TF_R9(%rsp) + movq %rax,TF_RAX(%rsp) + movq %rbx,TF_RBX(%rsp) + movq %rbp,TF_RBP(%rsp) + movq %r10,TF_R10(%rsp) + movq %r11,TF_R11(%rsp) + movq %r12,TF_R12(%rsp) + movq %r13,TF_R13(%rsp) + movq %r14,TF_R14(%rsp) + movq %r15,TF_R15(%rsp) + SAVE_SEGS + movl $TF_HASSEGS,TF_FLAGS(%rsp) + cld + xorl %ebx,%ebx + testb $SEL_RPL_MASK,TF_CS(%rsp) + jnz mchk_fromuserspace + /* + * We've interrupted the kernel. Preserve GS.base in %r12 + * and %cr3 in %r13. + */ + movl $MSR_GSBASE,%ecx + rdmsr + movq %rax,%r12 + shlq $32,%rdx + orq %rdx,%r12 + /* Retrieve and load the canonical value for GS.base. */ + movq TF_SIZE(%rsp),%rdx + movl %edx,%eax + shrq $32,%rdx + wrmsr + movq %cr3,%r13 + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je mchk_calltrap + movq %rax,%cr3 + jmp mchk_calltrap +mchk_fromuserspace: + incl %ebx + swapgs + movq %cr3,%r13 + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je 1f + movq %rax,%cr3 +1: +/* Note: this label is also used by ddb and gdb: */ +mchk_calltrap: + FAKE_MCOUNT(TF_RIP(%rsp)) + movq %rsp,%rdi + call mca_intr + MEXITCOUNT testl %ebx,%ebx /* %ebx == 0 => return to userland */ jnz doreti_exit /* Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c +++ head/sys/amd64/amd64/machdep.c @@ -662,7 +662,7 @@ struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); - +static char mce0_stack[PAGE_SIZE] __aligned(16); static char nmi0_stack[PAGE_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); @@ -819,7 +819,7 @@ IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti), IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), - IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(mchk_pti), + IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), IDTVEC(xmm_pti), #ifdef KDTRACE_HOOKS @@ -1658,8 +1658,7 @@ SEL_KPL, 0); setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_MC, pti ? &IDTVEC(mchk_pti) : &IDTVEC(mchk), SDT_SYSIGT, - SEL_KPL, 0); + setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); #ifdef KDTRACE_HOOKS @@ -1704,6 +1703,14 @@ np->np_pcpu = (register_t) pc; common_tss[0].tss_ist2 = (long) np; + /* + * MC# stack, runs on ist3. The pcpu pointer is stored just + * above the start of the ist3 stack. + */ + np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1; + np->np_pcpu = (register_t) pc; + common_tss[0].tss_ist3 = (long) np; + /* Set the IO permission bitmap (empty due to tss seg limit) */ common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; Index: head/sys/amd64/amd64/mp_machdep.c =================================================================== --- head/sys/amd64/amd64/mp_machdep.c +++ head/sys/amd64/amd64/mp_machdep.c @@ -87,6 +87,7 @@ /* Temporary variables for init_secondary() */ char *doublefault_stack; +char *mce_stack; char *nmi_stack; /* @@ -212,6 +213,10 @@ np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; common_tss[cpu].tss_ist2 = (long) np; + /* The MC# stack runs on IST3. */ + np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1; + common_tss[cpu].tss_ist3 = (long) np; + /* Prepare private GDT */ gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; for (x = 0; x < NGDT; x++) { @@ -250,8 +255,13 @@ PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0; /* Save the per-cpu pointer for use by the NMI handler. */ + np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; np->np_pcpu = (register_t) pc; + /* Save the per-cpu pointer for use by the MC# handler. */ + np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1; + np->np_pcpu = (register_t) pc; + wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ @@ -346,6 +356,8 @@ kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO); doublefault_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); + mce_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, + M_WAITOK | M_ZERO); nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c +++ head/sys/amd64/amd64/pmap.c @@ -7596,6 +7596,9 @@ /* NMI stack IST 2 */ va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); + /* MC# stack IST 3 */ + va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu); + pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); } pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, (vm_offset_t)etext, true); Index: head/sys/amd64/amd64/trap.c =================================================================== --- head/sys/amd64/amd64/trap.c +++ head/sys/amd64/amd64/trap.c @@ -220,11 +220,6 @@ #endif } - if (type == T_MCHK) { - mca_intr(); - return; - } - if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled Index: head/sys/amd64/include/intr_machdep.h =================================================================== --- head/sys/amd64/include/intr_machdep.h +++ head/sys/amd64/include/intr_machdep.h @@ -139,7 +139,7 @@ /* * The following data structure holds per-cpu data, and is placed just - * above the top of the space used for the NMI stack. + * above the top of the space used for the NMI and MC# stacks. */ struct nmi_pcpu { register_t np_pcpu;