diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 114d52e5a8f2..8d73717b03b3 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -1,307 +1,257 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * Copyright (c) 2014-2018 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: vector.s, 386BSD 0.1 unknown origin * $FreeBSD$ */ /* * Interrupt entry points for external interrupts triggered by I/O APICs * as well as IPI handlers. */ #include "opt_smp.h" #include "assym.inc" #include #include #include #ifdef SMP #define LK lock ; #else #define LK #endif .text SUPERALIGN_TEXT /* End Of Interrupt to APIC */ as_lapic_eoi: cmpl $0,x2apic_mode jne 1f movq lapic_map,%rax movl $0,LA_EOI(%rax) ret 1: movl $MSR_APIC_EOI,%ecx xorl %eax,%eax xorl %edx,%edx wrmsr ret /* * I/O Interrupt Entry Point. Rather than having one entry point for * each interrupt source, we use one entry point for each 32-bit word * in the ISR. The handler determines the highest bit set in the ISR, * translates that into a vector, and passes the vector to the * lapic_handle_intr() function. */ .macro ISR_VEC index, vec_name INTR_HANDLER \vec_name FAKE_MCOUNT(TF_RIP(%rsp)) cmpl $0,x2apic_mode je 1f movl $(MSR_APIC_ISR0 + \index),%ecx rdmsr jmp 2f 1: movq lapic_map, %rdx /* pointer to local APIC */ movl LA_ISR + 16 * (\index)(%rdx), %eax /* load ISR */ 2: bsrl %eax, %eax /* index of highest set bit in ISR */ jz 3f addl $(32 * \index),%eax movq %rsp, %rsi movl %eax, %edi /* pass the IRQ */ call lapic_handle_intr 3: MEXITCOUNT jmp doreti .endm /* * Handle "spurious INTerrupts". * Notes: * This is different than the "spurious INTerrupt" generated by an * 8259 PIC for missing INTs. See the APIC documentation for details. * This routine should NOT do an 'EOI' cycle. */ .text SUPERALIGN_TEXT IDTVEC(spuriousint) /* No EOI cycle used here */ jmp doreti_iret ISR_VEC 1, apic_isr1 ISR_VEC 2, apic_isr2 ISR_VEC 3, apic_isr3 ISR_VEC 4, apic_isr4 ISR_VEC 5, apic_isr5 ISR_VEC 6, apic_isr6 ISR_VEC 7, apic_isr7 /* * Local APIC periodic timer handler. */ INTR_HANDLER timerint FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call lapic_handle_timer MEXITCOUNT jmp doreti /* * Local APIC CMCI handler. */ INTR_HANDLER cmcint FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_cmc MEXITCOUNT jmp doreti /* * Local APIC error interrupt handler. */ INTR_HANDLER errorint FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_error MEXITCOUNT jmp doreti #ifdef XENHVM /* * Xen event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ INTR_HANDLER xen_intr_upcall FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call xen_intr_handle_upcall MEXITCOUNT jmp doreti #endif #ifdef SMP /* * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT -invltlb_ret: - call as_lapic_eoi - jmp ld_regs - - SUPERALIGN_TEXT - INTR_HANDLER invltlb - call invltlb_handler - jmp invltlb_ret - - INTR_HANDLER invltlb_pcid - call invltlb_pcid_handler - jmp invltlb_ret - - INTR_HANDLER invltlb_invpcid_nopti - call invltlb_invpcid_handler - jmp invltlb_ret - - INTR_HANDLER invltlb_invpcid_pti - call invltlb_invpcid_pti_handler - jmp invltlb_ret - -/* - * Single page TLB shootdown - */ - INTR_HANDLER invlpg - call invlpg_handler - jmp invltlb_ret - - INTR_HANDLER invlpg_invpcid - call invlpg_invpcid_handler - jmp invltlb_ret - - INTR_HANDLER invlpg_pcid - call invlpg_pcid_handler - jmp invltlb_ret - -/* - * Page range TLB shootdown. - */ - INTR_HANDLER invlrng - call invlrng_handler - jmp invltlb_ret - - INTR_HANDLER invlrng_invpcid - call invlrng_invpcid_handler - jmp invltlb_ret - - INTR_HANDLER invlrng_pcid - call invlrng_pcid_handler - jmp invltlb_ret - /* - * Invalidate cache. + * IPI handler for cache and TLB shootdown */ - INTR_HANDLER invlcache - call invlcache_handler - jmp invltlb_ret + INTR_HANDLER invlop + call invlop_handler + call as_lapic_eoi + jmp ld_regs /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ INTR_HANDLER ipi_intr_bitmap_handler call as_lapic_eoi FAKE_MCOUNT(TF_RIP(%rsp)) call ipi_bitmap_handler MEXITCOUNT jmp doreti /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ INTR_HANDLER cpustop call as_lapic_eoi call cpustop_handler jmp doreti /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. */ INTR_HANDLER cpususpend call cpususpend_handler call as_lapic_eoi jmp doreti /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. * * - Calls the generic rendezvous action function. */ INTR_HANDLER rendezvous #ifdef COUNT_IPIS movl PCPU(CPUID), %eax movq ipi_rendezvous_counts(,%rax,8), %rax incq (%rax) #endif call smp_rendezvous_action call as_lapic_eoi jmp doreti /* * IPI handler whose purpose is to interrupt the CPU with minimum overhead. * This is used by bhyve to force a host cpu executing in guest context to * trap into the hypervisor. * * This handler is different from other IPI handlers in the following aspects: * * 1. It doesn't push a trapframe on the stack. * * This implies that a DDB backtrace involving 'justreturn' will skip the * function that was interrupted by this handler. * * 2. It doesn't 'swapgs' when userspace is interrupted. * * The 'justreturn' handler does not access any pcpu data so it is not an * issue. Moreover the 'justreturn' handler can only be interrupted by an NMI * whose handler already doesn't trust GS.base when kernel code is interrupted. */ .text SUPERALIGN_TEXT IDTVEC(justreturn) pushq %rax pushq %rcx pushq %rdx call as_lapic_eoi popq %rdx popq %rcx popq %rax jmp doreti_iret INTR_HANDLER justreturn1 call as_lapic_eoi jmp doreti #endif /* SMP */ diff --git a/sys/amd64/amd64/db_interface.c b/sys/amd64/amd64/db_interface.c index 4645169af562..e35248b07663 100644 --- a/sys/amd64/amd64/db_interface.c +++ b/sys/amd64/amd64/db_interface.c @@ -1,111 +1,110 @@ /*- * Mach Operating System * Copyright (c) 1991,1990 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ #include __FBSDID("$FreeBSD$"); /* * Interface to new debugger. */ #include #include #include #include #include #include #include #include /* * Read bytes from kernel address space for debugger. */ int db_read_bytes(vm_offset_t addr, size_t size, char *data) { jmp_buf jb; void *prev_jb; char *src; int ret; prev_jb = kdb_jmpbuf(jb); ret = setjmp(jb); if (ret == 0) { src = (char *)addr; while (size-- > 0) *data++ = *src++; } (void)kdb_jmpbuf(prev_jb); return (ret); } /* * Write bytes to kernel address space for debugger. * We need to disable write protection temporarily so we can write * things (such as break points) that might be in write-protected * memory. */ int db_write_bytes(vm_offset_t addr, size_t size, char *data) { jmp_buf jb; void *prev_jb; char *dst; bool old_wp; int ret; old_wp = false; prev_jb = kdb_jmpbuf(jb); ret = setjmp(jb); if (ret == 0) { old_wp = disable_wp(); dst = (char *)addr; while (size-- > 0) *dst++ = *data++; } restore_wp(old_wp); (void)kdb_jmpbuf(prev_jb); return (ret); } void db_show_mdpcpu(struct pcpu *pc) { db_printf("self = %p\n", pc->pc_prvspace); db_printf("curpmap = %p\n", pc->pc_curpmap); db_printf("tssp = %p\n", pc->pc_tssp); db_printf("rsp0 = 0x%lx\n", pc->pc_rsp0); db_printf("kcr3 = 0x%lx\n", pc->pc_kcr3); db_printf("ucr3 = 0x%lx\n", pc->pc_ucr3); db_printf("scr3 = 0x%lx\n", pc->pc_saved_ucr3); db_printf("gs32p = %p\n", pc->pc_gs32p); db_printf("ldt = %p\n", pc->pc_ldt); db_printf("tss = %p\n", pc->pc_tss); - db_printf("tlb gen = %u\n", pc->pc_smp_tlb_done); } diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c index 4fb846d75948..1a07080c5daf 100644 --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1,2814 +1,2815 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_pci.h" #include "opt_platform.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_ATPIC #include #else #include #endif #include #include #include /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); /* * The PTI trampoline stack needs enough space for a hardware trapframe and a * couple of scratch registers, as well as the trapframe left behind after an * iret fault. */ CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - offsetof(struct pti_frame, pti_rip)); extern u_int64_t hammer_time(u_int64_t, u_int64_t); #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) static void cpu_startup(void *); static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len); static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Preload data parse function */ static caddr_t native_parse_preload_data(u_int64_t); /* Native function to fetch and parse the e820 map */ static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); /* Default init_ops implementation. */ struct init_ops init_ops = { .parse_preload_data = native_parse_preload_data, .early_clock_source_init = i8254_init, .early_delay = i8254_delay, .parse_memmap = native_parse_memmap, #ifdef SMP .mp_bootaddress = mp_bootaddress, .start_all_aps = native_start_all_aps, #endif #ifdef DEV_PCI .msi_init = msi_init, #endif }; /* * Physical address of the EFI System Table. Stashed from the metadata hints * passed into the kernel and used by the EFI code to call runtime services. */ vm_paddr_t efi_systbl_phys; /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; int cold = 1; long Maxmem = 0; long realmem = 0; struct kva_md_info kmi; static struct trapframe proc0_tf; struct region_descriptor r_idt; struct pcpu *__pcpu; struct pcpu temp_bsp_pcpu; struct mtx icu_lock; struct mem_range_softc mem_range_softc; struct mtx dt_lock; /* lock for GDT and LDT */ void (*vmm_resume_p)(void); static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_free_count())) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_free_count()), ptoa((uintmax_t)vm_free_count()) / 1048576); #ifdef DEV_PCI if (bootverbose && intel_graphics_stolen_base != 0) printf("intel stolen mem: base %#jx size %ju MB\n", (uintmax_t)intel_graphics_stolen_base, (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); #endif /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } static void late_ifunc_resolve(void *dummy __unused) { link_elf_late_ireloc(); } SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct pcb *pcb; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; pcb = td->td_pcb; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_rsp); if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); update_pcb_bases(pcb); sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; bzero(sf.sf_uc.uc_mcontext.mc_spare, sizeof(sf.sf_uc.uc_mcontext.mc_spare)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_rsp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned long)sp & ~0x3Ful); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); /* Build the argument list for the signal handler. */ regs->tf_rdi = sig; /* arg 1 in %rdi */ regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ } else { /* Old FreeBSD-style arguments. */ regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_rsp = (long)sfp; regs->tf_rip = p->p_sysent->sv_sigcode_base; regs->tf_rflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_ss = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct pcb *pcb; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; long rflags; int cs, error, ret; ksiginfo_t ksi; pcb = td->td_pcb; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) { uprintf("pid %d (%s): sigreturn copyin failed\n", p->p_pid, td->td_name); return (error); } ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; rflags = ucp->uc_mcontext.mc_rflags; /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(rflags, regs->tf_rflags)) { uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, td->td_name, rflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_rip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) { uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) { uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", p->p_pid, td->td_name, ret); return (ret); } bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); update_pcb_bases(pcb); pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #ifdef COMPAT_FREEBSD4 int freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) { return sys_sigreturn(td, (struct sigreturn_args *)uap); } #endif /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) { struct trapframe *regs; struct pcb *pcb; register_t saved_rflags; regs = td->td_frame; pcb = td->td_pcb; if (td->td_proc->p_md.md_ldt != NULL) user_ldt_free(td); update_pcb_bases(pcb); pcb->pcb_fsbase = 0; pcb->pcb_gsbase = 0; clear_pcb_flags(pcb, PCB_32BIT); pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; saved_rflags = regs->tf_rflags & PSL_T; bzero((char *)regs, sizeof(struct trapframe)); regs->tf_rip = imgp->entry_addr; regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; regs->tf_rdi = stack; /* argv */ regs->tf_rflags = PSL_USER | saved_rflags; regs->tf_ss = _udatasel; regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _ufssel; regs->tf_gs = _ugssel; regs->tf_flags = TF_HASSEGS; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } clear_pcb_flags(pcb, PCB_DBREGS); } /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); } void cpu_setregs(void) { register_t cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the * BSP. See the comments there about why we set them. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); } /* * Initialize amd64 and configure to run kernel */ /* * Initialize segments & interrupt table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ static char dblfault_stack[PAGE_SIZE] __aligned(16); static char mce0_stack[PAGE_SIZE] __aligned(16); static char nmi0_stack[PAGE_SIZE] __aligned(16); static char dbg0_stack[PAGE_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); /* * Software prototypes -- in more palatable form. * * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same * slots as corresponding segments for i386 kernel. */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GNULL2_SEL 1 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUFS32_SEL 2 32 bit %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS32_SEL 3 32 bit %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GUCODE32_SEL 6 32 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 8 64 bit Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_long = 1, .ssd_def32 = 0, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, .ssd_type = SDT_SYSTSS, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Actually, the TSS is a system descriptor which is double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 LDT Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 12 LDT Descriptor, double size */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_long = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); void setidt(int idx, inthand_t *func, int typ, int dpl, int ist) { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (uintptr_t)func; ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); ip->gd_ist = ist; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((uintptr_t)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(div_pti), IDTVEC(bpt_pti), IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), IDTVEC(xmm_pti), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), #endif IDTVEC(fast_syscall), IDTVEC(fast_syscall32), IDTVEC(fast_syscall_pti); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { struct { uint16_t limit; uint64_t base; } __packed idtr, gdtr; uint16_t ldt, tr; __asm __volatile("sidt %0" : "=m" (idtr)); db_printf("idtr\t0x%016lx/%04x\n", (u_long)idtr.base, (u_int)idtr.limit); __asm __volatile("sgdt %0" : "=m" (gdtr)); db_printf("gdtr\t0x%016lx/%04x\n", (u_long)gdtr.base, (u_int)gdtr.limit); __asm __volatile("sldt %0" : "=r" (ldt)); db_printf("ldtr\t0x%04x\n", ldt); __asm __volatile("str %0" : "=r" (tr)); db_printf("tr\t0x%04x\n", tr); db_printf("cr0\t0x%016lx\n", rcr0()); db_printf("cr2\t0x%016lx\n", rcr2()); db_printf("cr3\t0x%016lx\n", rcr3()); db_printf("cr4\t0x%016lx\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016lx\n", rxcr(0)); db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t%016lx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); } DB_SHOW_COMMAND(dbregs, db_show_dbregs) { db_printf("dr0\t0x%016lx\n", rdr0()); db_printf("dr1\t0x%016lx\n", rdr1()); db_printf("dr2\t0x%016lx\n", rdr2()); db_printf("dr3\t0x%016lx\n", rdr3()); db_printf("dr6\t0x%016lx\n", rdr6()); db_printf("dr7\t0x%016lx\n", rdr7()); } #endif void sdtossd(sd, ssd) struct user_segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_long = sd->sd_long; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } void ssdtosd(ssd, sd) struct soft_segment_descriptor *ssd; struct user_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_long = ssd->ssd_long; sd->sd_def32 = ssd->ssd_def32; sd->sd_gran = ssd->ssd_gran; } void ssdtosyssd(ssd, sd) struct soft_segment_descriptor *ssd; struct system_segment_descriptor *sd; { sd->sd_lobase = (ssd->ssd_base) & 0xffffff; sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; sd->sd_type = ssd->ssd_type; sd->sd_dpl = ssd->ssd_dpl; sd->sd_p = ssd->ssd_p; sd->sd_gran = ssd->ssd_gran; } #if !defined(DEV_ATPIC) && defined(DEV_ISA) #include #include /* * Return a bitmap of the current interrupt requests. This is 8259-specific * and is only suitable for use at probe time. * This is only here to pacify sio. It is NOT FATAL if this doesn't work. * It shouldn't be here. There should probably be an APIC centric * implementation in the apic driver code, if at all. */ intrmask_t isa_irq_pending(void) { u_char irr1; u_char irr2; irr1 = inb(IO_ICU1); irr2 = inb(IO_ICU2); return ((irr2 << 8) | irr1); } #endif u_int basemem; static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. * * NB: physmap_idx points to the next free slot. */ insert_idx = physmap_idx; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYS_AVAIL_ENTRIES) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } void bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap, *smapend; smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016lx len=%016lx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) continue; if (!add_physmap_entry(smap->base, smap->length, physmap, physmap_idx)) break; } } static void add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, int *physmap_idx) { struct efi_md *map, *p; const char *type; size_t efisz; int ndesc, i; static const char *types[] = { "Reserved", "LoaderCode", "LoaderData", "BootServicesCode", "BootServicesData", "RuntimeServicesCode", "RuntimeServicesData", "ConventionalMemory", "UnusableMemory", "ACPIReclaimMemory", "ACPIMemoryNVS", "MemoryMappedIO", "MemoryMappedIOPortSpace", "PalCode", "PersistentMemory" }; /* * Memory map data provided by UEFI via the GetMemoryMap * Boot Services API. */ efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return; ndesc = efihdr->memory_size / efihdr->descriptor_size; if (boothowto & RB_VERBOSE) printf("%23s %12s %12s %8s %4s\n", "Type", "Physical", "Virtual", "#Pages", "Attr"); for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, efihdr->descriptor_size)) { if (boothowto & RB_VERBOSE) { if (p->md_type < nitems(types)) type = types[p->md_type]; else type = ""; printf("%23s %012lx %12p %08lx ", type, p->md_phys, p->md_virt, p->md_pages); if (p->md_attr & EFI_MD_ATTR_UC) printf("UC "); if (p->md_attr & EFI_MD_ATTR_WC) printf("WC "); if (p->md_attr & EFI_MD_ATTR_WT) printf("WT "); if (p->md_attr & EFI_MD_ATTR_WB) printf("WB "); if (p->md_attr & EFI_MD_ATTR_UCE) printf("UCE "); if (p->md_attr & EFI_MD_ATTR_WP) printf("WP "); if (p->md_attr & EFI_MD_ATTR_RP) printf("RP "); if (p->md_attr & EFI_MD_ATTR_XP) printf("XP "); if (p->md_attr & EFI_MD_ATTR_NV) printf("NV "); if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) printf("MORE_RELIABLE "); if (p->md_attr & EFI_MD_ATTR_RO) printf("RO "); if (p->md_attr & EFI_MD_ATTR_RT) printf("RUNTIME"); printf("\n"); } switch (p->md_type) { case EFI_MD_TYPE_CODE: case EFI_MD_TYPE_DATA: case EFI_MD_TYPE_BS_CODE: case EFI_MD_TYPE_BS_DATA: case EFI_MD_TYPE_FREE: /* * We're allowed to use any entry with these types. */ break; default: continue; } if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), physmap, physmap_idx)) break; } } static char bootmethod[16] = ""; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); static void native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct bios_smap *smap; struct efi_map_header *efihdr; u_int32_t size; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes smap. */ efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); smap = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (efihdr == NULL && smap == NULL) panic("No BIOS smap or EFI map info from loader!"); if (efihdr != NULL) { add_efi_map_entries(efihdr, physmap, physmap_idx); strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); } else { size = *((u_int32_t *)smap - 1); bios_add_smap_entries(smap, size, physmap, physmap_idx); strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); } } #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(caddr_t kmdp, u_int64_t first) { int i, physmap_idx, pa_indx, da_indx; vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; u_long physmem_start, physmem_tunable, memtest; pt_entry_t *pte; quad_t dcons_addr, dcons_size; int page_counter; /* * Tell the physical memory allocator about pages used to store * the kernel and preloaded data. See kmem_bootstrap_free(). */ vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); bzero(physmap, sizeof(physmap)); physmap_idx = 0; init_ops.parse_memmap(kmdp, physmap, &physmap_idx); physmap_idx -= 2; /* * Find the 'base memory' segment for SMP */ basemem = 0; for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] <= 0xA0000) { basemem = physmap[i + 1] / 1024; break; } } if (basemem == 0 || basemem > 640) { if (bootverbose) printf( "Memory map doesn't contain a basemem segment, faking it"); basemem = 640; } /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * The boot memory test is disabled by default, as it takes a * significant amount of time on large-memory systems, and is * unfriendly to virtual machines as it unnecessarily touches all * pages. * * A general name is used as the code may be extended to support * additional tests beyond the current "page present" test. */ memtest = 0; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); /* * Don't allow MAXMEM or hw.physmem to extend the amount of memory * in the system. */ if (Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * Make hole for "AP -> long mode" bootstrap code. The * mp_bootaddress vector is only available when the kernel * is configured to support APs and APs for the system start * in real mode mode (e.g. SMP bare metal). */ if (init_ops.mp_bootaddress) init_ops.mp_bootaddress(physmap, &physmap_idx); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(&first); /* * Size up each available chunk of physical memory. * * XXX Some BIOSes corrupt low 64KB between suspend and resume. * By default, mask off the first 16 pages unless we appear to be * running in a VM. */ physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); if (physmap[0] < physmem_start) { if (physmem_start < PAGE_SIZE) physmap[0] = PAGE_SIZE; else if (physmem_start >= physmap[1]) physmap[0] = round_page(physmap[1] - PAGE_SIZE); else physmap[0] = round_page(physmem_start); } pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP1; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ page_counter = 0; if (memtest != 0) printf("Testing system memory"); for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR1; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= (vm_paddr_t)kernphys && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * Print a "." every GB to show we're making * progress. */ page_counter++; if ((page_counter % PAGES_PER_GB) == 0) printf("."); /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ENTRIES) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == PHYS_AVAIL_ENTRIES) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); if (memtest != 0) printf("\n"); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); } static caddr_t native_parse_preload_data(u_int64_t modulep) { caddr_t kmdp; char *envp; #ifdef DDB vm_offset_t ksym_start; vm_offset_t ksym_end; #endif preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); preload_bootstrap_relocate(KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += KERNBASE; init_static_kenv(envp, 0); #ifdef DDB ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); db_fetch_ksymtab(ksym_start, ksym_end, 0); #endif efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); return (kmdp); } static void amd64_kdb_init(void) { kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } /* Set up the fast syscall stuff */ void amd64_conf_fast_syscall(void) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); } void amd64_bsp_pcpu_init1(struct pcpu *pc) { struct user_segment_descriptor *gdt; PCPU_SET(prvspace, pc); gdt = *PCPU_PTR(gdt); PCPU_SET(curthread, &thread0); PCPU_SET(tssp, PCPU_PTR(common_tss)); PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); + PCPU_SET(smp_tlb_gen, 1); } void amd64_bsp_pcpu_init2(uint64_t rsp0) { PCPU_SET(rsp0, rsp0); PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); PCPU_SET(curpcb, thread0.td_pcb); } void amd64_bsp_ist_init(struct pcpu *pc) { struct nmi_pcpu *np; struct amd64tss *tssp; tssp = &pc->pc_common_tss; /* doublefault stack space, runs on ist1 */ np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist1 = (long)np; /* * NMI stack, runs on ist2. The pcpu pointer is stored just * above the start of the ist2 stack. */ np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist2 = (long)np; /* * MC# stack, runs on ist3. The pcpu pointer is stored just * above the start of the ist3 stack. */ np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist3 = (long)np; /* * DB# stack, runs on ist4. */ np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; np->np_pcpu = (register_t)pc; tssp->tss_ist4 = (long)np; } u_int64_t hammer_time(u_int64_t modulep, u_int64_t physfree) { caddr_t kmdp; int gsel_tss, x; struct pcpu *pc; struct xstate_hdr *xhdr; u_int64_t rsp0; char *env; struct user_segment_descriptor *gdt; struct region_descriptor r_gdt; size_t kstack0_sz; int late_console; TSRAW(&thread0, TS_ENTER, __func__, NULL); kmdp = init_ops.parse_preload_data(modulep); physfree += ucode_load_bsp(physfree + KERNBASE); physfree = roundup2(physfree, PAGE_SIZE); identify_cpu1(); identify_hypervisor(); identify_cpu_fixup_bsp(); identify_cpu2(); initializecpucache(); /* * Check for pti, pcid, and invpcid before ifuncs are * resolved, to correctly select the implementation for * pmap_activate_sw_mode(). */ pti = pti_get_default(); TUNABLE_INT_FETCH("vm.pmap.pti", &pti); TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) != 0; } else { pmap_pcid_enabled = 0; } link_elf_ireloc(kmdp); /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); /* Init basic tunables, hz etc */ init_param1(); thread0.td_kstack = physfree + KERNBASE; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); physfree += kstack0_sz; /* * Initialize enough of thread0 for delayed invalidation to * work very early. Rely on thread0.td_base_pri * zero-initialization, it is reset to PVM at proc0_init(). */ pmap_thread_init_invl_gen(&thread0); pc = &temp_bsp_pcpu; pcpu_init(pc, 0, sizeof(struct pcpu)); gdt = &temp_bsp_pcpu.pc_gdt[0]; /* * make gdt memory segments */ for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (long)gdt; lgdt(&r_gdt); wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ dpcpu_init((void *)(physfree + KERNBASE), 0); physfree += DPCPU_SIZE; amd64_bsp_pcpu_init1(pc); /* Non-late cninit() and printf() can be moved up to here. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, SEL_UPL, 0); setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); /* * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) * transition). * Once bootblocks have updated, we can test directly for * efi_systbl != NULL here... */ if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) != NULL) vty_set_preferred(VTY_VT); TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", &syscall_ret_l1d_flush_mode); TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", &x86_rngds_mitg_enable); finishidentcpu(); /* Final stage of CPU initialization */ initializecpu(); /* Initialize CPU registers */ amd64_bsp_ist_init(pc); /* Set the IO permission bitmap (empty due to tss seg limit) */ pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); amd64_conf_fast_syscall(); /* * We initialize the PCB pointer early so that exception * handlers will work. Also set up td_critnest to short-cut * the page fault handler. */ cpu_max_ext_state_size = sizeof(struct savefpu); set_top_of_stack_td(&thread0); thread0.td_pcb = get_pcb_td(&thread0); thread0.td_critnest = 1; /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ late_console = 1; TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); amd64_kdb_init(); } getmemsize(kmdp, physfree); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ #ifdef DEV_PCI /* This call might adjust phys_avail[]. */ pci_early_quirks(); #endif if (late_console) cninit(); #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); #endif #else #error "have you forgotten the isa device?" #endif if (late_console) amd64_kdb_init(); msgbufinit(msgbufp, msgbufsize); fpuinit(); /* * Reinitialize thread0's stack base now that the xsave area size is * known. Set up thread0's pcb save area after fpuinit calculated fpu * save area size. Zero out the extended state header in fpu save area. */ set_top_of_stack_td(&thread0); thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size); if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } /* make an initial tss so cpu can get interrupt stack on syscall! */ rsp0 = thread0.td_md.md_stack_base; /* Ensure the stack is aligned to 16 bytes */ rsp0 &= ~0xFul; PCPU_PTR(common_tss)->tss_rsp0 = rsp0; amd64_bsp_pcpu_init2(rsp0); /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); _ufssel = GSEL(GUFS32_SEL, SEL_UPL); _ugssel = GSEL(GUGS32_SEL, SEL_UPL); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; thread0.td_frame = &proc0_tf; env = kern_getenv("kernelname"); if (env != NULL) strlcpy(kernelname, env, sizeof(kernelname)); cpu_probe_amdc1e(); kcsan_cpu_init(0); #ifdef FDT x86_init_fdt(); #endif thread0.td_critnest = 0; TSEXIT(); /* Location of kernel stack for locore */ return (thread0.td_md.md_stack_base); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); static int efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct efi_map_header *efihdr; caddr_t kmdp; uint32_t efisize; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr == NULL) return (0); efisize = *((uint32_t *)efihdr - 1); return (SYSCTL_OUT(req, efihdr, efisize)); } SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; critical_enter(); } else td->td_md.md_spinlock_count++; } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) { critical_exit(); intr_restore(flags); } } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_r12 = tf->tf_r12; pcb->pcb_r13 = tf->tf_r13; pcb->pcb_r14 = tf->tf_r14; pcb->pcb_r15 = tf->tf_r15; pcb->pcb_rbp = tf->tf_rbp; pcb->pcb_rbx = tf->tf_rbx; pcb->pcb_rip = tf->tf_rip; pcb->pcb_rsp = tf->tf_rsp; } int ptrace_set_pc(struct thread *td, unsigned long addr) { td->td_frame->tf_rip = addr; set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } int ptrace_single_step(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if ((td->td_frame->tf_rflags & PSL_T) == 0) { td->td_frame->tf_rflags |= PSL_T; td->td_dbgflags |= TDB_STEP; } return (0); } int ptrace_clear_single_step(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); td->td_frame->tf_rflags &= ~PSL_T; td->td_dbgflags &= ~TDB_STEP; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; tp = td->td_frame; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_r15 = tp->tf_r15; regs->r_r14 = tp->tf_r14; regs->r_r13 = tp->tf_r13; regs->r_r12 = tp->tf_r12; regs->r_r11 = tp->tf_r11; regs->r_r10 = tp->tf_r10; regs->r_r9 = tp->tf_r9; regs->r_r8 = tp->tf_r8; regs->r_rdi = tp->tf_rdi; regs->r_rsi = tp->tf_rsi; regs->r_rbp = tp->tf_rbp; regs->r_rbx = tp->tf_rbx; regs->r_rdx = tp->tf_rdx; regs->r_rcx = tp->tf_rcx; regs->r_rax = tp->tf_rax; regs->r_rip = tp->tf_rip; regs->r_cs = tp->tf_cs; regs->r_rflags = tp->tf_rflags; regs->r_rsp = tp->tf_rsp; regs->r_ss = tp->tf_ss; if (tp->tf_flags & TF_HASSEGS) { regs->r_ds = tp->tf_ds; regs->r_es = tp->tf_es; regs->r_fs = tp->tf_fs; regs->r_gs = tp->tf_gs; } else { regs->r_ds = 0; regs->r_es = 0; regs->r_fs = 0; regs->r_gs = 0; } regs->r_err = 0; regs->r_trapno = 0; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *tp; register_t rflags; tp = td->td_frame; rflags = regs->r_rflags & 0xffffffff; if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); tp->tf_r15 = regs->r_r15; tp->tf_r14 = regs->r_r14; tp->tf_r13 = regs->r_r13; tp->tf_r12 = regs->r_r12; tp->tf_r11 = regs->r_r11; tp->tf_r10 = regs->r_r10; tp->tf_r9 = regs->r_r9; tp->tf_r8 = regs->r_r8; tp->tf_rdi = regs->r_rdi; tp->tf_rsi = regs->r_rsi; tp->tf_rbp = regs->r_rbp; tp->tf_rbx = regs->r_rbx; tp->tf_rdx = regs->r_rdx; tp->tf_rcx = regs->r_rcx; tp->tf_rax = regs->r_rax; tp->tf_rip = regs->r_rip; tp->tf_cs = regs->r_cs; tp->tf_rflags = rflags; tp->tf_rsp = regs->r_rsp; tp->tf_ss = regs->r_ss; if (0) { /* XXXKIB */ tp->tf_ds = regs->r_ds; tp->tf_es = regs->r_es; tp->tf_fs = regs->r_fs; tp->tf_gs = regs->r_gs; tp->tf_flags = TF_HASSEGS; } set_pcb_flags(td->td_pcb, PCB_FULL_IRET); return (0); } /* XXX check all this stuff! */ /* externalize from sv_xmm */ static void fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) { struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; struct envxmm *penv_xmm = &sv_xmm->sv_env; int i; /* pcb -> fpregs */ bzero(fpregs, sizeof(*fpregs)); /* FPU control/status */ penv_fpreg->en_cw = penv_xmm->en_cw; penv_fpreg->en_sw = penv_xmm->en_sw; penv_fpreg->en_tw = penv_xmm->en_tw; penv_fpreg->en_opcode = penv_xmm->en_opcode; penv_fpreg->en_rip = penv_xmm->en_rip; penv_fpreg->en_rdp = penv_xmm->en_rdp; penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); } /* internalize from fpregs into sv_xmm */ static void set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) { struct envxmm *penv_xmm = &sv_xmm->sv_env; struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; int i; /* fpregs -> pcb */ /* FPU control/status */ penv_xmm->en_cw = penv_fpreg->en_cw; penv_xmm->en_sw = penv_fpreg->en_sw; penv_xmm->en_tw = penv_fpreg->en_tw; penv_xmm->en_opcode = penv_fpreg->en_opcode; penv_xmm->en_rip = penv_fpreg->en_rip; penv_xmm->en_rdp = penv_fpreg->en_rdp; penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; /* FPU registers */ for (i = 0; i < 8; ++i) bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); /* SSE registers */ for (i = 0; i < 16; ++i) bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); } /* externalize from td->pcb */ int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); fpugetregs(td); fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); return (0); } /* internalize to td->pcb */ int set_fpregs(struct thread *td, struct fpreg *fpregs) { critical_enter(); set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); fpuuserinited(td); critical_exit(); return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct pcb *pcb; struct trapframe *tp; pcb = td->td_pcb; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_rsp); PROC_UNLOCK(curthread->td_proc); mcp->mc_r15 = tp->tf_r15; mcp->mc_r14 = tp->tf_r14; mcp->mc_r13 = tp->tf_r13; mcp->mc_r12 = tp->tf_r12; mcp->mc_r11 = tp->tf_r11; mcp->mc_r10 = tp->tf_r10; mcp->mc_r9 = tp->tf_r9; mcp->mc_r8 = tp->tf_r8; mcp->mc_rdi = tp->tf_rdi; mcp->mc_rsi = tp->tf_rsi; mcp->mc_rbp = tp->tf_rbp; mcp->mc_rbx = tp->tf_rbx; mcp->mc_rcx = tp->tf_rcx; mcp->mc_rflags = tp->tf_rflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_rax = 0; mcp->mc_rdx = 0; mcp->mc_rflags &= ~PSL_C; } else { mcp->mc_rax = tp->tf_rax; mcp->mc_rdx = tp->tf_rdx; } mcp->mc_rip = tp->tf_rip; mcp->mc_cs = tp->tf_cs; mcp->mc_rsp = tp->tf_rsp; mcp->mc_ss = tp->tf_ss; mcp->mc_ds = tp->tf_ds; mcp->mc_es = tp->tf_es; mcp->mc_fs = tp->tf_fs; mcp->mc_gs = tp->tf_gs; mcp->mc_flags = tp->tf_flags; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); update_pcb_bases(pcb); mcp->mc_fsbase = pcb->pcb_fsbase; mcp->mc_gsbase = pcb->pcb_gsbase; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { struct pcb *pcb; struct trapframe *tp; char *xfpustate; long rflags; int ret; pcb = td->td_pcb; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); rflags = (mcp->mc_rflags & PSL_USERCHANGE) | (tp->tf_rflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(struct savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_r15 = mcp->mc_r15; tp->tf_r14 = mcp->mc_r14; tp->tf_r13 = mcp->mc_r13; tp->tf_r12 = mcp->mc_r12; tp->tf_r11 = mcp->mc_r11; tp->tf_r10 = mcp->mc_r10; tp->tf_r9 = mcp->mc_r9; tp->tf_r8 = mcp->mc_r8; tp->tf_rdi = mcp->mc_rdi; tp->tf_rsi = mcp->mc_rsi; tp->tf_rbp = mcp->mc_rbp; tp->tf_rbx = mcp->mc_rbx; tp->tf_rdx = mcp->mc_rdx; tp->tf_rcx = mcp->mc_rcx; tp->tf_rax = mcp->mc_rax; tp->tf_rip = mcp->mc_rip; tp->tf_rflags = rflags; tp->tf_rsp = mcp->mc_rsp; tp->tf_ss = mcp->mc_ss; tp->tf_flags = mcp->mc_flags; if (tp->tf_flags & TF_HASSEGS) { tp->tf_ds = mcp->mc_ds; tp->tf_es = mcp->mc_es; tp->tf_fs = mcp->mc_fs; tp->tf_gs = mcp->mc_gs; } set_pcb_flags(pcb, PCB_FULL_IRET); if (mcp->mc_flags & _MC_HASBASES) { pcb->pcb_fsbase = mcp->mc_fsbase; pcb->pcb_gsbase = mcp->mc_gsbase; } return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len) { size_t max_len, len; mcp->mc_ownedfp = fpugetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = fpuformat(); if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(struct savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); } static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, xfpustate, xfpustate_len); } else return (EINVAL); return (error); } void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); if (PCPU_GET(fpcurthread) == td) fpudrop(); /* * XXX force a full drop of the fpu. The above only drops it if we * owned it. * * XXX I don't much like fpugetuserregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of fpugetuserregs()... perhaps we just * have too many layers. */ clear_pcb_flags(curthread->td_pcb, PCB_FPUINITDONE | PCB_USERFPUINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[8] = 0; dbregs->dr[9] = 0; dbregs->dr[10] = 0; dbregs->dr[11] = 0; dbregs->dr[12] = 0; dbregs->dr[13] = 0; dbregs->dr[14] = 0; dbregs->dr[15] = 0; return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP or a general protection fault right here. * Upper bits of dr6 and dr7 must not be set */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (td->td_frame->tf_cs == _ucode32sel && DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) return (EINVAL); } if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || (dbregs->dr[7] & 0xffffffff00000000ul) != 0) return (EINVAL); pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; set_pcb_flags(pcb, PCB_DBREGS); } return (0); } void reset_dbregs(void) { load_dr7(0); /* Turn off the control bits first */ load_dr0(0); load_dr1(0); load_dr2(0); load_dr3(0); load_dr6(0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(register_t dr6) { u_int64_t dr7; u_int64_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; bp = dr6 & DBREG_DR6_BMASK; if (bp == 0) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } /* * The pcb_flags is only modified by current thread, or by other threads * when current thread is stopped. However, current thread may change it * from the interrupt context in cpu_switch(), or in the trap handler. * When we read-modify-write pcb_flags from C sources, compiler may generate * code that is not atomic regarding the interrupt handler. If a trap or * interrupt happens and any flag is modified from the handler, it can be * clobbered with the cached value later. Therefore, we implement setting * and clearing flags with single-instruction functions, which do not race * with possible modification of the flags from the trap or interrupt context, * because traps and interrupts are executed only on instruction boundary. */ void set_pcb_flags_raw(struct pcb *pcb, const u_int flags) { __asm __volatile("orl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) : "cc", "memory"); } /* * The support for RDFSBASE, WRFSBASE and similar instructions for %gs * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into * pcb if user space modified the bases. We must save on the context * switch or if the return to usermode happens through the doreti. * * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, * which have a consequence that the base MSRs must be saved each time * the PCB_FULL_IRET flag is set. We disable interrupts to sync with * context switches. */ static void set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) { register_t r; if (curpcb == pcb && (flags & PCB_FULL_IRET) != 0 && (pcb->pcb_flags & PCB_FULL_IRET) == 0) { r = intr_disable(); if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { if (rfs() == _ufssel) pcb->pcb_fsbase = rdfsbase(); if (rgs() == _ugssel) pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); } set_pcb_flags_raw(pcb, flags); intr_restore(r); } else { set_pcb_flags_raw(pcb, flags); } } DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) { return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? set_pcb_flags_fsgsbase : set_pcb_flags_raw); } void clear_pcb_flags(struct pcb *pcb, const u_int flags) { __asm __volatile("andl %1,%0" : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) : "cc", "memory"); } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ #undef memset #undef memmove #undef memcpy void *memset_std(void *buf, int c, size_t len); void *memset_erms(void *buf, int c, size_t len); void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, size_t len); void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, size_t len); #ifdef KCSAN /* * These fail to build as ifuncs when used with KCSAN. */ void * memset(void *buf, int c, size_t len) { return (memset_std(buf, c, len)); } void * memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) { return (memmove_std(dst, src, len)); } void * memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) { return (memcpy_std(dst, src, len)); } #else DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memset_erms : memset_std); } DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memmove_erms : memmove_std); } DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? memcpy_erms : memcpy_std); } #endif void pagezero_std(void *addr); void pagezero_erms(void *addr); DEFINE_IFUNC(, void , pagezero, (void *)) { return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? pagezero_erms : pagezero_std); } diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 7085a9b1c540..f5de90484384 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -1,753 +1,1115 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_sched.h" #include "opt_smp.h" #include #include #include #include #include #ifdef GPROF #include #endif +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_ACPI #include #include #endif #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) #define GiB(v) (v ## ULL << 30) #define AP_BOOTPT_SZ (PAGE_SIZE * 3) /* Temporary variables for init_secondary() */ char *doublefault_stack; char *mce_stack; char *nmi_stack; char *dbg_stack; /* * Local data and functions. */ static int start_ap(int apic_id); static bool is_kernel_paddr(vm_paddr_t pa) { return (pa >= trunc_2mpage(btext - KERNBASE) && pa < round_page(_end - KERNBASE)); } static bool is_mpboot_good(vm_paddr_t start, vm_paddr_t end) { return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem); } /* * Calculate usable address in base memory for AP trampoline code. */ void mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx) { vm_paddr_t start, end; unsigned int i; bool allocated; alloc_ap_trampoline(physmap, physmap_idx); /* * Find a memory region big enough below the 4GB boundary to * store the initial page tables. Region must be mapped by * the direct map. * * Note that it needs to be aligned to a page boundary. */ allocated = false; for (i = *physmap_idx; i <= *physmap_idx; i -= 2) { /* * First, try to chomp at the start of the physmap region. * Kernel binary might claim it already. */ start = round_page(physmap[i]); end = start + AP_BOOTPT_SZ; if (start < end && end <= physmap[i + 1] && is_mpboot_good(start, end) && !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) { allocated = true; physmap[i] = end; break; } /* * Second, try to chomp at the end. Again, check * against kernel. */ end = trunc_page(physmap[i + 1]); start = end - AP_BOOTPT_SZ; if (start < end && start >= physmap[i] && is_mpboot_good(start, end) && !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) { allocated = true; physmap[i + 1] = start; break; } } if (allocated) { mptramp_pagetables = start; if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) { memmove(&physmap[i], &physmap[i + 2], sizeof(*physmap) * (*physmap_idx - i + 2)); *physmap_idx -= 2; } } else { mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ; if (bootverbose) printf( "Cannot find enough space for the initial AP page tables, placing them at %#x", mptramp_pagetables); } } /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; } - /* Install an inter-CPU IPI for TLB invalidation */ - if (pmap_pcid_enabled) { - if (invpcid_works) { - setidt(IPI_INVLTLB, pti ? - IDTVEC(invltlb_invpcid_pti_pti) : - IDTVEC(invltlb_invpcid_nopti), SDT_SYSIGT, - SEL_KPL, 0); - setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_invpcid_pti) : - IDTVEC(invlpg_invpcid), SDT_SYSIGT, SEL_KPL, 0); - setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_invpcid_pti) : - IDTVEC(invlrng_invpcid), SDT_SYSIGT, SEL_KPL, 0); - } else { - setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) : - IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0); - setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pcid_pti) : - IDTVEC(invlpg_pcid), SDT_SYSIGT, SEL_KPL, 0); - setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pcid_pti) : - IDTVEC(invlrng_pcid), SDT_SYSIGT, SEL_KPL, 0); - } - } else { - setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb), - SDT_SYSIGT, SEL_KPL, 0); - setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg), - SDT_SYSIGT, SEL_KPL, 0); - setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng), - SDT_SYSIGT, SEL_KPL, 0); - } - - /* Install an inter-CPU IPI for cache invalidation. */ - setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache), + /* Install an inter-CPU IPI for for cache and TLB invalidations. */ + setidt(IPI_INVLOP, pti ? IDTVEC(invlop_pti) : IDTVEC(invlop), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) : IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); /* Install generic inter-CPU IPI handler */ setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) : IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU stop/restart */ setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); /* Probe logical/physical core configuration. */ topo_probe(); assign_cpu_ids(); /* Start each Application Processor */ init_ops.start_all_aps(); set_interrupt_apic_ids(); #if defined(DEV_ACPI) && MAXMEMDOM > 1 acpi_pxm_set_cpu_locality(); #endif } /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { struct pcpu *pc; struct nmi_pcpu *np; struct user_segment_descriptor *gdt; struct region_descriptor ap_gdt; u_int64_t cr0; int cpu, gsel_tss, x; /* Set by the startup code for us to use */ cpu = bootAP; /* Update microcode before doing anything else. */ ucode_load_ap(cpu); /* Get per-cpu data and save */ pc = &__pcpu[cpu]; /* prime data page for it to use */ pcpu_init(pc, cpu, sizeof(struct pcpu)); dpcpu_init(dpcpu, cpu); pc->pc_apic_id = cpu_apic_ids[cpu]; pc->pc_prvspace = pc; pc->pc_curthread = 0; pc->pc_tssp = &pc->pc_common_tss; pc->pc_rsp0 = 0; pc->pc_pti_rsp0 = (((vm_offset_t)&pc->pc_pti_stack + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); gdt = pc->pc_gdt; pc->pc_tss = (struct system_segment_descriptor *)&gdt[GPROC0_SEL]; pc->pc_fs32p = &gdt[GUFS32_SEL]; pc->pc_gs32p = &gdt[GUGS32_SEL]; pc->pc_ldt = (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]; /* See comment in pmap_bootstrap(). */ pc->pc_pcid_next = PMAP_PCID_KERN + 2; pc->pc_pcid_gen = 1; + pc->pc_smp_tlb_gen = 1; + /* Init tss */ pc->pc_common_tss = __pcpu[0].pc_common_tss; pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; pc->pc_common_tss.tss_rsp0 = 0; /* The doublefault stack runs on IST1. */ np = ((struct nmi_pcpu *)&doublefault_stack[PAGE_SIZE]) - 1; np->np_pcpu = (register_t)pc; pc->pc_common_tss.tss_ist1 = (long)np; /* The NMI stack runs on IST2. */ np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; np->np_pcpu = (register_t)pc; pc->pc_common_tss.tss_ist2 = (long)np; /* The MC# stack runs on IST3. */ np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1; np->np_pcpu = (register_t)pc; pc->pc_common_tss.tss_ist3 = (long)np; /* The DB# stack runs on IST4. */ np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1; np->np_pcpu = (register_t)pc; pc->pc_common_tss.tss_ist4 = (long)np; /* Prepare private GDT */ gdt_segs[GPROC0_SEL].ssd_base = (long)&pc->pc_common_tss; for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != GPROC0_SEL + 1 && x != GUSERLDT_SEL && x != GUSERLDT_SEL + 1) ssdtosd(&gdt_segs[x], &gdt[x]); } ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; ap_gdt.rd_base = (u_long)gdt; lgdt(&ap_gdt); /* does magic intra-segment return */ wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ fix_cpuid(); lidt(&r_idt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); amd64_conf_fast_syscall(); /* signal our startup to the BSP. */ mp_naps++; /* Spin until the BSP releases the AP's. */ while (atomic_load_acq_int(&aps_ready) == 0) ia32_pause(); init_secondary_tail(); } /******************************************************************* * local functions and data */ #ifdef NUMA static void mp_realloc_pcpu(int cpuid, int domain) { vm_page_t m; vm_offset_t oa, na; oa = (vm_offset_t)&__pcpu[cpuid]; if (_vm_phys_domain(pmap_kextract(oa)) == domain) return; m = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); if (m == NULL) return; na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagecopy((void *)oa, (void *)na); pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1); /* XXX old pcpu page leaked. */ } #endif /* * start each AP in our list */ int native_start_all_aps(void) { u_int64_t *pt4, *pt3, *pt2; u_int32_t mpbioswarmvec; int apic_id, cpu, domain, i; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* copy the AP 1st level boot code */ bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); /* Locate the page tables, they'll be below the trampoline */ pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { /* Each slot of the level 4 pages points to the same level 3 page */ pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; /* Each slot of the level 3 pages points to the same level 2 page */ pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ pt2[i] = i * (2 * 1024 * 1024); pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; } /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ /* Relocate pcpu areas to the correct domain. */ #ifdef NUMA if (vm_ndomains > 1) for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; domain = acpi_pxm_get_cpu_locality(apic_id); mp_realloc_pcpu(cpu, domain); } #endif /* start each AP */ domain = 0; for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; #ifdef NUMA if (vm_ndomains > 1) domain = acpi_pxm_get_cpu_locality(apic_id); #endif /* allocate and set up an idle stack data page */ bootstacks[cpu] = (void *)kmem_malloc(kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO); doublefault_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO); mce_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO); nmi_stack = (char *)kmem_malloc_domainset( DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO); dbg_stack = (char *)kmem_malloc_domainset( DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain), DPCPU_SIZE, M_WAITOK | M_ZERO); bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 8; bootAP = cpu; /* attempt to start the Application Processor */ if (!start_ap(apic_id)) { /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; panic("AP #%d (PHY# %d) failed!", cpu, apic_id); } CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ } /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); /* number of APs actually started */ return (mp_naps); } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ static int start_ap(int apic_id) { int vector, ms; int cpus; /* calculate the vector */ vector = (boot_address >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } +/* + * Flush the TLB on other CPU's + */ + +/* + * Invalidation request. PCPU pc_smp_tlb_op uses u_int instead of the + * enum to avoid both namespace and ABI issues (with enums). + */ +enum invl_op_codes { + INVL_OP_TLB = 1, + INVL_OP_TLB_INVPCID = 2, + INVL_OP_TLB_INVPCID_PTI = 3, + INVL_OP_TLB_PCID = 4, + INVL_OP_PGRNG = 5, + INVL_OP_PGRNG_INVPCID = 6, + INVL_OP_PGRNG_PCID = 7, + INVL_OP_PG = 8, + INVL_OP_PG_INVPCID = 9, + INVL_OP_PG_PCID = 10, + INVL_OP_CACHE = 11, +}; + +/* + * These variables are initialized at startup to reflect how each of + * the different kinds of invalidations should be performed on the + * current machine and environment. + */ +static enum invl_op_codes invl_op_tlb; +static enum invl_op_codes invl_op_pgrng; +static enum invl_op_codes invl_op_pg; + +/* + * Scoreboard of IPI completion notifications from target to IPI initiator. + * + * Each CPU can initiate shootdown IPI independently from other CPUs. + * Initiator enters critical section, then fills its local PCPU + * shootdown info (pc_smp_tlb_ vars), then clears scoreboard generation + * at location (cpu, my_cpuid) for each target cpu. After that IPI is + * sent to all targets which scan for zeroed scoreboard generation + * words. Upon finding such word the shootdown data is read from + * corresponding cpu' pcpu, and generation is set. Meantime initiator + * loops waiting for all zeroed generations in scoreboard to update. + */ +static uint32_t *invl_scoreboard; + +static void +invl_scoreboard_init(void *arg __unused) +{ + u_int i; + + invl_scoreboard = malloc(sizeof(uint32_t) * (mp_maxid + 1) * + (mp_maxid + 1), M_DEVBUF, M_WAITOK); + for (i = 0; i < (mp_maxid + 1) * (mp_maxid + 1); i++) + invl_scoreboard[i] = 1; + + if (pmap_pcid_enabled) { + if (invpcid_works) { + if (pti) + invl_op_tlb = INVL_OP_TLB_INVPCID_PTI; + else + invl_op_tlb = INVL_OP_TLB_INVPCID; + invl_op_pgrng = INVL_OP_PGRNG_INVPCID; + invl_op_pg = INVL_OP_PG_INVPCID; + } else { + invl_op_tlb = INVL_OP_TLB_PCID; + invl_op_pgrng = INVL_OP_PGRNG_PCID; + invl_op_pg = INVL_OP_PG_PCID; + } + } else { + invl_op_tlb = INVL_OP_TLB; + invl_op_pgrng = INVL_OP_PGRNG; + invl_op_pg = INVL_OP_PG; + } +} +SYSINIT(invl_ops, SI_SUB_SMP, SI_ORDER_FIRST, invl_scoreboard_init, NULL); + +static uint32_t * +invl_scoreboard_getcpu(u_int cpu) +{ + return (invl_scoreboard + cpu * (mp_maxid + 1)); +} + +static uint32_t * +invl_scoreboard_slot(u_int cpu) +{ + return (invl_scoreboard_getcpu(cpu) + PCPU_GET(cpuid)); +} + +/* + * Used by pmap to request cache or TLB invalidation on local and + * remote processors. Mask provides the set of remote CPUs which are + * to be signalled with the invalidation IPI. As an optimization, the + * curcpu_cb callback is invoked on the calling CPU while waiting for + * remote CPUs to complete the operation. + * + * The callback function is called unconditionally on the caller's + * underlying processor, even when this processor is not set in the + * mask. So, the callback function must be prepared to handle such + * spurious invocations. + * + * Interrupts must be enabled when calling the function with smp + * started, to avoid deadlock with other IPIs that are protected with + * smp_ipi_mtx spinlock at the initiator side. + */ +static void +smp_targeted_tlb_shootdown(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, + vm_offset_t addr2, smp_invl_cb_t curcpu_cb, enum invl_op_codes op) +{ + cpuset_t other_cpus, mask1; + uint32_t generation, *p_cpudone; + int cpu; + + /* + * It is not necessary to signal other CPUs while booting or + * when in the debugger. + */ + if (kdb_active || KERNEL_PANICKED() || !smp_started) { + curcpu_cb(pmap, addr1, addr2); + return; + } + + sched_pin(); + + /* + * Check for other cpus. Return if none. + */ + if (CPU_ISFULLSET(&mask)) { + if (mp_ncpus <= 1) + goto nospinexit; + } else { + CPU_CLR(PCPU_GET(cpuid), &mask); + if (CPU_EMPTY(&mask)) + goto nospinexit; + } + + /* + * Initiator must have interrupts enabled, which prevents + * non-invalidation IPIs, that takes smp_ipi_mtx spinlock, + * from deadlocking with as. On the other hand, preemption + * must be disabled to pin initiator to the instance of the + * pcpu pc_smp_tlb data and scoreboard line. + */ + KASSERT((read_rflags() & PSL_I) != 0, + ("smp_targeted_tlb_shootdown: interrupts disabled")); + critical_enter(); + + PCPU_SET(smp_tlb_addr1, addr1); + PCPU_SET(smp_tlb_addr2, addr2); + PCPU_SET(smp_tlb_pmap, pmap); + generation = PCPU_GET(smp_tlb_gen); + if (++generation == 0) + generation = 1; + PCPU_SET(smp_tlb_gen, generation); + PCPU_SET(smp_tlb_op, op); + /* Fence between filling smp_tlb fields and clearing scoreboard. */ + atomic_thread_fence_rel(); + + mask1 = mask; + while ((cpu = CPU_FFS(&mask1)) != 0) { + cpu--; + CPU_CLR(cpu, &mask1); + KASSERT(*invl_scoreboard_slot(cpu) != 0, + ("IPI scoreboard is zero, initiator %d target %d", + PCPU_GET(cpuid), cpu)); + *invl_scoreboard_slot(cpu) = 0; + } + + /* + * IPI acts as a fence between writing to the scoreboard above + * (zeroing slot) and reading from it below (wait for + * acknowledge). + */ + if (CPU_ISFULLSET(&mask)) { + ipi_all_but_self(IPI_INVLOP); + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + } else { + other_cpus = mask; + while ((cpu = CPU_FFS(&mask)) != 0) { + cpu--; + CPU_CLR(cpu, &mask); + CTR3(KTR_SMP, "%s: cpu: %d invl ipi op: %x", __func__, + cpu, op); + ipi_send_cpu(cpu, IPI_INVLOP); + } + } + curcpu_cb(pmap, addr1, addr2); + while ((cpu = CPU_FFS(&other_cpus)) != 0) { + cpu--; + CPU_CLR(cpu, &other_cpus); + p_cpudone = invl_scoreboard_slot(cpu); + while (atomic_load_int(p_cpudone) != generation) + ia32_pause(); + } + critical_exit(); + sched_unpin(); + return; + +nospinexit: + curcpu_cb(pmap, addr1, addr2); + sched_unpin(); +} + +void +smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb) +{ + smp_targeted_tlb_shootdown(mask, pmap, 0, 0, curcpu_cb, invl_op_tlb); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif +} + +void +smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap, + smp_invl_cb_t curcpu_cb) +{ + smp_targeted_tlb_shootdown(mask, pmap, addr, 0, curcpu_cb, invl_op_pg); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif +} + +void +smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, + pmap_t pmap, smp_invl_cb_t curcpu_cb) +{ + smp_targeted_tlb_shootdown(mask, pmap, addr1, addr2, curcpu_cb, + invl_op_pgrng); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif +} + void -invltlb_invpcid_handler(void) +smp_cache_flush(smp_invl_cb_t curcpu_cb) +{ + smp_targeted_tlb_shootdown(all_cpus, NULL, 0, 0, curcpu_cb, + INVL_OP_CACHE); +} + +/* + * Handlers for TLB related IPIs + */ +static void +invltlb_handler(pmap_t smp_tlb_pmap) +{ +#ifdef COUNT_XINVLTLB_HITS + xhits_gbl[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + if (smp_tlb_pmap == kernel_pmap) + invltlb_glob(); + else + invltlb(); +} + +static void +invltlb_invpcid_handler(pmap_t smp_tlb_pmap) { struct invpcid_descr d; - uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - generation = smp_tlb_generation; d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB : INVPCID_CTX); - PCPU_SET(smp_tlb_done, generation); } -void -invltlb_invpcid_pti_handler(void) +static void +invltlb_invpcid_pti_handler(pmap_t smp_tlb_pmap) { struct invpcid_descr d; - uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - generation = smp_tlb_generation; d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; d.pad = 0; d.addr = 0; if (smp_tlb_pmap == kernel_pmap) { /* * This invalidation actually needs to clear kernel * mappings from the TLB in the current pmap, but * since we were asked for the flush in the kernel * pmap, achieve it by performing global flush. */ invpcid(&d, INVPCID_CTXGLOB); } else { invpcid(&d, INVPCID_CTX); d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } - PCPU_SET(smp_tlb_done, generation); } -void -invltlb_pcid_handler(void) +static void +invltlb_pcid_handler(pmap_t smp_tlb_pmap) { uint64_t kcr3, ucr3; - uint32_t generation, pcid; + uint32_t pcid; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - generation = smp_tlb_generation; /* Overlap with serialization */ if (smp_tlb_pmap == kernel_pmap) { invltlb_glob(); } else { /* * The current pmap might not be equal to * smp_tlb_pmap. The clearing of the pm_gen in * pmap_invalidate_all() takes care of TLB * invalidation when switching to the pmap on this * CPU. */ if (PCPU_GET(curpmap) == smp_tlb_pmap) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; kcr3 = smp_tlb_pmap->pm_cr3 | pcid; ucr3 = smp_tlb_pmap->pm_ucr3; if (ucr3 != PMAP_NO_CR3) { ucr3 |= PMAP_PCID_USER_PT | pcid; pmap_pti_pcid_invalidate(ucr3, kcr3); } else load_cr3(kcr3); } } - PCPU_SET(smp_tlb_done, generation); } -void -invlpg_invpcid_handler(void) +static void +invlpg_handler(vm_offset_t smp_tlb_addr1) +{ +#ifdef COUNT_XINVLTLB_HITS + xhits_pg[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + invlpg(smp_tlb_addr1); +} + +static void +invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1) { struct invpcid_descr d; - uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - generation = smp_tlb_generation; /* Overlap with serialization */ invlpg(smp_tlb_addr1); if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = smp_tlb_addr1; invpcid(&d, INVPCID_ADDR); } - PCPU_SET(smp_tlb_done, generation); } -void -invlpg_pcid_handler(void) +static void +invlpg_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1) { uint64_t kcr3, ucr3; - uint32_t generation; uint32_t pcid; #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - generation = smp_tlb_generation; /* Overlap with serialization */ invlpg(smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1); } - PCPU_SET(smp_tlb_done, generation); } -void -invlrng_invpcid_handler(void) +static void +invlrng_handler(vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2) +{ + vm_offset_t addr, addr2; + +#ifdef COUNT_XINVLTLB_HITS + xhits_rng[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + addr = smp_tlb_addr1; + addr2 = smp_tlb_addr2; + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < addr2); +} + +static void +invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1, + vm_offset_t smp_tlb_addr2) { struct invpcid_descr d; vm_offset_t addr, addr2; - uint32_t generation; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; addr2 = smp_tlb_addr2; - generation = smp_tlb_generation; /* Overlap with serialization */ do { invlpg(addr); addr += PAGE_SIZE; } while (addr < addr2); if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = smp_tlb_addr1; do { invpcid(&d, INVPCID_ADDR); d.addr += PAGE_SIZE; } while (d.addr < addr2); } - PCPU_SET(smp_tlb_done, generation); } -void -invlrng_pcid_handler(void) +static void +invlrng_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1, + vm_offset_t smp_tlb_addr2) { vm_offset_t addr, addr2; uint64_t kcr3, ucr3; - uint32_t generation; uint32_t pcid; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; addr2 = smp_tlb_addr2; - generation = smp_tlb_generation; /* Overlap with serialization */ do { invlpg(addr); addr += PAGE_SIZE; } while (addr < addr2); if (smp_tlb_pmap == PCPU_GET(curpmap) && (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2); } - PCPU_SET(smp_tlb_done, generation); +} + +static void +invlcache_handler(void) +{ +#ifdef COUNT_IPIS + (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + wbinvd(); +} + +static void +invlop_handler_one_req(enum invl_op_codes smp_tlb_op, pmap_t smp_tlb_pmap, + vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2) +{ + switch (smp_tlb_op) { + case INVL_OP_TLB: + invltlb_handler(smp_tlb_pmap); + break; + case INVL_OP_TLB_INVPCID: + invltlb_invpcid_handler(smp_tlb_pmap); + break; + case INVL_OP_TLB_INVPCID_PTI: + invltlb_invpcid_pti_handler(smp_tlb_pmap); + break; + case INVL_OP_TLB_PCID: + invltlb_pcid_handler(smp_tlb_pmap); + break; + case INVL_OP_PGRNG: + invlrng_handler(smp_tlb_addr1, smp_tlb_addr2); + break; + case INVL_OP_PGRNG_INVPCID: + invlrng_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1, + smp_tlb_addr2); + break; + case INVL_OP_PGRNG_PCID: + invlrng_pcid_handler(smp_tlb_pmap, smp_tlb_addr1, + smp_tlb_addr2); + break; + case INVL_OP_PG: + invlpg_handler(smp_tlb_addr1); + break; + case INVL_OP_PG_INVPCID: + invlpg_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1); + break; + case INVL_OP_PG_PCID: + invlpg_pcid_handler(smp_tlb_pmap, smp_tlb_addr1); + break; + case INVL_OP_CACHE: + invlcache_handler(); + break; + default: + __assert_unreachable(); + break; + } +} + +void +invlop_handler(void) +{ + struct pcpu *initiator_pc; + pmap_t smp_tlb_pmap; + vm_offset_t smp_tlb_addr1, smp_tlb_addr2; + u_int initiator_cpu_id; + enum invl_op_codes smp_tlb_op; + uint32_t *scoreboard, smp_tlb_gen; + + scoreboard = invl_scoreboard_getcpu(PCPU_GET(cpuid)); + for (;;) { + for (initiator_cpu_id = 0; initiator_cpu_id <= mp_maxid; + initiator_cpu_id++) { + if (scoreboard[initiator_cpu_id] == 0) + break; + } + if (initiator_cpu_id > mp_maxid) + break; + initiator_pc = cpuid_to_pcpu[initiator_cpu_id]; + + /* + * This acquire fence and its corresponding release + * fence in smp_targeted_tlb_shootdown(), is between + * reading zero scoreboard slot and accessing PCPU of + * initiator for pc_smp_tlb values. + */ + atomic_thread_fence_acq(); + smp_tlb_pmap = initiator_pc->pc_smp_tlb_pmap; + smp_tlb_addr1 = initiator_pc->pc_smp_tlb_addr1; + smp_tlb_addr2 = initiator_pc->pc_smp_tlb_addr2; + smp_tlb_op = initiator_pc->pc_smp_tlb_op; + smp_tlb_gen = initiator_pc->pc_smp_tlb_gen; + + /* + * Ensure that we do not make our scoreboard + * notification visible to the initiator until the + * pc_smp_tlb values are read. The corresponding + * fence is implicitly provided by the barrier in the + * IPI send operation before the APIC ICR register + * write. + * + * As an optimization, the request is acknowledged + * before the actual invalidation is performed. It is + * safe because target CPU cannot return to userspace + * before handler finishes. Only NMI can preempt the + * handler, but NMI would see the kernel handler frame + * and not touch not-invalidated user page table. + */ + atomic_thread_fence_acq(); + atomic_store_int(&scoreboard[initiator_cpu_id], smp_tlb_gen); + + invlop_handler_one_req(smp_tlb_op, smp_tlb_pmap, smp_tlb_addr1, + smp_tlb_addr2); + } } diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index b7b546ed2b6d..22c6ed40aa20 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -1,312 +1,317 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PCPU_H_ #define _MACHINE_PCPU_H_ #ifndef _SYS_CDEFS_H_ #error "sys/cdefs.h is a prerequisite for this file" #endif #include #include #define PC_PTI_STACK_SZ 16 struct monitorbuf { int idle_state; /* Used by cpu_idle_mwait. */ int stop_state; /* Used by cpustop_handler. */ char padding[128 - (2 * sizeof(int))]; }; _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line"); /* * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. * The reason for doing it via a struct is so that an array of pointers * to each CPU's data can be set up for things like "check curproc on all * other processors" */ #define PCPU_MD_FIELDS \ struct monitorbuf pc_monitorbuf __aligned(128); /* cache line */\ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \ void *pc_pad0; \ uint64_t pc_kcr3; \ uint64_t pc_ucr3; \ uint64_t pc_saved_ucr3; \ register_t pc_rsp0; \ register_t pc_scratch_rsp; /* User %rsp in syscall */ \ register_t pc_scratch_rax; \ u_int pc_apic_id; \ u_int pc_acpi_id; /* ACPI CPU id */ \ /* Pointer to the CPU %fs descriptor */ \ struct user_segment_descriptor *pc_fs32p; \ /* Pointer to the CPU %gs descriptor */ \ struct user_segment_descriptor *pc_gs32p; \ /* Pointer to the CPU LDT descriptor */ \ struct system_segment_descriptor *pc_ldt; \ /* Pointer to the CPU TSS descriptor */ \ struct system_segment_descriptor *pc_tss; \ uint64_t pc_pm_save_cnt; \ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ uint64_t pc_dbreg[16]; /* ddb debugging regs */ \ uint64_t pc_pti_stack[PC_PTI_STACK_SZ]; \ register_t pc_pti_rsp0; \ int pc_dbreg_cmd; /* ddb debugging reg cmd */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \ uint32_t pc_pcid_next; \ uint32_t pc_pcid_gen; \ - uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ + uint32_t pc_unused; \ uint32_t pc_ibpb_set; \ void *pc_mds_buf; \ void *pc_mds_buf64; \ uint32_t pc_pad[2]; \ uint8_t pc_mds_tmp[64]; \ u_int pc_ipi_bitmap; \ struct amd64tss pc_common_tss; \ struct user_segment_descriptor pc_gdt[NGDT]; \ - char __pad[2956] /* pad to UMA_PCPU_ALLOC_SIZE */ + void *pc_smp_tlb_pmap; \ + uint64_t pc_smp_tlb_addr1; \ + uint64_t pc_smp_tlb_addr2; \ + uint32_t pc_smp_tlb_gen; \ + u_int pc_smp_tlb_op; \ + char __pad[2924] /* pad to UMA_PCPU_ALLOC_SIZE */ #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 #ifdef _KERNEL #define MONITOR_STOPSTATE_RUNNING 0 #define MONITOR_STOPSTATE_STOPPED 1 #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) /* * Evaluates to the byte offset of the per-cpu variable name. */ #define __pcpu_offset(name) \ __offsetof(struct pcpu, name) /* * Evaluates to the type of the per-cpu variable name. */ #define __pcpu_type(name) \ __typeof(((struct pcpu *)0)->name) /* * Evaluates to the address of the per-cpu variable name. */ #define __PCPU_PTR(name) __extension__ ({ \ __pcpu_type(name) *__p; \ \ __asm __volatile("movq %%gs:%1,%0; addq %2,%0" \ : "=r" (__p) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))), \ "i" (__pcpu_offset(name))); \ \ __p; \ }) /* * Evaluates to the value of the per-cpu variable name. */ #define __PCPU_GET(name) __extension__ ({ \ __pcpu_type(name) __res; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ if (sizeof(__res) == 1 || sizeof(__res) == 2 || \ sizeof(__res) == 4 || sizeof(__res) == 8) { \ __asm __volatile("mov %%gs:%1,%0" \ : "=r" (__s) \ : "m" (*(struct __s *)(__pcpu_offset(name)))); \ *(struct __s *)(void *)&__res = __s; \ } else { \ __res = *__PCPU_PTR(name); \ } \ __res; \ }) /* * Adds the value to the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_ADD(name, val) do { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4 || sizeof(__val) == 8) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("add %1,%%gs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else \ *__PCPU_PTR(name) += __val; \ } while (0) /* * Increments the value of the per-cpu counter name. The implementation * must be atomic with respect to interrupts. */ #define __PCPU_INC(name) do { \ CTASSERT(sizeof(__pcpu_type(name)) == 1 || \ sizeof(__pcpu_type(name)) == 2 || \ sizeof(__pcpu_type(name)) == 4 || \ sizeof(__pcpu_type(name)) == 8); \ if (sizeof(__pcpu_type(name)) == 1) { \ __asm __volatile("incb %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 2) { \ __asm __volatile("incw %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 4) { \ __asm __volatile("incl %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } else if (sizeof(__pcpu_type(name)) == 8) { \ __asm __volatile("incq %%gs:%0" \ : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\ : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\ } \ } while (0) /* * Sets the value of the per-cpu variable name to value val. */ #define __PCPU_SET(name, val) { \ __pcpu_type(name) __val; \ struct __s { \ u_char __b[MIN(sizeof(__pcpu_type(name)), 8)]; \ } __s; \ \ __val = (val); \ if (sizeof(__val) == 1 || sizeof(__val) == 2 || \ sizeof(__val) == 4 || sizeof(__val) == 8) { \ __s = *(struct __s *)(void *)&__val; \ __asm __volatile("mov %1,%%gs:%0" \ : "=m" (*(struct __s *)(__pcpu_offset(name))) \ : "r" (__s)); \ } else { \ *__PCPU_PTR(name) = __val; \ } \ } #define get_pcpu() __extension__ ({ \ struct pcpu *__pc; \ \ __asm __volatile("movq %%gs:%1,%0" \ : "=r" (__pc) \ : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace)))); \ __pc; \ }) #define PCPU_GET(member) __PCPU_GET(pc_ ## member) #define PCPU_ADD(member, val) __PCPU_ADD(pc_ ## member, val) #define PCPU_INC(member) __PCPU_INC(pc_ ## member) #define PCPU_PTR(member) __PCPU_PTR(pc_ ## member) #define PCPU_SET(member, val) __PCPU_SET(pc_ ## member, val) #define IS_BSP() (PCPU_GET(cpuid) == 0) #define zpcpu_offset_cpu(cpu) ((uintptr_t)&__pcpu[0] + UMA_PCPU_ALLOC_SIZE * cpu) #define zpcpu_base_to_offset(base) (void *)((uintptr_t)(base) - (uintptr_t)&__pcpu[0]) #define zpcpu_offset_to_base(base) (void *)((uintptr_t)(base) + (uintptr_t)&__pcpu[0]) #define zpcpu_sub_protected(base, n) do { \ ZPCPU_ASSERT_PROTECTED(); \ zpcpu_sub(base, n); \ } while (0) #define zpcpu_set_protected(base, n) do { \ __typeof(*base) __n = (n); \ ZPCPU_ASSERT_PROTECTED(); \ switch (sizeof(*base)) { \ case 4: \ __asm __volatile("movl\t%1,%%gs:(%0)" \ : : "r" (base), "ri" (__n) : "memory", "cc"); \ break; \ case 8: \ __asm __volatile("movq\t%1,%%gs:(%0)" \ : : "r" (base), "ri" (__n) : "memory", "cc"); \ break; \ default: \ *zpcpu_get(base) = __n; \ } \ } while (0); #define zpcpu_add(base, n) do { \ __typeof(*base) __n = (n); \ CTASSERT(sizeof(*base) == 4 || sizeof(*base) == 8); \ switch (sizeof(*base)) { \ case 4: \ __asm __volatile("addl\t%1,%%gs:(%0)" \ : : "r" (base), "ri" (__n) : "memory", "cc"); \ break; \ case 8: \ __asm __volatile("addq\t%1,%%gs:(%0)" \ : : "r" (base), "ri" (__n) : "memory", "cc"); \ break; \ } \ } while (0) #define zpcpu_add_protected(base, n) do { \ ZPCPU_ASSERT_PROTECTED(); \ zpcpu_add(base, n); \ } while (0) #define zpcpu_sub(base, n) do { \ __typeof(*base) __n = (n); \ CTASSERT(sizeof(*base) == 4 || sizeof(*base) == 8); \ switch (sizeof(*base)) { \ case 4: \ __asm __volatile("subl\t%1,%%gs:(%0)" \ : : "r" (base), "ri" (__n) : "memory", "cc"); \ break; \ case 8: \ __asm __volatile("subq\t%1,%%gs:(%0)" \ : : "r" (base), "ri" (__n) : "memory", "cc"); \ break; \ } \ } while (0); #else /* !__GNUCLIKE_ASM || !__GNUCLIKE___TYPEOF */ #error "this file needs to be ported to your compiler" #endif /* __GNUCLIKE_ASM && __GNUCLIKE___TYPEOF */ #endif /* _KERNEL */ #endif /* !_MACHINE_PCPU_H_ */ diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h index 2ecfe62cf9fb..d5b5fa9c5b81 100644 --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -1,67 +1,47 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ * */ #ifndef _MACHINE_SMP_H_ #define _MACHINE_SMP_H_ #ifdef _KERNEL #ifdef SMP #ifndef LOCORE #include /* global symbols in mpboot.S */ extern char mptramp_start[]; extern u_int32_t mptramp_pagetables; /* IPI handlers */ inthand_t IDTVEC(justreturn), /* interrupt CPU with minimum overhead */ IDTVEC(justreturn1_pti), - IDTVEC(invltlb_pti), - IDTVEC(invltlb_pcid_pti), - IDTVEC(invltlb_pcid), /* TLB shootdowns - global, pcid */ - IDTVEC(invltlb_invpcid_pti_pti), - IDTVEC(invltlb_invpcid_nopti), - IDTVEC(invlpg_pti), - IDTVEC(invlpg_invpcid_pti), - IDTVEC(invlpg_invpcid), - IDTVEC(invlpg_pcid_pti), - IDTVEC(invlpg_pcid), - IDTVEC(invlrng_pti), - IDTVEC(invlrng_invpcid_pti), - IDTVEC(invlrng_invpcid), - IDTVEC(invlrng_pcid_pti), - IDTVEC(invlrng_pcid), - IDTVEC(invlcache_pti), + IDTVEC(invlop_pti), + IDTVEC(invlop), IDTVEC(ipi_intr_bitmap_handler_pti), IDTVEC(cpustop_pti), IDTVEC(cpususpend_pti), IDTVEC(rendezvous_pti); -void invltlb_pcid_handler(void); -void invltlb_invpcid_handler(void); -void invltlb_invpcid_pti_handler(void); -void invlpg_invpcid_handler(void); -void invlpg_pcid_handler(void); -void invlrng_invpcid_handler(void); -void invlrng_pcid_handler(void); +void invlop_handler(void); int native_start_all_aps(void); void mp_bootaddress(vm_paddr_t *, unsigned int *); #endif /* !LOCORE */ #endif /* SMP */ #endif /* _KERNEL */ #endif /* _MACHINE_SMP_H_ */ diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index 953e34d2962c..663fd98e5c8a 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -1,469 +1,689 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_apic.h" #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #if !defined(lint) #if !defined(SMP) #error How did you get here? #endif #ifndef DEV_APIC #error The apic device is required for SMP, add "device apic" to your config file. #endif #endif /* not lint */ #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_ACPI #include #include #endif #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (PMAP_MAP_LOW + 0x0467) #define WARMBOOT_SEG (PMAP_MAP_LOW + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #define CHECK_WRITE(A, D) #endif /* CHECK_POINTS */ /* * Local data and functions. */ static void install_ap_tramp(void); static int start_all_aps(void); static int start_ap(int apic_id); static char *ap_copyout_buf; static char *ap_tramp_stack_base; /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; } /* Install an inter-CPU IPI for TLB invalidation */ setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for cache invalidation. */ setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install generic inter-CPU IPI handler */ setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for CPU stop/restart */ setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); /* Probe logical/physical core configuration. */ topo_probe(); assign_cpu_ids(); /* Start each Application Processor */ start_all_aps(); set_interrupt_apic_ids(); #if defined(DEV_ACPI) && MAXMEMDOM > 1 acpi_pxm_set_cpu_locality(); #endif } /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { struct pcpu *pc; struct i386tss *common_tssp; struct region_descriptor r_gdt, r_idt; int gsel_tss, myid, x; u_int cr0; /* bootAP is set in start_ap() to our ID. */ myid = bootAP; /* Update microcode before doing anything else. */ ucode_load_ap(myid); /* Get per-cpu data */ pc = &__pcpu[myid]; /* prime data page for it to use */ pcpu_init(pc, myid, sizeof(struct pcpu)); dpcpu_init(dpcpu, myid); pc->pc_apic_id = cpu_apic_ids[myid]; pc->pc_prvspace = pc; pc->pc_curthread = 0; pc->pc_common_tssp = common_tssp = &(__pcpu[0].pc_common_tssp)[myid]; fix_cpuid(); gdt_segs[GPRIV_SEL].ssd_base = (int)pc; gdt_segs[GPROC0_SEL].ssd_base = (int)common_tssp; gdt_segs[GLDT_SEL].ssd_base = (int)ldt; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1; r_idt.rd_base = (int)idt; lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); PCPU_SET(trampstk, (uintptr_t)ap_tramp_stack_base + TRAMP_STACK_SZ - VM86_STACK_SPACE); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; common_tssp->tss_esp0 = PCPU_GET(trampstk); common_tssp->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); common_tssp->tss_ioopt = sizeof(struct i386tss) << 16; PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd); PCPU_SET(copyout_buf, ap_copyout_buf); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); CHECK_WRITE(0x38, 5); /* signal our startup to the BSP. */ mp_naps++; CHECK_WRITE(0x39, 6); /* Spin until the BSP releases the AP's. */ while (atomic_load_acq_int(&aps_ready) == 0) ia32_pause(); /* BSP may have changed PTD while we were waiting */ invltlb(); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif init_secondary_tail(); } /* * start each AP in our list */ #define TMPMAP_START 1 static int start_all_aps(void) { u_char mpbiosreason; u_int32_t mpbioswarmvec; int apic_id, cpu; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); pmap_remap_lower(true); /* install the AP 1st level boot code */ install_ap_tramp(); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* take advantage of the P==V mapping for PTD[0] for AP boot */ /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; /* allocate and set up a boot stack data page */ bootstacks[cpu] = (char *)kmem_malloc(kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 4; bootAP = cpu; ap_tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT); ap_copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT); /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(apic_id)) { printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ } pmap_remap_lower(false); /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); /* number of APs actually started */ return mp_naps; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(void) { int x; int size = *(int *) ((u_long) & bootMP_size); vm_offset_t va = boot_address; u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) va; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; KASSERT (size <= PAGE_SIZE, ("'size' do not fit into PAGE_SIZE, as expected.")); pmap_kenter(va, boot_address); pmap_invalidate_page (kernel_pmap, va); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) va; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_address + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = (u_int)MPentry; /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_address & 0xffff; *dst8 = ((u_int) boot_address >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_address & 0xffff; *dst8 = ((u_int) boot_address >> 16) & 0xff; } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ static int start_ap(int apic_id) { int vector, ms; int cpus; /* calculate the vector */ vector = (boot_address >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } + +/* + * Flush the TLB on other CPU's + */ + +/* Variables needed for SMP tlb shootdown. */ +vm_offset_t smp_tlb_addr1, smp_tlb_addr2; +pmap_t smp_tlb_pmap; +volatile uint32_t smp_tlb_generation; + +/* + * Used by pmap to request cache or TLB invalidation on local and + * remote processors. Mask provides the set of remote CPUs which are + * to be signalled with the invalidation IPI, specified by vector. As + * an optimization, the curcpu_cb callback is invoked on the calling + * CPU while waiting for remote CPUs to complete the operation. + * + * The callback function is called unconditionally on the caller's + * underlying processor, even when this processor is not set in the + * mask. So, the callback function must be prepared to handle such + * spurious invocations. + */ +static void +smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, + vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb) +{ + cpuset_t other_cpus; + volatile uint32_t *p_cpudone; + uint32_t generation; + int cpu; + + /* + * It is not necessary to signal other CPUs while booting or + * when in the debugger. + */ + if (kdb_active || KERNEL_PANICKED() || !smp_started) { + curcpu_cb(pmap, addr1, addr2); + return; + } + + sched_pin(); + + /* + * Check for other cpus. Return if none. + */ + if (CPU_ISFULLSET(&mask)) { + if (mp_ncpus <= 1) + goto nospinexit; + } else { + CPU_CLR(PCPU_GET(cpuid), &mask); + if (CPU_EMPTY(&mask)) + goto nospinexit; + } + + KASSERT((read_eflags() & PSL_I) != 0, + ("smp_targeted_tlb_shootdown: interrupts disabled")); + mtx_lock_spin(&smp_ipi_mtx); + smp_tlb_addr1 = addr1; + smp_tlb_addr2 = addr2; + smp_tlb_pmap = pmap; + generation = ++smp_tlb_generation; + if (CPU_ISFULLSET(&mask)) { + ipi_all_but_self(vector); + other_cpus = all_cpus; + CPU_CLR(PCPU_GET(cpuid), &other_cpus); + } else { + other_cpus = mask; + while ((cpu = CPU_FFS(&mask)) != 0) { + cpu--; + CPU_CLR(cpu, &mask); + CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, + cpu, vector); + ipi_send_cpu(cpu, vector); + } + } + curcpu_cb(pmap, addr1, addr2); + while ((cpu = CPU_FFS(&other_cpus)) != 0) { + cpu--; + CPU_CLR(cpu, &other_cpus); + p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; + while (*p_cpudone != generation) + ia32_pause(); + } + mtx_unlock_spin(&smp_ipi_mtx); + sched_unpin(); + return; + +nospinexit: + curcpu_cb(pmap, addr1, addr2); + sched_unpin(); +} + +void +smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb) +{ + smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0, curcpu_cb); +#ifdef COUNT_XINVLTLB_HITS + ipi_global++; +#endif +} + +void +smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap, + smp_invl_cb_t curcpu_cb) +{ + smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0, curcpu_cb); +#ifdef COUNT_XINVLTLB_HITS + ipi_page++; +#endif +} + +void +smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, + pmap_t pmap, smp_invl_cb_t curcpu_cb) +{ + smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2, + curcpu_cb); +#ifdef COUNT_XINVLTLB_HITS + ipi_range++; + ipi_range_size += (addr2 - addr1) / PAGE_SIZE; +#endif +} + +void +smp_cache_flush(smp_invl_cb_t curcpu_cb) +{ + smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0, + curcpu_cb); +} + +/* + * Handlers for TLB related IPIs + */ +void +invltlb_handler(void) +{ + uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_gbl[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + /* + * Reading the generation here allows greater parallelism + * since invalidating the TLB is a serializing operation. + */ + generation = smp_tlb_generation; + if (smp_tlb_pmap == kernel_pmap) + invltlb_glob(); + PCPU_SET(smp_tlb_done, generation); +} + +void +invlpg_handler(void) +{ + uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_pg[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + generation = smp_tlb_generation; /* Overlap with serialization */ + if (smp_tlb_pmap == kernel_pmap) + invlpg(smp_tlb_addr1); + PCPU_SET(smp_tlb_done, generation); +} + +void +invlrng_handler(void) +{ + vm_offset_t addr, addr2; + uint32_t generation; + +#ifdef COUNT_XINVLTLB_HITS + xhits_rng[PCPU_GET(cpuid)]++; +#endif /* COUNT_XINVLTLB_HITS */ +#ifdef COUNT_IPIS + (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + addr = smp_tlb_addr1; + addr2 = smp_tlb_addr2; + generation = smp_tlb_generation; /* Overlap with serialization */ + if (smp_tlb_pmap == kernel_pmap) { + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < addr2); + } + + PCPU_SET(smp_tlb_done, generation); +} + +void +invlcache_handler(void) +{ + uint32_t generation; + +#ifdef COUNT_IPIS + (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; +#endif /* COUNT_IPIS */ + + /* + * Reading the generation here allows greater parallelism + * since wbinvd is a serializing instruction. Without the + * temporary, we'd wait for wbinvd to complete, then the read + * would execute, then the dependent write, which must then + * complete before return from interrupt. + */ + generation = smp_tlb_generation; + wbinvd(); + PCPU_SET(smp_tlb_done, generation); +} diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h index 4fcb55a41996..395695d3ecd2 100644 --- a/sys/i386/include/smp.h +++ b/sys/i386/include/smp.h @@ -1,37 +1,48 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ * */ #ifndef _MACHINE_SMP_H_ #define _MACHINE_SMP_H_ #ifdef _KERNEL #ifdef SMP #ifndef LOCORE #include #include #include #include #include #include +inthand_t + IDTVEC(invltlb), /* TLB shootdowns - global */ + IDTVEC(invlpg), /* TLB shootdowns - 1 page */ + IDTVEC(invlrng), /* TLB shootdowns - page range */ + IDTVEC(invlcache); /* Write back and invalidate cache */ + /* functions in mpboot.s */ void bootMP(void); +void invltlb_handler(void); +void invlpg_handler(void); +void invlrng_handler(void); +void invlcache_handler(void); + #endif /* !LOCORE */ #endif /* SMP */ #endif /* _KERNEL */ #endif /* _MACHINE_SMP_H_ */ diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h index de85cf9198fd..866dafe6dca4 100644 --- a/sys/x86/include/apicvar.h +++ b/sys/x86/include/apicvar.h @@ -1,491 +1,492 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _X86_APICVAR_H_ #define _X86_APICVAR_H_ /* * Local && I/O APIC variable definitions. */ /* * Layout of local APIC interrupt vectors: * * 0xff (255) +-------------+ * | | 15 (Spurious / IPIs / Local Interrupts) * 0xf0 (240) +-------------+ * | | 14 (I/O Interrupts / Timer) * 0xe0 (224) +-------------+ * | | 13 (I/O Interrupts) * 0xd0 (208) +-------------+ * | | 12 (I/O Interrupts) * 0xc0 (192) +-------------+ * | | 11 (I/O Interrupts) * 0xb0 (176) +-------------+ * | | 10 (I/O Interrupts) * 0xa0 (160) +-------------+ * | | 9 (I/O Interrupts) * 0x90 (144) +-------------+ * | | 8 (I/O Interrupts / System Calls) * 0x80 (128) +-------------+ * | | 7 (I/O Interrupts) * 0x70 (112) +-------------+ * | | 6 (I/O Interrupts) * 0x60 (96) +-------------+ * | | 5 (I/O Interrupts) * 0x50 (80) +-------------+ * | | 4 (I/O Interrupts) * 0x40 (64) +-------------+ * | | 3 (I/O Interrupts) * 0x30 (48) +-------------+ * | | 2 (ATPIC Interrupts) * 0x20 (32) +-------------+ * | | 1 (Exceptions, traps, faults, etc.) * 0x10 (16) +-------------+ * | | 0 (Exceptions, traps, faults, etc.) * 0x00 (0) +-------------+ * * Note: 0x80 needs to be handled specially and not allocated to an * I/O device! */ #define xAPIC_MAX_APIC_ID 0xfe #define xAPIC_ID_ALL 0xff #define MAX_APIC_ID 0x200 #define APIC_ID_ALL 0xffffffff #define IOAPIC_MAX_ID xAPIC_MAX_APIC_ID /* I/O Interrupts are used for external devices such as ISA, PCI, etc. */ #define APIC_IO_INTS (IDT_IO_INTS + 16) #define APIC_NUM_IOINTS 191 /* The timer interrupt is used for clock handling and drives hardclock, etc. */ #define APIC_TIMER_INT (APIC_IO_INTS + APIC_NUM_IOINTS) /* ********************* !!! WARNING !!! ****************************** * Each local apic has an interrupt receive fifo that is two entries deep * for each interrupt priority class (higher 4 bits of interrupt vector). * Once the fifo is full the APIC can no longer receive interrupts for this * class and sending IPIs from other CPUs will be blocked. * To avoid deadlocks there should be no more than two IPI interrupts * pending at the same time. * Currently this is guaranteed by dividing the IPIs in two groups that have * each at most one IPI interrupt pending. The first group is protected by the * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user * at a time) The second group uses a single interrupt and a bitmap to avoid * redundant IPI interrupts. */ /* Interrupts for local APIC LVT entries other than the timer. */ #define APIC_LOCAL_INTS 240 #define APIC_ERROR_INT APIC_LOCAL_INTS #define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1) #define APIC_CMC_INT (APIC_LOCAL_INTS + 2) #define APIC_IPI_INTS (APIC_LOCAL_INTS + 3) #define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */ -#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */ +#define IPI_INVLOP (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs, amd64 */ +#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs, i386 */ #define IPI_INVLPG (APIC_IPI_INTS + 2) #define IPI_INVLRNG (APIC_IPI_INTS + 3) #define IPI_INVLCACHE (APIC_IPI_INTS + 4) /* Vector to handle bitmap based IPIs */ #define IPI_BITMAP_VECTOR (APIC_IPI_INTS + 5) /* IPIs handled by IPI_BITMAP_VECTOR */ #define IPI_AST 0 /* Generate software trap. */ #define IPI_PREEMPT 1 #define IPI_HARDCLOCK 2 #define IPI_TRACE 3 /* Collect stack trace. */ #define IPI_BITMAP_LAST IPI_TRACE #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST) #define IPI_STOP (APIC_IPI_INTS + 6) /* Stop CPU until restarted. */ #define IPI_SUSPEND (APIC_IPI_INTS + 7) /* Suspend CPU until restarted. */ #define IPI_DYN_FIRST (APIC_IPI_INTS + 8) #define IPI_DYN_LAST (254) /* IPIs allocated at runtime */ /* * IPI_STOP_HARD does not need to occupy a slot in the IPI vector space since * it is delivered using an NMI anyways. */ #define IPI_NMI_FIRST 255 #define IPI_STOP_HARD 255 /* Stop CPU with a NMI. */ /* * The spurious interrupt can share the priority class with the IPIs since * it is not a normal interrupt. (Does not use the APIC's interrupt fifo) */ #define APIC_SPURIOUS_INT 255 #ifndef LOCORE #define APIC_IPI_DEST_SELF -1 #define APIC_IPI_DEST_ALL -2 #define APIC_IPI_DEST_OTHERS -3 #define APIC_BUS_UNKNOWN -1 #define APIC_BUS_ISA 0 #define APIC_BUS_EISA 1 #define APIC_BUS_PCI 2 #define APIC_BUS_MAX APIC_BUS_PCI #define IRQ_EXTINT -1 #define IRQ_NMI -2 #define IRQ_SMI -3 #define IRQ_DISABLED -4 /* * An APIC enumerator is a pseudo bus driver that enumerates APIC's including * CPU's and I/O APIC's. */ struct apic_enumerator { const char *apic_name; int (*apic_probe)(void); int (*apic_probe_cpus)(void); int (*apic_setup_local)(void); int (*apic_setup_io)(void); SLIST_ENTRY(apic_enumerator) apic_next; }; inthand_t IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint), IDTVEC(spuriousint), IDTVEC(timerint), IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti), IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti), IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti), IDTVEC(spuriousint_pti), IDTVEC(timerint_pti); extern vm_paddr_t lapic_paddr; extern int *apic_cpuids; void apic_register_enumerator(struct apic_enumerator *enumerator); void *ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase); int ioapic_disable_pin(void *cookie, u_int pin); int ioapic_get_vector(void *cookie, u_int pin); void ioapic_register(void *cookie); int ioapic_remap_vector(void *cookie, u_int pin, int vector); int ioapic_set_bus(void *cookie, u_int pin, int bus_type); int ioapic_set_extint(void *cookie, u_int pin); int ioapic_set_nmi(void *cookie, u_int pin); int ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol); int ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger); int ioapic_set_smi(void *cookie, u_int pin); /* * Struct containing pointers to APIC functions whose * implementation is run time selectable. */ struct apic_ops { void (*create)(u_int, int); void (*init)(vm_paddr_t); void (*xapic_mode)(void); bool (*is_x2apic)(void); void (*setup)(int); void (*dump)(const char *); void (*disable)(void); void (*eoi)(void); int (*id)(void); int (*intr_pending)(u_int); void (*set_logical_id)(u_int, u_int, u_int); u_int (*cpuid)(u_int); /* Vectors */ u_int (*alloc_vector)(u_int, u_int); u_int (*alloc_vectors)(u_int, u_int *, u_int, u_int); void (*enable_vector)(u_int, u_int); void (*disable_vector)(u_int, u_int); void (*free_vector)(u_int, u_int, u_int); /* PMC */ int (*enable_pmc)(void); void (*disable_pmc)(void); void (*reenable_pmc)(void); /* CMC */ void (*enable_cmc)(void); /* AMD ELVT */ int (*enable_mca_elvt)(void); /* IPI */ void (*ipi_raw)(register_t, u_int); void (*ipi_vectored)(u_int, int); int (*ipi_wait)(int); int (*ipi_alloc)(inthand_t *ipifunc); void (*ipi_free)(int vector); /* LVT */ int (*set_lvt_mask)(u_int, u_int, u_char); int (*set_lvt_mode)(u_int, u_int, u_int32_t); int (*set_lvt_polarity)(u_int, u_int, enum intr_polarity); int (*set_lvt_triggermode)(u_int, u_int, enum intr_trigger); }; extern struct apic_ops apic_ops; static inline void lapic_create(u_int apic_id, int boot_cpu) { apic_ops.create(apic_id, boot_cpu); } static inline void lapic_init(vm_paddr_t addr) { apic_ops.init(addr); } static inline void lapic_xapic_mode(void) { apic_ops.xapic_mode(); } static inline bool lapic_is_x2apic(void) { return (apic_ops.is_x2apic()); } static inline void lapic_setup(int boot) { apic_ops.setup(boot); } static inline void lapic_dump(const char *str) { apic_ops.dump(str); } static inline void lapic_disable(void) { apic_ops.disable(); } static inline void lapic_eoi(void) { apic_ops.eoi(); } static inline int lapic_id(void) { return (apic_ops.id()); } static inline int lapic_intr_pending(u_int vector) { return (apic_ops.intr_pending(vector)); } /* XXX: UNUSED */ static inline void lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) { apic_ops.set_logical_id(apic_id, cluster, cluster_id); } static inline u_int apic_cpuid(u_int apic_id) { return (apic_ops.cpuid(apic_id)); } static inline u_int apic_alloc_vector(u_int apic_id, u_int irq) { return (apic_ops.alloc_vector(apic_id, irq)); } static inline u_int apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) { return (apic_ops.alloc_vectors(apic_id, irqs, count, align)); } static inline void apic_enable_vector(u_int apic_id, u_int vector) { apic_ops.enable_vector(apic_id, vector); } static inline void apic_disable_vector(u_int apic_id, u_int vector) { apic_ops.disable_vector(apic_id, vector); } static inline void apic_free_vector(u_int apic_id, u_int vector, u_int irq) { apic_ops.free_vector(apic_id, vector, irq); } static inline int lapic_enable_pmc(void) { return (apic_ops.enable_pmc()); } static inline void lapic_disable_pmc(void) { apic_ops.disable_pmc(); } static inline void lapic_reenable_pmc(void) { apic_ops.reenable_pmc(); } static inline void lapic_enable_cmc(void) { apic_ops.enable_cmc(); } static inline int lapic_enable_mca_elvt(void) { return (apic_ops.enable_mca_elvt()); } static inline void lapic_ipi_raw(register_t icrlo, u_int dest) { apic_ops.ipi_raw(icrlo, dest); } static inline void lapic_ipi_vectored(u_int vector, int dest) { apic_ops.ipi_vectored(vector, dest); } static inline int lapic_ipi_wait(int delay) { return (apic_ops.ipi_wait(delay)); } static inline int lapic_ipi_alloc(inthand_t *ipifunc) { return (apic_ops.ipi_alloc(ipifunc)); } static inline void lapic_ipi_free(int vector) { return (apic_ops.ipi_free(vector)); } static inline int lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked) { return (apic_ops.set_lvt_mask(apic_id, lvt, masked)); } static inline int lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode) { return (apic_ops.set_lvt_mode(apic_id, lvt, mode)); } static inline int lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) { return (apic_ops.set_lvt_polarity(apic_id, lvt, pol)); } static inline int lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger) { return (apic_ops.set_lvt_triggermode(apic_id, lvt, trigger)); } void lapic_handle_cmc(void); void lapic_handle_error(void); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); int ioapic_get_rid(u_int apic_id, uint16_t *ridp); extern int x2apic_mode; extern int lapic_eoi_suppression; #ifdef _SYS_SYSCTL_H_ SYSCTL_DECL(_hw_apic); #endif #endif /* !LOCORE */ #endif /* _X86_APICVAR_H_ */ diff --git a/sys/x86/include/x86_smp.h b/sys/x86/include/x86_smp.h index 1a0ef8fbcf78..d5535a602bcb 100644 --- a/sys/x86/include/x86_smp.h +++ b/sys/x86/include/x86_smp.h @@ -1,120 +1,112 @@ /*- * SPDX-License-Identifier: Beerware * * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ * */ #ifndef _X86_X86_SMP_H_ #define _X86_X86_SMP_H_ #include #include #include #include #include struct pmap; /* global data in mp_x86.c */ extern int mp_naps; extern int boot_cpu_id; extern struct pcb stoppcbs[]; extern int cpu_apic_ids[]; extern int bootAP; extern void *dpcpu; extern char *bootSTK; extern void *bootstacks[]; extern unsigned int boot_address; extern unsigned int bootMP_size; extern volatile int aps_ready; extern struct mtx ap_boot_mtx; extern int cpu_logical; extern int cpu_cores; extern volatile uint32_t smp_tlb_generation; extern struct pmap *smp_tlb_pmap; extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2; extern u_int xhits_gbl[]; extern u_int xhits_pg[]; extern u_int xhits_rng[]; extern u_int ipi_global; extern u_int ipi_page; extern u_int ipi_range; extern u_int ipi_range_size; extern int nmi_kdb_lock; extern int nmi_is_broadcast; struct cpu_info { int cpu_present:1; int cpu_bsp:1; int cpu_disabled:1; int cpu_hyperthread:1; }; extern struct cpu_info *cpu_info; /* * Set if MWAIT does not reliably wake when the MONITORed address is written. */ extern bool mwait_cpustop_broken; #ifdef COUNT_IPIS extern u_long *ipi_invltlb_counts[MAXCPU]; extern u_long *ipi_invlrng_counts[MAXCPU]; extern u_long *ipi_invlpg_counts[MAXCPU]; extern u_long *ipi_invlcache_counts[MAXCPU]; extern u_long *ipi_rendezvous_counts[MAXCPU]; #endif /* IPI handlers */ inthand_t - IDTVEC(invltlb), /* TLB shootdowns - global */ - IDTVEC(invlpg), /* TLB shootdowns - 1 page */ - IDTVEC(invlrng), /* TLB shootdowns - page range */ - IDTVEC(invlcache), /* Write back and invalidate cache */ IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ IDTVEC(cpustop), /* CPU stops & waits to be restarted */ IDTVEC(cpususpend), /* CPU suspends & waits to be resumed */ IDTVEC(rendezvous); /* handle CPU rendezvous */ typedef void (*smp_invl_cb_t)(struct pmap *, vm_offset_t addr1, vm_offset_t addr2); /* functions in x86_mp.c */ void assign_cpu_ids(void); void cpu_add(u_int apic_id, char boot_cpu); void cpustop_handler(void); void cpususpend_handler(void); void alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx); void init_secondary_tail(void); -void invltlb_handler(void); -void invlpg_handler(void); -void invlrng_handler(void); -void invlcache_handler(void); void init_secondary(void); void ipi_startup(int apic_id, int vector); void ipi_all_but_self(u_int ipi); void ipi_bitmap_handler(struct trapframe frame); void ipi_cpu(int cpu, u_int ipi); int ipi_nmi_handler(void); void ipi_selected(cpuset_t cpus, u_int ipi); void set_interrupt_apic_ids(void); void smp_cache_flush(smp_invl_cb_t curcpu_cb); void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap, smp_invl_cb_t curcpu_cb); void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva, vm_offset_t endva, struct pmap *pmap, smp_invl_cb_t curcpu_cb); void smp_masked_invltlb(cpuset_t mask, struct pmap *pmap, smp_invl_cb_t curcpu_cb); void mem_range_AP_init(void); void topo_probe(void); void ipi_send_cpu(int cpu, u_int ipi); #endif diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index 85f27d639b69..bc1d211a27fd 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1,1877 +1,1642 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifdef __i386__ #include "opt_apic.h" #endif #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #include "opt_stack.h" #include #include #include #include /* cngetc() */ #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct susppcb **susppcbs; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif /* Default cpu_ops implementation. */ struct cpu_ops cpu_ops; /* * Local data and functions. */ static volatile cpuset_t ipi_stop_nmi_pending; volatile cpuset_t resuming_cpus; volatile cpuset_t toresume_cpus; /* used to hold the AP's until we are ready to release them */ struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info *cpu_info; int *apic_cpuids; int cpu_apic_ids[MAXCPU]; _Static_assert(MAXCPU <= MAX_APIC_ID, "MAXCPU cannot be larger that MAX_APIC_ID"); _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); static void release_aps(void *dummy); static void cpustop_handler_post(u_int cpu); static int hyperthreading_allowed = 1; SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); static int hyperthreading_intr_allowed = 0; SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, &hyperthreading_intr_allowed, 0, "Allow interrupts on HTT logical CPUs"); static struct topo_node topo_root; static int pkg_id_shift; static int node_id_shift; static int core_id_shift; static int disabled_cpus; struct cache_info { int id_shift; int present; } static caches[MAX_CACHE_LEVELS]; unsigned int boot_address; static bool stop_mwait = false; SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, "Use MONITOR/MWAIT when stopping CPU, if available"); #define MiB(v) (v ## ULL << 20) void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } /* * Round up to the next power of two, if necessary, and then * take log2. * Returns -1 if argument is zero. */ static __inline int mask_width(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } /* * Add a cache level to the cache topology description. */ static int add_deterministic_cache(int type, int level, int share_count) { if (type == 0) return (0); if (type > 3) { printf("unexpected cache type %d\n", type); return (1); } if (type == 2) /* ignore instruction cache */ return (1); if (level == 0 || level > MAX_CACHE_LEVELS) { printf("unexpected cache level %d\n", type); return (1); } if (caches[level - 1].present) { printf("WARNING: multiple entries for L%u data cache\n", level); printf("%u => %u\n", caches[level - 1].id_shift, mask_width(share_count)); } caches[level - 1].id_shift = mask_width(share_count); caches[level - 1].present = 1; if (caches[level - 1].id_shift > pkg_id_shift) { printf("WARNING: L%u data cache covers more " "APIC IDs than a package (%u > %u)\n", level, caches[level - 1].id_shift, pkg_id_shift); caches[level - 1].id_shift = pkg_id_shift; } if (caches[level - 1].id_shift < core_id_shift) { printf("WARNING: L%u data cache covers fewer " "APIC IDs than a core (%u < %u)\n", level, caches[level - 1].id_shift, core_id_shift); caches[level - 1].id_shift = core_id_shift; } return (1); } /* * Determine topology of processing units and caches for AMD CPUs. * See: * - AMD CPUID Specification (Publication # 25481) * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) * - BKDG For AMD Family 10h Processors (Publication # 31116) * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) */ static void topo_probe_amd(void) { u_int p[4]; uint64_t v; int level; int nodes_per_socket; int share_count; int type; int i; /* No multi-core capability. */ if ((amd_feature2 & AMDID2_CMP) == 0) return; /* For families 10h and newer. */ pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> AMDID_COREID_SIZE_SHIFT; /* For 0Fh family. */ if (pkg_id_shift == 0) pkg_id_shift = mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); /* * Families prior to 16h define the following value as * cores per compute unit and we don't really care about the AMD * compute units at the moment. Perhaps we should treat them as * cores and cores within the compute units as hardware threads, * but that's up for debate. * Later families define the value as threads per compute unit, * so we are following AMD's nomenclature here. */ if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && CPUID_TO_FAMILY(cpu_id) >= 0x16) { cpuid_count(0x8000001e, 0, p); share_count = ((p[1] >> 8) & 0xff) + 1; core_id_shift = mask_width(share_count); /* * For Zen (17h), gather Nodes per Processor. Each node is a * Zeppelin die; TR and EPYC CPUs will have multiple dies per * package. Communication latency between dies is higher than * within them. */ nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); } if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { for (i = 0; ; i++) { cpuid_count(0x8000001d, i, p); type = p[0] & 0x1f; level = (p[0] >> 5) & 0x7; share_count = 1 + ((p[0] >> 14) & 0xfff); if (!add_deterministic_cache(type, level, share_count)) break; } } else { if (cpu_exthigh >= 0x80000005) { cpuid_count(0x80000005, 0, p); if (((p[2] >> 24) & 0xff) != 0) { caches[0].id_shift = 0; caches[0].present = 1; } } if (cpu_exthigh >= 0x80000006) { cpuid_count(0x80000006, 0, p); if (((p[2] >> 16) & 0xffff) != 0) { caches[1].id_shift = 0; caches[1].present = 1; } if (((p[3] >> 18) & 0x3fff) != 0) { nodes_per_socket = 1; if ((amd_feature2 & AMDID2_NODE_ID) != 0) { /* * Handle multi-node processors that * have multiple chips, each with its * own L3 cache, on the same die. */ v = rdmsr(0xc001100c); nodes_per_socket = 1 + ((v >> 3) & 0x7); } caches[2].id_shift = pkg_id_shift - mask_width(nodes_per_socket); caches[2].present = 1; } } } } /* * Determine topology of processing units for Intel CPUs * using CPUID Leaf 1 and Leaf 4, if supported. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS */ static void topo_probe_intel_0x4(void) { u_int p[4]; int max_cores; int max_logical; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; if (cpu_high >= 0x4) { cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; } else max_cores = 1; core_id_shift = mask_width(max_logical/max_cores); KASSERT(core_id_shift >= 0, ("intel topo: max_cores > max_logical\n")); pkg_id_shift = core_id_shift + mask_width(max_cores); } /* * Determine topology of processing units for Intel CPUs * using CPUID Leaf 11, if supported. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS */ static void topo_probe_intel_0xb(void) { u_int p[4]; int bits; int type; int i; /* Fall back if CPU leaf 11 doesn't really exist. */ cpuid_count(0x0b, 0, p); if (p[1] == 0) { topo_probe_intel_0x4(); return; } /* We only support three levels for now. */ for (i = 0; ; i++) { cpuid_count(0x0b, i, p); bits = p[0] & 0x1f; type = (p[2] >> 8) & 0xff; if (type == 0) break; /* TODO: check for duplicate (re-)assignment */ if (type == CPUID_TYPE_SMT) core_id_shift = bits; else if (type == CPUID_TYPE_CORE) pkg_id_shift = bits; else printf("unknown CPU level type %d\n", type); } if (pkg_id_shift < core_id_shift) { printf("WARNING: core covers more APIC IDs than a package\n"); core_id_shift = pkg_id_shift; } } /* * Determine topology of caches for Intel CPUs. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 Architectures Software Developer’s Manual * Volume 2A: Instruction Set Reference, A-M, * CPUID instruction */ static void topo_probe_intel_caches(void) { u_int p[4]; int level; int share_count; int type; int i; if (cpu_high < 0x4) { /* * Available cache level and sizes can be determined * via CPUID leaf 2, but that requires a huge table of hardcoded * values, so for now just assume L1 and L2 caches potentially * shared only by HTT processing units, if HTT is present. */ caches[0].id_shift = pkg_id_shift; caches[0].present = 1; caches[1].id_shift = pkg_id_shift; caches[1].present = 1; return; } for (i = 0; ; i++) { cpuid_count(0x4, i, p); type = p[0] & 0x1f; level = (p[0] >> 5) & 0x7; share_count = 1 + ((p[0] >> 14) & 0xfff); if (!add_deterministic_cache(type, level, share_count)) break; } } /* * Determine topology of processing units and caches for Intel CPUs. * See: * - Intel 64 Architecture Processor Topology Enumeration */ static void topo_probe_intel(void) { /* * Note that 0x1 <= cpu_high < 4 case should be * compatible with topo_probe_intel_0x4() logic when * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) * or it should trigger the fallback otherwise. */ if (cpu_high >= 0xb) topo_probe_intel_0xb(); else if (cpu_high >= 0x1) topo_probe_intel_0x4(); topo_probe_intel_caches(); } /* * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. * Then topology is extrapolated on all packages using an * assumption that APIC ID to hardware component ID mapping is * homogenious. * That doesn't necesserily imply that the topology is uniform. */ void topo_probe(void) { static int cpu_topo_probed = 0; struct x86_topo_layer { int type; int subtype; int id_shift; } topo_layers[MAX_CACHE_LEVELS + 4]; struct topo_node *parent; struct topo_node *node; int layer; int nlayers; int node_id; int i; if (cpu_topo_probed) return; CPU_ZERO(&logical_cpus_mask); if (mp_ncpus <= 1) ; /* nothing */ else if (cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_HYGON) topo_probe_amd(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) topo_probe_intel(); KASSERT(pkg_id_shift >= core_id_shift, ("bug in APIC topology discovery")); nlayers = 0; bzero(topo_layers, sizeof(topo_layers)); topo_layers[nlayers].type = TOPO_TYPE_PKG; topo_layers[nlayers].id_shift = pkg_id_shift; if (bootverbose) printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; if (pkg_id_shift > node_id_shift && node_id_shift != 0) { topo_layers[nlayers].type = TOPO_TYPE_GROUP; topo_layers[nlayers].id_shift = node_id_shift; if (bootverbose) printf("Node ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; } /* * Consider all caches to be within a package/chip * and "in front" of all sub-components like * cores and hardware threads. */ for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { if (caches[i].present) { if (node_id_shift != 0) KASSERT(caches[i].id_shift <= node_id_shift, ("bug in APIC topology discovery")); KASSERT(caches[i].id_shift <= pkg_id_shift, ("bug in APIC topology discovery")); KASSERT(caches[i].id_shift >= core_id_shift, ("bug in APIC topology discovery")); topo_layers[nlayers].type = TOPO_TYPE_CACHE; topo_layers[nlayers].subtype = i + 1; topo_layers[nlayers].id_shift = caches[i].id_shift; if (bootverbose) printf("L%u cache ID shift: %u\n", topo_layers[nlayers].subtype, topo_layers[nlayers].id_shift); nlayers++; } } if (pkg_id_shift > core_id_shift) { topo_layers[nlayers].type = TOPO_TYPE_CORE; topo_layers[nlayers].id_shift = core_id_shift; if (bootverbose) printf("Core ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; } topo_layers[nlayers].type = TOPO_TYPE_PU; topo_layers[nlayers].id_shift = 0; nlayers++; topo_init_root(&topo_root); for (i = 0; i <= max_apic_id; ++i) { if (!cpu_info[i].cpu_present) continue; parent = &topo_root; for (layer = 0; layer < nlayers; ++layer) { node_id = i >> topo_layers[layer].id_shift; parent = topo_add_node_by_hwid(parent, node_id, topo_layers[layer].type, topo_layers[layer].subtype); } } parent = &topo_root; for (layer = 0; layer < nlayers; ++layer) { node_id = boot_cpu_id >> topo_layers[layer].id_shift; node = topo_find_node_by_hwid(parent, node_id, topo_layers[layer].type, topo_layers[layer].subtype); topo_promote_child(node); parent = node; } cpu_topo_probed = 1; } /* * Assign logical CPU IDs to local APICs. */ void assign_cpu_ids(void) { struct topo_node *node; u_int smt_mask; int nhyper; smt_mask = (1u << core_id_shift) - 1; /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 is always assigned to the BSP. */ mp_ncpus = 0; nhyper = 0; TOPO_FOREACH(node, &topo_root) { if (node->type != TOPO_TYPE_PU) continue; if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) cpu_info[node->hwid].cpu_hyperthread = 1; if (resource_disabled("lapic", node->hwid)) { if (node->hwid != boot_cpu_id) cpu_info[node->hwid].cpu_disabled = 1; else printf("Cannot disable BSP, APIC ID = %d\n", node->hwid); } if (!hyperthreading_allowed && cpu_info[node->hwid].cpu_hyperthread) cpu_info[node->hwid].cpu_disabled = 1; if (mp_ncpus >= MAXCPU) cpu_info[node->hwid].cpu_disabled = 1; if (cpu_info[node->hwid].cpu_disabled) { disabled_cpus++; continue; } if (cpu_info[node->hwid].cpu_hyperthread) nhyper++; cpu_apic_ids[mp_ncpus] = node->hwid; apic_cpuids[node->hwid] = mp_ncpus; topo_set_pu_id(node, mp_ncpus); mp_ncpus++; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); mp_ncores = mp_ncpus - nhyper; smp_threads_per_core = mp_ncpus / mp_ncores; } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { struct topo_node *node; const char *hyperthread; struct topo_analysis topology; printf("FreeBSD/SMP: "); if (topo_analyze(&topo_root, 1, &topology)) { printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); if (topology.entities[TOPO_LEVEL_GROUP] > 1) printf(" x %d groups", topology.entities[TOPO_LEVEL_GROUP]); if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) printf(" x %d cache groups", topology.entities[TOPO_LEVEL_CACHEGROUP]); if (topology.entities[TOPO_LEVEL_CORE] > 0) printf(" x %d core(s)", topology.entities[TOPO_LEVEL_CORE]); if (topology.entities[TOPO_LEVEL_THREAD] > 1) printf(" x %d hardware threads", topology.entities[TOPO_LEVEL_THREAD]); } else { printf("Non-uniform topology"); } printf("\n"); if (disabled_cpus) { printf("FreeBSD/SMP Online: "); if (topo_analyze(&topo_root, 0, &topology)) { printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); if (topology.entities[TOPO_LEVEL_GROUP] > 1) printf(" x %d groups", topology.entities[TOPO_LEVEL_GROUP]); if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) printf(" x %d cache groups", topology.entities[TOPO_LEVEL_CACHEGROUP]); if (topology.entities[TOPO_LEVEL_CORE] > 0) printf(" x %d core(s)", topology.entities[TOPO_LEVEL_CORE]); if (topology.entities[TOPO_LEVEL_THREAD] > 1) printf(" x %d hardware threads", topology.entities[TOPO_LEVEL_THREAD]); } else { printf("Non-uniform topology"); } printf("\n"); } if (!bootverbose) return; TOPO_FOREACH(node, &topo_root) { switch (node->type) { case TOPO_TYPE_PKG: printf("Package HW ID = %u\n", node->hwid); break; case TOPO_TYPE_CORE: printf("\tCore HW ID = %u\n", node->hwid); break; case TOPO_TYPE_PU: if (cpu_info[node->hwid].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; if (node->subtype == 0) printf("\t\tCPU (AP%s): APIC ID: %u" "(disabled)\n", hyperthread, node->hwid); else if (node->id == 0) printf("\t\tCPU0 (BSP): APIC ID: %u\n", node->hwid); else printf("\t\tCPU%u (AP%s): APIC ID: %u\n", node->id, hyperthread, node->hwid); break; default: /* ignored */ break; } } } /* * Add a scheduling group, a group of logical processors sharing * a particular cache (and, thus having an affinity), to the scheduling * topology. * This function recursively works on lower level caches. */ static void x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) { struct topo_node *node; int nchildren; int ncores; int i; KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || root->type == TOPO_TYPE_GROUP, ("x86topo_add_sched_group: bad type: %u", root->type)); CPU_COPY(&root->cpuset, &cg_root->cg_mask); cg_root->cg_count = root->cpu_count; if (root->type == TOPO_TYPE_SYSTEM) cg_root->cg_level = CG_SHARE_NONE; else cg_root->cg_level = root->subtype; /* * Check how many core nodes we have under the given root node. * If we have multiple logical processors, but not multiple * cores, then those processors must be hardware threads. */ ncores = 0; node = root; while (node != NULL) { if (node->type != TOPO_TYPE_CORE) { node = topo_next_node(root, node); continue; } ncores++; node = topo_next_nonchild_node(root, node); } if (cg_root->cg_level != CG_SHARE_NONE && root->cpu_count > 1 && ncores < 2) cg_root->cg_flags = CG_FLAG_SMT; /* * Find out how many cache nodes we have under the given root node. * We ignore cache nodes that cover all the same processors as the * root node. Also, we do not descend below found cache nodes. * That is, we count top-level "non-redundant" caches under the root * node. */ nchildren = 0; node = root; while (node != NULL) { if ((node->type != TOPO_TYPE_GROUP && node->type != TOPO_TYPE_CACHE) || (root->type != TOPO_TYPE_SYSTEM && CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { node = topo_next_node(root, node); continue; } nchildren++; node = topo_next_nonchild_node(root, node); } cg_root->cg_child = smp_topo_alloc(nchildren); cg_root->cg_children = nchildren; /* * Now find again the same cache nodes as above and recursively * build scheduling topologies for them. */ node = root; i = 0; while (node != NULL) { if ((node->type != TOPO_TYPE_GROUP && node->type != TOPO_TYPE_CACHE) || (root->type != TOPO_TYPE_SYSTEM && CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { node = topo_next_node(root, node); continue; } cg_root->cg_child[i].cg_parent = cg_root; x86topo_add_sched_group(node, &cg_root->cg_child[i]); i++; node = topo_next_nonchild_node(root, node); } } /* * Build the MI scheduling topology from the discovered hardware topology. */ struct cpu_group * cpu_topo(void) { struct cpu_group *cg_root; if (mp_ncpus <= 1) return (smp_topo_none()); cg_root = smp_topo_alloc(1); x86topo_add_sched_group(&topo_root, cg_root); return (cg_root); } static void cpu_alloc(void *dummy __unused) { /* * Dynamically allocate the arrays that depend on the * maximum APIC ID. */ cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, M_WAITOK | M_ZERO); apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, M_WAITOK | M_ZERO); } SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); /* * Add a logical CPU to the topology. */ void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > max_apic_id) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %u claims to be BSP, but CPU %u already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (bootverbose) printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { /* * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). * If there were no calls to cpu_add() assume this is a UP system. */ if (mp_ncpus == 0) mp_ncpus = 1; } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ CPU_SETOF(0, &all_cpus); return (mp_ncpus > 1); } /* Allocate memory for the AP trampoline. */ void alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx) { unsigned int i; bool allocated; allocated = false; for (i = *physmap_idx; i <= *physmap_idx; i -= 2) { /* * Find a memory region big enough and below the 1MB boundary * for the trampoline code. * NB: needs to be page aligned. */ if (physmap[i] >= MiB(1) || (trunc_page(physmap[i + 1]) - round_page(physmap[i])) < round_page(bootMP_size)) continue; allocated = true; /* * Try to steal from the end of the region to mimic previous * behaviour, else fallback to steal from the start. */ if (physmap[i + 1] < MiB(1)) { boot_address = trunc_page(physmap[i + 1]); if ((physmap[i + 1] - boot_address) < bootMP_size) boot_address -= round_page(bootMP_size); physmap[i + 1] = boot_address; } else { boot_address = round_page(physmap[i]); physmap[i] = boot_address + round_page(bootMP_size); } if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) { memmove(&physmap[i], &physmap[i + 2], sizeof(*physmap) * (*physmap_idx - i + 2)); *physmap_idx -= 2; } break; } if (!allocated) { boot_address = basemem * 1024 - bootMP_size; if (bootverbose) printf( "Cannot find enough space for the boot trampoline, placing it at %#x", boot_address); } } /* * AP CPU's call this to initialize themselves. */ void init_secondary_tail(void) { u_int cpuid; pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); /* * On real hardware, switch to x2apic mode if possible. Do it * after aps_ready was signalled, to avoid manipulating the * mode while BSP might still want to send some IPI to us * (second startup IPI is ignored on modern hardware etc). */ lapic_xapic_mode(); /* Initialize the PAT MSR. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX */ initializecpu(); /* set up FPU state on the AP */ #ifdef __amd64__ fpuinit(); #else npxinit(false); #endif if (cpu_ops.cpu_init) cpu_ops.cpu_init(); /* A quick check from sanity claus */ cpuid = PCPU_GET(cpuid); if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", cpuid); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mtx_lock_spin(&ap_boot_mtx); mca_init(); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); if (bootverbose) printf("SMP: AP CPU #%d Launched!\n", cpuid); else printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", cpuid, smp_cpus == mp_ncpus ? "\n" : " "); /* Determine if we are a logical CPU. */ if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) CPU_SET(cpuid, &logical_cpus_mask); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } #ifdef __amd64__ /* * Enable global pages TLB extension * This also implicitly flushes the TLB */ load_cr4(rcr4() | CR4_PGE); if (pmap_pcid_enabled) load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); #endif mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (atomic_load_acq_int(&smp_started) == 0) ia32_pause(); #ifndef EARLY_AP_STARTUP /* Start per-CPU event timers. */ cpu_initclocks_ap(); #endif kcsan_cpu_init(cpuid); /* * Assert that smp_after_idle_runnable condition is reasonable. */ MPASS(PCPU_GET(curpcb) == NULL); sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } static void smp_after_idle_runnable(void *arg __unused) { struct pcpu *pc; int cpu; for (cpu = 1; cpu < mp_ncpus; cpu++) { pc = pcpu_find(cpu); while (atomic_load_ptr(&pc->pc_curpcb) == NULL) cpu_spinwait(); kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages * PAGE_SIZE); } } SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, smp_after_idle_runnable, NULL); /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (cpu_info[apic_id].cpu_hyperthread && !hyperthreading_intr_allowed) continue; intr_add_cpu(i); } } #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Init and startup IPI. */ void ipi_startup(int apic_id, int vector) { /* * This attempts to follow the algorithm described in the * Intel Multiprocessor Specification v1.4 in section B.4. * For each IPI, we allow the local APIC ~20us to deliver the * IPI. If that times out, we panic. */ /* * first we do an INIT IPI: this INIT IPI might be run, resetting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); lapic_ipi_wait(100); /* Explicitly deassert the INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); DELAY(10000); /* wait ~10mS */ /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver first STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver second STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ } /* * Send an IPI to specified CPU handling the bitmap logic. */ void ipi_send_cpu(int cpu, u_int ipi) { u_int bitmap, old, new; u_int *cpu_bitmap; KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; old = *cpu_bitmap; for (;;) { if ((old & bitmap) == bitmap) break; new = old | bitmap; if (atomic_fcmpset_int(cpu_bitmap, &old, new)) break; } if (old) return; } lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; td = curthread; ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> pc_ipi_bitmap); /* * sched_preempt() must be called to clear the pending preempt * IPI to enable delivery of further preempts. However, the * critical section will cause extra scheduler lock thrashing * when used unconditionally. Only critical_enter() if * hardclock must also run, which requires the section entry. */ if (ipi_bitmap & (1 << IPI_HARDCLOCK)) critical_enter(); td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; #if defined(STACK) || defined(DDB) if (ipi_bitmap & (1 << IPI_TRACE)) stack_capture_intr(); #endif if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; if (ipi_bitmap & (1 << IPI_HARDCLOCK)) critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); while ((cpu = CPU_FFS(&cpus)) != 0) { cpu--; CPU_CLR(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); if (IPI_IS_BITMAPED(ipi)) { ipi_selected(other_cpus, ipi); return; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } int ipi_nmi_handler(void) { u_int cpuid; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) return (1); CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); cpustop_handler(); return (0); } int nmi_kdb_lock; void nmi_call_kdb_smp(u_int type, struct trapframe *frame) { int cpu; bool call_post; cpu = PCPU_GET(cpuid); if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { nmi_call_kdb(cpu, type, frame); call_post = false; } else { savectx(&stoppcbs[cpu]); CPU_SET_ATOMIC(cpu, &stopped_cpus); while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) ia32_pause(); call_post = true; } atomic_store_rel_int(&nmi_kdb_lock, 0); if (call_post) cpustop_handler_post(cpu); } /* * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, * if available) until we are resumed. */ void cpustop_handler(void) { struct monitorbuf *mb; u_int cpu; bool use_mwait; cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && !mwait_cpustop_broken); if (use_mwait) { mb = PCPU_PTR(monitorbuf); atomic_store_int(&mb->stop_state, MONITOR_STOPSTATE_STOPPED); } /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) { if (use_mwait) { cpu_monitor(mb, 0, 0); if (atomic_load_int(&mb->stop_state) == MONITOR_STOPSTATE_STOPPED) cpu_mwait(0, MWAIT_C1); continue; } ia32_pause(); /* * Halt non-BSP CPUs on panic -- we're never going to need them * again, and might as well save power / release resources * (e.g., overprovisioned VM infrastructure). */ while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) halt(); } cpustop_handler_post(cpu); } static void cpustop_handler_post(u_int cpu) { CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); /* * We don't broadcast TLB invalidations to other CPUs when they are * stopped. Hence, we clear the TLB before resuming. */ invltlb_glob(); #if defined(__amd64__) && defined(DDB) amd64_db_resume_dbreg(); #endif if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. */ void cpususpend_handler(void) { u_int cpu; mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); cpu = PCPU_GET(cpuid); if (savectx(&susppcbs[cpu]->sp_pcb)) { #ifdef __amd64__ fpususpend(susppcbs[cpu]->sp_fpususpend); #else npxsuspend(susppcbs[cpu]->sp_fpususpend); #endif /* * suspended_cpus is cleared shortly after each AP is restarted * by a Startup IPI, so that the BSP can proceed to restarting * the next AP. * * resuming_cpus gets cleared when the AP completes * initialization after having been released by the BSP. * resuming_cpus is probably not the best name for the * variable, because it is actually a set of processors that * haven't resumed yet and haven't necessarily started resuming. * * Note that suspended_cpus is meaningful only for ACPI suspend * as it's not really used for Xen suspend since the APs are * automatically restored to the running state and the correct * context. For the same reason resumectx is never called in * that case. */ CPU_SET_ATOMIC(cpu, &suspended_cpus); CPU_SET_ATOMIC(cpu, &resuming_cpus); /* * Invalidate the cache after setting the global status bits. * The last AP to set its bit may end up being an Owner of the * corresponding cache line in MOESI protocol. The AP may be * stopped before the cache line is written to the main memory. */ wbinvd(); } else { #ifdef __amd64__ fpuresume(susppcbs[cpu]->sp_fpususpend); #else npxresume(susppcbs[cpu]->sp_fpususpend); #endif pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); /* Indicate that we have restarted and restored the context. */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); } /* Wait for resume directive */ while (!CPU_ISSET(cpu, &toresume_cpus)) ia32_pause(); /* Re-apply microcode updates. */ ucode_reload(); #ifdef __i386__ /* Finish removing the identity mapping of low memory for this AP. */ invltlb_glob(); #endif if (cpu_ops.cpu_resume) cpu_ops.cpu_resume(); #ifdef __amd64__ if (vmm_resume_p) vmm_resume_p(); #endif /* Resume MCA and local APIC */ lapic_xapic_mode(); mca_resume(); lapic_setup(0); /* Indicate that we are resumed */ CPU_CLR_ATOMIC(cpu, &resuming_cpus); CPU_CLR_ATOMIC(cpu, &suspended_cpus); CPU_CLR_ATOMIC(cpu, &toresume_cpus); } - -void -invlcache_handler(void) -{ - uint32_t generation; - -#ifdef COUNT_IPIS - (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; -#endif /* COUNT_IPIS */ - - /* - * Reading the generation here allows greater parallelism - * since wbinvd is a serializing instruction. Without the - * temporary, we'd wait for wbinvd to complete, then the read - * would execute, then the dependent write, which must then - * complete before return from interrupt. - */ - generation = smp_tlb_generation; - wbinvd(); - PCPU_SET(smp_tlb_done, generation); -} - /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); intrcnt_add(buf, &ipi_invlcache_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif - -/* - * Flush the TLB on other CPU's - */ - -/* Variables needed for SMP tlb shootdown. */ -vm_offset_t smp_tlb_addr1, smp_tlb_addr2; -pmap_t smp_tlb_pmap; -volatile uint32_t smp_tlb_generation; - -#ifdef __amd64__ -#define read_eflags() read_rflags() -#endif - -/* - * Used by pmap to request invalidation of TLB or cache on local and - * remote processors. Mask provides the set of remote CPUs which are - * to be signalled with the IPI specified by vector. The curcpu_cb - * callback is invoked on the calling CPU while waiting for remote - * CPUs to complete the operation. - * - * The callback function is called unconditionally on the caller's - * underlying processor, even when this processor is not set in the - * mask. So, the callback function must be prepared to handle such - * spurious invocations. - */ -static void -smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, - vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb) -{ - cpuset_t other_cpus; - volatile uint32_t *p_cpudone; - uint32_t generation; - int cpu; - - /* - * It is not necessary to signal other CPUs while booting or - * when in the debugger. - */ - if (kdb_active || KERNEL_PANICKED() || !smp_started) { - curcpu_cb(pmap, addr1, addr2); - return; - } - - sched_pin(); - - /* - * Check for other cpus. Return if none. - */ - if (CPU_ISFULLSET(&mask)) { - if (mp_ncpus <= 1) - goto nospinexit; - } else { - CPU_CLR(PCPU_GET(cpuid), &mask); - if (CPU_EMPTY(&mask)) - goto nospinexit; - } - - if (!(read_eflags() & PSL_I)) - panic("%s: interrupts disabled", __func__); - mtx_lock_spin(&smp_ipi_mtx); - smp_tlb_addr1 = addr1; - smp_tlb_addr2 = addr2; - smp_tlb_pmap = pmap; - generation = ++smp_tlb_generation; - if (CPU_ISFULLSET(&mask)) { - ipi_all_but_self(vector); - other_cpus = all_cpus; - CPU_CLR(PCPU_GET(cpuid), &other_cpus); - } else { - other_cpus = mask; - while ((cpu = CPU_FFS(&mask)) != 0) { - cpu--; - CPU_CLR(cpu, &mask); - CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, - cpu, vector); - ipi_send_cpu(cpu, vector); - } - } - curcpu_cb(pmap, addr1, addr2); - while ((cpu = CPU_FFS(&other_cpus)) != 0) { - cpu--; - CPU_CLR(cpu, &other_cpus); - p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done; - while (*p_cpudone != generation) - ia32_pause(); - } - mtx_unlock_spin(&smp_ipi_mtx); - sched_unpin(); - return; - -nospinexit: - curcpu_cb(pmap, addr1, addr2); - sched_unpin(); -} - -void -smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb) -{ - - smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0, curcpu_cb); -#ifdef COUNT_XINVLTLB_HITS - ipi_global++; -#endif -} - -void -smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap, - smp_invl_cb_t curcpu_cb) -{ - - smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0, curcpu_cb); -#ifdef COUNT_XINVLTLB_HITS - ipi_page++; -#endif -} - -void -smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2, - pmap_t pmap, smp_invl_cb_t curcpu_cb) -{ - - smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2, - curcpu_cb); -#ifdef COUNT_XINVLTLB_HITS - ipi_range++; - ipi_range_size += (addr2 - addr1) / PAGE_SIZE; -#endif -} - -void -smp_cache_flush(smp_invl_cb_t curcpu_cb) -{ - - smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0, - curcpu_cb); -} - -/* - * Handlers for TLB related IPIs - */ -void -invltlb_handler(void) -{ - uint32_t generation; - -#ifdef COUNT_XINVLTLB_HITS - xhits_gbl[PCPU_GET(cpuid)]++; -#endif /* COUNT_XINVLTLB_HITS */ -#ifdef COUNT_IPIS - (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; -#endif /* COUNT_IPIS */ - - /* - * Reading the generation here allows greater parallelism - * since invalidating the TLB is a serializing operation. - */ - generation = smp_tlb_generation; - if (smp_tlb_pmap == kernel_pmap) - invltlb_glob(); -#ifdef __amd64__ - else - invltlb(); -#endif - PCPU_SET(smp_tlb_done, generation); -} - -void -invlpg_handler(void) -{ - uint32_t generation; - -#ifdef COUNT_XINVLTLB_HITS - xhits_pg[PCPU_GET(cpuid)]++; -#endif /* COUNT_XINVLTLB_HITS */ -#ifdef COUNT_IPIS - (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; -#endif /* COUNT_IPIS */ - - generation = smp_tlb_generation; /* Overlap with serialization */ -#ifdef __i386__ - if (smp_tlb_pmap == kernel_pmap) -#endif - invlpg(smp_tlb_addr1); - PCPU_SET(smp_tlb_done, generation); -} - -void -invlrng_handler(void) -{ - vm_offset_t addr, addr2; - uint32_t generation; - -#ifdef COUNT_XINVLTLB_HITS - xhits_rng[PCPU_GET(cpuid)]++; -#endif /* COUNT_XINVLTLB_HITS */ -#ifdef COUNT_IPIS - (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; -#endif /* COUNT_IPIS */ - - addr = smp_tlb_addr1; - addr2 = smp_tlb_addr2; - generation = smp_tlb_generation; /* Overlap with serialization */ -#ifdef __i386__ - if (smp_tlb_pmap == kernel_pmap) -#endif - do { - invlpg(addr); - addr += PAGE_SIZE; - } while (addr < addr2); - - PCPU_SET(smp_tlb_done, generation); -} diff --git a/sys/x86/xen/xen_apic.c b/sys/x86/xen/xen_apic.c index 8bf2158dbec2..7d23f0a50417 100644 --- a/sys/x86/xen/xen_apic.c +++ b/sys/x86/xen/xen_apic.c @@ -1,627 +1,573 @@ /* * Copyright (c) 2014 Roger Pau Monné * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*--------------------------------- Macros -----------------------------------*/ #define XEN_APIC_UNSUPPORTED \ panic("%s: not available in Xen PV port.", __func__) /*--------------------------- Forward Declarations ---------------------------*/ #ifdef SMP static driver_filter_t xen_smp_rendezvous_action; +#ifdef __amd64__ +static driver_filter_t xen_invlop; +#else static driver_filter_t xen_invltlb; static driver_filter_t xen_invlpg; static driver_filter_t xen_invlrng; static driver_filter_t xen_invlcache; +#endif static driver_filter_t xen_ipi_bitmap_handler; static driver_filter_t xen_cpustop_handler; static driver_filter_t xen_cpususpend_handler; #endif /*---------------------------------- Macros ----------------------------------*/ #define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) /*--------------------------------- Xen IPIs ---------------------------------*/ #ifdef SMP struct xen_ipi_handler { driver_filter_t *filter; const char *description; }; static struct xen_ipi_handler xen_ipis[] = { [IPI_TO_IDX(IPI_RENDEZVOUS)] = { xen_smp_rendezvous_action, "r" }, +#ifdef __amd64__ + [IPI_TO_IDX(IPI_INVLOP)] = { xen_invlop, "itlb"}, +#else [IPI_TO_IDX(IPI_INVLTLB)] = { xen_invltlb, "itlb"}, [IPI_TO_IDX(IPI_INVLPG)] = { xen_invlpg, "ipg" }, [IPI_TO_IDX(IPI_INVLRNG)] = { xen_invlrng, "irg" }, [IPI_TO_IDX(IPI_INVLCACHE)] = { xen_invlcache, "ic" }, +#endif [IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler, "b" }, [IPI_TO_IDX(IPI_STOP)] = { xen_cpustop_handler, "st" }, [IPI_TO_IDX(IPI_SUSPEND)] = { xen_cpususpend_handler, "sp" }, }; #endif /*------------------------------- Per-CPU Data -------------------------------*/ #ifdef SMP DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); #endif /*------------------------------- Xen PV APIC --------------------------------*/ static void xen_pv_lapic_create(u_int apic_id, int boot_cpu) { #ifdef SMP cpu_add(apic_id, boot_cpu); #endif } static void xen_pv_lapic_init(vm_paddr_t addr) { } static void xen_pv_lapic_setup(int boot) { } static void xen_pv_lapic_dump(const char *str) { printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str); } static void xen_pv_lapic_disable(void) { } static bool xen_pv_lapic_is_x2apic(void) { return (false); } static void xen_pv_lapic_eoi(void) { XEN_APIC_UNSUPPORTED; } static int xen_pv_lapic_id(void) { return (PCPU_GET(apic_id)); } static int xen_pv_lapic_intr_pending(u_int vector) { XEN_APIC_UNSUPPORTED; return (0); } static u_int xen_pv_apic_cpuid(u_int apic_id) { #ifdef SMP return (apic_cpuids[apic_id]); #else return (0); #endif } static u_int xen_pv_apic_alloc_vector(u_int apic_id, u_int irq) { XEN_APIC_UNSUPPORTED; return (0); } static u_int xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) { XEN_APIC_UNSUPPORTED; return (0); } static void xen_pv_apic_disable_vector(u_int apic_id, u_int vector) { XEN_APIC_UNSUPPORTED; } static void xen_pv_apic_enable_vector(u_int apic_id, u_int vector) { XEN_APIC_UNSUPPORTED; } static void xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq) { XEN_APIC_UNSUPPORTED; } static void xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) { XEN_APIC_UNSUPPORTED; } static int xen_pv_lapic_enable_pmc(void) { XEN_APIC_UNSUPPORTED; return (0); } static void xen_pv_lapic_disable_pmc(void) { XEN_APIC_UNSUPPORTED; } static void xen_pv_lapic_reenable_pmc(void) { XEN_APIC_UNSUPPORTED; } static void xen_pv_lapic_enable_cmc(void) { } #ifdef SMP static void xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest) { XEN_APIC_UNSUPPORTED; } #define PCPU_ID_GET(id, field) (pcpu_find(id)->pc_##field) static void send_nmi(int dest) { unsigned int cpu; /* * NMIs are not routed over event channels, and instead delivered as on * native using the exception vector (#2). Triggering them can be done * using the local APIC, or an hypercall as a shortcut like it's done * below. */ switch(dest) { case APIC_IPI_DEST_SELF: HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_GET(vcpu_id), NULL); break; case APIC_IPI_DEST_ALL: CPU_FOREACH(cpu) HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_ID_GET(cpu, vcpu_id), NULL); break; case APIC_IPI_DEST_OTHERS: CPU_FOREACH(cpu) if (cpu != PCPU_GET(cpuid)) HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_ID_GET(cpu, vcpu_id), NULL); break; default: HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_ID_GET(apic_cpuid(dest), vcpu_id), NULL); break; } } #undef PCPU_ID_GET static void xen_pv_lapic_ipi_vectored(u_int vector, int dest) { xen_intr_handle_t *ipi_handle; int ipi_idx, to_cpu, self; if (vector >= IPI_NMI_FIRST) { send_nmi(dest); return; } ipi_idx = IPI_TO_IDX(vector); if (ipi_idx >= nitems(xen_ipis)) panic("IPI out of range"); switch(dest) { case APIC_IPI_DEST_SELF: ipi_handle = DPCPU_GET(ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); break; case APIC_IPI_DEST_ALL: CPU_FOREACH(to_cpu) { ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); } break; case APIC_IPI_DEST_OTHERS: self = PCPU_GET(cpuid); CPU_FOREACH(to_cpu) { if (to_cpu != self) { ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); } } break; default: to_cpu = apic_cpuid(dest); ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); break; } } static int xen_pv_lapic_ipi_wait(int delay) { XEN_APIC_UNSUPPORTED; return (0); } #endif /* SMP */ static int xen_pv_lapic_ipi_alloc(inthand_t *ipifunc) { XEN_APIC_UNSUPPORTED; return (-1); } static void xen_pv_lapic_ipi_free(int vector) { XEN_APIC_UNSUPPORTED; } static int xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked) { XEN_APIC_UNSUPPORTED; return (0); } static int xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode) { XEN_APIC_UNSUPPORTED; return (0); } static int xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol) { XEN_APIC_UNSUPPORTED; return (0); } static int xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger) { XEN_APIC_UNSUPPORTED; return (0); } /* Xen apic_ops implementation */ struct apic_ops xen_apic_ops = { .create = xen_pv_lapic_create, .init = xen_pv_lapic_init, .xapic_mode = xen_pv_lapic_disable, .is_x2apic = xen_pv_lapic_is_x2apic, .setup = xen_pv_lapic_setup, .dump = xen_pv_lapic_dump, .disable = xen_pv_lapic_disable, .eoi = xen_pv_lapic_eoi, .id = xen_pv_lapic_id, .intr_pending = xen_pv_lapic_intr_pending, .set_logical_id = xen_pv_lapic_set_logical_id, .cpuid = xen_pv_apic_cpuid, .alloc_vector = xen_pv_apic_alloc_vector, .alloc_vectors = xen_pv_apic_alloc_vectors, .enable_vector = xen_pv_apic_enable_vector, .disable_vector = xen_pv_apic_disable_vector, .free_vector = xen_pv_apic_free_vector, .enable_pmc = xen_pv_lapic_enable_pmc, .disable_pmc = xen_pv_lapic_disable_pmc, .reenable_pmc = xen_pv_lapic_reenable_pmc, .enable_cmc = xen_pv_lapic_enable_cmc, #ifdef SMP .ipi_raw = xen_pv_lapic_ipi_raw, .ipi_vectored = xen_pv_lapic_ipi_vectored, .ipi_wait = xen_pv_lapic_ipi_wait, #endif .ipi_alloc = xen_pv_lapic_ipi_alloc, .ipi_free = xen_pv_lapic_ipi_free, .set_lvt_mask = xen_pv_lapic_set_lvt_mask, .set_lvt_mode = xen_pv_lapic_set_lvt_mode, .set_lvt_polarity = xen_pv_lapic_set_lvt_polarity, .set_lvt_triggermode = xen_pv_lapic_set_lvt_triggermode, }; #ifdef SMP /*---------------------------- XEN PV IPI Handlers ---------------------------*/ /* * These are C clones of the ASM functions found in apic_vector. */ static int xen_ipi_bitmap_handler(void *arg) { struct trapframe *frame; frame = arg; ipi_bitmap_handler(*frame); return (FILTER_HANDLED); } static int xen_smp_rendezvous_action(void *arg) { #ifdef COUNT_IPIS (*ipi_rendezvous_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ smp_rendezvous_action(); return (FILTER_HANDLED); } -static int -xen_invltlb(void *arg) -{ - - invltlb_handler(); - return (FILTER_HANDLED); -} - #ifdef __amd64__ static int -xen_invltlb_invpcid(void *arg) -{ - - invltlb_invpcid_handler(); - return (FILTER_HANDLED); -} - -static int -xen_invltlb_pcid(void *arg) +xen_invlop(void *arg) { - invltlb_pcid_handler(); + invlop_handler(); return (FILTER_HANDLED); } -static int -xen_invltlb_invpcid_pti(void *arg) -{ - - invltlb_invpcid_pti_handler(); - return (FILTER_HANDLED); -} +#else /* __i386__ */ static int -xen_invlpg_invpcid_handler(void *arg) -{ - - invlpg_invpcid_handler(); - return (FILTER_HANDLED); -} - -static int -xen_invlpg_pcid_handler(void *arg) -{ - - invlpg_pcid_handler(); - return (FILTER_HANDLED); -} - -static int -xen_invlrng_invpcid_handler(void *arg) -{ - - invlrng_invpcid_handler(); - return (FILTER_HANDLED); -} - -static int -xen_invlrng_pcid_handler(void *arg) +xen_invltlb(void *arg) { - invlrng_pcid_handler(); + invltlb_handler(); return (FILTER_HANDLED); } -#endif static int xen_invlpg(void *arg) { invlpg_handler(); return (FILTER_HANDLED); } static int xen_invlrng(void *arg) { invlrng_handler(); return (FILTER_HANDLED); } static int xen_invlcache(void *arg) { invlcache_handler(); return (FILTER_HANDLED); } +#endif /* __amd64__ */ static int xen_cpustop_handler(void *arg) { cpustop_handler(); return (FILTER_HANDLED); } static int xen_cpususpend_handler(void *arg) { cpususpend_handler(); return (FILTER_HANDLED); } /*----------------------------- XEN PV IPI setup -----------------------------*/ /* * Those functions are provided outside of the Xen PV APIC implementation * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC, * because on PVHVM there's an emulated LAPIC provided by Xen. */ static void xen_cpu_ipi_init(int cpu) { xen_intr_handle_t *ipi_handle; const struct xen_ipi_handler *ipi; int idx, rc; ipi_handle = DPCPU_ID_GET(cpu, ipi_handle); for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) { if (ipi->filter == NULL) { ipi_handle[idx] = NULL; continue; } rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter, INTR_TYPE_TTY, &ipi_handle[idx]); if (rc != 0) panic("Unable to allocate a XEN IPI port"); xen_intr_describe(ipi_handle[idx], "%s", ipi->description); } } static void xen_setup_cpus(void) { int i; if (!xen_vector_callback_enabled) return; -#ifdef __amd64__ - if (pmap_pcid_enabled) { - if (pti) - xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = - invpcid_works ? xen_invltlb_invpcid_pti : - xen_invltlb_pcid; - else - xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter = - invpcid_works ? xen_invltlb_invpcid : - xen_invltlb_pcid; - xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = invpcid_works ? - xen_invlpg_invpcid_handler : xen_invlpg_pcid_handler; - xen_ipis[IPI_TO_IDX(IPI_INVLRNG)].filter = invpcid_works ? - xen_invlrng_invpcid_handler : xen_invlrng_pcid_handler; - } -#endif CPU_FOREACH(i) xen_cpu_ipi_init(i); /* Set the xen pv ipi ops to replace the native ones */ if (xen_hvm_domain()) apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored; } /* Switch to using PV IPIs as soon as the vcpu_id is set. */ SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_SECOND, xen_setup_cpus, NULL); #endif /* SMP */