diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 114d52e5a8f2..8d73717b03b3 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -1,307 +1,257 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 2014-2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by I/O APICs
  * as well as IPI handlers.
  */
 
 #include "opt_smp.h"
 
 #include "assym.inc"
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
 #ifdef SMP
 #define LK	lock ;
 #else
 #define LK
 #endif
 
 	.text
 	SUPERALIGN_TEXT
 	/* End Of Interrupt to APIC */
 as_lapic_eoi:
 	cmpl	$0,x2apic_mode
 	jne	1f
 	movq	lapic_map,%rax
 	movl	$0,LA_EOI(%rax)
 	ret
 1:
 	movl	$MSR_APIC_EOI,%ecx
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	wrmsr
 	ret
 
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
 	.macro	ISR_VEC	index, vec_name
 	INTR_HANDLER	\vec_name
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	cmpl	$0,x2apic_mode
 	je	1f
 	movl	$(MSR_APIC_ISR0 + \index),%ecx
 	rdmsr
 	jmp	2f
 1:
 	movq	lapic_map, %rdx		/* pointer to local APIC */
 	movl	LA_ISR + 16 * (\index)(%rdx), %eax	/* load ISR */
 2:
 	bsrl	%eax, %eax	/* index of highest set bit in ISR */
 	jz	3f
 	addl	$(32 * \index),%eax
 	movq	%rsp, %rsi
 	movl	%eax, %edi	/* pass the IRQ */
 	call	lapic_handle_intr
 3:
 	MEXITCOUNT
 	jmp	doreti
 	.endm
 
 /*
  * Handle "spurious INTerrupts".
  * Notes:
  *  This is different than the "spurious INTerrupt" generated by an
  *   8259 PIC for missing INTs.  See the APIC documentation for details.
  *  This routine should NOT do an 'EOI' cycle.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
 	/* No EOI cycle used here */
 	jmp	doreti_iret
 
 	ISR_VEC	1, apic_isr1
 	ISR_VEC	2, apic_isr2
 	ISR_VEC	3, apic_isr3
 	ISR_VEC	4, apic_isr4
 	ISR_VEC	5, apic_isr5
 	ISR_VEC	6, apic_isr6
 	ISR_VEC	7, apic_isr7
 
 /*
  * Local APIC periodic timer handler.
  */
 	INTR_HANDLER	timerint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	lapic_handle_timer
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC CMCI handler.
  */
 	INTR_HANDLER cmcint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC error interrupt handler.
  */
 	INTR_HANDLER errorint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_error
 	MEXITCOUNT
 	jmp	doreti
 
 #ifdef XENHVM
 /*
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
 	INTR_HANDLER xen_intr_upcall
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	xen_intr_handle_upcall
 	MEXITCOUNT
 	jmp	doreti
 #endif
 
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
 	.text
 
 	SUPERALIGN_TEXT
-invltlb_ret:
-	call	as_lapic_eoi
-	jmp	ld_regs
-
-	SUPERALIGN_TEXT
-	INTR_HANDLER invltlb
-	call	invltlb_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invltlb_pcid
-	call	invltlb_pcid_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invltlb_invpcid_nopti
-	call	invltlb_invpcid_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invltlb_invpcid_pti
-	call	invltlb_invpcid_pti_handler
-	jmp	invltlb_ret
-
-/*
- * Single page TLB shootdown
- */
-	INTR_HANDLER invlpg
-	call	invlpg_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invlpg_invpcid
-	call	invlpg_invpcid_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invlpg_pcid
-	call	invlpg_pcid_handler
-	jmp	invltlb_ret
-
-/*
- * Page range TLB shootdown.
- */
-	INTR_HANDLER invlrng
-	call	invlrng_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invlrng_invpcid
-	call	invlrng_invpcid_handler
-	jmp	invltlb_ret
-
-	INTR_HANDLER invlrng_pcid
-	call	invlrng_pcid_handler
-	jmp	invltlb_ret
-
 /*
- * Invalidate cache.
+ * IPI handler for cache and TLB shootdown
  */
-	INTR_HANDLER invlcache
-	call	invlcache_handler
-	jmp	invltlb_ret
+	INTR_HANDLER invlop
+	call	invlop_handler
+	call	as_lapic_eoi
+	jmp	ld_regs
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
 	INTR_HANDLER ipi_intr_bitmap_handler
 	call	as_lapic_eoi
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	INTR_HANDLER cpustop
 	call	as_lapic_eoi
 	call	cpustop_handler
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
 	INTR_HANDLER cpususpend
 	call	cpususpend_handler
 	call	as_lapic_eoi
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
  */
 	INTR_HANDLER rendezvous
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movq	ipi_rendezvous_counts(,%rax,8), %rax
 	incq	(%rax)
 #endif
 	call	smp_rendezvous_action
 	call	as_lapic_eoi
 	jmp	doreti
 
 /*
  * IPI handler whose purpose is to interrupt the CPU with minimum overhead.
  * This is used by bhyve to force a host cpu executing in guest context to
  * trap into the hypervisor.
  *
  * This handler is different from other IPI handlers in the following aspects:
  *
  * 1. It doesn't push a trapframe on the stack.
  *
  * This implies that a DDB backtrace involving 'justreturn' will skip the
  * function that was interrupted by this handler.
  *
  * 2. It doesn't 'swapgs' when userspace is interrupted.
  *
  * The 'justreturn' handler does not access any pcpu data so it is not an
  * issue. Moreover the 'justreturn' handler can only be interrupted by an NMI
  * whose handler already doesn't trust GS.base when kernel code is interrupted.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(justreturn)
 	pushq	%rax
 	pushq	%rcx
 	pushq	%rdx
 	call	as_lapic_eoi
 	popq	%rdx
 	popq	%rcx
 	popq	%rax
 	jmp	doreti_iret
 
 	INTR_HANDLER	justreturn1
 	call	as_lapic_eoi
 	jmp	doreti
 
 #endif /* SMP */
diff --git a/sys/amd64/amd64/db_interface.c b/sys/amd64/amd64/db_interface.c
index 4645169af562..e35248b07663 100644
--- a/sys/amd64/amd64/db_interface.c
+++ b/sys/amd64/amd64/db_interface.c
@@ -1,111 +1,110 @@
 /*-
  * Mach Operating System
  * Copyright (c) 1991,1990 Carnegie Mellon University
  * All Rights Reserved.
  *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Interface to new debugger.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/pcpu.h>
 
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 #include <ddb/ddb.h>
 
 /*
  * Read bytes from kernel address space for debugger.
  */
 int
 db_read_bytes(vm_offset_t addr, size_t size, char *data)
 {
 	jmp_buf jb;
 	void *prev_jb;
 	char *src;
 	int ret;
 
 	prev_jb = kdb_jmpbuf(jb);
 	ret = setjmp(jb);
 	if (ret == 0) {
 		src = (char *)addr;
 		while (size-- > 0)
 			*data++ = *src++;
 	}
 	(void)kdb_jmpbuf(prev_jb);
 	return (ret);
 }
 
 /*
  * Write bytes to kernel address space for debugger.
  * We need to disable write protection temporarily so we can write
  * things (such as break points) that might be in write-protected
  * memory.
  */
 int
 db_write_bytes(vm_offset_t addr, size_t size, char *data)
 {
 	jmp_buf jb;
 	void *prev_jb;
 	char *dst;
 	bool old_wp;
 	int ret;
 
 	old_wp = false;
 	prev_jb = kdb_jmpbuf(jb);
 	ret = setjmp(jb);
 	if (ret == 0) {
 		old_wp = disable_wp();
 		dst = (char *)addr;
 		while (size-- > 0)
 			*dst++ = *data++;
 	}
 	restore_wp(old_wp);
 	(void)kdb_jmpbuf(prev_jb);
 	return (ret);
 }
 
 void
 db_show_mdpcpu(struct pcpu *pc)
 {
 
 	db_printf("self         = %p\n", pc->pc_prvspace);
 	db_printf("curpmap      = %p\n", pc->pc_curpmap);
 	db_printf("tssp         = %p\n", pc->pc_tssp);
 	db_printf("rsp0         = 0x%lx\n", pc->pc_rsp0);
 	db_printf("kcr3         = 0x%lx\n", pc->pc_kcr3);
 	db_printf("ucr3         = 0x%lx\n", pc->pc_ucr3);
 	db_printf("scr3         = 0x%lx\n", pc->pc_saved_ucr3);
 	db_printf("gs32p        = %p\n", pc->pc_gs32p);
 	db_printf("ldt          = %p\n", pc->pc_ldt);
 	db_printf("tss          = %p\n", pc->pc_tss);
-	db_printf("tlb gen      = %u\n", pc->pc_smp_tlb_done);
 }
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 4fb846d75948..1a07080c5daf 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1,2814 +1,2815 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_pci.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/csan.h>
 #include <sys/efi.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <x86/ifunc.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <x86/isa/icu.h>
 #else
 #include <x86/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 #include <x86/init.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 /*
  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  * couple of scratch registers, as well as the trapframe left behind after an
  * iret fault.
  */
 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
     offsetof(struct pti_frame, pti_rip));
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /* Preload data parse function */
 static caddr_t native_parse_preload_data(u_int64_t);
 
 /* Native function to fetch and parse the e820 map */
 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
 
 /* Default init_ops implementation. */
 struct init_ops init_ops = {
 	.parse_preload_data =	native_parse_preload_data,
 	.early_clock_source_init =	i8254_init,
 	.early_delay =			i8254_delay,
 	.parse_memmap =			native_parse_memmap,
 #ifdef SMP
 	.mp_bootaddress =		mp_bootaddress,
 	.start_all_aps =		native_start_all_aps,
 #endif
 #ifdef DEV_PCI
 	.msi_init =			msi_init,
 #endif
 };
 
 /*
  * Physical address of the EFI System Table. Stashed from the metadata hints
  * passed into the kernel and used by the EFI code to call runtime services.
  */
 vm_paddr_t efi_systbl_phys;
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_idt;
 
 struct pcpu *__pcpu;
 struct pcpu temp_bsp_pcpu;
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 void (*vmm_resume_p)(void);
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge. 
 	 */
 	sysenv = kern_getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = kern_getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)vm_free_count()))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)vm_free_count()),
 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
 #ifdef DEV_PCI
 	if (bootverbose && intel_graphics_stolen_base != 0)
 		printf("intel stolen mem: base %#jx size %ju MB\n",
 		    (uintmax_t)intel_graphics_stolen_base,
 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
 #endif
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 static void
 late_ifunc_resolve(void *dummy __unused)
 {
 	link_elf_late_ireloc();
 }
 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by call
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct pcb *pcb;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	pcb = td->td_pcb;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	update_pcb_bases(pcb);
 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_ss = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct pcb *pcb;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	pcb = td->td_pcb;
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0) {
 		uprintf("pid %d (%s): sigreturn copyin failed\n",
 		    p->p_pid, td->td_name);
 		return (error);
 	}
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
 		    td->td_name, rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
 		    td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    p->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
 		xfpustate = __builtin_alloca(xfpustate_len);
 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    p->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    p->p_pid, td->td_name, ret);
 		return (ret);
 	}
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
  
 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
 {
 	struct trapframe *regs;
 	struct pcb *pcb;
 	register_t saved_rflags;
 
 	regs = td->td_frame;
 	pcb = td->td_pcb;
 
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	clear_pcb_flags(pcb, PCB_32BIT);
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
 
 	saved_rflags = regs->tf_rflags & PSL_T;
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | saved_rflags;
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 
 	/*
 	 * Reset the hardware debug registers if they were in use.
 	 * They won't have any meaning for the newly exec'd process.
 	 */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		pcb->pcb_dr0 = 0;
 		pcb->pcb_dr1 = 0;
 		pcb->pcb_dr2 = 0;
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
 		if (pcb == curpcb) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 			reset_dbregs();
 		}
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 static char mce0_stack[PAGE_SIZE] __aligned(16);
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
 static char dbg0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
 /*
  * Software prototypes -- in more palatable form.
  *
  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  * slots as corresponding segments for i386 kernel.
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GNULL2_SEL	1 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
 	.ssd_type = SDT_SYSTSS,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* Actually, the TSS is a system descriptor which is double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 LDT Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
 
 void
 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 	IDTVEC(div_pti), IDTVEC(bpt_pti),
 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
 	IDTVEC(xmm_pti),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
 #endif
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
 	IDTVEC(fast_syscall_pti);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	struct {
 		uint16_t limit;
 		uint64_t base;
 	} __packed idtr, gdtr;
 	uint16_t ldt, tr;
 
 	__asm __volatile("sidt %0" : "=m" (idtr));
 	db_printf("idtr\t0x%016lx/%04x\n",
 	    (u_long)idtr.base, (u_int)idtr.limit);
 	__asm __volatile("sgdt %0" : "=m" (gdtr));
 	db_printf("gdtr\t0x%016lx/%04x\n",
 	    (u_long)gdtr.base, (u_int)gdtr.limit);
 	__asm __volatile("sldt %0" : "=r" (ldt));
 	db_printf("ldtr\t0x%04x\n", ldt);
 	__asm __volatile("str %0" : "=r" (tr));
 	db_printf("tr\t0x%04x\n", tr);
 	db_printf("cr0\t0x%016lx\n", rcr0());
 	db_printf("cr2\t0x%016lx\n", rcr2());
 	db_printf("cr3\t0x%016lx\n", rcr3());
 	db_printf("cr4\t0x%016lx\n", rcr4());
 	if (rcr4() & CR4_XSAVE)
 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
 		db_printf("FEATURES_CTL\t%016lx\n",
 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
 }
 
 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
 {
 
 	db_printf("dr0\t0x%016lx\n", rdr0());
 	db_printf("dr1\t0x%016lx\n", rdr1());
 	db_printf("dr2\t0x%016lx\n", rdr2());
 	db_printf("dr3\t0x%016lx\n", rdr3());
 	db_printf("dr6\t0x%016lx\n", rdr6());
 	db_printf("dr7\t0x%016lx\n", rdr7());	
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
 #include <isa/isavar.h>
 #include <isa/isareg.h>
 /*
  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  * and is only suitable for use at probe time.
  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  * It shouldn't be here.  There should probably be an APIC centric
  * implementation in the apic driver code, if at all.
  */
 intrmask_t
 isa_irq_pending(void)
 {
 	u_char irr1;
 	u_char irr2;
 
 	irr1 = inb(IO_ICU1);
 	irr2 = inb(IO_ICU2);
 	return ((irr2 << 8) | irr1);
 }
 #endif
 
 u_int basemem;
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 *
 	 * NB: physmap_idx points to the next free slot.
 	 */
 	insert_idx = physmap_idx;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 void
 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
                       vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap, *smapend;
 
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (!add_physmap_entry(smap->base, smap->length, physmap,
 		    physmap_idx))
 			break;
 	}
 }
 
 static void
 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
     int *physmap_idx)
 {
 	struct efi_md *map, *p;
 	const char *type;
 	size_t efisz;
 	int ndesc, i;
 
 	static const char *types[] = {
 		"Reserved",
 		"LoaderCode",
 		"LoaderData",
 		"BootServicesCode",
 		"BootServicesData",
 		"RuntimeServicesCode",
 		"RuntimeServicesData",
 		"ConventionalMemory",
 		"UnusableMemory",
 		"ACPIReclaimMemory",
 		"ACPIMemoryNVS",
 		"MemoryMappedIO",
 		"MemoryMappedIOPortSpace",
 		"PalCode",
 		"PersistentMemory"
 	};
 
 	/*
 	 * Memory map data provided by UEFI via the GetMemoryMap
 	 * Boot Services API.
 	 */
 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 
 	if (efihdr->descriptor_size == 0)
 		return;
 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
 
 	if (boothowto & RB_VERBOSE)
 		printf("%23s %12s %12s %8s %4s\n",
 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
 
 	for (i = 0, p = map; i < ndesc; i++,
 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 		if (boothowto & RB_VERBOSE) {
 			if (p->md_type < nitems(types))
 				type = types[p->md_type];
 			else
 				type = "<INVALID>";
 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 			    p->md_virt, p->md_pages);
 			if (p->md_attr & EFI_MD_ATTR_UC)
 				printf("UC ");
 			if (p->md_attr & EFI_MD_ATTR_WC)
 				printf("WC ");
 			if (p->md_attr & EFI_MD_ATTR_WT)
 				printf("WT ");
 			if (p->md_attr & EFI_MD_ATTR_WB)
 				printf("WB ");
 			if (p->md_attr & EFI_MD_ATTR_UCE)
 				printf("UCE ");
 			if (p->md_attr & EFI_MD_ATTR_WP)
 				printf("WP ");
 			if (p->md_attr & EFI_MD_ATTR_RP)
 				printf("RP ");
 			if (p->md_attr & EFI_MD_ATTR_XP)
 				printf("XP ");
 			if (p->md_attr & EFI_MD_ATTR_NV)
 				printf("NV ");
 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 				printf("MORE_RELIABLE ");
 			if (p->md_attr & EFI_MD_ATTR_RO)
 				printf("RO ");
 			if (p->md_attr & EFI_MD_ATTR_RT)
 				printf("RUNTIME");
 			printf("\n");
 		}
 
 		switch (p->md_type) {
 		case EFI_MD_TYPE_CODE:
 		case EFI_MD_TYPE_DATA:
 		case EFI_MD_TYPE_BS_CODE:
 		case EFI_MD_TYPE_BS_DATA:
 		case EFI_MD_TYPE_FREE:
 			/*
 			 * We're allowed to use any entry with these types.
 			 */
 			break;
 		default:
 			continue;
 		}
 
 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 		    physmap, physmap_idx))
 			break;
 	}
 }
 
 static char bootmethod[16] = "";
 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
     "System firmware boot method");
 
 static void
 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap;
 	struct efi_map_header *efihdr;
 	u_int32_t size;
 
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	smap = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (efihdr == NULL && smap == NULL)
 		panic("No BIOS smap or EFI map info from loader!");
 
 	if (efihdr != NULL) {
 		add_efi_map_entries(efihdr, physmap, physmap_idx);
 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 	} else {
 		size = *((u_int32_t *)smap - 1);
 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 	}
 }
 
 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
 	u_long physmem_start, physmem_tunable, memtest;
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 	int page_counter;
 
 	/*
 	 * Tell the physical memory allocator about pages used to store
 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
 	 */
 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
 
 	bzero(physmap, sizeof(physmap));
 	physmap_idx = 0;
 
 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 	physmap_idx -= 2;
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] <= 0xA0000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0 || basemem > 640) {
 		if (bootverbose)
 			printf(
 		"Memory map doesn't contain a basemem segment, faking it");
 		basemem = 640;
 	}
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * The boot memory test is disabled by default, as it takes a
 	 * significant amount of time on large-memory systems, and is
 	 * unfriendly to virtual machines as it unnecessarily touches all
 	 * pages.
 	 *
 	 * A general name is used as the code may be extended to support
 	 * additional tests beyond the current "page present" test.
 	 */
 	memtest = 0;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * Make hole for "AP -> long mode" bootstrap code.  The
 	 * mp_bootaddress vector is only available when the kernel
 	 * is configured to support APs and APs for the system start
 	 * in real mode mode (e.g. SMP bare metal).
 	 */
 	if (init_ops.mp_bootaddress)
 		init_ops.mp_bootaddress(physmap, &physmap_idx);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 *
 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 	 * By default, mask off the first 16 pages unless we appear to be
 	 * running in a VM.
 	 */
 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 	if (physmap[0] < physmem_start) {
 		if (physmem_start < PAGE_SIZE)
 			physmap[0] = PAGE_SIZE;
 		else if (physmem_start >= physmap[1])
 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 		else
 			physmap[0] = round_page(physmem_start);
 	}
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	page_counter = 0;
 	if (memtest != 0)
 		printf("Testing system memory");
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= (vm_paddr_t)kernphys && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * Print a "." every GB to show we're making
 			 * progress.
 			 */
 			page_counter++;
 			if ((page_counter % PAGES_PER_GB) == 0)
 				printf(".");
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == PHYS_AVAIL_ENTRIES) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	if (memtest != 0)
 		printf("\n");
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 }
 
 static caddr_t
 native_parse_preload_data(u_int64_t modulep)
 {
 	caddr_t kmdp;
 	char *envp;
 #ifdef DDB
 	vm_offset_t ksym_start;
 	vm_offset_t ksym_end;
 #endif
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 	if (envp != NULL)
 		envp += KERNBASE;
 	init_static_kenv(envp, 0);
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
 #endif
 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 
 	return (kmdp);
 }
 
 static void
 amd64_kdb_init(void)
 {
 	kdb_init();
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
 /* Set up the fast syscall stuff */
 void
 amd64_conf_fast_syscall(void)
 {
 	uint64_t msr;
 
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 	    (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
 }
 
 void
 amd64_bsp_pcpu_init1(struct pcpu *pc)
 {
 	struct user_segment_descriptor *gdt;
 
 	PCPU_SET(prvspace, pc);
 	gdt = *PCPU_PTR(gdt);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(tssp, PCPU_PTR(common_tss));
 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
+	PCPU_SET(smp_tlb_gen, 1);
 }
 
 void
 amd64_bsp_pcpu_init2(uint64_t rsp0)
 {
 
 	PCPU_SET(rsp0, rsp0);
 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 	PCPU_SET(curpcb, thread0.td_pcb);
 }
 
 void
 amd64_bsp_ist_init(struct pcpu *pc)
 {
 	struct nmi_pcpu *np;
 	struct amd64tss *tssp;
 
 	tssp = &pc->pc_common_tss;
 
 	/* doublefault stack space, runs on ist1 */
 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist1 = (long)np;
 
 	/*
 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
 	 * above the start of the ist2 stack.
 	 */
 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist2 = (long)np;
 
 	/*
 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
 	 * above the start of the ist3 stack.
 	 */
 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist3 = (long)np;
 
 	/*
 	 * DB# stack, runs on ist4.
 	 */
 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist4 = (long)np;
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
 	struct xstate_hdr *xhdr;
 	u_int64_t rsp0;
 	char *env;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor r_gdt;
 	size_t kstack0_sz;
 	int late_console;
 
 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
 
 	kmdp = init_ops.parse_preload_data(modulep);
 
 	physfree += ucode_load_bsp(physfree + KERNBASE);
 	physfree = roundup2(physfree, PAGE_SIZE);
 
 	identify_cpu1();
 	identify_hypervisor();
 	identify_cpu_fixup_bsp();
 	identify_cpu2();
 	initializecpucache();
 
 	/*
 	 * Check for pti, pcid, and invpcid before ifuncs are
 	 * resolved, to correctly select the implementation for
 	 * pmap_activate_sw_mode().
 	 */
 	pti = pti_get_default();
 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 		invpcid_works = (cpu_stdext_feature &
 		    CPUID_STDEXT_INVPCID) != 0;
 	} else {
 		pmap_pcid_enabled = 0;
 	}
 
 	link_elf_ireloc(kmdp);
 
 	/*
 	 * This may be done better later if it gets more high level
 	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	thread0.td_kstack = physfree + KERNBASE;
 	thread0.td_kstack_pages = kstack_pages;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
 	physfree += kstack0_sz;
 
 	/*
 	 * Initialize enough of thread0 for delayed invalidation to
 	 * work very early.  Rely on thread0.td_base_pri
 	 * zero-initialization, it is reset to PVM at proc0_init().
 	 */
 	pmap_thread_init_invl_gen(&thread0);
 
 	pc = &temp_bsp_pcpu;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	gdt = &temp_bsp_pcpu.pc_gdt[0];
 
 	/*
 	 * make gdt memory segments
 	 */
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base = (long)gdt;
 	lgdt(&r_gdt);
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	dpcpu_init((void *)(physfree + KERNBASE), 0);
 	physfree += DPCPU_SIZE;
 	amd64_bsp_pcpu_init1(pc);
 	/* Non-late cninit() and printf() can be moved up to here. */
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 		    SEL_KPL, 0);
 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 	    SEL_UPL, 0);
 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 	    SEL_UPL, 0);
 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 	    SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 	    SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 	    SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the clock before the console so that console
 	 * initialization can use DELAY().
 	 */
 	clock_init();
 
 	/*
 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 	 * transition).
 	 * Once bootblocks have updated, we can test directly for
 	 * efi_systbl != NULL here...
 	 */
 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 	    != NULL)
 		vty_set_preferred(VTY_VT);
 
 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
 
 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
 
 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
 	    &syscall_ret_l1d_flush_mode);
 
 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
 
 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
 
 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
 	    &x86_rngds_mitg_enable);
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 
 	amd64_bsp_ist_init(pc);
 	
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	amd64_conf_fast_syscall();
 
 	/*
 	 * We initialize the PCB pointer early so that exception
 	 * handlers will work.  Also set up td_critnest to short-cut
 	 * the page fault handler.
 	 */
 	cpu_max_ext_state_size = sizeof(struct savefpu);
 	set_top_of_stack_td(&thread0);
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_critnest = 1;
 
 	/*
 	 * The console and kdb should be initialized even earlier than here,
 	 * but some console drivers don't work until after getmemsize().
 	 * Default to late console initialization to support these drivers.
 	 * This loses mainly printf()s in getmemsize() and early debugging.
 	 */
 	late_console = 1;
 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
 	if (!late_console) {
 		cninit();
 		amd64_kdb_init();
 	}
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 #ifdef DEV_PCI
         /* This call might adjust phys_avail[]. */
         pci_early_quirks();
 #endif
 
 	if (late_console)
 		cninit();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?"
 #endif
 
 	if (late_console)
 		amd64_kdb_init();
 
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
 	/*
 	 * Reinitialize thread0's stack base now that the xsave area size is
 	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
 	 * save area size.  Zero out the extended state header in fpu save area.
 	 */
 	set_top_of_stack_td(&thread0);
 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 		    1);
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	rsp0 = thread0.td_md.md_stack_base;
 	/* Ensure the stack is aligned to 16 bytes */
 	rsp0 &= ~0xFul;
 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
 	amd64_bsp_pcpu_init2(rsp0);
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 
         env = kern_getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	cpu_probe_amdc1e();
 
 	kcsan_cpu_init(0);
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 	thread0.td_critnest = 0;
 
 	TSEXIT();
 
 	/* Location of kernel stack for locore */
 	return (thread0.td_md.md_stack_base);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 static int
 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct bios_smap *smapbase;
 	struct bios_smap_xattr smap;
 	caddr_t kmdp;
 	uint32_t *smapattr;
 	int count, error, i;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		return (0);
 	smapattr = (uint32_t *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 	error = 0;
 	for (i = 0; i < count; i++) {
 		smap.base = smapbase[i].base;
 		smap.length = smapbase[i].length;
 		smap.type = smapbase[i].type;
 		if (smapattr != NULL)
 			smap.xattr = smapattr[i];
 		else
 			smap.xattr = 0;
 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
 	}
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, smap,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     smap_sysctl_handler, "S,bios_smap_xattr",
     "Raw BIOS SMAP data");
 
 static int
 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct efi_map_header *efihdr;
 	caddr_t kmdp;
 	uint32_t efisize;
 
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr == NULL)
 		return (0);
 	efisize = *((uint32_t *)efihdr - 1);
 	return (SYSCTL_OUT(req, efihdr, efisize));
 }
 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     efi_map_sysctl_handler, "S,efi_map_header",
     "Raw EFI Memory Map");
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 		critical_enter();
 	} else
 		td->td_md.md_spinlock_count++;
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0) {
 		critical_exit();
 		intr_restore(flags);
 	}
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = tf->tf_rsp;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 
 	td->td_frame->tf_rip = addr;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
 		td->td_frame->tf_rflags |= PSL_T;
 		td->td_dbgflags |= TDB_STEP;
 	}
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	td->td_frame->tf_rflags &= ~PSL_T;
 	td->td_dbgflags &= ~TDB_STEP;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	if (tp->tf_flags & TF_HASSEGS) {
 		regs->r_ds = tp->tf_ds;
 		regs->r_es = tp->tf_es;
 		regs->r_fs = tp->tf_fs;
 		regs->r_gs = tp->tf_gs;
 	} else {
 		regs->r_ds = 0;
 		regs->r_es = 0;
 		regs->r_fs = 0;
 		regs->r_gs = 0;
 	}
 	regs->r_err = 0;
 	regs->r_trapno = 0;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	if (0) {	/* XXXKIB */
 		tp->tf_ds = regs->r_ds;
 		tp->tf_es = regs->r_es;
 		tp->tf_fs = regs->r_fs;
 		tp->tf_gs = regs->r_gs;
 		tp->tf_flags = TF_HASSEGS;
 	}
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	fpugetregs(td);
 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	critical_enter();
 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 	fpuuserinited(td);
 	critical_exit();
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_flags = tp->tf_flags;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	update_pcb_bases(pcb);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	tp->tf_flags = mcp->mc_flags;
 	if (tp->tf_flags & TF_HASSEGS) {
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_gs = mcp->mc_gs;
 	}
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	if (mcp->mc_flags & _MC_HASBASES) {
 		pcb->pcb_fsbase = mcp->mc_fsbase;
 		pcb->pcb_gsbase = mcp->mc_gsbase;
 	}
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 	 * have too many layers.
 	 */
 	clear_pcb_flags(curthread->td_pcb,
 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		set_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(register_t dr6)
 {
         u_int64_t dr7;
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
 
         bp = dr6 & DBREG_DR6_BMASK;
         if (bp == 0) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 /*
  * The pcb_flags is only modified by current thread, or by other threads
  * when current thread is stopped.  However, current thread may change it
  * from the interrupt context in cpu_switch(), or in the trap handler.
  * When we read-modify-write pcb_flags from C sources, compiler may generate
  * code that is not atomic regarding the interrupt handler.  If a trap or
  * interrupt happens and any flag is modified from the handler, it can be
  * clobbered with the cached value later.  Therefore, we implement setting
  * and clearing flags with single-instruction functions, which do not race
  * with possible modification of the flags from the trap or interrupt context,
  * because traps and interrupts are executed only on instruction boundary.
  */
 void
 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("orl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 
 }
 
 /*
  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
  * pcb if user space modified the bases.  We must save on the context
  * switch or if the return to usermode happens through the doreti.
  *
  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
  * which have a consequence that the base MSRs must be saved each time
  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
  * context switches.
  */
 static void
 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
 {
 	register_t r;
 
 	if (curpcb == pcb &&
 	    (flags & PCB_FULL_IRET) != 0 &&
 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 		r = intr_disable();
 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 			if (rfs() == _ufssel)
 				pcb->pcb_fsbase = rdfsbase();
 			if (rgs() == _ugssel)
 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 		}
 		set_pcb_flags_raw(pcb, flags);
 		intr_restore(r);
 	} else {
 		set_pcb_flags_raw(pcb, flags);
 	}
 }
 
 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
 }
 
 void
 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("andl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
 
 #undef memset
 #undef memmove
 #undef memcpy
 
 void	*memset_std(void *buf, int c, size_t len);
 void	*memset_erms(void *buf, int c, size_t len);
 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 
 #ifdef KCSAN
 /*
  * These fail to build as ifuncs when used with KCSAN.
  */
 void *
 memset(void *buf, int c, size_t len)
 {
 
 	return (memset_std(buf, c, len));
 }
 
 void *
 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
 {
 
 	return (memmove_std(dst, src, len));
 }
 
 void *
 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
 {
 
 	return (memcpy_std(dst, src, len));
 }
 #else
 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memset_erms : memset_std);
 }
 
 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
     size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memmove_erms : memmove_std);
 }
 
 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memcpy_erms : memcpy_std);
 }
 #endif
 
 void	pagezero_std(void *addr);
 void	pagezero_erms(void *addr);
 DEFINE_IFUNC(, void , pagezero, (void *))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    pagezero_erms : pagezero_std);
 }
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index 7085a9b1c540..f5de90484384 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -1,753 +1,1115 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
+#include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cputypes.h>
 #include <machine/cpufunc.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <machine/cpu.h>
 #include <x86/init.h>
 
 #ifdef DEV_ACPI
 #include <contrib/dev/acpica/include/acpi.h>
 #include <dev/acpica/acpivar.h>
 #endif
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
 #define WARMBOOT_SEG		(KERNBASE + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 #define GiB(v)			(v ## ULL << 30)
 
 #define	AP_BOOTPT_SZ		(PAGE_SIZE * 3)
 
 /* Temporary variables for init_secondary()  */
 char *doublefault_stack;
 char *mce_stack;
 char *nmi_stack;
 char *dbg_stack;
 
 /*
  * Local data and functions.
  */
 
 static int	start_ap(int apic_id);
 
 static bool
 is_kernel_paddr(vm_paddr_t pa)
 {
 
 	return (pa >= trunc_2mpage(btext - KERNBASE) &&
 	   pa < round_page(_end - KERNBASE));
 }
 
 static bool
 is_mpboot_good(vm_paddr_t start, vm_paddr_t end)
 {
 
 	return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
 }
 
 /*
  * Calculate usable address in base memory for AP trampoline code.
  */
 void
 mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx)
 {
 	vm_paddr_t start, end;
 	unsigned int i;
 	bool allocated;
 
 	alloc_ap_trampoline(physmap, physmap_idx);
 
 	/*
 	 * Find a memory region big enough below the 4GB boundary to
 	 * store the initial page tables.  Region must be mapped by
 	 * the direct map.
 	 *
 	 * Note that it needs to be aligned to a page boundary.
 	 */
 	allocated = false;
 	for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
 		/*
 		 * First, try to chomp at the start of the physmap region.
 		 * Kernel binary might claim it already.
 		 */
 		start = round_page(physmap[i]);
 		end = start + AP_BOOTPT_SZ;
 		if (start < end && end <= physmap[i + 1] &&
 		    is_mpboot_good(start, end) &&
 		    !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
 			allocated = true;
 			physmap[i] = end;
 			break;
 		}
 
 		/*
 		 * Second, try to chomp at the end.  Again, check
 		 * against kernel.
 		 */
 		end = trunc_page(physmap[i + 1]);
 		start = end - AP_BOOTPT_SZ;
 		if (start < end && start >= physmap[i] &&
 		    is_mpboot_good(start, end) &&
 		    !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
 			allocated = true;
 			physmap[i + 1] = start;
 			break;
 		}
 	}
 	if (allocated) {
 		mptramp_pagetables = start;
 		if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
 			memmove(&physmap[i], &physmap[i + 2],
 			    sizeof(*physmap) * (*physmap_idx - i + 2));
 			*physmap_idx -= 2;
 		}
 	} else {
 		mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ;
 		if (bootverbose)
 			printf(
 "Cannot find enough space for the initial AP page tables, placing them at %#x",
 			    mptramp_pagetables);
 	}
 }
 
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
 void
 cpu_mp_start(void)
 {
 	int i;
 
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
 	}
 
-	/* Install an inter-CPU IPI for TLB invalidation */
-	if (pmap_pcid_enabled) {
-		if (invpcid_works) {
-			setidt(IPI_INVLTLB, pti ?
-			    IDTVEC(invltlb_invpcid_pti_pti) :
-			    IDTVEC(invltlb_invpcid_nopti), SDT_SYSIGT,
-			    SEL_KPL, 0);
-			setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_invpcid_pti) :
-			    IDTVEC(invlpg_invpcid), SDT_SYSIGT, SEL_KPL, 0);
-			setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_invpcid_pti) :
-			    IDTVEC(invlrng_invpcid), SDT_SYSIGT, SEL_KPL, 0);
-		} else {
-			setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) :
-			    IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0);
-			setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pcid_pti) :
-			    IDTVEC(invlpg_pcid), SDT_SYSIGT, SEL_KPL, 0);
-			setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pcid_pti) :
-			    IDTVEC(invlrng_pcid), SDT_SYSIGT, SEL_KPL, 0);
-		}
-	} else {
-		setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb),
-		    SDT_SYSIGT, SEL_KPL, 0);
-		setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg),
-		    SDT_SYSIGT, SEL_KPL, 0);
-		setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng),
-		    SDT_SYSIGT, SEL_KPL, 0);
-	}
-
-	/* Install an inter-CPU IPI for cache invalidation. */
-	setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache),
+	/* Install an inter-CPU IPI for for cache and TLB invalidations. */
+	setidt(IPI_INVLOP, pti ? IDTVEC(invlop_pti) : IDTVEC(invlop),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
 	setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) :
 	    IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install generic inter-CPU IPI handler */
 	setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) :
 	    IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
 	setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU suspend/resume */
 	setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
 		boot_cpu_id = PCPU_GET(apic_id);
 		cpu_info[boot_cpu_id].cpu_bsp = 1;
 	} else
 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 
 	/* Probe logical/physical core configuration. */
 	topo_probe();
 
 	assign_cpu_ids();
 
 	/* Start each Application Processor */
 	init_ops.start_all_aps();
 
 	set_interrupt_apic_ids();
 
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 	acpi_pxm_set_cpu_locality();
 #endif
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary(void)
 {
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor ap_gdt;
 	u_int64_t cr0;
 	int cpu, gsel_tss, x;
 
 	/* Set by the startup code for us to use */
 	cpu = bootAP;
 
 	/* Update microcode before doing anything else. */
 	ucode_load_ap(cpu);
 
 	/* Get per-cpu data and save  */
 	pc = &__pcpu[cpu];
 
 	/* prime data page for it to use */
 	pcpu_init(pc, cpu, sizeof(struct pcpu));
 	dpcpu_init(dpcpu, cpu);
 	pc->pc_apic_id = cpu_apic_ids[cpu];
 	pc->pc_prvspace = pc;
 	pc->pc_curthread = 0;
 	pc->pc_tssp = &pc->pc_common_tss;
 	pc->pc_rsp0 = 0;
 	pc->pc_pti_rsp0 = (((vm_offset_t)&pc->pc_pti_stack +
 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 	gdt = pc->pc_gdt;
 	pc->pc_tss = (struct system_segment_descriptor *)&gdt[GPROC0_SEL];
 	pc->pc_fs32p = &gdt[GUFS32_SEL];
 	pc->pc_gs32p = &gdt[GUGS32_SEL];
 	pc->pc_ldt = (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL];
 	/* See comment in pmap_bootstrap(). */
 	pc->pc_pcid_next = PMAP_PCID_KERN + 2;
 	pc->pc_pcid_gen = 1;
 
+	pc->pc_smp_tlb_gen = 1;
+
 	/* Init tss */
 	pc->pc_common_tss = __pcpu[0].pc_common_tss;
 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 	pc->pc_common_tss.tss_rsp0 = 0;
 
 	/* The doublefault stack runs on IST1. */
 	np = ((struct nmi_pcpu *)&doublefault_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist1 = (long)np;
 
 	/* The NMI stack runs on IST2. */
 	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist2 = (long)np;
 
 	/* The MC# stack runs on IST3. */
 	np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist3 = (long)np;
 
 	/* The DB# stack runs on IST4. */
 	np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist4 = (long)np;
 
 	/* Prepare private GDT */
 	gdt_segs[GPROC0_SEL].ssd_base = (long)&pc->pc_common_tss;
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != GPROC0_SEL + 1 &&
 		    x != GUSERLDT_SEL && x != GUSERLDT_SEL + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	ap_gdt.rd_base = (u_long)gdt;
 	lgdt(&ap_gdt);			/* does magic intra-segment return */
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
 	fix_cpuid();
 
 	lidt(&r_idt);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	/*
 	 * Set to a known state:
 	 * Set by mpboot.s: CR0_PG, CR0_PE
 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
 	 */
 	cr0 = rcr0();
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 
 	amd64_conf_fast_syscall();
 
 	/* signal our startup to the BSP. */
 	mp_naps++;
 
 	/* Spin until the BSP releases the AP's. */
 	while (atomic_load_acq_int(&aps_ready) == 0)
 		ia32_pause();
 
 	init_secondary_tail();
 }
 
 /*******************************************************************
  * local functions and data
  */
 
 #ifdef NUMA
 static void
 mp_realloc_pcpu(int cpuid, int domain)
 {
 	vm_page_t m;
 	vm_offset_t oa, na;
 
 	oa = (vm_offset_t)&__pcpu[cpuid];
 	if (_vm_phys_domain(pmap_kextract(oa)) == domain)
 		return;
 	m = vm_page_alloc_domain(NULL, 0, domain,
 	    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
 	if (m == NULL)
 		return;
 	na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	pagecopy((void *)oa, (void *)na);
 	pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1);
 	/* XXX old pcpu page leaked. */
 }
 #endif
 
 /*
  * start each AP in our list
  */
 int
 native_start_all_aps(void)
 {
 	u_int64_t *pt4, *pt3, *pt2;
 	u_int32_t mpbioswarmvec;
 	int apic_id, cpu, domain, i;
 	u_char mpbiosreason;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	/* copy the AP 1st level boot code */
 	bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
 
 	/* Locate the page tables, they'll be below the trampoline */
 	pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
 	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
 	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
 
 	/* Create the initial 1GB replicated page tables */
 	for (i = 0; i < 512; i++) {
 		/* Each slot of the level 4 pages points to the same level 3 page */
 		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
 		pt4[i] |= PG_V | PG_RW | PG_U;
 
 		/* Each slot of the level 3 pages points to the same level 2 page */
 		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
 		pt3[i] |= PG_V | PG_RW | PG_U;
 
 		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
 		pt2[i] = i * (2 * 1024 * 1024);
 		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
 	}
 
 	/* save the current value of the warm-start vector */
 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 
 	/* setup a vector to our boot code */
 	*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
 	*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
 	/* Relocate pcpu areas to the correct domain. */
 #ifdef NUMA
 	if (vm_ndomains > 1)
 		for (cpu = 1; cpu < mp_ncpus; cpu++) {
 			apic_id = cpu_apic_ids[cpu];
 			domain = acpi_pxm_get_cpu_locality(apic_id);
 			mp_realloc_pcpu(cpu, domain);
 		}
 #endif
 
 	/* start each AP */
 	domain = 0;
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		apic_id = cpu_apic_ids[cpu];
 #ifdef NUMA
 		if (vm_ndomains > 1)
 			domain = acpi_pxm_get_cpu_locality(apic_id);
 #endif
 		/* allocate and set up an idle stack data page */
 		bootstacks[cpu] = (void *)kmem_malloc(kstack_pages * PAGE_SIZE,
 		    M_WAITOK | M_ZERO);
 		doublefault_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK |
 		    M_ZERO);
 		mce_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO);
 		nmi_stack = (char *)kmem_malloc_domainset(
 		    DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO);
 		dbg_stack = (char *)kmem_malloc_domainset(
 		    DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO);
 		dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain),
 		    DPCPU_SIZE, M_WAITOK | M_ZERO);
 
 		bootSTK = (char *)bootstacks[cpu] +
 		    kstack_pages * PAGE_SIZE - 8;
 		bootAP = cpu;
 
 		/* attempt to start the Application Processor */
 		if (!start_ap(apic_id)) {
 			/* restore the warmstart vector */
 			*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 			panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
 		}
 
 		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
 	}
 
 	/* restore the warmstart vector */
 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 
 	/* number of APs actually started */
 	return (mp_naps);
 }
 
 
 /*
  * This function starts the AP (application processor) identified
  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
  * to accomplish this.  This is necessary because of the nuances
  * of the different hardware we might encounter.  It isn't pretty,
  * but it seems to work.
  */
 static int
 start_ap(int apic_id)
 {
 	int vector, ms;
 	int cpus;
 
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
 	ipi_startup(apic_id, vector);
 
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
 			return 1;	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return 0;		/* return FAILURE */
 }
 
+/*
+ * Flush the TLB on other CPU's
+ */
+
+/*
+ * Invalidation request.  PCPU pc_smp_tlb_op uses u_int instead of the
+ * enum to avoid both namespace and ABI issues (with enums).
+ */
+enum invl_op_codes {
+      INVL_OP_TLB		= 1,
+      INVL_OP_TLB_INVPCID	= 2,
+      INVL_OP_TLB_INVPCID_PTI	= 3,
+      INVL_OP_TLB_PCID		= 4,
+      INVL_OP_PGRNG		= 5,
+      INVL_OP_PGRNG_INVPCID	= 6,
+      INVL_OP_PGRNG_PCID	= 7,
+      INVL_OP_PG		= 8,
+      INVL_OP_PG_INVPCID	= 9,
+      INVL_OP_PG_PCID		= 10,
+      INVL_OP_CACHE		= 11,
+};
+
+/*
+ * These variables are initialized at startup to reflect how each of
+ * the different kinds of invalidations should be performed on the
+ * current machine and environment.
+ */
+static enum invl_op_codes invl_op_tlb;
+static enum invl_op_codes invl_op_pgrng;
+static enum invl_op_codes invl_op_pg;
+
+/*
+ * Scoreboard of IPI completion notifications from target to IPI initiator.
+ *
+ * Each CPU can initiate shootdown IPI independently from other CPUs.
+ * Initiator enters critical section, then fills its local PCPU
+ * shootdown info (pc_smp_tlb_ vars), then clears scoreboard generation
+ * at location (cpu, my_cpuid) for each target cpu.  After that IPI is
+ * sent to all targets which scan for zeroed scoreboard generation
+ * words.  Upon finding such word the shootdown data is read from
+ * corresponding cpu' pcpu, and generation is set.  Meantime initiator
+ * loops waiting for all zeroed generations in scoreboard to update.
+ */
+static uint32_t *invl_scoreboard;
+
+static void
+invl_scoreboard_init(void *arg __unused)
+{
+	u_int i;
+
+	invl_scoreboard = malloc(sizeof(uint32_t) * (mp_maxid + 1) *
+	    (mp_maxid + 1), M_DEVBUF, M_WAITOK);
+	for (i = 0; i < (mp_maxid + 1) * (mp_maxid + 1); i++)
+		invl_scoreboard[i] = 1;
+
+	if (pmap_pcid_enabled) {
+		if (invpcid_works) {
+			if (pti)
+				invl_op_tlb = INVL_OP_TLB_INVPCID_PTI;
+			else
+				invl_op_tlb = INVL_OP_TLB_INVPCID;
+			invl_op_pgrng = INVL_OP_PGRNG_INVPCID;
+			invl_op_pg = INVL_OP_PG_INVPCID;
+		} else {
+			invl_op_tlb = INVL_OP_TLB_PCID;
+			invl_op_pgrng = INVL_OP_PGRNG_PCID;
+			invl_op_pg = INVL_OP_PG_PCID;
+		}
+	} else {
+		invl_op_tlb = INVL_OP_TLB;
+		invl_op_pgrng = INVL_OP_PGRNG;
+		invl_op_pg = INVL_OP_PG;
+	}
+}
+SYSINIT(invl_ops, SI_SUB_SMP, SI_ORDER_FIRST, invl_scoreboard_init, NULL);
+
+static uint32_t *
+invl_scoreboard_getcpu(u_int cpu)
+{
+	return (invl_scoreboard + cpu * (mp_maxid + 1));
+}
+
+static uint32_t *
+invl_scoreboard_slot(u_int cpu)
+{
+	return (invl_scoreboard_getcpu(cpu) + PCPU_GET(cpuid));
+}
+
+/*
+ * Used by pmap to request cache or TLB invalidation on local and
+ * remote processors.  Mask provides the set of remote CPUs which are
+ * to be signalled with the invalidation IPI.  As an optimization, the
+ * curcpu_cb callback is invoked on the calling CPU while waiting for
+ * remote CPUs to complete the operation.
+ *
+ * The callback function is called unconditionally on the caller's
+ * underlying processor, even when this processor is not set in the
+ * mask.  So, the callback function must be prepared to handle such
+ * spurious invocations.
+ *
+ * Interrupts must be enabled when calling the function with smp
+ * started, to avoid deadlock with other IPIs that are protected with
+ * smp_ipi_mtx spinlock at the initiator side.
+ */
+static void
+smp_targeted_tlb_shootdown(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
+    vm_offset_t addr2, smp_invl_cb_t curcpu_cb, enum invl_op_codes op)
+{
+	cpuset_t other_cpus, mask1;
+	uint32_t generation, *p_cpudone;
+	int cpu;
+
+	/*
+	 * It is not necessary to signal other CPUs while booting or
+	 * when in the debugger.
+	 */
+	if (kdb_active || KERNEL_PANICKED() || !smp_started) {
+		curcpu_cb(pmap, addr1, addr2);
+		return;
+	}
+
+	sched_pin();
+
+	/*
+	 * Check for other cpus.  Return if none.
+	 */
+	if (CPU_ISFULLSET(&mask)) {
+		if (mp_ncpus <= 1)
+			goto nospinexit;
+	} else {
+		CPU_CLR(PCPU_GET(cpuid), &mask);
+		if (CPU_EMPTY(&mask))
+			goto nospinexit;
+	}
+
+	/*
+	 * Initiator must have interrupts enabled, which prevents
+	 * non-invalidation IPIs, that takes smp_ipi_mtx spinlock,
+	 * from deadlocking with as.  On the other hand, preemption
+	 * must be disabled to pin initiator to the instance of the
+	 * pcpu pc_smp_tlb data and scoreboard line.
+	 */
+	KASSERT((read_rflags() & PSL_I) != 0,
+	    ("smp_targeted_tlb_shootdown: interrupts disabled"));
+	critical_enter();
+
+	PCPU_SET(smp_tlb_addr1, addr1);
+	PCPU_SET(smp_tlb_addr2, addr2);
+	PCPU_SET(smp_tlb_pmap, pmap);
+	generation = PCPU_GET(smp_tlb_gen);
+	if (++generation == 0)
+		generation = 1;
+	PCPU_SET(smp_tlb_gen, generation);
+	PCPU_SET(smp_tlb_op, op);
+	/* Fence between filling smp_tlb fields and clearing scoreboard. */
+	atomic_thread_fence_rel();
+
+	mask1 = mask;
+	while ((cpu = CPU_FFS(&mask1)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &mask1);
+		KASSERT(*invl_scoreboard_slot(cpu) != 0,
+		    ("IPI scoreboard is zero, initiator %d target %d",
+		    PCPU_GET(cpuid), cpu));
+		*invl_scoreboard_slot(cpu) = 0;
+	}
+
+	/*
+	 * IPI acts as a fence between writing to the scoreboard above
+	 * (zeroing slot) and reading from it below (wait for
+	 * acknowledge).
+	 */
+	if (CPU_ISFULLSET(&mask)) {
+		ipi_all_but_self(IPI_INVLOP);
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	} else {
+		other_cpus = mask;
+		while ((cpu = CPU_FFS(&mask)) != 0) {
+			cpu--;
+			CPU_CLR(cpu, &mask);
+			CTR3(KTR_SMP, "%s: cpu: %d invl ipi op: %x", __func__,
+			    cpu, op);
+			ipi_send_cpu(cpu, IPI_INVLOP);
+		}
+	}
+	curcpu_cb(pmap, addr1, addr2);
+	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &other_cpus);
+		p_cpudone = invl_scoreboard_slot(cpu);
+		while (atomic_load_int(p_cpudone) != generation)
+			ia32_pause();
+	}
+	critical_exit();
+	sched_unpin();
+	return;
+
+nospinexit:
+	curcpu_cb(pmap, addr1, addr2);
+	sched_unpin();
+}
+
+void
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, pmap, 0, 0, curcpu_cb, invl_op_tlb);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_global++;
+#endif
+}
+
+void
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
+    smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, pmap, addr, 0, curcpu_cb, invl_op_pg);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_page++;
+#endif
+}
+
+void
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+    pmap_t pmap, smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, pmap, addr1, addr2, curcpu_cb,
+	    invl_op_pgrng);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_range++;
+	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+}
+
 void
-invltlb_invpcid_handler(void)
+smp_cache_flush(smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(all_cpus, NULL, 0, 0, curcpu_cb,
+	    INVL_OP_CACHE);
+}
+
+/*
+ * Handlers for TLB related IPIs
+ */
+static void
+invltlb_handler(pmap_t smp_tlb_pmap)
+{
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	if (smp_tlb_pmap == kernel_pmap)
+		invltlb_glob();
+	else
+		invltlb();
+}
+
+static void
+invltlb_invpcid_handler(pmap_t smp_tlb_pmap)
 {
 	struct invpcid_descr d;
-	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
 	invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
 	    INVPCID_CTX);
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invltlb_invpcid_pti_handler(void)
+static void
+invltlb_invpcid_pti_handler(pmap_t smp_tlb_pmap)
 {
 	struct invpcid_descr d;
-	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
 	if (smp_tlb_pmap == kernel_pmap) {
 		/*
 		 * This invalidation actually needs to clear kernel
 		 * mappings from the TLB in the current pmap, but
 		 * since we were asked for the flush in the kernel
 		 * pmap, achieve it by performing global flush.
 		 */
 		invpcid(&d, INVPCID_CTXGLOB);
 	} else {
 		invpcid(&d, INVPCID_CTX);
 		d.pcid |= PMAP_PCID_USER_PT;
 		invpcid(&d, INVPCID_CTX);
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invltlb_pcid_handler(void)
+static void
+invltlb_pcid_handler(pmap_t smp_tlb_pmap)
 {
 	uint64_t kcr3, ucr3;
-	uint32_t generation, pcid;
+	uint32_t pcid;
   
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	if (smp_tlb_pmap == kernel_pmap) {
 		invltlb_glob();
 	} else {
 		/*
 		 * The current pmap might not be equal to
 		 * smp_tlb_pmap.  The clearing of the pm_gen in
 		 * pmap_invalidate_all() takes care of TLB
 		 * invalidation when switching to the pmap on this
 		 * CPU.
 		 */
 		if (PCPU_GET(curpmap) == smp_tlb_pmap) {
 			pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 			kcr3 = smp_tlb_pmap->pm_cr3 | pcid;
 			ucr3 = smp_tlb_pmap->pm_ucr3;
 			if (ucr3 != PMAP_NO_CR3) {
 				ucr3 |= PMAP_PCID_USER_PT | pcid;
 				pmap_pti_pcid_invalidate(ucr3, kcr3);
 			} else
 				load_cr3(kcr3);
 		}
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invlpg_invpcid_handler(void)
+static void
+invlpg_handler(vm_offset_t smp_tlb_addr1)
+{
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	invlpg(smp_tlb_addr1);
+}
+
+static void
+invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 {
 	struct invpcid_descr d;
-	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
 		    PMAP_PCID_USER_PT;
 		d.pad = 0;
 		d.addr = smp_tlb_addr1;
 		invpcid(&d, INVPCID_ADDR);
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invlpg_pcid_handler(void)
+static void
+invlpg_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 {
 	uint64_t kcr3, ucr3;
-	uint32_t generation;
 	uint32_t pcid;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
 		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1);
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invlrng_invpcid_handler(void)
+static void
+invlrng_handler(vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2)
+{
+	vm_offset_t addr, addr2;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	do {
+		invlpg(addr);
+		addr += PAGE_SIZE;
+	} while (addr < addr2);
+}
+
+static void
+invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
+    vm_offset_t smp_tlb_addr2)
 {
 	struct invpcid_descr d;
 	vm_offset_t addr, addr2;
-	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
 		    PMAP_PCID_USER_PT;
 		d.pad = 0;
 		d.addr = smp_tlb_addr1;
 		do {
 			invpcid(&d, INVPCID_ADDR);
 			d.addr += PAGE_SIZE;
 		} while (d.addr < addr2);
 	}
-	PCPU_SET(smp_tlb_done, generation);
 }
 
-void
-invlrng_pcid_handler(void)
+static void
+invlrng_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
+    vm_offset_t smp_tlb_addr2)
 {
 	vm_offset_t addr, addr2;
 	uint64_t kcr3, ucr3;
-	uint32_t generation;
 	uint32_t pcid;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
-	generation = smp_tlb_generation;	/* Overlap with serialization */
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
 		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2);
 	}
-	PCPU_SET(smp_tlb_done, generation);
+}
+
+static void
+invlcache_handler(void)
+{
+#ifdef COUNT_IPIS
+	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+	wbinvd();
+}
+
+static void
+invlop_handler_one_req(enum invl_op_codes smp_tlb_op, pmap_t smp_tlb_pmap,
+    vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2)
+{
+	switch (smp_tlb_op) {
+	case INVL_OP_TLB:
+		invltlb_handler(smp_tlb_pmap);
+		break;
+	case INVL_OP_TLB_INVPCID:
+		invltlb_invpcid_handler(smp_tlb_pmap);
+		break;
+	case INVL_OP_TLB_INVPCID_PTI:
+		invltlb_invpcid_pti_handler(smp_tlb_pmap);
+		break;
+	case INVL_OP_TLB_PCID:
+		invltlb_pcid_handler(smp_tlb_pmap);
+		break;
+	case INVL_OP_PGRNG:
+		invlrng_handler(smp_tlb_addr1, smp_tlb_addr2);
+		break;
+	case INVL_OP_PGRNG_INVPCID:
+		invlrng_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1,
+		    smp_tlb_addr2);
+		break;
+	case INVL_OP_PGRNG_PCID:
+		invlrng_pcid_handler(smp_tlb_pmap, smp_tlb_addr1,
+		    smp_tlb_addr2);
+		break;
+	case INVL_OP_PG:
+		invlpg_handler(smp_tlb_addr1);
+		break;
+	case INVL_OP_PG_INVPCID:
+		invlpg_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1);
+		break;
+	case INVL_OP_PG_PCID:
+		invlpg_pcid_handler(smp_tlb_pmap, smp_tlb_addr1);
+		break;
+	case INVL_OP_CACHE:
+		invlcache_handler();
+		break;
+	default:
+		__assert_unreachable();
+		break;
+	}
+}
+
+void
+invlop_handler(void)
+{
+	struct pcpu *initiator_pc;
+	pmap_t smp_tlb_pmap;
+	vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+	u_int initiator_cpu_id;
+	enum invl_op_codes smp_tlb_op;
+	uint32_t *scoreboard, smp_tlb_gen;
+
+	scoreboard = invl_scoreboard_getcpu(PCPU_GET(cpuid));
+	for (;;) {
+		for (initiator_cpu_id = 0; initiator_cpu_id <= mp_maxid;
+		    initiator_cpu_id++) {
+			if (scoreboard[initiator_cpu_id] == 0)
+				break;
+		}
+		if (initiator_cpu_id > mp_maxid)
+			break;
+		initiator_pc = cpuid_to_pcpu[initiator_cpu_id];
+
+		/*
+		 * This acquire fence and its corresponding release
+		 * fence in smp_targeted_tlb_shootdown(), is between
+		 * reading zero scoreboard slot and accessing PCPU of
+		 * initiator for pc_smp_tlb values.
+		 */
+		atomic_thread_fence_acq();
+		smp_tlb_pmap = initiator_pc->pc_smp_tlb_pmap;
+		smp_tlb_addr1 = initiator_pc->pc_smp_tlb_addr1;
+		smp_tlb_addr2 = initiator_pc->pc_smp_tlb_addr2;
+		smp_tlb_op = initiator_pc->pc_smp_tlb_op;
+		smp_tlb_gen = initiator_pc->pc_smp_tlb_gen;
+
+		/*
+		 * Ensure that we do not make our scoreboard
+		 * notification visible to the initiator until the
+		 * pc_smp_tlb values are read.  The corresponding
+		 * fence is implicitly provided by the barrier in the
+		 * IPI send operation before the APIC ICR register
+		 * write.
+		 *
+		 * As an optimization, the request is acknowledged
+		 * before the actual invalidation is performed.  It is
+		 * safe because target CPU cannot return to userspace
+		 * before handler finishes. Only NMI can preempt the
+		 * handler, but NMI would see the kernel handler frame
+		 * and not touch not-invalidated user page table.
+		 */
+		atomic_thread_fence_acq();
+		atomic_store_int(&scoreboard[initiator_cpu_id], smp_tlb_gen);
+
+		invlop_handler_one_req(smp_tlb_op, smp_tlb_pmap, smp_tlb_addr1,
+		    smp_tlb_addr2);
+	}
 }
diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h
index b7b546ed2b6d..22c6ed40aa20 100644
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@@ -1,312 +1,317 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) Peter Wemm <peter@netplex.com.au>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_PCPU_H_
 #define	_MACHINE_PCPU_H_
 
 #ifndef _SYS_CDEFS_H_
 #error "sys/cdefs.h is a prerequisite for this file"
 #endif
 
 #include <machine/segments.h>
 #include <machine/tss.h>
 
 #define	PC_PTI_STACK_SZ	16
 
 struct monitorbuf {
 	int idle_state;		/* Used by cpu_idle_mwait. */
 	int stop_state;		/* Used by cpustop_handler. */
 	char padding[128 - (2 * sizeof(int))];
 };
 _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line");
 
 /*
  * The SMP parts are setup in pmap.c and locore.s for the BSP, and
  * mp_machdep.c sets up the data for the AP's to "see" when they awake.
  * The reason for doing it via a struct is so that an array of pointers
  * to each CPU's data can be set up for things like "check curproc on all
  * other processors"
  */
 #define	PCPU_MD_FIELDS							\
 	struct monitorbuf pc_monitorbuf __aligned(128);	/* cache line */\
 	struct	pcpu *pc_prvspace;	/* Self-reference */		\
 	struct	pmap *pc_curpmap;					\
 	struct	amd64tss *pc_tssp;	/* TSS segment active on CPU */	\
 	void	*pc_pad0;						\
 	uint64_t pc_kcr3;						\
 	uint64_t pc_ucr3;						\
 	uint64_t pc_saved_ucr3;						\
 	register_t pc_rsp0;						\
 	register_t pc_scratch_rsp;	/* User %rsp in syscall */	\
 	register_t pc_scratch_rax;					\
 	u_int	pc_apic_id;						\
 	u_int   pc_acpi_id;		/* ACPI CPU id */		\
 	/* Pointer to the CPU %fs descriptor */				\
 	struct user_segment_descriptor	*pc_fs32p;			\
 	/* Pointer to the CPU %gs descriptor */				\
 	struct user_segment_descriptor	*pc_gs32p;			\
 	/* Pointer to the CPU LDT descriptor */				\
 	struct system_segment_descriptor *pc_ldt;			\
 	/* Pointer to the CPU TSS descriptor */				\
 	struct system_segment_descriptor *pc_tss;			\
 	uint64_t	pc_pm_save_cnt;					\
 	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
 	uint64_t pc_dbreg[16];		/* ddb debugging regs */	\
 	uint64_t pc_pti_stack[PC_PTI_STACK_SZ];				\
 	register_t pc_pti_rsp0;						\
 	int pc_dbreg_cmd;		/* ddb debugging reg cmd */	\
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	uint32_t pc_pcid_next;						\
 	uint32_t pc_pcid_gen;						\
-	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
+	uint32_t pc_unused;						\
 	uint32_t pc_ibpb_set;						\
 	void	*pc_mds_buf;						\
 	void	*pc_mds_buf64;						\
 	uint32_t pc_pad[2];						\
 	uint8_t	pc_mds_tmp[64];						\
 	u_int 	pc_ipi_bitmap;						\
 	struct amd64tss pc_common_tss;					\
 	struct user_segment_descriptor pc_gdt[NGDT];			\
-	char	__pad[2956]		/* pad to UMA_PCPU_ALLOC_SIZE */
+	void	*pc_smp_tlb_pmap;					\
+	uint64_t pc_smp_tlb_addr1;					\
+	uint64_t pc_smp_tlb_addr2;					\
+	uint32_t pc_smp_tlb_gen;					\
+	u_int	pc_smp_tlb_op;						\
+	char	__pad[2924]		/* pad to UMA_PCPU_ALLOC_SIZE */
 
 #define	PC_DBREG_CMD_NONE	0
 #define	PC_DBREG_CMD_LOAD	1
 
 #ifdef _KERNEL
 
 #define MONITOR_STOPSTATE_RUNNING	0
 #define MONITOR_STOPSTATE_STOPPED	1
 
 #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF)
 
 /*
  * Evaluates to the byte offset of the per-cpu variable name.
  */
 #define	__pcpu_offset(name)						\
 	__offsetof(struct pcpu, name)
 
 /*
  * Evaluates to the type of the per-cpu variable name.
  */
 #define	__pcpu_type(name)						\
 	__typeof(((struct pcpu *)0)->name)
 
 /*
  * Evaluates to the address of the per-cpu variable name.
  */
 #define	__PCPU_PTR(name) __extension__ ({				\
 	__pcpu_type(name) *__p;						\
 									\
 	__asm __volatile("movq %%gs:%1,%0; addq %2,%0"			\
 	    : "=r" (__p)						\
 	    : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))),	\
 	      "i" (__pcpu_offset(name)));				\
 									\
 	__p;								\
 })
 
 /*
  * Evaluates to the value of the per-cpu variable name.
  */
 #define	__PCPU_GET(name) __extension__ ({				\
 	__pcpu_type(name) __res;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__pcpu_type(name)), 8)];		\
 	} __s;								\
 									\
 	if (sizeof(__res) == 1 || sizeof(__res) == 2 ||			\
 	    sizeof(__res) == 4 || sizeof(__res) == 8) {			\
 		__asm __volatile("mov %%gs:%1,%0"			\
 		    : "=r" (__s)					\
 		    : "m" (*(struct __s *)(__pcpu_offset(name))));	\
 		*(struct __s *)(void *)&__res = __s;			\
 	} else {							\
 		__res = *__PCPU_PTR(name);				\
 	}								\
 	__res;								\
 })
 
 /*
  * Adds the value to the per-cpu counter name.  The implementation
  * must be atomic with respect to interrupts.
  */
 #define	__PCPU_ADD(name, val) do {					\
 	__pcpu_type(name) __val;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__pcpu_type(name)), 8)];		\
 	} __s;								\
 									\
 	__val = (val);							\
 	if (sizeof(__val) == 1 || sizeof(__val) == 2 ||			\
 	    sizeof(__val) == 4 || sizeof(__val) == 8) {			\
 		__s = *(struct __s *)(void *)&__val;			\
 		__asm __volatile("add %1,%%gs:%0"			\
 		    : "=m" (*(struct __s *)(__pcpu_offset(name)))	\
 		    : "r" (__s));					\
 	} else								\
 		*__PCPU_PTR(name) += __val;				\
 } while (0)
 
 /*
  * Increments the value of the per-cpu counter name.  The implementation
  * must be atomic with respect to interrupts.
  */
 #define	__PCPU_INC(name) do {						\
 	CTASSERT(sizeof(__pcpu_type(name)) == 1 ||			\
 	    sizeof(__pcpu_type(name)) == 2 ||				\
 	    sizeof(__pcpu_type(name)) == 4 ||				\
 	    sizeof(__pcpu_type(name)) == 8);				\
 	if (sizeof(__pcpu_type(name)) == 1) {				\
 		__asm __volatile("incb %%gs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 2) {			\
 		__asm __volatile("incw %%gs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 4) {			\
 		__asm __volatile("incl %%gs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	} else if (sizeof(__pcpu_type(name)) == 8) {			\
 		__asm __volatile("incq %%gs:%0"				\
 		    : "=m" (*(__pcpu_type(name) *)(__pcpu_offset(name)))\
 		    : "m" (*(__pcpu_type(name) *)(__pcpu_offset(name))));\
 	}								\
 } while (0)
 
 /*
  * Sets the value of the per-cpu variable name to value val.
  */
 #define	__PCPU_SET(name, val) {						\
 	__pcpu_type(name) __val;					\
 	struct __s {							\
 		u_char	__b[MIN(sizeof(__pcpu_type(name)), 8)];		\
 	} __s;								\
 									\
 	__val = (val);							\
 	if (sizeof(__val) == 1 || sizeof(__val) == 2 ||			\
 	    sizeof(__val) == 4 || sizeof(__val) == 8) {			\
 		__s = *(struct __s *)(void *)&__val;			\
 		__asm __volatile("mov %1,%%gs:%0"			\
 		    : "=m" (*(struct __s *)(__pcpu_offset(name)))	\
 		    : "r" (__s));					\
 	} else {							\
 		*__PCPU_PTR(name) = __val;				\
 	}								\
 }
 
 #define	get_pcpu() __extension__ ({					\
 	struct pcpu *__pc;						\
 									\
 	__asm __volatile("movq %%gs:%1,%0"				\
 	    : "=r" (__pc)						\
 	    : "m" (*(struct pcpu *)(__pcpu_offset(pc_prvspace))));	\
 	__pc;								\
 })
 
 #define	PCPU_GET(member)	__PCPU_GET(pc_ ## member)
 #define	PCPU_ADD(member, val)	__PCPU_ADD(pc_ ## member, val)
 #define	PCPU_INC(member)	__PCPU_INC(pc_ ## member)
 #define	PCPU_PTR(member)	__PCPU_PTR(pc_ ## member)
 #define	PCPU_SET(member, val)	__PCPU_SET(pc_ ## member, val)
 
 #define	IS_BSP()	(PCPU_GET(cpuid) == 0)
 
 #define zpcpu_offset_cpu(cpu)	((uintptr_t)&__pcpu[0] + UMA_PCPU_ALLOC_SIZE * cpu)
 #define zpcpu_base_to_offset(base) (void *)((uintptr_t)(base) - (uintptr_t)&__pcpu[0])
 #define zpcpu_offset_to_base(base) (void *)((uintptr_t)(base) + (uintptr_t)&__pcpu[0])
 
 #define zpcpu_sub_protected(base, n) do {				\
 	ZPCPU_ASSERT_PROTECTED();					\
 	zpcpu_sub(base, n);						\
 } while (0)
 
 #define zpcpu_set_protected(base, n) do {				\
 	__typeof(*base) __n = (n);					\
 	ZPCPU_ASSERT_PROTECTED();					\
 	switch (sizeof(*base)) {					\
 	case 4:								\
 		__asm __volatile("movl\t%1,%%gs:(%0)"			\
 		    : : "r" (base), "ri" (__n) : "memory", "cc");	\
 		break;							\
 	case 8:								\
 		__asm __volatile("movq\t%1,%%gs:(%0)"			\
 		    : : "r" (base), "ri" (__n) : "memory", "cc");	\
 		break;							\
 	default:							\
 		*zpcpu_get(base) = __n;					\
 	}								\
 } while (0);
 
 #define zpcpu_add(base, n) do {						\
 	__typeof(*base) __n = (n);					\
 	CTASSERT(sizeof(*base) == 4 || sizeof(*base) == 8);		\
 	switch (sizeof(*base)) {					\
 	case 4:								\
 		__asm __volatile("addl\t%1,%%gs:(%0)"			\
 		    : : "r" (base), "ri" (__n) : "memory", "cc");	\
 		break;							\
 	case 8:								\
 		__asm __volatile("addq\t%1,%%gs:(%0)"			\
 		    : : "r" (base), "ri" (__n) : "memory", "cc");	\
 		break;							\
 	}								\
 } while (0)
 
 #define zpcpu_add_protected(base, n) do {				\
 	ZPCPU_ASSERT_PROTECTED();					\
 	zpcpu_add(base, n);						\
 } while (0)
 
 #define zpcpu_sub(base, n) do {						\
 	__typeof(*base) __n = (n);					\
 	CTASSERT(sizeof(*base) == 4 || sizeof(*base) == 8);		\
 	switch (sizeof(*base)) {					\
 	case 4:								\
 		__asm __volatile("subl\t%1,%%gs:(%0)"			\
 		    : : "r" (base), "ri" (__n) : "memory", "cc");	\
 		break;							\
 	case 8:								\
 		__asm __volatile("subq\t%1,%%gs:(%0)"			\
 		    : : "r" (base), "ri" (__n) : "memory", "cc");	\
 		break;							\
 	}								\
 } while (0);
 
 #else /* !__GNUCLIKE_ASM || !__GNUCLIKE___TYPEOF */
 
 #error "this file needs to be ported to your compiler"
 
 #endif /* __GNUCLIKE_ASM && __GNUCLIKE___TYPEOF */
 
 #endif /* _KERNEL */
 
 #endif /* !_MACHINE_PCPU_H_ */
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 2ecfe62cf9fb..d5b5fa9c5b81 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -1,67 +1,47 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _MACHINE_SMP_H_
 #define _MACHINE_SMP_H_
 
 #ifdef _KERNEL
 
 #ifdef SMP
 
 #ifndef LOCORE
 
 #include <x86/x86_smp.h>
 
 /* global symbols in mpboot.S */
 extern char			mptramp_start[];
 extern u_int32_t		mptramp_pagetables;
 
 /* IPI handlers */
 inthand_t
 	IDTVEC(justreturn),	/* interrupt CPU with minimum overhead */
 	IDTVEC(justreturn1_pti),
-	IDTVEC(invltlb_pti),
-	IDTVEC(invltlb_pcid_pti),
-	IDTVEC(invltlb_pcid),	/* TLB shootdowns - global, pcid */
-	IDTVEC(invltlb_invpcid_pti_pti),
-	IDTVEC(invltlb_invpcid_nopti),
-	IDTVEC(invlpg_pti),
-	IDTVEC(invlpg_invpcid_pti),
-	IDTVEC(invlpg_invpcid),
-	IDTVEC(invlpg_pcid_pti),
-	IDTVEC(invlpg_pcid),
-	IDTVEC(invlrng_pti),
-	IDTVEC(invlrng_invpcid_pti),
-	IDTVEC(invlrng_invpcid),
-	IDTVEC(invlrng_pcid_pti),
-	IDTVEC(invlrng_pcid),
-	IDTVEC(invlcache_pti),
+	IDTVEC(invlop_pti),
+	IDTVEC(invlop),
 	IDTVEC(ipi_intr_bitmap_handler_pti),
 	IDTVEC(cpustop_pti),
 	IDTVEC(cpususpend_pti),
 	IDTVEC(rendezvous_pti);
 
-void	invltlb_pcid_handler(void);
-void	invltlb_invpcid_handler(void);
-void	invltlb_invpcid_pti_handler(void);
-void	invlpg_invpcid_handler(void);
-void	invlpg_pcid_handler(void);
-void	invlrng_invpcid_handler(void);
-void	invlrng_pcid_handler(void);
+void	invlop_handler(void);
 int	native_start_all_aps(void);
 void	mp_bootaddress(vm_paddr_t *, unsigned int *);
 
 #endif /* !LOCORE */
 #endif /* SMP */
 
 #endif /* _KERNEL */
 #endif /* _MACHINE_SMP_H_ */
diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c
index 953e34d2962c..663fd98e5c8a 100644
--- a/sys/i386/i386/mp_machdep.c
+++ b/sys/i386/i386/mp_machdep.c
@@ -1,469 +1,689 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_apic.h"
 #include "opt_cpu.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #if !defined(lint)
 #if !defined(SMP)
 #error How did you get here?
 #endif
 
 #ifndef DEV_APIC
 #error The apic device is required for SMP, add "device apic" to your config file.
 #endif
 #endif /* not lint */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
+#include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <x86/ucode.h>
 
 #ifdef DEV_ACPI
 #include <contrib/dev/acpica/include/acpi.h>
 #include <dev/acpica/acpivar.h>
 #endif
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(PMAP_MAP_LOW + 0x0467)
 #define WARMBOOT_SEG		(PMAP_MAP_LOW + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 /*
  * this code MUST be enabled here and in mpboot.s.
  * it follows the very early stages of AP boot by placing values in CMOS ram.
  * it NORMALLY will never be needed and thus the primitive method for enabling.
  *
 #define CHECK_POINTS
  */
 
 #if defined(CHECK_POINTS)
 #define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
 
 #define CHECK_INIT(D);				\
 	CHECK_WRITE(0x34, (D));			\
 	CHECK_WRITE(0x35, (D));			\
 	CHECK_WRITE(0x36, (D));			\
 	CHECK_WRITE(0x37, (D));			\
 	CHECK_WRITE(0x38, (D));			\
 	CHECK_WRITE(0x39, (D));
 
 #define CHECK_PRINT(S);				\
 	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
 	   (S),					\
 	   CHECK_READ(0x34),			\
 	   CHECK_READ(0x35),			\
 	   CHECK_READ(0x36),			\
 	   CHECK_READ(0x37),			\
 	   CHECK_READ(0x38),			\
 	   CHECK_READ(0x39));
 
 #else				/* CHECK_POINTS */
 
 #define CHECK_INIT(D)
 #define CHECK_PRINT(S)
 #define CHECK_WRITE(A, D)
 
 #endif				/* CHECK_POINTS */
 
 /*
  * Local data and functions.
  */
 
 static void	install_ap_tramp(void);
 static int	start_all_aps(void);
 static int	start_ap(int apic_id);
 
 static char *ap_copyout_buf;
 static char *ap_tramp_stack_base;
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
 void
 cpu_mp_start(void)
 {
 	int i;
 
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
 	}
 
 	/* Install an inter-CPU IPI for TLB invalidation */
 	setidt(IPI_INVLTLB, IDTVEC(invltlb),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLPG, IDTVEC(invlpg),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLRNG, IDTVEC(invlrng),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for cache invalidation. */
 	setidt(IPI_INVLCACHE, IDTVEC(invlcache),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
 	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install generic inter-CPU IPI handler */
 	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
 	setidt(IPI_STOP, IDTVEC(cpustop),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for CPU suspend/resume */
 	setidt(IPI_SUSPEND, IDTVEC(cpususpend),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
 		boot_cpu_id = PCPU_GET(apic_id);
 		cpu_info[boot_cpu_id].cpu_bsp = 1;
 	} else
 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 
 	/* Probe logical/physical core configuration. */
 	topo_probe();
 
 	assign_cpu_ids();
 
 	/* Start each Application Processor */
 	start_all_aps();
 
 	set_interrupt_apic_ids();
 
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 	acpi_pxm_set_cpu_locality();
 #endif
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary(void)
 {
 	struct pcpu *pc;
 	struct i386tss *common_tssp;
 	struct region_descriptor r_gdt, r_idt;
 	int gsel_tss, myid, x;
 	u_int cr0;
 
 	/* bootAP is set in start_ap() to our ID. */
 	myid = bootAP;
 
 	/* Update microcode before doing anything else. */
 	ucode_load_ap(myid);
 
 	/* Get per-cpu data */
 	pc = &__pcpu[myid];
 
 	/* prime data page for it to use */
 	pcpu_init(pc, myid, sizeof(struct pcpu));
 	dpcpu_init(dpcpu, myid);
 	pc->pc_apic_id = cpu_apic_ids[myid];
 	pc->pc_prvspace = pc;
 	pc->pc_curthread = 0;
 	pc->pc_common_tssp = common_tssp = &(__pcpu[0].pc_common_tssp)[myid];
 
 	fix_cpuid();
 
 	gdt_segs[GPRIV_SEL].ssd_base = (int)pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int)common_tssp;
 	gdt_segs[GLDT_SEL].ssd_base = (int)ldt;
 
 	for (x = 0; x < NGDT; x++) {
 		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base = (int) &gdt[myid * NGDT];
 	lgdt(&r_gdt);			/* does magic intra-segment return */
 
 	r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1;
 	r_idt.rd_base = (int)idt;
 	lidt(&r_idt);
 
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	PCPU_SET(trampstk, (uintptr_t)ap_tramp_stack_base + TRAMP_STACK_SZ -
 	    VM86_STACK_SPACE);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
 	common_tssp->tss_esp0 = PCPU_GET(trampstk);
 	common_tssp->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
 	common_tssp->tss_ioopt = sizeof(struct i386tss) << 16;
 	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	ltr(gsel_tss);
 
 	PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
 	PCPU_SET(copyout_buf, ap_copyout_buf);
 
 	/*
 	 * Set to a known state:
 	 * Set by mpboot.s: CR0_PG, CR0_PE
 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
 	 */
 	cr0 = rcr0();
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 	CHECK_WRITE(0x38, 5);
 	
 	/* signal our startup to the BSP. */
 	mp_naps++;
 	CHECK_WRITE(0x39, 6);
 
 	/* Spin until the BSP releases the AP's. */
 	while (atomic_load_acq_int(&aps_ready) == 0)
 		ia32_pause();
 
 	/* BSP may have changed PTD while we were waiting */
 	invltlb();
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 	lidt(&r_idt);
 #endif
 
 	init_secondary_tail();
 }
 
 /*
  * start each AP in our list
  */
 #define TMPMAP_START 1
 static int
 start_all_aps(void)
 {
 	u_char mpbiosreason;
 	u_int32_t mpbioswarmvec;
 	int apic_id, cpu;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	pmap_remap_lower(true);
 
 	/* install the AP 1st level boot code */
 	install_ap_tramp();
 
 	/* save the current value of the warm-start vector */
 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 
 	/* take advantage of the P==V mapping for PTD[0] for AP boot */
 
 	/* start each AP */
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		apic_id = cpu_apic_ids[cpu];
 
 		/* allocate and set up a boot stack data page */
 		bootstacks[cpu] = (char *)kmem_malloc(kstack_pages * PAGE_SIZE,
 		    M_WAITOK | M_ZERO);
 		dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO);
 		/* setup a vector to our boot code */
 		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
 		*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
 		outb(CMOS_REG, BIOS_RESET);
 		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
 		bootSTK = (char *)bootstacks[cpu] + kstack_pages *
 		    PAGE_SIZE - 4;
 		bootAP = cpu;
 
 		ap_tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT);
 		ap_copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT);
 
 		/* attempt to start the Application Processor */
 		CHECK_INIT(99);	/* setup checkpoints */
 		if (!start_ap(apic_id)) {
 			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
 			CHECK_PRINT("trace");	/* show checkpoints */
 			/* better panic as the AP may be running loose */
 			printf("panic y/n? [y] ");
 			if (cngetc() != 'n')
 				panic("bye-bye");
 		}
 		CHECK_PRINT("trace");		/* show checkpoints */
 
 		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
 	}
 
 	pmap_remap_lower(false);
 
 	/* restore the warmstart vector */
 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 
 	/* number of APs actually started */
 	return mp_naps;
 }
 
 /*
  * load the 1st level AP boot code into base memory.
  */
 
 /* targets for relocation */
 extern void bigJump(void);
 extern void bootCodeSeg(void);
 extern void bootDataSeg(void);
 extern void MPentry(void);
 extern u_int MP_GDT;
 extern u_int mp_gdtbase;
 
 static void
 install_ap_tramp(void)
 {
 	int     x;
 	int     size = *(int *) ((u_long) & bootMP_size);
 	vm_offset_t va = boot_address;
 	u_char *src = (u_char *) ((u_long) bootMP);
 	u_char *dst = (u_char *) va;
 	u_int   boot_base = (u_int) bootMP;
 	u_int8_t *dst8;
 	u_int16_t *dst16;
 	u_int32_t *dst32;
 
 	KASSERT (size <= PAGE_SIZE,
 	    ("'size' do not fit into PAGE_SIZE, as expected."));
 	pmap_kenter(va, boot_address);
 	pmap_invalidate_page (kernel_pmap, va);
 	for (x = 0; x < size; ++x)
 		*dst++ = *src++;
 
 	/*
 	 * modify addresses in code we just moved to basemem. unfortunately we
 	 * need fairly detailed info about mpboot.s for this to work.  changes
 	 * to mpboot.s might require changes here.
 	 */
 
 	/* boot code is located in KERNEL space */
 	dst = (u_char *) va;
 
 	/* modify the lgdt arg */
 	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
 	*dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
 
 	/* modify the ljmp target for MPentry() */
 	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
 	*dst32 = (u_int)MPentry;
 
 	/* modify the target for boot code segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 
 	/* modify the target for boot data segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 }
 
 /*
  * This function starts the AP (application processor) identified
  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
  * to accomplish this.  This is necessary because of the nuances
  * of the different hardware we might encounter.  It isn't pretty,
  * but it seems to work.
  */
 static int
 start_ap(int apic_id)
 {
 	int vector, ms;
 	int cpus;
 
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
 	ipi_startup(apic_id, vector);
 
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
 			return 1;	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return 0;		/* return FAILURE */
 }
+
+/*
+ * Flush the TLB on other CPU's
+ */
+
+/* Variables needed for SMP tlb shootdown. */
+vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
+pmap_t smp_tlb_pmap;
+volatile uint32_t smp_tlb_generation;
+
+/*
+ * Used by pmap to request cache or TLB invalidation on local and
+ * remote processors.  Mask provides the set of remote CPUs which are
+ * to be signalled with the invalidation IPI, specified by vector.  As
+ * an optimization, the curcpu_cb callback is invoked on the calling
+ * CPU while waiting for remote CPUs to complete the operation.
+ *
+ * The callback function is called unconditionally on the caller's
+ * underlying processor, even when this processor is not set in the
+ * mask.  So, the callback function must be prepared to handle such
+ * spurious invocations.
+ */
+static void
+smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
+    vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb)
+{
+	cpuset_t other_cpus;
+	volatile uint32_t *p_cpudone;
+	uint32_t generation;
+	int cpu;
+
+	/*
+	 * It is not necessary to signal other CPUs while booting or
+	 * when in the debugger.
+	 */
+	if (kdb_active || KERNEL_PANICKED() || !smp_started) {
+		curcpu_cb(pmap, addr1, addr2);
+		return;
+	}
+
+	sched_pin();
+
+	/*
+	 * Check for other cpus.  Return if none.
+	 */
+	if (CPU_ISFULLSET(&mask)) {
+		if (mp_ncpus <= 1)
+			goto nospinexit;
+	} else {
+		CPU_CLR(PCPU_GET(cpuid), &mask);
+		if (CPU_EMPTY(&mask))
+			goto nospinexit;
+	}
+
+	KASSERT((read_eflags() & PSL_I) != 0,
+	    ("smp_targeted_tlb_shootdown: interrupts disabled"));
+	mtx_lock_spin(&smp_ipi_mtx);
+	smp_tlb_addr1 = addr1;
+	smp_tlb_addr2 = addr2;
+	smp_tlb_pmap = pmap;
+	generation = ++smp_tlb_generation;
+	if (CPU_ISFULLSET(&mask)) {
+		ipi_all_but_self(vector);
+		other_cpus = all_cpus;
+		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
+	} else {
+		other_cpus = mask;
+		while ((cpu = CPU_FFS(&mask)) != 0) {
+			cpu--;
+			CPU_CLR(cpu, &mask);
+			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
+			    cpu, vector);
+			ipi_send_cpu(cpu, vector);
+		}
+	}
+	curcpu_cb(pmap, addr1, addr2);
+	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
+		cpu--;
+		CPU_CLR(cpu, &other_cpus);
+		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
+		while (*p_cpudone != generation)
+			ia32_pause();
+	}
+	mtx_unlock_spin(&smp_ipi_mtx);
+	sched_unpin();
+	return;
+
+nospinexit:
+	curcpu_cb(pmap, addr1, addr2);
+	sched_unpin();
+}
+
+void
+smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0, curcpu_cb);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_global++;
+#endif
+}
+
+void
+smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
+    smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0, curcpu_cb);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_page++;
+#endif
+}
+
+void
+smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
+    pmap_t pmap, smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2,
+	    curcpu_cb);
+#ifdef COUNT_XINVLTLB_HITS
+	ipi_range++;
+	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
+#endif
+}
+
+void
+smp_cache_flush(smp_invl_cb_t curcpu_cb)
+{
+	smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0,
+	    curcpu_cb);
+}
+
+/*
+ * Handlers for TLB related IPIs
+ */
+void
+invltlb_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_gbl[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since invalidating the TLB is a serializing operation.
+	 */
+	generation = smp_tlb_generation;
+	if (smp_tlb_pmap == kernel_pmap)
+		invltlb_glob();
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlpg_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_pg[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	if (smp_tlb_pmap == kernel_pmap)
+		invlpg(smp_tlb_addr1);
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlrng_handler(void)
+{
+	vm_offset_t addr, addr2;
+	uint32_t generation;
+
+#ifdef COUNT_XINVLTLB_HITS
+	xhits_rng[PCPU_GET(cpuid)]++;
+#endif /* COUNT_XINVLTLB_HITS */
+#ifdef COUNT_IPIS
+	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	addr = smp_tlb_addr1;
+	addr2 = smp_tlb_addr2;
+	generation = smp_tlb_generation;	/* Overlap with serialization */
+	if (smp_tlb_pmap == kernel_pmap) {
+		do {
+			invlpg(addr);
+			addr += PAGE_SIZE;
+		} while (addr < addr2);
+	}
+
+	PCPU_SET(smp_tlb_done, generation);
+}
+
+void
+invlcache_handler(void)
+{
+	uint32_t generation;
+
+#ifdef COUNT_IPIS
+	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
+#endif /* COUNT_IPIS */
+
+	/*
+	 * Reading the generation here allows greater parallelism
+	 * since wbinvd is a serializing instruction.  Without the
+	 * temporary, we'd wait for wbinvd to complete, then the read
+	 * would execute, then the dependent write, which must then
+	 * complete before return from interrupt.
+	 */
+	generation = smp_tlb_generation;
+	wbinvd();
+	PCPU_SET(smp_tlb_done, generation);
+}
diff --git a/sys/i386/include/smp.h b/sys/i386/include/smp.h
index 4fcb55a41996..395695d3ecd2 100644
--- a/sys/i386/include/smp.h
+++ b/sys/i386/include/smp.h
@@ -1,37 +1,48 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _MACHINE_SMP_H_
 #define _MACHINE_SMP_H_
 
 #ifdef _KERNEL
 
 #ifdef SMP
 
 #ifndef LOCORE
 
 #include <x86/x86_smp.h>
 
 #include <sys/bus.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/pcb.h>
 
+inthand_t
+	IDTVEC(invltlb),	/* TLB shootdowns - global */
+	IDTVEC(invlpg),		/* TLB shootdowns - 1 page */
+	IDTVEC(invlrng),	/* TLB shootdowns - page range */
+	IDTVEC(invlcache);	/* Write back and invalidate cache */
+
 /* functions in mpboot.s */
 void bootMP(void);
 
+void	invltlb_handler(void);
+void	invlpg_handler(void);
+void	invlrng_handler(void);
+void	invlcache_handler(void);
+
 #endif /* !LOCORE */
 #endif /* SMP */
 
 #endif /* _KERNEL */
 #endif /* _MACHINE_SMP_H_ */
diff --git a/sys/x86/include/apicvar.h b/sys/x86/include/apicvar.h
index de85cf9198fd..866dafe6dca4 100644
--- a/sys/x86/include/apicvar.h
+++ b/sys/x86/include/apicvar.h
@@ -1,491 +1,492 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _X86_APICVAR_H_
 #define _X86_APICVAR_H_
 
 /*
  * Local && I/O APIC variable definitions.
  */
 
 /*
  * Layout of local APIC interrupt vectors:
  *
  *	0xff (255)  +-------------+
  *                  |             | 15 (Spurious / IPIs / Local Interrupts)
  *	0xf0 (240)  +-------------+
  *                  |             | 14 (I/O Interrupts / Timer)
  *	0xe0 (224)  +-------------+
  *                  |             | 13 (I/O Interrupts)
  *	0xd0 (208)  +-------------+
  *                  |             | 12 (I/O Interrupts)
  *	0xc0 (192)  +-------------+
  *                  |             | 11 (I/O Interrupts)
  *	0xb0 (176)  +-------------+
  *                  |             | 10 (I/O Interrupts)
  *	0xa0 (160)  +-------------+
  *                  |             | 9 (I/O Interrupts)
  *	0x90 (144)  +-------------+
  *                  |             | 8 (I/O Interrupts / System Calls)
  *	0x80 (128)  +-------------+
  *                  |             | 7 (I/O Interrupts)
  *	0x70 (112)  +-------------+
  *                  |             | 6 (I/O Interrupts)
  *	0x60 (96)   +-------------+
  *                  |             | 5 (I/O Interrupts)
  *	0x50 (80)   +-------------+
  *                  |             | 4 (I/O Interrupts)
  *	0x40 (64)   +-------------+
  *                  |             | 3 (I/O Interrupts)
  *	0x30 (48)   +-------------+
  *                  |             | 2 (ATPIC Interrupts)
  *	0x20 (32)   +-------------+
  *                  |             | 1 (Exceptions, traps, faults, etc.)
  *	0x10 (16)   +-------------+
  *                  |             | 0 (Exceptions, traps, faults, etc.)
  *	0x00 (0)    +-------------+
  *
  * Note: 0x80 needs to be handled specially and not allocated to an
  * I/O device!
  */
 
 #define	xAPIC_MAX_APIC_ID	0xfe
 #define	xAPIC_ID_ALL		0xff
 #define	MAX_APIC_ID		0x200
 #define	APIC_ID_ALL		0xffffffff
 
 #define	IOAPIC_MAX_ID		xAPIC_MAX_APIC_ID
 
 /* I/O Interrupts are used for external devices such as ISA, PCI, etc. */
 #define	APIC_IO_INTS	(IDT_IO_INTS + 16)
 #define	APIC_NUM_IOINTS	191
 
 /* The timer interrupt is used for clock handling and drives hardclock, etc. */
 #define	APIC_TIMER_INT	(APIC_IO_INTS + APIC_NUM_IOINTS)
 
 /*  
  ********************* !!! WARNING !!! ******************************
  * Each local apic has an interrupt receive fifo that is two entries deep
  * for each interrupt priority class (higher 4 bits of interrupt vector).
  * Once the fifo is full the APIC can no longer receive interrupts for this
  * class and sending IPIs from other CPUs will be blocked.
  * To avoid deadlocks there should be no more than two IPI interrupts
  * pending at the same time.
  * Currently this is guaranteed by dividing the IPIs in two groups that have 
  * each at most one IPI interrupt pending. The first group is protected by the
  * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user 
  * at a time) The second group uses a single interrupt and a bitmap to avoid
  * redundant IPI interrupts.
  */ 
 
 /* Interrupts for local APIC LVT entries other than the timer. */
 #define	APIC_LOCAL_INTS	240
 #define	APIC_ERROR_INT	APIC_LOCAL_INTS
 #define	APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
 #define	APIC_CMC_INT	(APIC_LOCAL_INTS + 2)
 #define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 3)
 
 #define	IPI_RENDEZVOUS	(APIC_IPI_INTS)		/* Inter-CPU rendezvous. */
-#define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs */
+#define	IPI_INVLOP	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs, amd64 */
+#define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs, i386 */
 #define	IPI_INVLPG	(APIC_IPI_INTS + 2)
 #define	IPI_INVLRNG	(APIC_IPI_INTS + 3)
 #define	IPI_INVLCACHE	(APIC_IPI_INTS + 4)
 /* Vector to handle bitmap based IPIs */
 #define	IPI_BITMAP_VECTOR	(APIC_IPI_INTS + 5) 
 
 /* IPIs handled by IPI_BITMAP_VECTOR */
 #define	IPI_AST		0 	/* Generate software trap. */
 #define IPI_PREEMPT     1
 #define IPI_HARDCLOCK   2
 #define	IPI_TRACE	3	/* Collect stack trace. */
 #define	IPI_BITMAP_LAST IPI_TRACE
 #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
 
 #define	IPI_STOP	(APIC_IPI_INTS + 6)	/* Stop CPU until restarted. */
 #define	IPI_SUSPEND	(APIC_IPI_INTS + 7)	/* Suspend CPU until restarted. */
 #define	IPI_DYN_FIRST	(APIC_IPI_INTS + 8)
 #define	IPI_DYN_LAST	(254)			/* IPIs allocated at runtime */
 
 /*
  * IPI_STOP_HARD does not need to occupy a slot in the IPI vector space since
  * it is delivered using an NMI anyways.
  */
 #define	IPI_NMI_FIRST	255
 #define	IPI_STOP_HARD	255			/* Stop CPU with a NMI. */
 
 /*
  * The spurious interrupt can share the priority class with the IPIs since
  * it is not a normal interrupt. (Does not use the APIC's interrupt fifo)
  */
 #define	APIC_SPURIOUS_INT 255
 
 #ifndef LOCORE
 
 #define	APIC_IPI_DEST_SELF	-1
 #define	APIC_IPI_DEST_ALL	-2
 #define	APIC_IPI_DEST_OTHERS	-3
 
 #define	APIC_BUS_UNKNOWN	-1
 #define	APIC_BUS_ISA		0
 #define	APIC_BUS_EISA		1
 #define	APIC_BUS_PCI		2
 #define	APIC_BUS_MAX		APIC_BUS_PCI
 
 #define	IRQ_EXTINT		-1
 #define	IRQ_NMI			-2
 #define	IRQ_SMI			-3
 #define	IRQ_DISABLED		-4
 
 /*
  * An APIC enumerator is a pseudo bus driver that enumerates APIC's including
  * CPU's and I/O APIC's.
  */
 struct apic_enumerator {
 	const char *apic_name;
 	int (*apic_probe)(void);
 	int (*apic_probe_cpus)(void);
 	int (*apic_setup_local)(void);
 	int (*apic_setup_io)(void);
 	SLIST_ENTRY(apic_enumerator) apic_next;
 };
 
 inthand_t
 	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
 	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
 	IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
 	IDTVEC(spuriousint), IDTVEC(timerint),
 	IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti),
 	IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti),
 	IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti),
 	IDTVEC(spuriousint_pti), IDTVEC(timerint_pti);
 
 extern vm_paddr_t lapic_paddr;
 extern int *apic_cpuids;
 
 void	apic_register_enumerator(struct apic_enumerator *enumerator);
 void	*ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase);
 int	ioapic_disable_pin(void *cookie, u_int pin);
 int	ioapic_get_vector(void *cookie, u_int pin);
 void	ioapic_register(void *cookie);
 int	ioapic_remap_vector(void *cookie, u_int pin, int vector);
 int	ioapic_set_bus(void *cookie, u_int pin, int bus_type);
 int	ioapic_set_extint(void *cookie, u_int pin);
 int	ioapic_set_nmi(void *cookie, u_int pin);
 int	ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol);
 int	ioapic_set_triggermode(void *cookie, u_int pin,
 	    enum intr_trigger trigger);
 int	ioapic_set_smi(void *cookie, u_int pin);
 
 /*
  * Struct containing pointers to APIC functions whose
  * implementation is run time selectable.
  */
 struct apic_ops {
 	void	(*create)(u_int, int);
 	void	(*init)(vm_paddr_t);
 	void	(*xapic_mode)(void);
 	bool	(*is_x2apic)(void);
 	void	(*setup)(int);
 	void	(*dump)(const char *);
 	void	(*disable)(void);
 	void	(*eoi)(void);
 	int	(*id)(void);
 	int	(*intr_pending)(u_int);
 	void	(*set_logical_id)(u_int, u_int, u_int);
 	u_int	(*cpuid)(u_int);
 
 	/* Vectors */
 	u_int	(*alloc_vector)(u_int, u_int);
 	u_int	(*alloc_vectors)(u_int, u_int *, u_int, u_int);
 	void	(*enable_vector)(u_int, u_int);
 	void	(*disable_vector)(u_int, u_int);
 	void	(*free_vector)(u_int, u_int, u_int);
 
 
 	/* PMC */
 	int	(*enable_pmc)(void);
 	void	(*disable_pmc)(void);
 	void	(*reenable_pmc)(void);
 
 	/* CMC */
 	void	(*enable_cmc)(void);
 
 	/* AMD ELVT */
 	int	(*enable_mca_elvt)(void);
 
 	/* IPI */
 	void	(*ipi_raw)(register_t, u_int);
 	void	(*ipi_vectored)(u_int, int);
 	int	(*ipi_wait)(int);
 	int	(*ipi_alloc)(inthand_t *ipifunc);
 	void	(*ipi_free)(int vector);
 
 	/* LVT */
 	int	(*set_lvt_mask)(u_int, u_int, u_char);
 	int	(*set_lvt_mode)(u_int, u_int, u_int32_t);
 	int	(*set_lvt_polarity)(u_int, u_int, enum intr_polarity);
 	int	(*set_lvt_triggermode)(u_int, u_int, enum intr_trigger);
 };
 
 extern struct apic_ops apic_ops;
 
 static inline void
 lapic_create(u_int apic_id, int boot_cpu)
 {
 
 	apic_ops.create(apic_id, boot_cpu);
 }
 
 static inline void
 lapic_init(vm_paddr_t addr)
 {
 
 	apic_ops.init(addr);
 }
 
 static inline void
 lapic_xapic_mode(void)
 {
 
 	apic_ops.xapic_mode();
 }
 
 static inline bool
 lapic_is_x2apic(void)
 {
 
 	return (apic_ops.is_x2apic());
 }
 
 static inline void
 lapic_setup(int boot)
 {
 
 	apic_ops.setup(boot);
 }
 
 static inline void
 lapic_dump(const char *str)
 {
 
 	apic_ops.dump(str);
 }
 
 static inline void
 lapic_disable(void)
 {
 
 	apic_ops.disable();
 }
 
 static inline void
 lapic_eoi(void)
 {
 
 	apic_ops.eoi();
 }
 
 static inline int
 lapic_id(void)
 {
 
 	return (apic_ops.id());
 }
 
 static inline int
 lapic_intr_pending(u_int vector)
 {
 
 	return (apic_ops.intr_pending(vector));
 }
 
 /* XXX: UNUSED */
 static inline void
 lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 
 	apic_ops.set_logical_id(apic_id, cluster, cluster_id);
 }
 
 static inline u_int
 apic_cpuid(u_int apic_id)
 {
 
 	return (apic_ops.cpuid(apic_id));
 }
 
 static inline u_int
 apic_alloc_vector(u_int apic_id, u_int irq)
 {
 
 	return (apic_ops.alloc_vector(apic_id, irq));
 }
 
 static inline u_int
 apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 
 	return (apic_ops.alloc_vectors(apic_id, irqs, count, align));
 }
 
 static inline void
 apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	apic_ops.enable_vector(apic_id, vector);
 }
 
 static inline void
 apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	apic_ops.disable_vector(apic_id, vector);
 }
 
 static inline void
 apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 
 	apic_ops.free_vector(apic_id, vector, irq);
 }
 
 static inline int
 lapic_enable_pmc(void)
 {
 
 	return (apic_ops.enable_pmc());
 }
 
 static inline void
 lapic_disable_pmc(void)
 {
 
 	apic_ops.disable_pmc();
 }
 
 static inline void
 lapic_reenable_pmc(void)
 {
 
 	apic_ops.reenable_pmc();
 }
 
 static inline void
 lapic_enable_cmc(void)
 {
 
 	apic_ops.enable_cmc();
 }
 
 static inline int
 lapic_enable_mca_elvt(void)
 {
 
 	return (apic_ops.enable_mca_elvt());
 }
 
 static inline void
 lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 
 	apic_ops.ipi_raw(icrlo, dest);
 }
 
 static inline void
 lapic_ipi_vectored(u_int vector, int dest)
 {
 
 	apic_ops.ipi_vectored(vector, dest);
 }
 
 static inline int
 lapic_ipi_wait(int delay)
 {
 
 	return (apic_ops.ipi_wait(delay));
 }
 
 static inline int
 lapic_ipi_alloc(inthand_t *ipifunc)
 {
 
 	return (apic_ops.ipi_alloc(ipifunc));
 }
 
 static inline void
 lapic_ipi_free(int vector)
 {
 
 	return (apic_ops.ipi_free(vector));
 }
 
 static inline int
 lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
 {
 
 	return (apic_ops.set_lvt_mask(apic_id, lvt, masked));
 }
 
 static inline int
 lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode)
 {
 
 	return (apic_ops.set_lvt_mode(apic_id, lvt, mode));
 }
 
 static inline int
 lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 {
 
 	return (apic_ops.set_lvt_polarity(apic_id, lvt, pol));
 }
 
 static inline int
 lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger)
 {
 
 	return (apic_ops.set_lvt_triggermode(apic_id, lvt, trigger));
 }
 
 void	lapic_handle_cmc(void);
 void	lapic_handle_error(void);
 void	lapic_handle_intr(int vector, struct trapframe *frame);
 void	lapic_handle_timer(struct trapframe *frame);
 
 int	ioapic_get_rid(u_int apic_id, uint16_t *ridp);
 
 extern int x2apic_mode;
 extern int lapic_eoi_suppression;
 
 #ifdef _SYS_SYSCTL_H_
 SYSCTL_DECL(_hw_apic);
 #endif
 
 #endif /* !LOCORE */
 #endif /* _X86_APICVAR_H_ */
diff --git a/sys/x86/include/x86_smp.h b/sys/x86/include/x86_smp.h
index 1a0ef8fbcf78..d5535a602bcb 100644
--- a/sys/x86/include/x86_smp.h
+++ b/sys/x86/include/x86_smp.h
@@ -1,120 +1,112 @@
 /*-
  * SPDX-License-Identifier: Beerware
  *
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _X86_X86_SMP_H_
 #define	_X86_X86_SMP_H_
 
 #include <sys/bus.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/pcb.h>
 
 struct pmap;
 
 /* global data in mp_x86.c */
 extern int mp_naps;
 extern int boot_cpu_id;
 extern struct pcb stoppcbs[];
 extern int cpu_apic_ids[];
 extern int bootAP;
 extern void *dpcpu;
 extern char *bootSTK;
 extern void *bootstacks[];
 extern unsigned int boot_address;
 extern unsigned int bootMP_size;
 extern volatile int aps_ready;
 extern struct mtx ap_boot_mtx;
 extern int cpu_logical;
 extern int cpu_cores;
 extern volatile uint32_t smp_tlb_generation;
 extern struct pmap *smp_tlb_pmap;
 extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 extern u_int xhits_gbl[];
 extern u_int xhits_pg[];
 extern u_int xhits_rng[];
 extern u_int ipi_global;
 extern u_int ipi_page;
 extern u_int ipi_range;
 extern u_int ipi_range_size;
 
 extern int nmi_kdb_lock;
 extern int nmi_is_broadcast;
 
 struct cpu_info {
 	int	cpu_present:1;
 	int	cpu_bsp:1;
 	int	cpu_disabled:1;
 	int	cpu_hyperthread:1;
 };
 extern struct cpu_info *cpu_info;
 
 /*
  * Set if MWAIT does not reliably wake when the MONITORed address is written.
  */
 extern bool mwait_cpustop_broken;
 
 #ifdef COUNT_IPIS
 extern u_long *ipi_invltlb_counts[MAXCPU];
 extern u_long *ipi_invlrng_counts[MAXCPU];
 extern u_long *ipi_invlpg_counts[MAXCPU];
 extern u_long *ipi_invlcache_counts[MAXCPU];
 extern u_long *ipi_rendezvous_counts[MAXCPU];
 #endif
 
 /* IPI handlers */
 inthand_t
-	IDTVEC(invltlb),	/* TLB shootdowns - global */
-	IDTVEC(invlpg),		/* TLB shootdowns - 1 page */
-	IDTVEC(invlrng),	/* TLB shootdowns - page range */
-	IDTVEC(invlcache),	/* Write back and invalidate cache */
 	IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ 
 	IDTVEC(cpustop),	/* CPU stops & waits to be restarted */
 	IDTVEC(cpususpend),	/* CPU suspends & waits to be resumed */
 	IDTVEC(rendezvous);	/* handle CPU rendezvous */
 
 typedef void (*smp_invl_cb_t)(struct pmap *, vm_offset_t addr1,
     vm_offset_t addr2);
 
 /* functions in x86_mp.c */
 void	assign_cpu_ids(void);
 void	cpu_add(u_int apic_id, char boot_cpu);
 void	cpustop_handler(void);
 void	cpususpend_handler(void);
 void	alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx);
 void	init_secondary_tail(void);
-void	invltlb_handler(void);
-void	invlpg_handler(void);
-void	invlrng_handler(void);
-void	invlcache_handler(void);
 void	init_secondary(void);
 void	ipi_startup(int apic_id, int vector);
 void	ipi_all_but_self(u_int ipi);
 void 	ipi_bitmap_handler(struct trapframe frame);
 void	ipi_cpu(int cpu, u_int ipi);
 int	ipi_nmi_handler(void);
 void	ipi_selected(cpuset_t cpus, u_int ipi);
 void	set_interrupt_apic_ids(void);
 void	smp_cache_flush(smp_invl_cb_t curcpu_cb);
 void	smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap,
 	    smp_invl_cb_t curcpu_cb);
 void	smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
 	    vm_offset_t endva, struct pmap *pmap, smp_invl_cb_t curcpu_cb);
 void	smp_masked_invltlb(cpuset_t mask, struct pmap *pmap,
 	    smp_invl_cb_t curcpu_cb);
 void	mem_range_AP_init(void);
 void	topo_probe(void);
 void	ipi_send_cpu(int cpu, u_int ipi);
 
 #endif
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index 85f27d639b69..bc1d211a27fd 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1,1877 +1,1642 @@
 /*-
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #include <sys/csan.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/stack.h>
 #include <x86/ucode.h>
 
 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");
 
 /* lock region used by kernel profiling */
 int	mcount_lock;
 
 int	mp_naps;		/* # of Applications processors */
 int	boot_cpu_id = -1;	/* designated BSP */
 
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 int bootAP;
 
 /* Free these after use */
 void *bootstacks[MAXCPU];
 void *dpcpu;
 
 struct pcb stoppcbs[MAXCPU];
 struct susppcb **susppcbs;
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
 static u_long *ipi_preempt_counts[MAXCPU];
 static u_long *ipi_ast_counts[MAXCPU];
 u_long *ipi_invltlb_counts[MAXCPU];
 u_long *ipi_invlrng_counts[MAXCPU];
 u_long *ipi_invlpg_counts[MAXCPU];
 u_long *ipi_invlcache_counts[MAXCPU];
 u_long *ipi_rendezvous_counts[MAXCPU];
 static u_long *ipi_hardclock_counts[MAXCPU];
 #endif
 
 /* Default cpu_ops implementation. */
 struct cpu_ops cpu_ops;
 
 /*
  * Local data and functions.
  */
 
 static volatile cpuset_t ipi_stop_nmi_pending;
 
 volatile cpuset_t resuming_cpus;
 volatile cpuset_t toresume_cpus;
 
 /* used to hold the AP's until we are ready to release them */
 struct mtx ap_boot_mtx;
 
 /* Set to 1 once we're ready to let the APs out of the pen. */
 volatile int aps_ready = 0;
 
 /*
  * Store data from cpu_add() until later in the boot when we actually setup
  * the APs.
  */
 struct cpu_info *cpu_info;
 int *apic_cpuids;
 int cpu_apic_ids[MAXCPU];
 _Static_assert(MAXCPU <= MAX_APIC_ID,
     "MAXCPU cannot be larger that MAX_APIC_ID");
 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
     "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");
 
 static void	release_aps(void *dummy);
 static void	cpustop_handler_post(u_int cpu);
 
 static int	hyperthreading_allowed = 1;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
 	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
 
 static int	hyperthreading_intr_allowed = 0;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN,
 	&hyperthreading_intr_allowed, 0,
 	"Allow interrupts on HTT logical CPUs");
 
 static struct topo_node topo_root;
 
 static int pkg_id_shift;
 static int node_id_shift;
 static int core_id_shift;
 static int disabled_cpus;
 
 struct cache_info {
 	int	id_shift;
 	int	present;
 } static caches[MAX_CACHE_LEVELS];
 
 unsigned int boot_address;
 
 static bool stop_mwait = false;
 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0,
     "Use MONITOR/MWAIT when stopping CPU, if available");
 
 #define MiB(v)	(v ## ULL << 20)
 
 void
 mem_range_AP_init(void)
 {
 
 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
 /*
  * Round up to the next power of two, if necessary, and then
  * take log2.
  * Returns -1 if argument is zero.
  */
 static __inline int
 mask_width(u_int x)
 {
 
 	return (fls(x << (1 - powerof2(x))) - 1);
 }
 
 /*
  * Add a cache level to the cache topology description.
  */
 static int
 add_deterministic_cache(int type, int level, int share_count)
 {
 
 	if (type == 0)
 		return (0);
 	if (type > 3) {
 		printf("unexpected cache type %d\n", type);
 		return (1);
 	}
 	if (type == 2) /* ignore instruction cache */
 		return (1);
 	if (level == 0 || level > MAX_CACHE_LEVELS) {
 		printf("unexpected cache level %d\n", type);
 		return (1);
 	}
 
 	if (caches[level - 1].present) {
 		printf("WARNING: multiple entries for L%u data cache\n", level);
 		printf("%u => %u\n", caches[level - 1].id_shift,
 		    mask_width(share_count));
 	}
 	caches[level - 1].id_shift = mask_width(share_count);
 	caches[level - 1].present = 1;
 
 	if (caches[level - 1].id_shift > pkg_id_shift) {
 		printf("WARNING: L%u data cache covers more "
 		    "APIC IDs than a package (%u > %u)\n", level,
 		    caches[level - 1].id_shift, pkg_id_shift);
 		caches[level - 1].id_shift = pkg_id_shift;
 	}
 	if (caches[level - 1].id_shift < core_id_shift) {
 		printf("WARNING: L%u data cache covers fewer "
 		    "APIC IDs than a core (%u < %u)\n", level,
 		    caches[level - 1].id_shift, core_id_shift);
 		caches[level - 1].id_shift = core_id_shift;
 	}
 
 	return (1);
 }
 
 /*
  * Determine topology of processing units and caches for AMD CPUs.
  * See:
  *  - AMD CPUID Specification (Publication # 25481)
  *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
  *  - BKDG For AMD Family 10h Processors (Publication # 31116)
  *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
  *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
  *  - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
  */
 static void
 topo_probe_amd(void)
 {
 	u_int p[4];
 	uint64_t v;
 	int level;
 	int nodes_per_socket;
 	int share_count;
 	int type;
 	int i;
 
 	/* No multi-core capability. */
 	if ((amd_feature2 & AMDID2_CMP) == 0)
 		return;
 
 	/* For families 10h and newer. */
 	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
 	    AMDID_COREID_SIZE_SHIFT;
 
 	/* For 0Fh family. */
 	if (pkg_id_shift == 0)
 		pkg_id_shift =
 		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
 
 	/*
 	 * Families prior to 16h define the following value as
 	 * cores per compute unit and we don't really care about the AMD
 	 * compute units at the moment.  Perhaps we should treat them as
 	 * cores and cores within the compute units as hardware threads,
 	 * but that's up for debate.
 	 * Later families define the value as threads per compute unit,
 	 * so we are following AMD's nomenclature here.
 	 */
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
 	    CPUID_TO_FAMILY(cpu_id) >= 0x16) {
 		cpuid_count(0x8000001e, 0, p);
 		share_count = ((p[1] >> 8) & 0xff) + 1;
 		core_id_shift = mask_width(share_count);
 
 		/*
 		 * For Zen (17h), gather Nodes per Processor.  Each node is a
 		 * Zeppelin die; TR and EPYC CPUs will have multiple dies per
 		 * package.  Communication latency between dies is higher than
 		 * within them.
 		 */
 		nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
 		node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
 	}
 
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
 		for (i = 0; ; i++) {
 			cpuid_count(0x8000001d, i, p);
 			type = p[0] & 0x1f;
 			level = (p[0] >> 5) & 0x7;
 			share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 			if (!add_deterministic_cache(type, level, share_count))
 				break;
 		}
 	} else {
 		if (cpu_exthigh >= 0x80000005) {
 			cpuid_count(0x80000005, 0, p);
 			if (((p[2] >> 24) & 0xff) != 0) {
 				caches[0].id_shift = 0;
 				caches[0].present = 1;
 			}
 		}
 		if (cpu_exthigh >= 0x80000006) {
 			cpuid_count(0x80000006, 0, p);
 			if (((p[2] >> 16) & 0xffff) != 0) {
 				caches[1].id_shift = 0;
 				caches[1].present = 1;
 			}
 			if (((p[3] >> 18) & 0x3fff) != 0) {
 				nodes_per_socket = 1;
 				if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
 					/*
 					 * Handle multi-node processors that
 					 * have multiple chips, each with its
 					 * own L3 cache, on the same die.
 					 */
 					v = rdmsr(0xc001100c);
 					nodes_per_socket = 1 + ((v >> 3) & 0x7);
 				}
 				caches[2].id_shift =
 				    pkg_id_shift - mask_width(nodes_per_socket);
 				caches[2].present = 1;
 			}
 		}
 	}
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 1 and Leaf 4, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0x4(void)
 {
 	u_int p[4];
 	int max_cores;
 	int max_logical;
 
 	/* Both zero and one here mean one logical processor per package. */
 	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
 	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
 	if (max_logical <= 1)
 		return;
 
 	if (cpu_high >= 0x4) {
 		cpuid_count(0x04, 0, p);
 		max_cores = ((p[0] >> 26) & 0x3f) + 1;
 	} else
 		max_cores = 1;
 
 	core_id_shift = mask_width(max_logical/max_cores);
 	KASSERT(core_id_shift >= 0,
 	    ("intel topo: max_cores > max_logical\n"));
 	pkg_id_shift = core_id_shift + mask_width(max_cores);
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 11, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0xb(void)
 {
 	u_int p[4];
 	int bits;
 	int type;
 	int i;
 
 	/* Fall back if CPU leaf 11 doesn't really exist. */
 	cpuid_count(0x0b, 0, p);
 	if (p[1] == 0) {
 		topo_probe_intel_0x4();
 		return;
 	}
 
 	/* We only support three levels for now. */
 	for (i = 0; ; i++) {
 		cpuid_count(0x0b, i, p);
 
 		bits = p[0] & 0x1f;
 		type = (p[2] >> 8) & 0xff;
 
 		if (type == 0)
 			break;
 
 		/* TODO: check for duplicate (re-)assignment */
 		if (type == CPUID_TYPE_SMT)
 			core_id_shift = bits;
 		else if (type == CPUID_TYPE_CORE)
 			pkg_id_shift = bits;
 		else
 			printf("unknown CPU level type %d\n", type);
 	}
 
 	if (pkg_id_shift < core_id_shift) {
 		printf("WARNING: core covers more APIC IDs than a package\n");
 		core_id_shift = pkg_id_shift;
 	}
 }
 
 /*
  * Determine topology of caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 Architectures Software Developer’s Manual
  *    Volume 2A: Instruction Set Reference, A-M,
  *    CPUID instruction
  */
 static void
 topo_probe_intel_caches(void)
 {
 	u_int p[4];
 	int level;
 	int share_count;
 	int type;
 	int i;
 
 	if (cpu_high < 0x4) {
 		/*
 		 * Available cache level and sizes can be determined
 		 * via CPUID leaf 2, but that requires a huge table of hardcoded
 		 * values, so for now just assume L1 and L2 caches potentially
 		 * shared only by HTT processing units, if HTT is present.
 		 */
 		caches[0].id_shift = pkg_id_shift;
 		caches[0].present = 1;
 		caches[1].id_shift = pkg_id_shift;
 		caches[1].present = 1;
 		return;
 	}
 
 	for (i = 0; ; i++) {
 		cpuid_count(0x4, i, p);
 		type = p[0] & 0x1f;
 		level = (p[0] >> 5) & 0x7;
 		share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 		if (!add_deterministic_cache(type, level, share_count))
 			break;
 	}
 }
 
 /*
  * Determine topology of processing units and caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  */
 static void
 topo_probe_intel(void)
 {
 
 	/*
 	 * Note that 0x1 <= cpu_high < 4 case should be
 	 * compatible with topo_probe_intel_0x4() logic when
 	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
 	 * or it should trigger the fallback otherwise.
 	 */
 	if (cpu_high >= 0xb)
 		topo_probe_intel_0xb();
 	else if (cpu_high >= 0x1)
 		topo_probe_intel_0x4();
 
 	topo_probe_intel_caches();
 }
 
 /*
  * Topology information is queried only on BSP, on which this
  * code runs and for which it can query CPUID information.
  * Then topology is extrapolated on all packages using an
  * assumption that APIC ID to hardware component ID mapping is
  * homogenious.
  * That doesn't necesserily imply that the topology is uniform.
  */
 void
 topo_probe(void)
 {
 	static int cpu_topo_probed = 0;
 	struct x86_topo_layer {
 		int type;
 		int subtype;
 		int id_shift;
 	} topo_layers[MAX_CACHE_LEVELS + 4];
 	struct topo_node *parent;
 	struct topo_node *node;
 	int layer;
 	int nlayers;
 	int node_id;
 	int i;
 
 	if (cpu_topo_probed)
 		return;
 
 	CPU_ZERO(&logical_cpus_mask);
 
 	if (mp_ncpus <= 1)
 		; /* nothing */
 	else if (cpu_vendor_id == CPU_VENDOR_AMD ||
 	    cpu_vendor_id == CPU_VENDOR_HYGON)
 		topo_probe_amd();
 	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
 		topo_probe_intel();
 
 	KASSERT(pkg_id_shift >= core_id_shift,
 	    ("bug in APIC topology discovery"));
 
 	nlayers = 0;
 	bzero(topo_layers, sizeof(topo_layers));
 
 	topo_layers[nlayers].type = TOPO_TYPE_PKG;
 	topo_layers[nlayers].id_shift = pkg_id_shift;
 	if (bootverbose)
 		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
 	nlayers++;
 
 	if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
 		topo_layers[nlayers].type = TOPO_TYPE_GROUP;
 		topo_layers[nlayers].id_shift = node_id_shift;
 		if (bootverbose)
 			printf("Node ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	/*
 	 * Consider all caches to be within a package/chip
 	 * and "in front" of all sub-components like
 	 * cores and hardware threads.
 	 */
 	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
 		if (caches[i].present) {
 			if (node_id_shift != 0)
 				KASSERT(caches[i].id_shift <= node_id_shift,
 					("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift <= pkg_id_shift,
 				("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift >= core_id_shift,
 				("bug in APIC topology discovery"));
 
 			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
 			topo_layers[nlayers].subtype = i + 1;
 			topo_layers[nlayers].id_shift = caches[i].id_shift;
 			if (bootverbose)
 				printf("L%u cache ID shift: %u\n",
 				    topo_layers[nlayers].subtype,
 				    topo_layers[nlayers].id_shift);
 			nlayers++;
 		}
 	}
 
 	if (pkg_id_shift > core_id_shift) {
 		topo_layers[nlayers].type = TOPO_TYPE_CORE;
 		topo_layers[nlayers].id_shift = core_id_shift;
 		if (bootverbose)
 			printf("Core ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	topo_layers[nlayers].type = TOPO_TYPE_PU;
 	topo_layers[nlayers].id_shift = 0;
 	nlayers++;
 
 	topo_init_root(&topo_root);
 	for (i = 0; i <= max_apic_id; ++i) {
 		if (!cpu_info[i].cpu_present)
 			continue;
 
 		parent = &topo_root;
 		for (layer = 0; layer < nlayers; ++layer) {
 			node_id = i >> topo_layers[layer].id_shift;
 			parent = topo_add_node_by_hwid(parent, node_id,
 			    topo_layers[layer].type,
 			    topo_layers[layer].subtype);
 		}
 	}
 
 	parent = &topo_root;
 	for (layer = 0; layer < nlayers; ++layer) {
 		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
 		node = topo_find_node_by_hwid(parent, node_id,
 		    topo_layers[layer].type,
 		    topo_layers[layer].subtype);
 		topo_promote_child(node);
 		parent = node;
 	}
 
 	cpu_topo_probed = 1;
 }
 
 /*
  * Assign logical CPU IDs to local APICs.
  */
 void
 assign_cpu_ids(void)
 {
 	struct topo_node *node;
 	u_int smt_mask;
 	int nhyper;
 
 	smt_mask = (1u << core_id_shift) - 1;
 
 	/*
 	 * Assign CPU IDs to local APIC IDs and disable any CPUs
 	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
 	 */
 	mp_ncpus = 0;
 	nhyper = 0;
 	TOPO_FOREACH(node, &topo_root) {
 		if (node->type != TOPO_TYPE_PU)
 			continue;
 
 		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
 			cpu_info[node->hwid].cpu_hyperthread = 1;
 
 		if (resource_disabled("lapic", node->hwid)) {
 			if (node->hwid != boot_cpu_id)
 				cpu_info[node->hwid].cpu_disabled = 1;
 			else
 				printf("Cannot disable BSP, APIC ID = %d\n",
 				    node->hwid);
 		}
 
 		if (!hyperthreading_allowed &&
 		    cpu_info[node->hwid].cpu_hyperthread)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (mp_ncpus >= MAXCPU)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (cpu_info[node->hwid].cpu_disabled) {
 			disabled_cpus++;
 			continue;
 		}
 
 		if (cpu_info[node->hwid].cpu_hyperthread)
 			nhyper++;
 
 		cpu_apic_ids[mp_ncpus] = node->hwid;
 		apic_cpuids[node->hwid] = mp_ncpus;
 		topo_set_pu_id(node, mp_ncpus);
 		mp_ncpus++;
 	}
 
 	KASSERT(mp_maxid >= mp_ncpus - 1,
 	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
 	    mp_ncpus));
 
 	mp_ncores = mp_ncpus - nhyper;
 	smp_threads_per_core = mp_ncpus / mp_ncores;
 }
 
 /*
  * Print various information about the SMP system hardware and setup.
  */
 void
 cpu_mp_announce(void)
 {
 	struct topo_node *node;
 	const char *hyperthread;
 	struct topo_analysis topology;
 
 	printf("FreeBSD/SMP: ");
 	if (topo_analyze(&topo_root, 1, &topology)) {
 		printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
 		if (topology.entities[TOPO_LEVEL_GROUP] > 1)
 			printf(" x %d groups",
 			    topology.entities[TOPO_LEVEL_GROUP]);
 		if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
 			printf(" x %d cache groups",
 			    topology.entities[TOPO_LEVEL_CACHEGROUP]);
 		if (topology.entities[TOPO_LEVEL_CORE] > 0)
 			printf(" x %d core(s)",
 			    topology.entities[TOPO_LEVEL_CORE]);
 		if (topology.entities[TOPO_LEVEL_THREAD] > 1)
 			printf(" x %d hardware threads",
 			    topology.entities[TOPO_LEVEL_THREAD]);
 	} else {
 		printf("Non-uniform topology");
 	}
 	printf("\n");
 
 	if (disabled_cpus) {
 		printf("FreeBSD/SMP Online: ");
 		if (topo_analyze(&topo_root, 0, &topology)) {
 			printf("%d package(s)",
 			    topology.entities[TOPO_LEVEL_PKG]);
 			if (topology.entities[TOPO_LEVEL_GROUP] > 1)
 				printf(" x %d groups",
 				    topology.entities[TOPO_LEVEL_GROUP]);
 			if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
 				printf(" x %d cache groups",
 				    topology.entities[TOPO_LEVEL_CACHEGROUP]);
 			if (topology.entities[TOPO_LEVEL_CORE] > 0)
 				printf(" x %d core(s)",
 				    topology.entities[TOPO_LEVEL_CORE]);
 			if (topology.entities[TOPO_LEVEL_THREAD] > 1)
 				printf(" x %d hardware threads",
 				    topology.entities[TOPO_LEVEL_THREAD]);
 		} else {
 			printf("Non-uniform topology");
 		}
 		printf("\n");
 	}
 
 	if (!bootverbose)
 		return;
 
 	TOPO_FOREACH(node, &topo_root) {
 		switch (node->type) {
 		case TOPO_TYPE_PKG:
 			printf("Package HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_CORE:
 			printf("\tCore HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_PU:
 			if (cpu_info[node->hwid].cpu_hyperthread)
 				hyperthread = "/HT";
 			else
 				hyperthread = "";
 
 			if (node->subtype == 0)
 				printf("\t\tCPU (AP%s): APIC ID: %u"
 				    "(disabled)\n", hyperthread, node->hwid);
 			else if (node->id == 0)
 				printf("\t\tCPU0 (BSP): APIC ID: %u\n",
 				    node->hwid);
 			else
 				printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
 				    node->id, hyperthread, node->hwid);
 			break;
 		default:
 			/* ignored */
 			break;
 		}
 	}
 }
 
 /*
  * Add a scheduling group, a group of logical processors sharing
  * a particular cache (and, thus having an affinity), to the scheduling
  * topology.
  * This function recursively works on lower level caches.
  */
 static void
 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
 {
 	struct topo_node *node;
 	int nchildren;
 	int ncores;
 	int i;
 
 	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE ||
 	    root->type == TOPO_TYPE_GROUP,
 	    ("x86topo_add_sched_group: bad type: %u", root->type));
 	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
 	cg_root->cg_count = root->cpu_count;
 	if (root->type == TOPO_TYPE_SYSTEM)
 		cg_root->cg_level = CG_SHARE_NONE;
 	else
 		cg_root->cg_level = root->subtype;
 
 	/*
 	 * Check how many core nodes we have under the given root node.
 	 * If we have multiple logical processors, but not multiple
 	 * cores, then those processors must be hardware threads.
 	 */
 	ncores = 0;
 	node = root;
 	while (node != NULL) {
 		if (node->type != TOPO_TYPE_CORE) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 
 		ncores++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	if (cg_root->cg_level != CG_SHARE_NONE &&
 	    root->cpu_count > 1 && ncores < 2)
 		cg_root->cg_flags = CG_FLAG_SMT;
 
 	/*
 	 * Find out how many cache nodes we have under the given root node.
 	 * We ignore cache nodes that cover all the same processors as the
 	 * root node.  Also, we do not descend below found cache nodes.
 	 * That is, we count top-level "non-redundant" caches under the root
 	 * node.
 	 */
 	nchildren = 0;
 	node = root;
 	while (node != NULL) {
 		if ((node->type != TOPO_TYPE_GROUP &&
 		    node->type != TOPO_TYPE_CACHE) ||
 		    (root->type != TOPO_TYPE_SYSTEM &&
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		nchildren++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	cg_root->cg_child = smp_topo_alloc(nchildren);
 	cg_root->cg_children = nchildren;
 
 	/*
 	 * Now find again the same cache nodes as above and recursively
 	 * build scheduling topologies for them.
 	 */
 	node = root;
 	i = 0;
 	while (node != NULL) {
 		if ((node->type != TOPO_TYPE_GROUP &&
 		    node->type != TOPO_TYPE_CACHE) ||
 		    (root->type != TOPO_TYPE_SYSTEM &&
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		cg_root->cg_child[i].cg_parent = cg_root;
 		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
 		i++;
 		node = topo_next_nonchild_node(root, node);
 	}
 }
 
 /*
  * Build the MI scheduling topology from the discovered hardware topology.
  */
 struct cpu_group *
 cpu_topo(void)
 {
 	struct cpu_group *cg_root;
 
 	if (mp_ncpus <= 1)
 		return (smp_topo_none());
 
 	cg_root = smp_topo_alloc(1);
 	x86topo_add_sched_group(&topo_root, cg_root);
 	return (cg_root);
 }
 
 static void
 cpu_alloc(void *dummy __unused)
 {
 	/*
 	 * Dynamically allocate the arrays that depend on the
 	 * maximum APIC ID.
 	 */
 	cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS,
 	    M_WAITOK | M_ZERO);
 	apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS,
 	    M_WAITOK | M_ZERO);
 }
 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);
 
 /*
  * Add a logical CPU to the topology.
  */
 void
 cpu_add(u_int apic_id, char boot_cpu)
 {
 
 	if (apic_id > max_apic_id) {
 		panic("SMP: APIC ID %d too high", apic_id);
 		return;
 	}
 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
 	    apic_id));
 	cpu_info[apic_id].cpu_present = 1;
 	if (boot_cpu) {
 		KASSERT(boot_cpu_id == -1,
 		    ("CPU %u claims to be BSP, but CPU %u already is", apic_id,
 		    boot_cpu_id));
 		boot_cpu_id = apic_id;
 		cpu_info[apic_id].cpu_bsp = 1;
 	}
 	if (bootverbose)
 		printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
 		    "AP");
 }
 
 void
 cpu_mp_setmaxid(void)
 {
 
 	/*
 	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
 	 * If there were no calls to cpu_add() assume this is a UP system.
 	 */
 	if (mp_ncpus == 0)
 		mp_ncpus = 1;
 }
 
 int
 cpu_mp_probe(void)
 {
 
 	/*
 	 * Always record BSP in CPU map so that the mbuf init code works
 	 * correctly.
 	 */
 	CPU_SETOF(0, &all_cpus);
 	return (mp_ncpus > 1);
 }
 
 /* Allocate memory for the AP trampoline. */
 void
 alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx)
 {
 	unsigned int i;
 	bool allocated;
 
 	allocated = false;
 	for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
 		/*
 		 * Find a memory region big enough and below the 1MB boundary
 		 * for the trampoline code.
 		 * NB: needs to be page aligned.
 		 */
 		if (physmap[i] >= MiB(1) ||
 		    (trunc_page(physmap[i + 1]) - round_page(physmap[i])) <
 		    round_page(bootMP_size))
 			continue;
 
 		allocated = true;
 		/*
 		 * Try to steal from the end of the region to mimic previous
 		 * behaviour, else fallback to steal from the start.
 		 */
 		if (physmap[i + 1] < MiB(1)) {
 			boot_address = trunc_page(physmap[i + 1]);
 			if ((physmap[i + 1] - boot_address) < bootMP_size)
 				boot_address -= round_page(bootMP_size);
 			physmap[i + 1] = boot_address;
 		} else {
 			boot_address = round_page(physmap[i]);
 			physmap[i] = boot_address + round_page(bootMP_size);
 		}
 		if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
 			memmove(&physmap[i], &physmap[i + 2],
 			    sizeof(*physmap) * (*physmap_idx - i + 2));
 			*physmap_idx -= 2;
 		}
 		break;
 	}
 
 	if (!allocated) {
 		boot_address = basemem * 1024 - bootMP_size;
 		if (bootverbose)
 			printf(
 "Cannot find enough space for the boot trampoline, placing it at %#x",
 			    boot_address);
 	}
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary_tail(void)
 {
 	u_int cpuid;
 
 	pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
 
 	/*
 	 * On real hardware, switch to x2apic mode if possible.  Do it
 	 * after aps_ready was signalled, to avoid manipulating the
 	 * mode while BSP might still want to send some IPI to us
 	 * (second startup IPI is ignored on modern hardware etc).
 	 */
 	lapic_xapic_mode();
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
 
 	/* set up CPU registers and state */
 	cpu_setregs();
 
 	/* set up SSE/NX */
 	initializecpu();
 
 	/* set up FPU state on the AP */
 #ifdef __amd64__
 	fpuinit();
 #else
 	npxinit(false);
 #endif
 
 	if (cpu_ops.cpu_init)
 		cpu_ops.cpu_init();
 
 	/* A quick check from sanity claus */
 	cpuid = PCPU_GET(cpuid);
 	if (PCPU_GET(apic_id) != lapic_id()) {
 		printf("SMP: cpuid = %d\n", cpuid);
 		printf("SMP: actual apic_id = %d\n", lapic_id());
 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
 		panic("cpuid mismatch! boom!!");
 	}
 
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
 
 	mtx_lock_spin(&ap_boot_mtx);
 
 	mca_init();
 
 	/* Init local apic for irq's */
 	lapic_setup(1);
 
 	/* Set memory range attributes for this CPU to match the BSP */
 	mem_range_AP_init();
 
 	smp_cpus++;
 
 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
 	if (bootverbose)
 		printf("SMP: AP CPU #%d Launched!\n", cpuid);
 	else
 		printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "",
 		    cpuid, smp_cpus == mp_ncpus ? "\n" : " ");
 
 	/* Determine if we are a logical CPU. */
 	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
 		CPU_SET(cpuid, &logical_cpus_mask);
 
 	if (bootverbose)
 		lapic_dump("AP");
 
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
 	}
 
 #ifdef __amd64__
 	/*
 	 * Enable global pages TLB extension
 	 * This also implicitly flushes the TLB 
 	 */
 	load_cr4(rcr4() | CR4_PGE);
 	if (pmap_pcid_enabled)
 		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 #endif
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
 	/* Wait until all the AP's are up. */
 	while (atomic_load_acq_int(&smp_started) == 0)
 		ia32_pause();
 
 #ifndef EARLY_AP_STARTUP
 	/* Start per-CPU event timers. */
 	cpu_initclocks_ap();
 #endif
 
 	kcsan_cpu_init(cpuid);
 
 	/*
 	 * Assert that smp_after_idle_runnable condition is reasonable.
 	 */
 	MPASS(PCPU_GET(curpcb) == NULL);
 
 	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
 }
 
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
 	struct pcpu *pc;
 	int cpu;
 
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		pc = pcpu_find(cpu);
 		while (atomic_load_ptr(&pc->pc_curpcb) == NULL)
 			cpu_spinwait();
 		kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages *
 		    PAGE_SIZE);
 	}
 }
 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
     smp_after_idle_runnable, NULL);
 
 /*
  * We tell the I/O APIC code about all the CPUs we want to receive
  * interrupts.  If we don't want certain CPUs to receive IRQs we
  * can simply not tell the I/O APIC code about them in this function.
  * We also do not tell it about the BSP since it tells itself about
  * the BSP internally to work with UP kernels and on UP machines.
  */
 void
 set_interrupt_apic_ids(void)
 {
 	u_int i, apic_id;
 
 	for (i = 0; i < MAXCPU; i++) {
 		apic_id = cpu_apic_ids[i];
 		if (apic_id == -1)
 			continue;
 		if (cpu_info[apic_id].cpu_bsp)
 			continue;
 		if (cpu_info[apic_id].cpu_disabled)
 			continue;
 
 		/* Don't let hyperthreads service interrupts. */
 		if (cpu_info[apic_id].cpu_hyperthread &&
 		    !hyperthreading_intr_allowed)
 			continue;
 
 		intr_add_cpu(i);
 	}
 }
 
 
 #ifdef COUNT_XINVLTLB_HITS
 u_int xhits_gbl[MAXCPU];
 u_int xhits_pg[MAXCPU];
 u_int xhits_rng[MAXCPU];
 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
     sizeof(xhits_gbl), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
     sizeof(xhits_pg), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
     sizeof(xhits_rng), "IU", "");
 
 u_int ipi_global;
 u_int ipi_page;
 u_int ipi_range;
 u_int ipi_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
     0, "");
 #endif /* COUNT_XINVLTLB_HITS */
 
 /*
  * Init and startup IPI.
  */
 void
 ipi_startup(int apic_id, int vector)
 {
 
 	/*
 	 * This attempts to follow the algorithm described in the
 	 * Intel Multiprocessor Specification v1.4 in section B.4.
 	 * For each IPI, we allow the local APIC ~20us to deliver the
 	 * IPI.  If that times out, we panic.
 	 */
 
 	/*
 	 * first we do an INIT IPI: this INIT IPI might be run, resetting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
 	 * ignored.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
 	lapic_ipi_wait(100);
 
 	/* Explicitly deassert the INIT IPI. */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
 	    apic_id);
 
 	DELAY(10000);		/* wait ~10mS */
 
 	/*
 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
 	 * latched, (P5 bug) this 1st STARTUP would then terminate
 	 * immediately, and the previously started INIT IPI would continue. OR
 	 * the previous INIT IPI has already run. and this STARTUP IPI will
 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
 	 * will run.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver first STARTUP IPI to APIC %d",
 		    apic_id);
 	DELAY(200);		/* wait ~200uS */
 
 	/*
 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
 	 * recognized after hardware RESET or INIT IPI.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver second STARTUP IPI to APIC %d",
 		    apic_id);
 
 	DELAY(200);		/* wait ~200uS */
 }
 
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
 void
 ipi_send_cpu(int cpu, u_int ipi)
 {
 	u_int bitmap, old, new;
 	u_int *cpu_bitmap;
 
 	KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1,
 	    ("IPI to non-existent CPU %d", cpu));
 
 	if (IPI_IS_BITMAPED(ipi)) {
 		bitmap = 1 << ipi;
 		ipi = IPI_BITMAP_VECTOR;
 		cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap;
 		old = *cpu_bitmap;
 		for (;;) {
 			if ((old & bitmap) == bitmap)
 				break;
 			new = old | bitmap;
 			if (atomic_fcmpset_int(cpu_bitmap, &old, new))
 				break;
 		}
 		if (old)
 			return;
 	}
 	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
 }
 
 void
 ipi_bitmap_handler(struct trapframe frame)
 {
 	struct trapframe *oldframe;
 	struct thread *td;
 	int cpu = PCPU_GET(cpuid);
 	u_int ipi_bitmap;
 
 	td = curthread;
 	ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]->
 	    pc_ipi_bitmap);
 
 	/*
 	 * sched_preempt() must be called to clear the pending preempt
 	 * IPI to enable delivery of further preempts.  However, the
 	 * critical section will cause extra scheduler lock thrashing
 	 * when used unconditionally.  Only critical_enter() if
 	 * hardclock must also run, which requires the section entry.
 	 */
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
 		critical_enter();
 
 	td->td_intr_nesting_level++;
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = &frame;
 #if defined(STACK) || defined(DDB)
 	if (ipi_bitmap & (1 << IPI_TRACE))
 		stack_capture_intr();
 #endif
 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
 #ifdef COUNT_IPIS
 		(*ipi_preempt_counts[cpu])++;
 #endif
 		sched_preempt(td);
 	}
 	if (ipi_bitmap & (1 << IPI_AST)) {
 #ifdef COUNT_IPIS
 		(*ipi_ast_counts[cpu])++;
 #endif
 		/* Nothing to do for AST */
 	}
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
 #ifdef COUNT_IPIS
 		(*ipi_hardclock_counts[cpu])++;
 #endif
 		hardclockintr();
 	}
 	td->td_intr_frame = oldframe;
 	td->td_intr_nesting_level--;
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
 		critical_exit();
 }
 
 /*
  * send an IPI to a set of cpus.
  */
 void
 ipi_selected(cpuset_t cpus, u_int ipi)
 {
 	int cpu;
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
 
 	while ((cpu = CPU_FFS(&cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &cpus);
 		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 		ipi_send_cpu(cpu, ipi);
 	}
 }
 
 /*
  * send an IPI to a specific CPU.
  */
 void
 ipi_cpu(int cpu, u_int ipi)
 {
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
 
 	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 	ipi_send_cpu(cpu, ipi);
 }
 
 /*
  * send an IPI to all CPUs EXCEPT myself
  */
 void
 ipi_all_but_self(u_int ipi)
 {
 	cpuset_t other_cpus;
 
 	other_cpus = all_cpus;
 	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	if (IPI_IS_BITMAPED(ipi)) {
 		ipi_selected(other_cpus, ipi);
 		return;
 	}
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
 
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 int
 ipi_nmi_handler(void)
 {
 	u_int cpuid;
 
 	/*
 	 * As long as there is not a simple way to know about a NMI's
 	 * source, if the bitmask for the current CPU is present in
 	 * the global pending bitword an IPI_STOP_HARD has been issued
 	 * and should be handled.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
 		return (1);
 
 	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
 	cpustop_handler();
 	return (0);
 }
 
 int nmi_kdb_lock;
 
 void
 nmi_call_kdb_smp(u_int type, struct trapframe *frame)
 {
 	int cpu;
 	bool call_post;
 
 	cpu = PCPU_GET(cpuid);
 	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
 		nmi_call_kdb(cpu, type, frame);
 		call_post = false;
 	} else {
 		savectx(&stoppcbs[cpu]);
 		CPU_SET_ATOMIC(cpu, &stopped_cpus);
 		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
 			ia32_pause();
 		call_post = true;
 	}
 	atomic_store_rel_int(&nmi_kdb_lock, 0);
 	if (call_post)
 		cpustop_handler_post(cpu);
 }
 
 /*
  * Handle an IPI_STOP by saving our current context and spinning (or mwaiting,
  * if available) until we are resumed.
  */
 void
 cpustop_handler(void)
 {
 	struct monitorbuf *mb;
 	u_int cpu;
 	bool use_mwait;
 
 	cpu = PCPU_GET(cpuid);
 
 	savectx(&stoppcbs[cpu]);
 
 	use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 &&
 	    !mwait_cpustop_broken);
 	if (use_mwait) {
 		mb = PCPU_PTR(monitorbuf);
 		atomic_store_int(&mb->stop_state,
 		    MONITOR_STOPSTATE_STOPPED);
 	}
 
 	/* Indicate that we are stopped */
 	CPU_SET_ATOMIC(cpu, &stopped_cpus);
 
 	/* Wait for restart */
 	while (!CPU_ISSET(cpu, &started_cpus)) {
 		if (use_mwait) {
 			cpu_monitor(mb, 0, 0);
 			if (atomic_load_int(&mb->stop_state) ==
 			    MONITOR_STOPSTATE_STOPPED)
 				cpu_mwait(0, MWAIT_C1);
 			continue;
 		}
 
 		ia32_pause();
 
 		/*
 		 * Halt non-BSP CPUs on panic -- we're never going to need them
 		 * again, and might as well save power / release resources
 		 * (e.g., overprovisioned VM infrastructure).
 		 */
 		while (__predict_false(!IS_BSP() && KERNEL_PANICKED()))
 			halt();
 	}
 
 	cpustop_handler_post(cpu);
 }
 
 static void
 cpustop_handler_post(u_int cpu)
 {
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
 	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
 	/*
 	 * We don't broadcast TLB invalidations to other CPUs when they are
 	 * stopped. Hence, we clear the TLB before resuming.
 	 */
 	invltlb_glob();
 
 #if defined(__amd64__) && defined(DDB)
 	amd64_db_resume_dbreg();
 #endif
 
 	if (cpu == 0 && cpustop_restartfunc != NULL) {
 		cpustop_restartfunc();
 		cpustop_restartfunc = NULL;
 	}
 }
 
 /*
  * Handle an IPI_SUSPEND by saving our current context and spinning until we
  * are resumed.
  */
 void
 cpususpend_handler(void)
 {
 	u_int cpu;
 
 	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
 
 	cpu = PCPU_GET(cpuid);
 	if (savectx(&susppcbs[cpu]->sp_pcb)) {
 #ifdef __amd64__
 		fpususpend(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxsuspend(susppcbs[cpu]->sp_fpususpend);
 #endif
 		/*
 		 * suspended_cpus is cleared shortly after each AP is restarted
 		 * by a Startup IPI, so that the BSP can proceed to restarting
 		 * the next AP.
 		 *
 		 * resuming_cpus gets cleared when the AP completes
 		 * initialization after having been released by the BSP.
 		 * resuming_cpus is probably not the best name for the
 		 * variable, because it is actually a set of processors that
 		 * haven't resumed yet and haven't necessarily started resuming.
 		 *
 		 * Note that suspended_cpus is meaningful only for ACPI suspend
 		 * as it's not really used for Xen suspend since the APs are
 		 * automatically restored to the running state and the correct
 		 * context.  For the same reason resumectx is never called in
 		 * that case.
 		 */
 		CPU_SET_ATOMIC(cpu, &suspended_cpus);
 		CPU_SET_ATOMIC(cpu, &resuming_cpus);
 
 		/*
 		 * Invalidate the cache after setting the global status bits.
 		 * The last AP to set its bit may end up being an Owner of the
 		 * corresponding cache line in MOESI protocol.  The AP may be
 		 * stopped before the cache line is written to the main memory.
 		 */
 		wbinvd();
 	} else {
 #ifdef __amd64__
 		fpuresume(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxresume(susppcbs[cpu]->sp_fpususpend);
 #endif
 		pmap_init_pat();
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
 
 		/* Indicate that we have restarted and restored the context. */
 		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
 	/* Wait for resume directive */
 	while (!CPU_ISSET(cpu, &toresume_cpus))
 		ia32_pause();
 
 	/* Re-apply microcode updates. */
 	ucode_reload();
 
 #ifdef __i386__
 	/* Finish removing the identity mapping of low memory for this AP. */
 	invltlb_glob();
 #endif
 
 	if (cpu_ops.cpu_resume)
 		cpu_ops.cpu_resume();
 #ifdef __amd64__
 	if (vmm_resume_p)
 		vmm_resume_p();
 #endif
 
 	/* Resume MCA and local APIC */
 	lapic_xapic_mode();
 	mca_resume();
 	lapic_setup(0);
 
 	/* Indicate that we are resumed */
 	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
 	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
 }
 
-
-void
-invlcache_handler(void)
-{
-	uint32_t generation;
-
-#ifdef COUNT_IPIS
-	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	/*
-	 * Reading the generation here allows greater parallelism
-	 * since wbinvd is a serializing instruction.  Without the
-	 * temporary, we'd wait for wbinvd to complete, then the read
-	 * would execute, then the dependent write, which must then
-	 * complete before return from interrupt.
-	 */
-	generation = smp_tlb_generation;
-	wbinvd();
-	PCPU_SET(smp_tlb_done, generation);
-}
-
 /*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
 static void
 release_aps(void *dummy __unused)
 {
 
 	if (mp_ncpus == 1) 
 		return;
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
 #ifdef COUNT_IPIS
 /*
  * Setup interrupt counters for IPI handlers.
  */
 static void
 mp_ipi_intrcnt(void *dummy)
 {
 	char buf[64];
 	int i;
 
 	CPU_FOREACH(i) {
 		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
 		intrcnt_add(buf, &ipi_invlcache_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
 		intrcnt_add(buf, &ipi_preempt_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
 		intrcnt_add(buf, &ipi_ast_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
 		intrcnt_add(buf, &ipi_hardclock_counts[i]);
 	}		
 }
 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 #endif
-
-/*
- * Flush the TLB on other CPU's
- */
-
-/* Variables needed for SMP tlb shootdown. */
-vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
-pmap_t smp_tlb_pmap;
-volatile uint32_t smp_tlb_generation;
-
-#ifdef __amd64__
-#define	read_eflags() read_rflags()
-#endif
-
-/*
- * Used by pmap to request invalidation of TLB or cache on local and
- * remote processors.  Mask provides the set of remote CPUs which are
- * to be signalled with the IPI specified by vector.  The curcpu_cb
- * callback is invoked on the calling CPU while waiting for remote
- * CPUs to complete the operation.
- *
- * The callback function is called unconditionally on the caller's
- * underlying processor, even when this processor is not set in the
- * mask.  So, the callback function must be prepared to handle such
- * spurious invocations.
- */
-static void
-smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
-    vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb)
-{
-	cpuset_t other_cpus;
-	volatile uint32_t *p_cpudone;
-	uint32_t generation;
-	int cpu;
-
-	/*
-	 * It is not necessary to signal other CPUs while booting or
-	 * when in the debugger.
-	 */
-	if (kdb_active || KERNEL_PANICKED() || !smp_started) {
-		curcpu_cb(pmap, addr1, addr2);
-		return;
-	}
-
-	sched_pin();
-
-	/*
-	 * Check for other cpus.  Return if none.
-	 */
-	if (CPU_ISFULLSET(&mask)) {
-		if (mp_ncpus <= 1)
-			goto nospinexit;
-	} else {
-		CPU_CLR(PCPU_GET(cpuid), &mask);
-		if (CPU_EMPTY(&mask))
-			goto nospinexit;
-	}
-
-	if (!(read_eflags() & PSL_I))
-		panic("%s: interrupts disabled", __func__);
-	mtx_lock_spin(&smp_ipi_mtx);
-	smp_tlb_addr1 = addr1;
-	smp_tlb_addr2 = addr2;
-	smp_tlb_pmap = pmap;
-	generation = ++smp_tlb_generation;
-	if (CPU_ISFULLSET(&mask)) {
-		ipi_all_but_self(vector);
-		other_cpus = all_cpus;
-		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
-	} else {
-		other_cpus = mask;
-		while ((cpu = CPU_FFS(&mask)) != 0) {
-			cpu--;
-			CPU_CLR(cpu, &mask);
-			CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
-			    cpu, vector);
-			ipi_send_cpu(cpu, vector);
-		}
-	}
-	curcpu_cb(pmap, addr1, addr2);
-	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
-		cpu--;
-		CPU_CLR(cpu, &other_cpus);
-		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
-		while (*p_cpudone != generation)
-			ia32_pause();
-	}
-	mtx_unlock_spin(&smp_ipi_mtx);
-	sched_unpin();
-	return;
-
-nospinexit:
-	curcpu_cb(pmap, addr1, addr2);
-	sched_unpin();
-}
-
-void
-smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
-{
-
-	smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0, curcpu_cb);
-#ifdef COUNT_XINVLTLB_HITS
-	ipi_global++;
-#endif
-}
-
-void
-smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
-    smp_invl_cb_t curcpu_cb)
-{
-
-	smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0, curcpu_cb);
-#ifdef COUNT_XINVLTLB_HITS
-	ipi_page++;
-#endif
-}
-
-void
-smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
-    pmap_t pmap, smp_invl_cb_t curcpu_cb)
-{
-
-	smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2,
-	    curcpu_cb);
-#ifdef COUNT_XINVLTLB_HITS
-	ipi_range++;
-	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
-#endif
-}
-
-void
-smp_cache_flush(smp_invl_cb_t curcpu_cb)
-{
-
-	smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0,
-	    curcpu_cb);
-}
-
-/*
- * Handlers for TLB related IPIs
- */
-void
-invltlb_handler(void)
-{
-	uint32_t generation;
-  
-#ifdef COUNT_XINVLTLB_HITS
-	xhits_gbl[PCPU_GET(cpuid)]++;
-#endif /* COUNT_XINVLTLB_HITS */
-#ifdef COUNT_IPIS
-	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	/*
-	 * Reading the generation here allows greater parallelism
-	 * since invalidating the TLB is a serializing operation.
-	 */
-	generation = smp_tlb_generation;
-	if (smp_tlb_pmap == kernel_pmap)
-		invltlb_glob();
-#ifdef __amd64__
-	else
-		invltlb();
-#endif
-	PCPU_SET(smp_tlb_done, generation);
-}
-
-void
-invlpg_handler(void)
-{
-	uint32_t generation;
-
-#ifdef COUNT_XINVLTLB_HITS
-	xhits_pg[PCPU_GET(cpuid)]++;
-#endif /* COUNT_XINVLTLB_HITS */
-#ifdef COUNT_IPIS
-	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	generation = smp_tlb_generation;	/* Overlap with serialization */
-#ifdef __i386__
-	if (smp_tlb_pmap == kernel_pmap)
-#endif
-		invlpg(smp_tlb_addr1);
-	PCPU_SET(smp_tlb_done, generation);
-}
-
-void
-invlrng_handler(void)
-{
-	vm_offset_t addr, addr2;
-	uint32_t generation;
-
-#ifdef COUNT_XINVLTLB_HITS
-	xhits_rng[PCPU_GET(cpuid)]++;
-#endif /* COUNT_XINVLTLB_HITS */
-#ifdef COUNT_IPIS
-	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
-#endif /* COUNT_IPIS */
-
-	addr = smp_tlb_addr1;
-	addr2 = smp_tlb_addr2;
-	generation = smp_tlb_generation;	/* Overlap with serialization */
-#ifdef __i386__
-	if (smp_tlb_pmap == kernel_pmap)
-#endif
-		do {
-			invlpg(addr);
-			addr += PAGE_SIZE;
-		} while (addr < addr2);
-
-	PCPU_SET(smp_tlb_done, generation);
-}
diff --git a/sys/x86/xen/xen_apic.c b/sys/x86/xen/xen_apic.c
index 8bf2158dbec2..7d23f0a50417 100644
--- a/sys/x86/xen/xen_apic.c
+++ b/sys/x86/xen/xen_apic.c
@@ -1,627 +1,573 @@
 /*
  * Copyright (c) 2014 Roger Pau Monné <roger.pau@citrix.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/cpufunc.h>
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 
 #include <x86/apicreg.h>
 #include <x86/apicvar.h>
 
 #include <xen/xen-os.h>
 #include <xen/features.h>
 #include <xen/gnttab.h>
 #include <xen/hypervisor.h>
 #include <xen/hvm.h>
 #include <xen/xen_intr.h>
 
 #include <xen/interface/vcpu.h>
 
 /*--------------------------------- Macros -----------------------------------*/
 
 #define XEN_APIC_UNSUPPORTED \
 	panic("%s: not available in Xen PV port.", __func__)
 
 
 /*--------------------------- Forward Declarations ---------------------------*/
 #ifdef SMP
 static driver_filter_t xen_smp_rendezvous_action;
+#ifdef __amd64__
+static driver_filter_t xen_invlop;
+#else
 static driver_filter_t xen_invltlb;
 static driver_filter_t xen_invlpg;
 static driver_filter_t xen_invlrng;
 static driver_filter_t xen_invlcache;
+#endif
 static driver_filter_t xen_ipi_bitmap_handler;
 static driver_filter_t xen_cpustop_handler;
 static driver_filter_t xen_cpususpend_handler;
 #endif
 
 /*---------------------------------- Macros ----------------------------------*/
 #define	IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
 
 /*--------------------------------- Xen IPIs ---------------------------------*/
 #ifdef SMP
 struct xen_ipi_handler
 {
 	driver_filter_t	*filter;
 	const char	*description;
 };
 
 static struct xen_ipi_handler xen_ipis[] = 
 {
 	[IPI_TO_IDX(IPI_RENDEZVOUS)]	= { xen_smp_rendezvous_action,	"r"   },
+#ifdef __amd64__
+	[IPI_TO_IDX(IPI_INVLOP)]	= { xen_invlop,			"itlb"},
+#else
 	[IPI_TO_IDX(IPI_INVLTLB)]	= { xen_invltlb,		"itlb"},
 	[IPI_TO_IDX(IPI_INVLPG)]	= { xen_invlpg,			"ipg" },
 	[IPI_TO_IDX(IPI_INVLRNG)]	= { xen_invlrng,		"irg" },
 	[IPI_TO_IDX(IPI_INVLCACHE)]	= { xen_invlcache,		"ic"  },
+#endif
 	[IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler,	"b"   },
 	[IPI_TO_IDX(IPI_STOP)]		= { xen_cpustop_handler,	"st"  },
 	[IPI_TO_IDX(IPI_SUSPEND)]	= { xen_cpususpend_handler,	"sp"  },
 };
 #endif
 
 /*------------------------------- Per-CPU Data -------------------------------*/
 #ifdef SMP
 DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
 #endif
 
 /*------------------------------- Xen PV APIC --------------------------------*/
 
 static void
 xen_pv_lapic_create(u_int apic_id, int boot_cpu)
 {
 #ifdef SMP
 	cpu_add(apic_id, boot_cpu);
 #endif
 }
 
 static void
 xen_pv_lapic_init(vm_paddr_t addr)
 {
 
 }
 
 static void
 xen_pv_lapic_setup(int boot)
 {
 
 }
 
 static void
 xen_pv_lapic_dump(const char *str)
 {
 
 	printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str);
 }
 
 static void
 xen_pv_lapic_disable(void)
 {
 
 }
 
 static bool
 xen_pv_lapic_is_x2apic(void)
 {
 
 	return (false);
 }
 
 static void
 xen_pv_lapic_eoi(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_id(void)
 {
 
 	return (PCPU_GET(apic_id));
 }
 
 static int
 xen_pv_lapic_intr_pending(u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static u_int
 xen_pv_apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return (apic_cpuids[apic_id]);
 #else
 	return (0);
 #endif
 }
 
 static u_int
 xen_pv_apic_alloc_vector(u_int apic_id, u_int irq)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static u_int
 xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static void
 xen_pv_apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_enable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static void
 xen_pv_lapic_disable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_reenable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_enable_cmc(void)
 {
 
 }
 
 #ifdef SMP
 static void
 xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 #define PCPU_ID_GET(id, field) (pcpu_find(id)->pc_##field)
 static void
 send_nmi(int dest)
 {
 	unsigned int cpu;
 
 	/*
 	 * NMIs are not routed over event channels, and instead delivered as on
 	 * native using the exception vector (#2). Triggering them can be done
 	 * using the local APIC, or an hypercall as a shortcut like it's done
 	 * below.
 	 */
 	switch(dest) {
 	case APIC_IPI_DEST_SELF:
 		HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_GET(vcpu_id), NULL);
 		break;
 	case APIC_IPI_DEST_ALL:
 		CPU_FOREACH(cpu)
 			HYPERVISOR_vcpu_op(VCPUOP_send_nmi,
 			    PCPU_ID_GET(cpu, vcpu_id), NULL);
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		CPU_FOREACH(cpu)
 			if (cpu != PCPU_GET(cpuid))
 				HYPERVISOR_vcpu_op(VCPUOP_send_nmi,
 				    PCPU_ID_GET(cpu, vcpu_id), NULL);
 		break;
 	default:
 		HYPERVISOR_vcpu_op(VCPUOP_send_nmi,
 		    PCPU_ID_GET(apic_cpuid(dest), vcpu_id), NULL);
 		break;
 	}
 }
 #undef PCPU_ID_GET
 
 static void
 xen_pv_lapic_ipi_vectored(u_int vector, int dest)
 {
 	xen_intr_handle_t *ipi_handle;
 	int ipi_idx, to_cpu, self;
 
 	if (vector >= IPI_NMI_FIRST) {
 		send_nmi(dest);
 		return;
 	}
 
 	ipi_idx = IPI_TO_IDX(vector);
 	if (ipi_idx >= nitems(xen_ipis))
 		panic("IPI out of range");
 
 	switch(dest) {
 	case APIC_IPI_DEST_SELF:
 		ipi_handle = DPCPU_GET(ipi_handle);
 		xen_intr_signal(ipi_handle[ipi_idx]);
 		break;
 	case APIC_IPI_DEST_ALL:
 		CPU_FOREACH(to_cpu) {
 			ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 			xen_intr_signal(ipi_handle[ipi_idx]);
 		}
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		self = PCPU_GET(cpuid);
 		CPU_FOREACH(to_cpu) {
 			if (to_cpu != self) {
 				ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 				xen_intr_signal(ipi_handle[ipi_idx]);
 			}
 		}
 		break;
 	default:
 		to_cpu = apic_cpuid(dest);
 		ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 		xen_intr_signal(ipi_handle[ipi_idx]);
 		break;
 	}
 }
 
 static int
 xen_pv_lapic_ipi_wait(int delay)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 #endif	/* SMP */
 
 static int
 xen_pv_lapic_ipi_alloc(inthand_t *ipifunc)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (-1);
 }
 
 static void
 xen_pv_lapic_ipi_free(int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
     enum intr_trigger trigger)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 /* Xen apic_ops implementation */
 struct apic_ops xen_apic_ops = {
 	.create			= xen_pv_lapic_create,
 	.init			= xen_pv_lapic_init,
 	.xapic_mode		= xen_pv_lapic_disable,
 	.is_x2apic		= xen_pv_lapic_is_x2apic,
 	.setup			= xen_pv_lapic_setup,
 	.dump			= xen_pv_lapic_dump,
 	.disable		= xen_pv_lapic_disable,
 	.eoi			= xen_pv_lapic_eoi,
 	.id			= xen_pv_lapic_id,
 	.intr_pending		= xen_pv_lapic_intr_pending,
 	.set_logical_id		= xen_pv_lapic_set_logical_id,
 	.cpuid			= xen_pv_apic_cpuid,
 	.alloc_vector		= xen_pv_apic_alloc_vector,
 	.alloc_vectors		= xen_pv_apic_alloc_vectors,
 	.enable_vector		= xen_pv_apic_enable_vector,
 	.disable_vector		= xen_pv_apic_disable_vector,
 	.free_vector		= xen_pv_apic_free_vector,
 	.enable_pmc		= xen_pv_lapic_enable_pmc,
 	.disable_pmc		= xen_pv_lapic_disable_pmc,
 	.reenable_pmc		= xen_pv_lapic_reenable_pmc,
 	.enable_cmc		= xen_pv_lapic_enable_cmc,
 #ifdef SMP
 	.ipi_raw		= xen_pv_lapic_ipi_raw,
 	.ipi_vectored		= xen_pv_lapic_ipi_vectored,
 	.ipi_wait		= xen_pv_lapic_ipi_wait,
 #endif
 	.ipi_alloc		= xen_pv_lapic_ipi_alloc,
 	.ipi_free		= xen_pv_lapic_ipi_free,
 	.set_lvt_mask		= xen_pv_lapic_set_lvt_mask,
 	.set_lvt_mode		= xen_pv_lapic_set_lvt_mode,
 	.set_lvt_polarity	= xen_pv_lapic_set_lvt_polarity,
 	.set_lvt_triggermode	= xen_pv_lapic_set_lvt_triggermode,
 };
 
 #ifdef SMP
 /*---------------------------- XEN PV IPI Handlers ---------------------------*/
 /*
  * These are C clones of the ASM functions found in apic_vector.
  */
 static int
 xen_ipi_bitmap_handler(void *arg)
 {
 	struct trapframe *frame;
 
 	frame = arg;
 	ipi_bitmap_handler(*frame);
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_smp_rendezvous_action(void *arg)
 {
 #ifdef COUNT_IPIS
 	(*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	smp_rendezvous_action();
 	return (FILTER_HANDLED);
 }
 
-static int
-xen_invltlb(void *arg)
-{
-
-	invltlb_handler();
-	return (FILTER_HANDLED);
-}
-
 #ifdef __amd64__
 static int
-xen_invltlb_invpcid(void *arg)
-{
-
-	invltlb_invpcid_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invltlb_pcid(void *arg)
+xen_invlop(void *arg)
 {
 
-	invltlb_pcid_handler();
+	invlop_handler();
 	return (FILTER_HANDLED);
 }
 
-static int
-xen_invltlb_invpcid_pti(void *arg)
-{
-
-	invltlb_invpcid_pti_handler();
-	return (FILTER_HANDLED);
-}
+#else /* __i386__ */
 
 static int
-xen_invlpg_invpcid_handler(void *arg)
-{
-
-	invlpg_invpcid_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invlpg_pcid_handler(void *arg)
-{
-
-	invlpg_pcid_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invlrng_invpcid_handler(void *arg)
-{
-
-	invlrng_invpcid_handler();
-	return (FILTER_HANDLED);
-}
-
-static int
-xen_invlrng_pcid_handler(void *arg)
+xen_invltlb(void *arg)
 {
 
-	invlrng_pcid_handler();
+	invltlb_handler();
 	return (FILTER_HANDLED);
 }
-#endif
 
 static int
 xen_invlpg(void *arg)
 {
 
 	invlpg_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invlrng(void *arg)
 {
 
 	invlrng_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invlcache(void *arg)
 {
 
 	invlcache_handler();
 	return (FILTER_HANDLED);
 }
+#endif /* __amd64__ */
 
 static int
 xen_cpustop_handler(void *arg)
 {
 
 	cpustop_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_cpususpend_handler(void *arg)
 {
 
 	cpususpend_handler();
 	return (FILTER_HANDLED);
 }
 
 /*----------------------------- XEN PV IPI setup -----------------------------*/
 /*
  * Those functions are provided outside of the Xen PV APIC implementation
  * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC,
  * because on PVHVM there's an emulated LAPIC provided by Xen.
  */
 static void
 xen_cpu_ipi_init(int cpu)
 {
 	xen_intr_handle_t *ipi_handle;
 	const struct xen_ipi_handler *ipi;
 	int idx, rc;
 
 	ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
 
 	for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
 
 		if (ipi->filter == NULL) {
 			ipi_handle[idx] = NULL;
 			continue;
 		}
 
 		rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter,
 		    INTR_TYPE_TTY, &ipi_handle[idx]);
 		if (rc != 0)
 			panic("Unable to allocate a XEN IPI port");
 		xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
 	}
 }
 
 static void
 xen_setup_cpus(void)
 {
 	int i;
 
 	if (!xen_vector_callback_enabled)
 		return;
 
-#ifdef __amd64__
-	if (pmap_pcid_enabled) {
-		if (pti)
-			xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
-			    invpcid_works ? xen_invltlb_invpcid_pti :
-			    xen_invltlb_pcid;
-		else
-			xen_ipis[IPI_TO_IDX(IPI_INVLTLB)].filter =
-			    invpcid_works ? xen_invltlb_invpcid :
-			    xen_invltlb_pcid;
-		xen_ipis[IPI_TO_IDX(IPI_INVLPG)].filter = invpcid_works ?
-		    xen_invlpg_invpcid_handler : xen_invlpg_pcid_handler;
-		xen_ipis[IPI_TO_IDX(IPI_INVLRNG)].filter = invpcid_works ?
-		    xen_invlrng_invpcid_handler : xen_invlrng_pcid_handler;
-	}
-#endif
 	CPU_FOREACH(i)
 		xen_cpu_ipi_init(i);
 
 	/* Set the xen pv ipi ops to replace the native ones */
 	if (xen_hvm_domain())
 		apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored;
 }
 
 /* Switch to using PV IPIs as soon as the vcpu_id is set. */
 SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_SECOND, xen_setup_cpus, NULL);
 #endif /* SMP */