Index: head/share/man/man9/swi.9
===================================================================
--- head/share/man/man9/swi.9	(revision 363526)
+++ head/share/man/man9/swi.9	(revision 363527)
@@ -1,249 +1,256 @@
 .\" Copyright (c) 2000-2001 John H. Baldwin <jhb@FreeBSD.org>
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd April 19, 2012
+.Dd July 25, 2020
 .Dt SWI 9
 .Os
 .Sh NAME
 .Nm swi_add ,
 .Nm swi_remove ,
 .Nm swi_sched
 .Nd register and schedule software interrupt handlers
 .Sh SYNOPSIS
 .In sys/param.h
 .In sys/bus.h
 .In sys/interrupt.h
 .Vt "extern struct intr_event *tty_intr_event" ;
 .Vt "extern struct intr_event *clk_intr_event" ;
 .Vt "extern void *vm_ih" ;
 .Ft int
 .Fo swi_add
 .Fa "struct intr_event **eventp"
 .Fa "const char *name"
 .Fa "driver_intr_t handler"
 .Fa "void *arg"
 .Fa "int pri"
 .Fa "enum intr_type flags"
 .Fa "void **cookiep"
 .Fc
 .Ft int
 .Fn swi_remove "void *cookie"
 .Ft void
 .Fn swi_sched "void *cookie" "int flags"
 .Sh DESCRIPTION
 These functions are used to register and schedule software interrupt handlers.
 Software interrupt handlers are attached to a software interrupt thread, just
 as hardware interrupt handlers are attached to a hardware interrupt thread.
 Multiple handlers can be attached to the same thread.
 Software interrupt handlers can be used to queue up less critical processing
 inside of hardware interrupt handlers so that the work can be done at a later
 time.
 Software interrupt threads are different from other kernel threads in that they
 are treated as an interrupt thread.
 This means that time spent executing these threads is counted as interrupt
 time, and that they can be run via a lightweight context switch.
 .Pp
 The
 .Fn swi_add
 function is used to add a new software interrupt handler to a specified
 interrupt event.
 The
 .Fa eventp
 argument is an optional pointer to a
 .Vt struct intr_event
 pointer.
 If this argument points to an existing event that holds a list of
 interrupt handlers, then this handler will be attached to that event.
 Otherwise a new event will be created, and if
 .Fa eventp
 is not
 .Dv NULL ,
 then the pointer at that address to will be modified to point to the
 newly created event.
 The
 .Fa name
 argument is used to associate a name with a specific handler.
 This name is appended to the name of the software interrupt thread that this
 handler is attached to.
 The
 .Fa handler
 argument is the function that will be executed when the handler is scheduled
 to run.
 The
 .Fa arg
 parameter will be passed in as the only parameter to
 .Fa handler
 when the function is executed.
 The
 .Fa pri
 value specifies the priority of this interrupt handler relative to other
 software interrupt handlers.
 If an interrupt event is created, then this value is used as the vector,
 and the
 .Fa flags
 argument is used to specify the attributes of a handler such as
 .Dv INTR_MPSAFE .
 The
 .Fa cookiep
 argument points to a
 .Vt void *
 cookie.
 This cookie will be set to a value that uniquely identifies this handler,
 and is used to schedule the handler for execution later on.
 .Pp
 The
 .Fn swi_remove
 function is used to teardown an interrupt handler pointed to by the
 .Fa cookie
 argument.
 It detaches the interrupt handler from the associated interrupt event
 and frees its memory.
 .Pp
 The
 .Fn swi_sched
 function is used to schedule an interrupt handler and its associated thread to
 run.
 The
 .Fa cookie
 argument specifies which software interrupt handler should be scheduled to run.
 The
 .Fa flags
 argument specifies how and when the handler should be run and is a mask of one
 or more of the following flags:
-.Bl -tag -width SWI_DELAY
+.Bl -tag -width SWI_FROMNMI
 .It Dv SWI_DELAY
 Specifies that the kernel should mark the specified handler as needing to run,
 but the kernel should not schedule the software interrupt thread to run.
 Instead,
 .Fa handler
 will be executed the next time that the software interrupt thread runs after
 being scheduled by another event.
 Attaching a handler to the clock software interrupt thread and using this flag
 when scheduling a software interrupt handler can be used to implement the
 functionality performed by
 .Fn setdelayed
 in earlier versions of
 .Fx .
+.It Dv SWI_FROMNMI
+Specifies that
+.Fn swi_sched
+is called from NMI context and should be careful about used KPIs.
+On platforms allowing IPI sending from NMI context it immediately wakes
+.Va clk_intr_event
+via the IPI, otherwise it works just like SWI_DELAY.
 .El
 .Pp
 The
 .Va tty_intr_event
 and
 .Va clk_intr_event
 variables contain pointers to the software interrupt handlers for the tty and
 clock software interrupts, respectively.
 .Va tty_intr_event
 is used to hang tty software interrupt handlers off of the same thread.
 .Va clk_intr_event
 is used to hang delayed handlers off of the clock software interrupt thread so
 that the functionality of
 .Fn setdelayed
 can be obtained in conjunction with
 .Dv SWI_DELAY .
 The
 .Va vm_ih
 handler cookie is used to schedule software interrupt threads to run for the
 VM subsystem.
 .Sh RETURN VALUES
 The
 .Fn swi_add
 and
 .Fn swi_remove
 functions return zero on success and non-zero on failure.
 .Sh ERRORS
 The
 .Fn swi_add
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EAGAIN
 The system-imposed limit on the total
 number of processes under execution would be exceeded.
 The limit is given by the
 .Xr sysctl 3
 MIB variable
 .Dv KERN_MAXPROC .
 .It Bq Er EINVAL
 The
 .Fa flags
 argument specifies
 .Dv INTR_ENTROPY .
 .It Bq Er EINVAL
 The
 .Fa eventp
 argument points to a hardware interrupt thread.
 .It Bq Er EINVAL
 Either of the
 .Fa name
 or
 .Fa handler
 arguments are
 .Dv NULL .
 .It Bq Er EINVAL
 The
 .Dv INTR_EXCL
 flag is specified and the interrupt event pointed to by
 .Fa eventp
 already has at least one handler, or the interrupt event already has an
 exclusive handler.
 .El
 .Pp
 The
 .Fn swi_remove
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 A software interrupt handler pointed to by
 .Fa cookie
 is
 .Dv NULL .
 .El
 .Sh SEE ALSO
 .Xr ithread 9 ,
 .Xr taskqueue 9
 .Sh HISTORY
 The
 .Fn swi_add
 and
 .Fn swi_sched
 functions first appeared in
 .Fx 5.0 .
 They replaced the
 .Fn register_swi
 function which appeared in
 .Fx 3.0
 and the
 .Fn setsoft* ,
 and
 .Fn schedsoft*
 functions which date back to at least
 .Bx 4.4 .
 The
 .Fn swi_remove
 function first appeared in
 .Fx 6.1 .
 .Sh BUGS
 Most of the global variables described in this manual page should not be
 global, or at the very least should not be declared in
 .In sys/interrupt.h .
Index: head/sys/amd64/amd64/apic_vector.S
===================================================================
--- head/sys/amd64/amd64/apic_vector.S	(revision 363526)
+++ head/sys/amd64/amd64/apic_vector.S	(revision 363527)
@@ -1,257 +1,267 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 2014-2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by I/O APICs
  * as well as IPI handlers.
  */
 
 #include "opt_smp.h"
 
 #include "assym.inc"
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
 #ifdef SMP
 #define LK	lock ;
 #else
 #define LK
 #endif
 
 	.text
 	SUPERALIGN_TEXT
 	/* End Of Interrupt to APIC */
 as_lapic_eoi:
 	cmpl	$0,x2apic_mode
 	jne	1f
 	movq	lapic_map,%rax
 	movl	$0,LA_EOI(%rax)
 	ret
 1:
 	movl	$MSR_APIC_EOI,%ecx
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	wrmsr
 	ret
 
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
 	.macro	ISR_VEC	index, vec_name
 	INTR_HANDLER	\vec_name
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	cmpl	$0,x2apic_mode
 	je	1f
 	movl	$(MSR_APIC_ISR0 + \index),%ecx
 	rdmsr
 	jmp	2f
 1:
 	movq	lapic_map, %rdx		/* pointer to local APIC */
 	movl	LA_ISR + 16 * (\index)(%rdx), %eax	/* load ISR */
 2:
 	bsrl	%eax, %eax	/* index of highest set bit in ISR */
 	jz	3f
 	addl	$(32 * \index),%eax
 	movq	%rsp, %rsi
 	movl	%eax, %edi	/* pass the IRQ */
 	call	lapic_handle_intr
 3:
 	MEXITCOUNT
 	jmp	doreti
 	.endm
 
 /*
  * Handle "spurious INTerrupts".
  * Notes:
  *  This is different than the "spurious INTerrupt" generated by an
  *   8259 PIC for missing INTs.  See the APIC documentation for details.
  *  This routine should NOT do an 'EOI' cycle.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
 	/* No EOI cycle used here */
 	jmp	doreti_iret
 
 	ISR_VEC	1, apic_isr1
 	ISR_VEC	2, apic_isr2
 	ISR_VEC	3, apic_isr3
 	ISR_VEC	4, apic_isr4
 	ISR_VEC	5, apic_isr5
 	ISR_VEC	6, apic_isr6
 	ISR_VEC	7, apic_isr7
 
 /*
  * Local APIC periodic timer handler.
  */
 	INTR_HANDLER	timerint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	lapic_handle_timer
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC CMCI handler.
  */
 	INTR_HANDLER cmcint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC error interrupt handler.
  */
 	INTR_HANDLER errorint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_error
 	MEXITCOUNT
 	jmp	doreti
 
 #ifdef XENHVM
 /*
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
 	INTR_HANDLER xen_intr_upcall
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	xen_intr_handle_upcall
 	MEXITCOUNT
 	jmp	doreti
 #endif
 
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
 	.text
 
 	SUPERALIGN_TEXT
 /*
  * IPI handler for cache and TLB shootdown
  */
 	INTR_HANDLER invlop
 	call	invlop_handler
 	call	as_lapic_eoi
 	jmp	ld_regs
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
 	INTR_HANDLER ipi_intr_bitmap_handler
 	call	as_lapic_eoi
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	INTR_HANDLER cpustop
 	call	as_lapic_eoi
 	call	cpustop_handler
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
 	INTR_HANDLER cpususpend
 	call	cpususpend_handler
 	call	as_lapic_eoi
 	jmp	doreti
 
 /*
+ * Executed by a CPU when it receives an IPI_SWI.
+ */
+	INTR_HANDLER ipi_swi
+	call	as_lapic_eoi
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	call	ipi_swi_handler
+	MEXITCOUNT
+	jmp	doreti
+
+/*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
  */
 	INTR_HANDLER rendezvous
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movq	ipi_rendezvous_counts(,%rax,8), %rax
 	incq	(%rax)
 #endif
 	call	smp_rendezvous_action
 	call	as_lapic_eoi
 	jmp	doreti
 
 /*
  * IPI handler whose purpose is to interrupt the CPU with minimum overhead.
  * This is used by bhyve to force a host cpu executing in guest context to
  * trap into the hypervisor.
  *
  * This handler is different from other IPI handlers in the following aspects:
  *
  * 1. It doesn't push a trapframe on the stack.
  *
  * This implies that a DDB backtrace involving 'justreturn' will skip the
  * function that was interrupted by this handler.
  *
  * 2. It doesn't 'swapgs' when userspace is interrupted.
  *
  * The 'justreturn' handler does not access any pcpu data so it is not an
  * issue. Moreover the 'justreturn' handler can only be interrupted by an NMI
  * whose handler already doesn't trust GS.base when kernel code is interrupted.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(justreturn)
 	pushq	%rax
 	pushq	%rcx
 	pushq	%rdx
 	call	as_lapic_eoi
 	popq	%rdx
 	popq	%rcx
 	popq	%rax
 	jmp	doreti_iret
 
 	INTR_HANDLER	justreturn1
 	call	as_lapic_eoi
 	jmp	doreti
 
 #endif /* SMP */
Index: head/sys/amd64/amd64/mp_machdep.c
===================================================================
--- head/sys/amd64/amd64/mp_machdep.c	(revision 363526)
+++ head/sys/amd64/amd64/mp_machdep.c	(revision 363527)
@@ -1,1111 +1,1115 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_phys.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cputypes.h>
 #include <machine/cpufunc.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <machine/cpu.h>
 #include <x86/init.h>
 
 #ifdef DEV_ACPI
 #include <contrib/dev/acpica/include/acpi.h>
 #include <dev/acpica/acpivar.h>
 #endif
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(KERNBASE + 0x0467)
 #define WARMBOOT_SEG		(KERNBASE + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 #define GiB(v)			(v ## ULL << 30)
 
 #define	AP_BOOTPT_SZ		(PAGE_SIZE * 3)
 
 /* Temporary variables for init_secondary()  */
 char *doublefault_stack;
 char *mce_stack;
 char *nmi_stack;
 char *dbg_stack;
 
 /*
  * Local data and functions.
  */
 
 static int	start_ap(int apic_id);
 
 static bool
 is_kernel_paddr(vm_paddr_t pa)
 {
 
 	return (pa >= trunc_2mpage(btext - KERNBASE) &&
 	   pa < round_page(_end - KERNBASE));
 }
 
 static bool
 is_mpboot_good(vm_paddr_t start, vm_paddr_t end)
 {
 
 	return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
 }
 
 /*
  * Calculate usable address in base memory for AP trampoline code.
  */
 void
 mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx)
 {
 	vm_paddr_t start, end;
 	unsigned int i;
 	bool allocated;
 
 	alloc_ap_trampoline(physmap, physmap_idx);
 
 	/*
 	 * Find a memory region big enough below the 4GB boundary to
 	 * store the initial page tables.  Region must be mapped by
 	 * the direct map.
 	 *
 	 * Note that it needs to be aligned to a page boundary.
 	 */
 	allocated = false;
 	for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
 		/*
 		 * First, try to chomp at the start of the physmap region.
 		 * Kernel binary might claim it already.
 		 */
 		start = round_page(physmap[i]);
 		end = start + AP_BOOTPT_SZ;
 		if (start < end && end <= physmap[i + 1] &&
 		    is_mpboot_good(start, end) &&
 		    !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
 			allocated = true;
 			physmap[i] = end;
 			break;
 		}
 
 		/*
 		 * Second, try to chomp at the end.  Again, check
 		 * against kernel.
 		 */
 		end = trunc_page(physmap[i + 1]);
 		start = end - AP_BOOTPT_SZ;
 		if (start < end && start >= physmap[i] &&
 		    is_mpboot_good(start, end) &&
 		    !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
 			allocated = true;
 			physmap[i + 1] = start;
 			break;
 		}
 	}
 	if (allocated) {
 		mptramp_pagetables = start;
 		if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
 			memmove(&physmap[i], &physmap[i + 2],
 			    sizeof(*physmap) * (*physmap_idx - i + 2));
 			*physmap_idx -= 2;
 		}
 	} else {
 		mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ;
 		if (bootverbose)
 			printf(
 "Cannot find enough space for the initial AP page tables, placing them at %#x",
 			    mptramp_pagetables);
 	}
 }
 
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
 void
 cpu_mp_start(void)
 {
 	int i;
 
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
 	}
 
 	/* Install an inter-CPU IPI for cache and TLB invalidations. */
 	setidt(IPI_INVLOP, pti ? IDTVEC(invlop_pti) : IDTVEC(invlop),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
 	setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) :
 	    IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install generic inter-CPU IPI handler */
 	setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) :
 	    IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
 	setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU suspend/resume */
 	setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
 	    SDT_SYSIGT, SEL_KPL, 0);
 
+	/* Install an IPI for calling delayed SWI */
+	setidt(IPI_SWI, pti ? IDTVEC(ipi_swi_pti) : IDTVEC(ipi_swi),
+	    SDT_SYSIGT, SEL_KPL, 0);
+
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
 		boot_cpu_id = PCPU_GET(apic_id);
 		cpu_info[boot_cpu_id].cpu_bsp = 1;
 	} else
 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 
 	/* Probe logical/physical core configuration. */
 	topo_probe();
 
 	assign_cpu_ids();
 
 	/* Start each Application Processor */
 	init_ops.start_all_aps();
 
 	set_interrupt_apic_ids();
 
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 	acpi_pxm_set_cpu_locality();
 #endif
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary(void)
 {
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor ap_gdt;
 	u_int64_t cr0;
 	int cpu, gsel_tss, x;
 
 	/* Set by the startup code for us to use */
 	cpu = bootAP;
 
 	/* Update microcode before doing anything else. */
 	ucode_load_ap(cpu);
 
 	/* Get per-cpu data and save  */
 	pc = &__pcpu[cpu];
 
 	/* prime data page for it to use */
 	pcpu_init(pc, cpu, sizeof(struct pcpu));
 	dpcpu_init(dpcpu, cpu);
 	pc->pc_apic_id = cpu_apic_ids[cpu];
 	pc->pc_prvspace = pc;
 	pc->pc_curthread = 0;
 	pc->pc_tssp = &pc->pc_common_tss;
 	pc->pc_rsp0 = 0;
 	pc->pc_pti_rsp0 = (((vm_offset_t)&pc->pc_pti_stack +
 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 	gdt = pc->pc_gdt;
 	pc->pc_tss = (struct system_segment_descriptor *)&gdt[GPROC0_SEL];
 	pc->pc_fs32p = &gdt[GUFS32_SEL];
 	pc->pc_gs32p = &gdt[GUGS32_SEL];
 	pc->pc_ldt = (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL];
 	pc->pc_ucr3_load_mask = PMAP_UCR3_NOMASK;
 	/* See comment in pmap_bootstrap(). */
 	pc->pc_pcid_next = PMAP_PCID_KERN + 2;
 	pc->pc_pcid_gen = 1;
 
 	pc->pc_smp_tlb_gen = 1;
 
 	/* Init tss */
 	pc->pc_common_tss = __pcpu[0].pc_common_tss;
 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 	pc->pc_common_tss.tss_rsp0 = 0;
 
 	/* The doublefault stack runs on IST1. */
 	np = ((struct nmi_pcpu *)&doublefault_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist1 = (long)np;
 
 	/* The NMI stack runs on IST2. */
 	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist2 = (long)np;
 
 	/* The MC# stack runs on IST3. */
 	np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist3 = (long)np;
 
 	/* The DB# stack runs on IST4. */
 	np = ((struct nmi_pcpu *) &dbg_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t)pc;
 	pc->pc_common_tss.tss_ist4 = (long)np;
 
 	/* Prepare private GDT */
 	gdt_segs[GPROC0_SEL].ssd_base = (long)&pc->pc_common_tss;
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != GPROC0_SEL + 1 &&
 		    x != GUSERLDT_SEL && x != GUSERLDT_SEL + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	ap_gdt.rd_base = (u_long)gdt;
 	lgdt(&ap_gdt);			/* does magic intra-segment return */
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
 	fix_cpuid();
 
 	lidt(&r_idt);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	/*
 	 * Set to a known state:
 	 * Set by mpboot.s: CR0_PG, CR0_PE
 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
 	 */
 	cr0 = rcr0();
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 
 	amd64_conf_fast_syscall();
 
 	/* signal our startup to the BSP. */
 	mp_naps++;
 
 	/* Spin until the BSP releases the AP's. */
 	while (atomic_load_acq_int(&aps_ready) == 0)
 		ia32_pause();
 
 	init_secondary_tail();
 }
 
 /*******************************************************************
  * local functions and data
  */
 
 #ifdef NUMA
 static void
 mp_realloc_pcpu(int cpuid, int domain)
 {
 	vm_page_t m;
 	vm_offset_t oa, na;
 
 	oa = (vm_offset_t)&__pcpu[cpuid];
 	if (_vm_phys_domain(pmap_kextract(oa)) == domain)
 		return;
 	m = vm_page_alloc_domain(NULL, 0, domain,
 	    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
 	if (m == NULL)
 		return;
 	na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	pagecopy((void *)oa, (void *)na);
 	pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1);
 	/* XXX old pcpu page leaked. */
 }
 #endif
 
 /*
  * start each AP in our list
  */
 int
 native_start_all_aps(void)
 {
 	u_int64_t *pt4, *pt3, *pt2;
 	u_int32_t mpbioswarmvec;
 	int apic_id, cpu, domain, i;
 	u_char mpbiosreason;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	/* copy the AP 1st level boot code */
 	bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
 
 	/* Locate the page tables, they'll be below the trampoline */
 	pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
 	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
 	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
 
 	/* Create the initial 1GB replicated page tables */
 	for (i = 0; i < 512; i++) {
 		/* Each slot of the level 4 pages points to the same level 3 page */
 		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
 		pt4[i] |= PG_V | PG_RW | PG_U;
 
 		/* Each slot of the level 3 pages points to the same level 2 page */
 		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
 		pt3[i] |= PG_V | PG_RW | PG_U;
 
 		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
 		pt2[i] = i * (2 * 1024 * 1024);
 		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
 	}
 
 	/* save the current value of the warm-start vector */
 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 
 	/* setup a vector to our boot code */
 	*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
 	*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
 	/* Relocate pcpu areas to the correct domain. */
 #ifdef NUMA
 	if (vm_ndomains > 1)
 		for (cpu = 1; cpu < mp_ncpus; cpu++) {
 			apic_id = cpu_apic_ids[cpu];
 			domain = acpi_pxm_get_cpu_locality(apic_id);
 			mp_realloc_pcpu(cpu, domain);
 		}
 #endif
 
 	/* start each AP */
 	domain = 0;
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		apic_id = cpu_apic_ids[cpu];
 #ifdef NUMA
 		if (vm_ndomains > 1)
 			domain = acpi_pxm_get_cpu_locality(apic_id);
 #endif
 		/* allocate and set up an idle stack data page */
 		bootstacks[cpu] = (void *)kmem_malloc(kstack_pages * PAGE_SIZE,
 		    M_WAITOK | M_ZERO);
 		doublefault_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK |
 		    M_ZERO);
 		mce_stack = (char *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO);
 		nmi_stack = (char *)kmem_malloc_domainset(
 		    DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO);
 		dbg_stack = (char *)kmem_malloc_domainset(
 		    DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO);
 		dpcpu = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain),
 		    DPCPU_SIZE, M_WAITOK | M_ZERO);
 
 		bootSTK = (char *)bootstacks[cpu] +
 		    kstack_pages * PAGE_SIZE - 8;
 		bootAP = cpu;
 
 		/* attempt to start the Application Processor */
 		if (!start_ap(apic_id)) {
 			/* restore the warmstart vector */
 			*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 			panic("AP #%d (PHY# %d) failed!", cpu, apic_id);
 		}
 
 		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
 	}
 
 	/* restore the warmstart vector */
 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 
 	/* number of APs actually started */
 	return (mp_naps);
 }
 
 
 /*
  * This function starts the AP (application processor) identified
  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
  * to accomplish this.  This is necessary because of the nuances
  * of the different hardware we might encounter.  It isn't pretty,
  * but it seems to work.
  */
 static int
 start_ap(int apic_id)
 {
 	int vector, ms;
 	int cpus;
 
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
 	ipi_startup(apic_id, vector);
 
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
 			return 1;	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return 0;		/* return FAILURE */
 }
 
 /*
  * Flush the TLB on other CPU's
  */
 
 /*
  * Invalidation request.  PCPU pc_smp_tlb_op uses u_int instead of the
  * enum to avoid both namespace and ABI issues (with enums).
  */
 enum invl_op_codes {
       INVL_OP_TLB		= 1,
       INVL_OP_TLB_INVPCID	= 2,
       INVL_OP_TLB_INVPCID_PTI	= 3,
       INVL_OP_TLB_PCID		= 4,
       INVL_OP_PGRNG		= 5,
       INVL_OP_PGRNG_INVPCID	= 6,
       INVL_OP_PGRNG_PCID	= 7,
       INVL_OP_PG		= 8,
       INVL_OP_PG_INVPCID	= 9,
       INVL_OP_PG_PCID		= 10,
       INVL_OP_CACHE		= 11,
 };
 
 /*
  * These variables are initialized at startup to reflect how each of
  * the different kinds of invalidations should be performed on the
  * current machine and environment.
  */
 static enum invl_op_codes invl_op_tlb;
 static enum invl_op_codes invl_op_pgrng;
 static enum invl_op_codes invl_op_pg;
 
 /*
  * Scoreboard of IPI completion notifications from target to IPI initiator.
  *
  * Each CPU can initiate shootdown IPI independently from other CPUs.
  * Initiator enters critical section, then fills its local PCPU
  * shootdown info (pc_smp_tlb_ vars), then clears scoreboard generation
  * at location (cpu, my_cpuid) for each target cpu.  After that IPI is
  * sent to all targets which scan for zeroed scoreboard generation
  * words.  Upon finding such word the shootdown data is read from
  * corresponding cpu's pcpu, and generation is set.  Meantime initiator
  * loops waiting for all zeroed generations in scoreboard to update.
  */
 static uint32_t *invl_scoreboard;
 
 static void
 invl_scoreboard_init(void *arg __unused)
 {
 	u_int i;
 
 	invl_scoreboard = malloc(sizeof(uint32_t) * (mp_maxid + 1) *
 	    (mp_maxid + 1), M_DEVBUF, M_WAITOK);
 	for (i = 0; i < (mp_maxid + 1) * (mp_maxid + 1); i++)
 		invl_scoreboard[i] = 1;
 
 	if (pmap_pcid_enabled) {
 		if (invpcid_works) {
 			if (pti)
 				invl_op_tlb = INVL_OP_TLB_INVPCID_PTI;
 			else
 				invl_op_tlb = INVL_OP_TLB_INVPCID;
 			invl_op_pgrng = INVL_OP_PGRNG_INVPCID;
 			invl_op_pg = INVL_OP_PG_INVPCID;
 		} else {
 			invl_op_tlb = INVL_OP_TLB_PCID;
 			invl_op_pgrng = INVL_OP_PGRNG_PCID;
 			invl_op_pg = INVL_OP_PG_PCID;
 		}
 	} else {
 		invl_op_tlb = INVL_OP_TLB;
 		invl_op_pgrng = INVL_OP_PGRNG;
 		invl_op_pg = INVL_OP_PG;
 	}
 }
 SYSINIT(invl_ops, SI_SUB_SMP, SI_ORDER_FIRST, invl_scoreboard_init, NULL);
 
 static uint32_t *
 invl_scoreboard_getcpu(u_int cpu)
 {
 	return (invl_scoreboard + cpu * (mp_maxid + 1));
 }
 
 static uint32_t *
 invl_scoreboard_slot(u_int cpu)
 {
 	return (invl_scoreboard_getcpu(cpu) + PCPU_GET(cpuid));
 }
 
 /*
  * Used by pmap to request cache or TLB invalidation on local and
  * remote processors.  Mask provides the set of remote CPUs which are
  * to be signalled with the invalidation IPI.  As an optimization, the
  * curcpu_cb callback is invoked on the calling CPU while waiting for
  * remote CPUs to complete the operation.
  *
  * The callback function is called unconditionally on the caller's
  * underlying processor, even when this processor is not set in the
  * mask.  So, the callback function must be prepared to handle such
  * spurious invocations.
  *
  * Interrupts must be enabled when calling the function with smp
  * started, to avoid deadlock with other IPIs that are protected with
  * smp_ipi_mtx spinlock at the initiator side.
  */
 static void
 smp_targeted_tlb_shootdown(cpuset_t mask, pmap_t pmap, vm_offset_t addr1,
     vm_offset_t addr2, smp_invl_cb_t curcpu_cb, enum invl_op_codes op)
 {
 	cpuset_t other_cpus, mask1;
 	uint32_t generation, *p_cpudone;
 	int cpu;
 
 	/*
 	 * It is not necessary to signal other CPUs while booting or
 	 * when in the debugger.
 	 */
 	if (kdb_active || KERNEL_PANICKED() || !smp_started) {
 		curcpu_cb(pmap, addr1, addr2);
 		return;
 	}
 
 	sched_pin();
 
 	/*
 	 * Check for other cpus.  Return if none.
 	 */
 	if (CPU_ISFULLSET(&mask)) {
 		if (mp_ncpus <= 1)
 			goto nospinexit;
 	} else {
 		CPU_CLR(PCPU_GET(cpuid), &mask);
 		if (CPU_EMPTY(&mask))
 			goto nospinexit;
 	}
 
 	/*
 	 * Initiator must have interrupts enabled, which prevents
 	 * non-invalidation IPIs that take smp_ipi_mtx spinlock,
 	 * from deadlocking with us.  On the other hand, preemption
 	 * must be disabled to pin initiator to the instance of the
 	 * pcpu pc_smp_tlb data and scoreboard line.
 	 */
 	KASSERT((read_rflags() & PSL_I) != 0,
 	    ("smp_targeted_tlb_shootdown: interrupts disabled"));
 	critical_enter();
 
 	PCPU_SET(smp_tlb_addr1, addr1);
 	PCPU_SET(smp_tlb_addr2, addr2);
 	PCPU_SET(smp_tlb_pmap, pmap);
 	generation = PCPU_GET(smp_tlb_gen);
 	if (++generation == 0)
 		generation = 1;
 	PCPU_SET(smp_tlb_gen, generation);
 	PCPU_SET(smp_tlb_op, op);
 	/* Fence between filling smp_tlb fields and clearing scoreboard. */
 	atomic_thread_fence_rel();
 
 	mask1 = mask;
 	while ((cpu = CPU_FFS(&mask1)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &mask1);
 		KASSERT(*invl_scoreboard_slot(cpu) != 0,
 		    ("IPI scoreboard is zero, initiator %d target %d",
 		    PCPU_GET(cpuid), cpu));
 		*invl_scoreboard_slot(cpu) = 0;
 	}
 
 	/*
 	 * IPI acts as a fence between writing to the scoreboard above
 	 * (zeroing slot) and reading from it below (wait for
 	 * acknowledgment).
 	 */
 	if (CPU_ISFULLSET(&mask)) {
 		ipi_all_but_self(IPI_INVLOP);
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	} else {
 		other_cpus = mask;
 		ipi_selected(mask, IPI_INVLOP);
 	}
 	curcpu_cb(pmap, addr1, addr2);
 	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &other_cpus);
 		p_cpudone = invl_scoreboard_slot(cpu);
 		while (atomic_load_int(p_cpudone) != generation)
 			ia32_pause();
 	}
 	critical_exit();
 	sched_unpin();
 	return;
 
 nospinexit:
 	curcpu_cb(pmap, addr1, addr2);
 	sched_unpin();
 }
 
 void
 smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, pmap, 0, 0, curcpu_cb, invl_op_tlb);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_global++;
 #endif
 }
 
 void
 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
     smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, pmap, addr, 0, curcpu_cb, invl_op_pg);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_page++;
 #endif
 }
 
 void
 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
     pmap_t pmap, smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, pmap, addr1, addr2, curcpu_cb,
 	    invl_op_pgrng);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_range++;
 	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 }
 
 void
 smp_cache_flush(smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(all_cpus, NULL, 0, 0, curcpu_cb,
 	    INVL_OP_CACHE);
 }
 
 /*
  * Handlers for TLB related IPIs
  */
 static void
 invltlb_handler(pmap_t smp_tlb_pmap)
 {
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	if (smp_tlb_pmap == kernel_pmap)
 		invltlb_glob();
 	else
 		invltlb();
 }
 
 static void
 invltlb_invpcid_handler(pmap_t smp_tlb_pmap)
 {
 	struct invpcid_descr d;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
 	invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
 	    INVPCID_CTX);
 }
 
 static void
 invltlb_invpcid_pti_handler(pmap_t smp_tlb_pmap)
 {
 	struct invpcid_descr d;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 	d.pad = 0;
 	d.addr = 0;
 	if (smp_tlb_pmap == kernel_pmap) {
 		/*
 		 * This invalidation actually needs to clear kernel
 		 * mappings from the TLB in the current pmap, but
 		 * since we were asked for the flush in the kernel
 		 * pmap, achieve it by performing global flush.
 		 */
 		invpcid(&d, INVPCID_CTXGLOB);
 	} else {
 		invpcid(&d, INVPCID_CTX);
 		if (smp_tlb_pmap == PCPU_GET(curpmap))
 			PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 	}
 }
 
 static void
 invltlb_pcid_handler(pmap_t smp_tlb_pmap)
 {
 	uint32_t pcid;
   
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	if (smp_tlb_pmap == kernel_pmap) {
 		invltlb_glob();
 	} else {
 		/*
 		 * The current pmap might not be equal to
 		 * smp_tlb_pmap.  The clearing of the pm_gen in
 		 * pmap_invalidate_all() takes care of TLB
 		 * invalidation when switching to the pmap on this
 		 * CPU.
 		 */
 		if (smp_tlb_pmap == PCPU_GET(curpmap)) {
 			pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 			load_cr3(smp_tlb_pmap->pm_cr3 | pcid);
 			if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3)
 				PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 		}
 	}
 }
 
 static void
 invlpg_handler(vm_offset_t smp_tlb_addr1)
 {
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	invlpg(smp_tlb_addr1);
 }
 
 static void
 invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 {
 	struct invpcid_descr d;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
 		    PMAP_PCID_USER_PT;
 		d.pad = 0;
 		d.addr = smp_tlb_addr1;
 		invpcid(&d, INVPCID_ADDR);
 	}
 }
 
 static void
 invlpg_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 {
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlpg(ucr3, kcr3, smp_tlb_addr1);
 	}
 }
 
 static void
 invlrng_handler(vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2)
 {
 	vm_offset_t addr, addr2;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 }
 
 static void
 invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
     vm_offset_t smp_tlb_addr2)
 {
 	struct invpcid_descr d;
 	vm_offset_t addr, addr2;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
 		    PMAP_PCID_USER_PT;
 		d.pad = 0;
 		d.addr = smp_tlb_addr1;
 		do {
 			invpcid(&d, INVPCID_ADDR);
 			d.addr += PAGE_SIZE;
 		} while (d.addr < addr2);
 	}
 }
 
 static void
 invlrng_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
     vm_offset_t smp_tlb_addr2)
 {
 	vm_offset_t addr, addr2;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	do {
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 		pmap_pti_pcid_invlrng(ucr3, kcr3, smp_tlb_addr1, addr2);
 	}
 }
 
 static void
 invlcache_handler(void)
 {
 #ifdef COUNT_IPIS
 	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 	wbinvd();
 }
 
 static void
 invlop_handler_one_req(enum invl_op_codes smp_tlb_op, pmap_t smp_tlb_pmap,
     vm_offset_t smp_tlb_addr1, vm_offset_t smp_tlb_addr2)
 {
 	switch (smp_tlb_op) {
 	case INVL_OP_TLB:
 		invltlb_handler(smp_tlb_pmap);
 		break;
 	case INVL_OP_TLB_INVPCID:
 		invltlb_invpcid_handler(smp_tlb_pmap);
 		break;
 	case INVL_OP_TLB_INVPCID_PTI:
 		invltlb_invpcid_pti_handler(smp_tlb_pmap);
 		break;
 	case INVL_OP_TLB_PCID:
 		invltlb_pcid_handler(smp_tlb_pmap);
 		break;
 	case INVL_OP_PGRNG:
 		invlrng_handler(smp_tlb_addr1, smp_tlb_addr2);
 		break;
 	case INVL_OP_PGRNG_INVPCID:
 		invlrng_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1,
 		    smp_tlb_addr2);
 		break;
 	case INVL_OP_PGRNG_PCID:
 		invlrng_pcid_handler(smp_tlb_pmap, smp_tlb_addr1,
 		    smp_tlb_addr2);
 		break;
 	case INVL_OP_PG:
 		invlpg_handler(smp_tlb_addr1);
 		break;
 	case INVL_OP_PG_INVPCID:
 		invlpg_invpcid_handler(smp_tlb_pmap, smp_tlb_addr1);
 		break;
 	case INVL_OP_PG_PCID:
 		invlpg_pcid_handler(smp_tlb_pmap, smp_tlb_addr1);
 		break;
 	case INVL_OP_CACHE:
 		invlcache_handler();
 		break;
 	default:
 		__assert_unreachable();
 		break;
 	}
 }
 
 void
 invlop_handler(void)
 {
 	struct pcpu *initiator_pc;
 	pmap_t smp_tlb_pmap;
 	vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 	u_int initiator_cpu_id;
 	enum invl_op_codes smp_tlb_op;
 	uint32_t *scoreboard, smp_tlb_gen;
 
 	scoreboard = invl_scoreboard_getcpu(PCPU_GET(cpuid));
 	for (;;) {
 		for (initiator_cpu_id = 0; initiator_cpu_id <= mp_maxid;
 		    initiator_cpu_id++) {
 			if (scoreboard[initiator_cpu_id] == 0)
 				break;
 		}
 		if (initiator_cpu_id > mp_maxid)
 			break;
 		initiator_pc = cpuid_to_pcpu[initiator_cpu_id];
 
 		/*
 		 * This acquire fence and its corresponding release
 		 * fence in smp_targeted_tlb_shootdown() is between
 		 * reading zero scoreboard slot and accessing PCPU of
 		 * initiator for pc_smp_tlb values.
 		 */
 		atomic_thread_fence_acq();
 		smp_tlb_pmap = initiator_pc->pc_smp_tlb_pmap;
 		smp_tlb_addr1 = initiator_pc->pc_smp_tlb_addr1;
 		smp_tlb_addr2 = initiator_pc->pc_smp_tlb_addr2;
 		smp_tlb_op = initiator_pc->pc_smp_tlb_op;
 		smp_tlb_gen = initiator_pc->pc_smp_tlb_gen;
 
 		/*
 		 * Ensure that we do not make our scoreboard
 		 * notification visible to the initiator until the
 		 * pc_smp_tlb values are read.  The corresponding
 		 * fence is implicitly provided by the barrier in the
 		 * IPI send operation before the APIC ICR register
 		 * write.
 		 *
 		 * As an optimization, the request is acknowledged
 		 * before the actual invalidation is performed.  It is
 		 * safe because target CPU cannot return to userspace
 		 * before handler finishes. Only NMI can preempt the
 		 * handler, but NMI would see the kernel handler frame
 		 * and not touch not-invalidated user page table.
 		 */
 		atomic_thread_fence_acq();
 		atomic_store_int(&scoreboard[initiator_cpu_id], smp_tlb_gen);
 
 		invlop_handler_one_req(smp_tlb_op, smp_tlb_pmap, smp_tlb_addr1,
 		    smp_tlb_addr2);
 	}
 }
Index: head/sys/amd64/include/smp.h
===================================================================
--- head/sys/amd64/include/smp.h	(revision 363526)
+++ head/sys/amd64/include/smp.h	(revision 363527)
@@ -1,47 +1,48 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _MACHINE_SMP_H_
 #define _MACHINE_SMP_H_
 
 #ifdef _KERNEL
 
 #ifdef SMP
 
 #ifndef LOCORE
 
 #include <x86/x86_smp.h>
 
 /* global symbols in mpboot.S */
 extern char			mptramp_start[];
 extern u_int32_t		mptramp_pagetables;
 
 /* IPI handlers */
 inthand_t
 	IDTVEC(justreturn),	/* interrupt CPU with minimum overhead */
 	IDTVEC(justreturn1_pti),
 	IDTVEC(invlop_pti),
 	IDTVEC(invlop),
 	IDTVEC(ipi_intr_bitmap_handler_pti),
+	IDTVEC(ipi_swi_pti),
 	IDTVEC(cpustop_pti),
 	IDTVEC(cpususpend_pti),
 	IDTVEC(rendezvous_pti);
 
 void	invlop_handler(void);
 int	native_start_all_aps(void);
 void	mp_bootaddress(vm_paddr_t *, unsigned int *);
 
 #endif /* !LOCORE */
 #endif /* SMP */
 
 #endif /* _KERNEL */
 #endif /* _MACHINE_SMP_H_ */
Index: head/sys/i386/i386/apic_vector.s
===================================================================
--- head/sys/i386/i386/apic_vector.s	(revision 363526)
+++ head/sys/i386/i386/apic_vector.s	(revision 363527)
@@ -1,333 +1,350 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: vector.s, 386BSD 0.1 unknown origin
  * $FreeBSD$
  */
 
 /*
  * Interrupt entry points for external interrupts triggered by I/O APICs
  * as well as IPI handlers.
  */
 
 #include "opt_smp.h"
 
 #include <machine/asmacros.h>
 #include <machine/psl.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
 #include "assym.inc"
 
 	.text
 	SUPERALIGN_TEXT
 	/* End Of Interrupt to APIC */
 as_lapic_eoi:
 	cmpl	$0,x2apic_mode
 	jne	1f
 	movl	lapic_map,%eax
 	movl	$0,LA_EOI(%eax)
 	ret
 1:
 	movl	$MSR_APIC_EOI,%ecx
 	xorl	%eax,%eax
 	xorl	%edx,%edx
 	wrmsr
 	ret
 
 /*
  * I/O Interrupt Entry Point.  Rather than having one entry point for
  * each interrupt source, we use one entry point for each 32-bit word
  * in the ISR.  The handler determines the highest bit set in the ISR,
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
 	.macro	ISR_VEC	index, vec_name
 	.text
 	SUPERALIGN_TEXT
 	.globl	X\()\vec_name\()_pti, X\()\vec_name
 
 X\()\vec_name\()_pti:
 X\()\vec_name:
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 	cmpl	$0,x2apic_mode
 	je	2f
 	movl	$(MSR_APIC_ISR0 + \index),%ecx
 	rdmsr
 	jmp	3f
 2:
 	movl	lapic_map, %edx		/* pointer to local APIC */
 	movl	LA_ISR + 16 * \index(%edx), %eax	/* load ISR */
 3:
 	bsrl	%eax, %eax	/* index of highest set bit in ISR */
 	jz	4f
 	addl	$(32 * \index),%eax
 	pushl	%esp
 	pushl	%eax		/* pass the IRQ */
 	movl	$lapic_handle_intr, %eax
 	call	*%eax
 	addl	$8, %esp	/* discard parameter */
 4:
 	MEXITCOUNT
 	jmp	doreti
 	.endm
 
 /*
  * Handle "spurious INTerrupts".
  * Notes:
  *  This is different than the "spurious INTerrupt" generated by an
  *   8259 PIC for missing INTs.  See the APIC documentation for details.
  *  This routine should NOT do an 'EOI' cycle.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
 
 	/* No EOI cycle used here */
 
 	iret
 
 	ISR_VEC	1, apic_isr1
 	ISR_VEC	2, apic_isr2
 	ISR_VEC	3, apic_isr3
 	ISR_VEC	4, apic_isr4
 	ISR_VEC	5, apic_isr5
 	ISR_VEC	6, apic_isr6
 	ISR_VEC	7, apic_isr7
 
 /*
  * Local APIC periodic timer handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(timerint_pti)
 IDTVEC(timerint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	movl	$lapic_handle_timer, %eax
 	call	*%eax
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC CMCI handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cmcint_pti)
 IDTVEC(cmcint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 	movl	$lapic_handle_cmc, %eax
 	call	*%eax
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Local APIC error interrupt handler.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(errorint_pti)
 IDTVEC(errorint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 	movl	$lapic_handle_error, %eax
 	call	*%eax
 	MEXITCOUNT
 	jmp	doreti
 
 #ifdef XENHVM
 /*
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(xen_intr_upcall)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	FAKE_MCOUNT(TF_EIP(%esp))
 	pushl	%esp
 	movl	$xen_intr_handle_upcall, %eax
 	call	*%eax
 	add	$4, %esp
 	MEXITCOUNT
 	jmp	doreti
 #endif
 
 #ifdef SMP
 /*
  * Global address space TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 invltlb_ret:
 	call	as_lapic_eoi
 	jmp	doreti
 
 	SUPERALIGN_TEXT
 IDTVEC(invltlb)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	movl	$invltlb_handler, %eax
 	call	*%eax
 	jmp	invltlb_ret
 
 /*
  * Single page TLB shootdown
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlpg)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	movl	$invlpg_handler, %eax
 	call	*%eax
 	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlrng)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	movl	$invlrng_handler, %eax
 	call	*%eax
 	jmp	invltlb_ret
 
 /*
  * Invalidate cache.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(invlcache)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	movl	$invlcache_handler, %eax
 	call	*%eax
 	jmp	invltlb_ret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(ipi_intr_bitmap_handler)	
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	call	as_lapic_eoi
 	FAKE_MCOUNT(TF_EIP(%esp))
 	movl	$ipi_bitmap_handler, %eax
 	call	*%eax
 	MEXITCOUNT
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpustop)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	call	as_lapic_eoi
 	movl	$cpustop_handler, %eax
 	call	*%eax
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(cpususpend)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 	call	as_lapic_eoi
 	movl	$cpususpend_handler, %eax
 	call	*%eax
 	jmp	doreti
 
 /*
+ * Executed by a CPU when it receives an IPI_SWI.
+ */
+	.text
+	SUPERALIGN_TEXT
+IDTVEC(ipi_swi)
+	PUSH_FRAME
+	SET_KERNEL_SREGS
+	cld
+	KENTER
+	call	as_lapic_eoi
+	FAKE_MCOUNT(TF_EIP(%esp))
+	movl	$ipi_swi_handler, %eax
+	call	*%eax
+	MEXITCOUNT
+	jmp	doreti
+
+/*
  * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU.
  *
  * - Calls the generic rendezvous action function.
  */
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(rendezvous)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
 	cld
 	KENTER
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movl	ipi_rendezvous_counts(,%eax,4), %eax
 	incl	(%eax)
 #endif
 	movl	$smp_rendezvous_action, %eax
 	call	*%eax
 	call	as_lapic_eoi
 	jmp	doreti
 	
 #endif /* SMP */
Index: head/sys/i386/i386/mp_machdep.c
===================================================================
--- head/sys/i386/i386/mp_machdep.c	(revision 363526)
+++ head/sys/i386/i386/mp_machdep.c	(revision 363527)
@@ -1,684 +1,688 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_apic.h"
 #include "opt_cpu.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 
 #if !defined(lint)
 #if !defined(SMP)
 #error How did you get here?
 #endif
 
 #ifndef DEV_APIC
 #error The apic device is required for SMP, add "device apic" to your config file.
 #endif
 #endif /* not lint */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <x86/ucode.h>
 
 #ifdef DEV_ACPI
 #include <contrib/dev/acpica/include/acpi.h>
 #include <dev/acpica/acpivar.h>
 #endif
 
 #define WARMBOOT_TARGET		0
 #define WARMBOOT_OFF		(PMAP_MAP_LOW + 0x0467)
 #define WARMBOOT_SEG		(PMAP_MAP_LOW + 0x0469)
 
 #define CMOS_REG		(0x70)
 #define CMOS_DATA		(0x71)
 #define BIOS_RESET		(0x0f)
 #define BIOS_WARM		(0x0a)
 
 /*
  * this code MUST be enabled here and in mpboot.s.
  * it follows the very early stages of AP boot by placing values in CMOS ram.
  * it NORMALLY will never be needed and thus the primitive method for enabling.
  *
 #define CHECK_POINTS
  */
 
 #if defined(CHECK_POINTS)
 #define CHECK_READ(A)	 (outb(CMOS_REG, (A)), inb(CMOS_DATA))
 #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D)))
 
 #define CHECK_INIT(D);				\
 	CHECK_WRITE(0x34, (D));			\
 	CHECK_WRITE(0x35, (D));			\
 	CHECK_WRITE(0x36, (D));			\
 	CHECK_WRITE(0x37, (D));			\
 	CHECK_WRITE(0x38, (D));			\
 	CHECK_WRITE(0x39, (D));
 
 #define CHECK_PRINT(S);				\
 	printf("%s: %d, %d, %d, %d, %d, %d\n",	\
 	   (S),					\
 	   CHECK_READ(0x34),			\
 	   CHECK_READ(0x35),			\
 	   CHECK_READ(0x36),			\
 	   CHECK_READ(0x37),			\
 	   CHECK_READ(0x38),			\
 	   CHECK_READ(0x39));
 
 #else				/* CHECK_POINTS */
 
 #define CHECK_INIT(D)
 #define CHECK_PRINT(S)
 #define CHECK_WRITE(A, D)
 
 #endif				/* CHECK_POINTS */
 
 /*
  * Local data and functions.
  */
 
 static void	install_ap_tramp(void);
 static int	start_all_aps(void);
 static int	start_ap(int apic_id);
 
 static char *ap_copyout_buf;
 static char *ap_tramp_stack_base;
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
 void
 cpu_mp_start(void)
 {
 	int i;
 
 	/* Initialize the logical ID to APIC ID table. */
 	for (i = 0; i < MAXCPU; i++) {
 		cpu_apic_ids[i] = -1;
 	}
 
 	/* Install an inter-CPU IPI for TLB invalidation */
 	setidt(IPI_INVLTLB, IDTVEC(invltlb),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLPG, IDTVEC(invlpg),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IPI_INVLRNG, IDTVEC(invlrng),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for cache invalidation. */
 	setidt(IPI_INVLCACHE, IDTVEC(invlcache),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
 	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install generic inter-CPU IPI handler */
 	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
 	setidt(IPI_STOP, IDTVEC(cpustop),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	/* Install an inter-CPU IPI for CPU suspend/resume */
 	setidt(IPI_SUSPEND, IDTVEC(cpususpend),
 	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 
+	/* Install an IPI for calling delayed SWI */
+	setidt(IPI_SWI, IDTVEC(ipi_swi),
+	       SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
+
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
 		boot_cpu_id = PCPU_GET(apic_id);
 		cpu_info[boot_cpu_id].cpu_bsp = 1;
 	} else
 		KASSERT(boot_cpu_id == PCPU_GET(apic_id),
 		    ("BSP's APIC ID doesn't match boot_cpu_id"));
 
 	/* Probe logical/physical core configuration. */
 	topo_probe();
 
 	assign_cpu_ids();
 
 	/* Start each Application Processor */
 	start_all_aps();
 
 	set_interrupt_apic_ids();
 
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 	acpi_pxm_set_cpu_locality();
 #endif
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary(void)
 {
 	struct pcpu *pc;
 	struct i386tss *common_tssp;
 	struct region_descriptor r_gdt, r_idt;
 	int gsel_tss, myid, x;
 	u_int cr0;
 
 	/* bootAP is set in start_ap() to our ID. */
 	myid = bootAP;
 
 	/* Update microcode before doing anything else. */
 	ucode_load_ap(myid);
 
 	/* Get per-cpu data */
 	pc = &__pcpu[myid];
 
 	/* prime data page for it to use */
 	pcpu_init(pc, myid, sizeof(struct pcpu));
 	dpcpu_init(dpcpu, myid);
 	pc->pc_apic_id = cpu_apic_ids[myid];
 	pc->pc_prvspace = pc;
 	pc->pc_curthread = 0;
 	pc->pc_common_tssp = common_tssp = &(__pcpu[0].pc_common_tssp)[myid];
 
 	fix_cpuid();
 
 	gdt_segs[GPRIV_SEL].ssd_base = (int)pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int)common_tssp;
 	gdt_segs[GLDT_SEL].ssd_base = (int)ldt;
 
 	for (x = 0; x < NGDT; x++) {
 		ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base = (int) &gdt[myid * NGDT];
 	lgdt(&r_gdt);			/* does magic intra-segment return */
 
 	r_idt.rd_limit = sizeof(struct gate_descriptor) * NIDT - 1;
 	r_idt.rd_base = (int)idt;
 	lidt(&r_idt);
 
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	PCPU_SET(trampstk, (uintptr_t)ap_tramp_stack_base + TRAMP_STACK_SZ -
 	    VM86_STACK_SPACE);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS;
 	common_tssp->tss_esp0 = PCPU_GET(trampstk);
 	common_tssp->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
 	common_tssp->tss_ioopt = sizeof(struct i386tss) << 16;
 	PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	ltr(gsel_tss);
 
 	PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd);
 	PCPU_SET(copyout_buf, ap_copyout_buf);
 
 	/*
 	 * Set to a known state:
 	 * Set by mpboot.s: CR0_PG, CR0_PE
 	 * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM
 	 */
 	cr0 = rcr0();
 	cr0 &= ~(CR0_CD | CR0_NW | CR0_EM);
 	load_cr0(cr0);
 	CHECK_WRITE(0x38, 5);
 	
 	/* signal our startup to the BSP. */
 	mp_naps++;
 	CHECK_WRITE(0x39, 6);
 
 	/* Spin until the BSP releases the AP's. */
 	while (atomic_load_acq_int(&aps_ready) == 0)
 		ia32_pause();
 
 	/* BSP may have changed PTD while we were waiting */
 	invltlb();
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 	lidt(&r_idt);
 #endif
 
 	init_secondary_tail();
 }
 
 /*
  * start each AP in our list
  */
 #define TMPMAP_START 1
 static int
 start_all_aps(void)
 {
 	u_char mpbiosreason;
 	u_int32_t mpbioswarmvec;
 	int apic_id, cpu;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	pmap_remap_lower(true);
 
 	/* install the AP 1st level boot code */
 	install_ap_tramp();
 
 	/* save the current value of the warm-start vector */
 	mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
 	outb(CMOS_REG, BIOS_RESET);
 	mpbiosreason = inb(CMOS_DATA);
 
 	/* take advantage of the P==V mapping for PTD[0] for AP boot */
 
 	/* start each AP */
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		apic_id = cpu_apic_ids[cpu];
 
 		/* allocate and set up a boot stack data page */
 		bootstacks[cpu] = (char *)kmem_malloc(kstack_pages * PAGE_SIZE,
 		    M_WAITOK | M_ZERO);
 		dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO);
 		/* setup a vector to our boot code */
 		*((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
 		*((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
 		outb(CMOS_REG, BIOS_RESET);
 		outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
 		bootSTK = (char *)bootstacks[cpu] + kstack_pages *
 		    PAGE_SIZE - 4;
 		bootAP = cpu;
 
 		ap_tramp_stack_base = pmap_trm_alloc(TRAMP_STACK_SZ, M_NOWAIT);
 		ap_copyout_buf = pmap_trm_alloc(TRAMP_COPYOUT_SZ, M_NOWAIT);
 
 		/* attempt to start the Application Processor */
 		CHECK_INIT(99);	/* setup checkpoints */
 		if (!start_ap(apic_id)) {
 			printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id);
 			CHECK_PRINT("trace");	/* show checkpoints */
 			/* better panic as the AP may be running loose */
 			printf("panic y/n? [y] ");
 			if (cngetc() != 'n')
 				panic("bye-bye");
 		}
 		CHECK_PRINT("trace");		/* show checkpoints */
 
 		CPU_SET(cpu, &all_cpus);	/* record AP in CPU map */
 	}
 
 	pmap_remap_lower(false);
 
 	/* restore the warmstart vector */
 	*(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec;
 
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 
 	/* number of APs actually started */
 	return mp_naps;
 }
 
 /*
  * load the 1st level AP boot code into base memory.
  */
 
 /* targets for relocation */
 extern void bigJump(void);
 extern void bootCodeSeg(void);
 extern void bootDataSeg(void);
 extern void MPentry(void);
 extern u_int MP_GDT;
 extern u_int mp_gdtbase;
 
 static void
 install_ap_tramp(void)
 {
 	int     x;
 	int     size = *(int *) ((u_long) & bootMP_size);
 	vm_offset_t va = boot_address;
 	u_char *src = (u_char *) ((u_long) bootMP);
 	u_char *dst = (u_char *) va;
 	u_int   boot_base = (u_int) bootMP;
 	u_int8_t *dst8;
 	u_int16_t *dst16;
 	u_int32_t *dst32;
 
 	KASSERT (size <= PAGE_SIZE,
 	    ("'size' do not fit into PAGE_SIZE, as expected."));
 	pmap_kenter(va, boot_address);
 	pmap_invalidate_page (kernel_pmap, va);
 	for (x = 0; x < size; ++x)
 		*dst++ = *src++;
 
 	/*
 	 * modify addresses in code we just moved to basemem. unfortunately we
 	 * need fairly detailed info about mpboot.s for this to work.  changes
 	 * to mpboot.s might require changes here.
 	 */
 
 	/* boot code is located in KERNEL space */
 	dst = (u_char *) va;
 
 	/* modify the lgdt arg */
 	dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base));
 	*dst32 = boot_address + ((u_int) & MP_GDT - boot_base);
 
 	/* modify the ljmp target for MPentry() */
 	dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1);
 	*dst32 = (u_int)MPentry;
 
 	/* modify the target for boot code segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 
 	/* modify the target for boot data segment */
 	dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base));
 	dst8 = (u_int8_t *) (dst16 + 1);
 	*dst16 = (u_int) boot_address & 0xffff;
 	*dst8 = ((u_int) boot_address >> 16) & 0xff;
 }
 
 /*
  * This function starts the AP (application processor) identified
  * by the APIC ID 'physicalCpu'.  It does quite a "song and dance"
  * to accomplish this.  This is necessary because of the nuances
  * of the different hardware we might encounter.  It isn't pretty,
  * but it seems to work.
  */
 static int
 start_ap(int apic_id)
 {
 	int vector, ms;
 	int cpus;
 
 	/* calculate the vector */
 	vector = (boot_address >> 12) & 0xff;
 
 	/* used as a watchpoint to signal AP startup */
 	cpus = mp_naps;
 
 	ipi_startup(apic_id, vector);
 
 	/* Wait up to 5 seconds for it to start. */
 	for (ms = 0; ms < 5000; ms++) {
 		if (mp_naps > cpus)
 			return 1;	/* return SUCCESS */
 		DELAY(1000);
 	}
 	return 0;		/* return FAILURE */
 }
 
 /*
  * Flush the TLB on other CPU's
  */
 
 /* Variables needed for SMP tlb shootdown. */
 vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 pmap_t smp_tlb_pmap;
 volatile uint32_t smp_tlb_generation;
 
 /*
  * Used by pmap to request cache or TLB invalidation on local and
  * remote processors.  Mask provides the set of remote CPUs which are
  * to be signalled with the invalidation IPI.  Vector specifies which
  * invalidation IPI is used.  As an optimization, the curcpu_cb
  * callback is invoked on the calling CPU while waiting for remote
  * CPUs to complete the operation.
  *
  * The callback function is called unconditionally on the caller's
  * underlying processor, even when this processor is not set in the
  * mask.  So, the callback function must be prepared to handle such
  * spurious invocations.
  */
 static void
 smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
     vm_offset_t addr1, vm_offset_t addr2, smp_invl_cb_t curcpu_cb)
 {
 	cpuset_t other_cpus;
 	volatile uint32_t *p_cpudone;
 	uint32_t generation;
 	int cpu;
 
 	/*
 	 * It is not necessary to signal other CPUs while booting or
 	 * when in the debugger.
 	 */
 	if (kdb_active || KERNEL_PANICKED() || !smp_started) {
 		curcpu_cb(pmap, addr1, addr2);
 		return;
 	}
 
 	sched_pin();
 
 	/*
 	 * Check for other cpus.  Return if none.
 	 */
 	if (CPU_ISFULLSET(&mask)) {
 		if (mp_ncpus <= 1)
 			goto nospinexit;
 	} else {
 		CPU_CLR(PCPU_GET(cpuid), &mask);
 		if (CPU_EMPTY(&mask))
 			goto nospinexit;
 	}
 
 	KASSERT((read_eflags() & PSL_I) != 0,
 	    ("smp_targeted_tlb_shootdown: interrupts disabled"));
 	mtx_lock_spin(&smp_ipi_mtx);
 	smp_tlb_addr1 = addr1;
 	smp_tlb_addr2 = addr2;
 	smp_tlb_pmap = pmap;
 	generation = ++smp_tlb_generation;
 	if (CPU_ISFULLSET(&mask)) {
 		ipi_all_but_self(vector);
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 	} else {
 		other_cpus = mask;
 		ipi_selected(mask, vector);
 	}
 	curcpu_cb(pmap, addr1, addr2);
 	while ((cpu = CPU_FFS(&other_cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &other_cpus);
 		p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
 		while (*p_cpudone != generation)
 			ia32_pause();
 	}
 	mtx_unlock_spin(&smp_ipi_mtx);
 	sched_unpin();
 	return;
 
 nospinexit:
 	curcpu_cb(pmap, addr1, addr2);
 	sched_unpin();
 }
 
 void
 smp_masked_invltlb(cpuset_t mask, pmap_t pmap, smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0, curcpu_cb);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_global++;
 #endif
 }
 
 void
 smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, pmap_t pmap,
     smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0, curcpu_cb);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_page++;
 #endif
 }
 
 void
 smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2,
     pmap_t pmap, smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2,
 	    curcpu_cb);
 #ifdef COUNT_XINVLTLB_HITS
 	ipi_range++;
 	ipi_range_size += (addr2 - addr1) / PAGE_SIZE;
 #endif
 }
 
 void
 smp_cache_flush(smp_invl_cb_t curcpu_cb)
 {
 	smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0,
 	    curcpu_cb);
 }
 
 /*
  * Handlers for TLB related IPIs
  */
 void
 invltlb_handler(void)
 {
 	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_gbl[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	/*
 	 * Reading the generation here allows greater parallelism
 	 * since invalidating the TLB is a serializing operation.
 	 */
 	generation = smp_tlb_generation;
 	if (smp_tlb_pmap == kernel_pmap)
 		invltlb_glob();
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlpg_handler(void)
 {
 	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_pg[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	generation = smp_tlb_generation;	/* Overlap with serialization */
 	if (smp_tlb_pmap == kernel_pmap)
 		invlpg(smp_tlb_addr1);
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlrng_handler(void)
 {
 	vm_offset_t addr, addr2;
 	uint32_t generation;
 
 #ifdef COUNT_XINVLTLB_HITS
 	xhits_rng[PCPU_GET(cpuid)]++;
 #endif /* COUNT_XINVLTLB_HITS */
 #ifdef COUNT_IPIS
 	(*ipi_invlrng_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
 	addr2 = smp_tlb_addr2;
 	generation = smp_tlb_generation;	/* Overlap with serialization */
 	if (smp_tlb_pmap == kernel_pmap) {
 		do {
 			invlpg(addr);
 			addr += PAGE_SIZE;
 		} while (addr < addr2);
 	}
 
 	PCPU_SET(smp_tlb_done, generation);
 }
 
 void
 invlcache_handler(void)
 {
 	uint32_t generation;
 
 #ifdef COUNT_IPIS
 	(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	/*
 	 * Reading the generation here allows greater parallelism
 	 * since wbinvd is a serializing instruction.  Without the
 	 * temporary, we'd wait for wbinvd to complete, then the read
 	 * would execute, then the dependent write, which must then
 	 * complete before return from interrupt.
 	 */
 	generation = smp_tlb_generation;
 	wbinvd();
 	PCPU_SET(smp_tlb_done, generation);
 }
Index: head/sys/kern/kern_clock.c
===================================================================
--- head/sys/kern/kern_clock.c	(revision 363526)
+++ head/sys/kern/kern_clock.c	(revision 363527)
@@ -1,833 +1,834 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kdb.h"
 #include "opt_device_polling.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ntp.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/epoch.h>
 #include <sys/eventhandler.h>
 #include <sys/gtaskqueue.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/sysctl.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/limits.h>
 #include <sys/timetc.h>
 
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , clock, hard);
 PMC_SOFT_DEFINE( , , clock, stat);
 PMC_SOFT_DEFINE_EX( , , clock, prof, \
     cpu_startprofclock, cpu_stopprofclock);
 #endif
 
 #ifdef DEVICE_POLLING
 extern void hardclock_device_poll(void);
 #endif /* DEVICE_POLLING */
 
 static void initclocks(void *dummy);
 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
 
 /* Spin-lock protecting profiling statistics. */
 static struct mtx time_lock;
 
 SDT_PROVIDER_DECLARE(sched);
 SDT_PROBE_DEFINE2(sched, , , tick, "struct thread *", "struct proc *");
 
 static int
 sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long cp_time[CPUSTATES];
 #ifdef SCTL_MASK32
 	int i;
 	unsigned int cp_time32[CPUSTATES];
 #endif
 
 	read_cpu_time(cp_time);
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		if (!req->oldptr)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time32));
 		for (i = 0; i < CPUSTATES; i++)
 			cp_time32[i] = (unsigned int)cp_time[i];
 		error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
 	} else
 #endif
 	{
 		if (!req->oldptr)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time));
 		error = SYSCTL_OUT(req, cp_time, sizeof(cp_time));
 	}
 	return error;
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
 
 static long empty[CPUSTATES];
 
 static int
 sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS)
 {
 	struct pcpu *pcpu;
 	int error;
 	int c;
 	long *cp_time;
 #ifdef SCTL_MASK32
 	unsigned int cp_time32[CPUSTATES];
 	int i;
 #endif
 
 	if (!req->oldptr) {
 #ifdef SCTL_MASK32
 		if (req->flags & SCTL_MASK32)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1));
 		else
 #endif
 			return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1));
 	}
 	for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) {
 		if (!CPU_ABSENT(c)) {
 			pcpu = pcpu_find(c);
 			cp_time = pcpu->pc_cp_time;
 		} else {
 			cp_time = empty;
 		}
 #ifdef SCTL_MASK32
 		if (req->flags & SCTL_MASK32) {
 			for (i = 0; i < CPUSTATES; i++)
 				cp_time32[i] = (unsigned int)cp_time[i];
 			error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
 		} else
 #endif
 			error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES);
 	}
 	return error;
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics");
 
 #ifdef DEADLKRES
 static const char *blessed[] = {
 	"getblk",
 	"so_snd_sx",
 	"so_rcv_sx",
 	NULL
 };
 static int slptime_threshold = 1800;
 static int blktime_threshold = 900;
 static int sleepfreq = 3;
 
 static void
 deadlres_td_on_lock(struct proc *p, struct thread *td, int blkticks)
 {
 	int tticks;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * The thread should be blocked on a turnstile, simply check
 	 * if the turnstile channel is in good state.
 	 */
 	MPASS(td->td_blocked != NULL);
 
 	tticks = ticks - td->td_blktick;
 	if (tticks > blkticks)
 		/*
 		 * Accordingly with provided thresholds, this thread is stuck
 		 * for too long on a turnstile.
 		 */
 		panic("%s: possible deadlock detected for %p (%s), "
 		    "blocked for %d ticks\n", __func__,
 		    td, sched_tdname(td), tticks);
 }
 
 static void
 deadlres_td_sleep_q(struct proc *p, struct thread *td, int slpticks)
 {
 	const void *wchan;
 	int i, slptype, tticks;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Check if the thread is sleeping on a lock, otherwise skip the check.
 	 * Drop the thread lock in order to avoid a LOR with the sleepqueue
 	 * spinlock.
 	 */
 	wchan = td->td_wchan;
 	tticks = ticks - td->td_slptick;
 	slptype = sleepq_type(wchan);
 	if ((slptype == SLEEPQ_SX || slptype == SLEEPQ_LK) &&
 	    tticks > slpticks) {
 
 		/*
 		 * Accordingly with provided thresholds, this thread is stuck
 		 * for too long on a sleepqueue.
 		 * However, being on a sleepqueue, we might still check for the
 		 * blessed list.
 		 */
 		for (i = 0; blessed[i] != NULL; i++)
 			if (!strcmp(blessed[i], td->td_wmesg))
 				return;
 
 		panic("%s: possible deadlock detected for %p (%s), "
 		    "blocked for %d ticks\n", __func__,
 		    td, sched_tdname(td), tticks);
 	}
 }
 
 static void
 deadlkres(void)
 {
 	struct proc *p;
 	struct thread *td;
 	int blkticks, slpticks, tryl;
 
 	tryl = 0;
 	for (;;) {
 		blkticks = blktime_threshold * hz;
 		slpticks = slptime_threshold * hz;
 
 		/*
 		 * Avoid to sleep on the sx_lock in order to avoid a
 		 * possible priority inversion problem leading to
 		 * starvation.
 		 * If the lock can't be held after 100 tries, panic.
 		 */
 		if (!sx_try_slock(&allproc_lock)) {
 			if (tryl > 100)
 				panic("%s: possible deadlock detected "
 				    "on allproc_lock\n", __func__);
 			tryl++;
 			pause("allproc", sleepfreq * hz);
 			continue;
 		}
 		tryl = 0;
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				if (TD_ON_LOCK(td))
 					deadlres_td_on_lock(p, td,
 					    blkticks);
 				else if (TD_IS_SLEEPING(td))
 					deadlres_td_sleep_q(p, td,
 					    slpticks);
 				thread_unlock(td);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 
 		/* Sleep for sleepfreq seconds. */
 		pause("-", sleepfreq * hz);
 	}
 }
 
 static struct kthread_desc deadlkres_kd = {
 	"deadlkres",
 	deadlkres,
 	(struct thread **)NULL
 };
 
 SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd);
 
 static SYSCTL_NODE(_debug, OID_AUTO, deadlkres, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Deadlock resolver");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, slptime_threshold, CTLFLAG_RW,
     &slptime_threshold, 0,
     "Number of seconds within is valid to sleep on a sleepqueue");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, blktime_threshold, CTLFLAG_RW,
     &blktime_threshold, 0,
     "Number of seconds within is valid to block on a turnstile");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, sleepfreq, CTLFLAG_RW, &sleepfreq, 0,
     "Number of seconds between any deadlock resolver thread run");
 #endif	/* DEADLKRES */
 
 void
 read_cpu_time(long *cp_time)
 {
 	struct pcpu *pc;
 	int i, j;
 
 	/* Sum up global cp_time[]. */
 	bzero(cp_time, sizeof(long) * CPUSTATES);
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
 		for (j = 0; j < CPUSTATES; j++)
 			cp_time[j] += pc->pc_cp_time[j];
 	}
 }
 
 #include <sys/watchdog.h>
 
 static int watchdog_ticks;
 static int watchdog_enabled;
 static void watchdog_fire(void);
 static void watchdog_config(void *, u_int, int *);
 
 static void
 watchdog_attach(void)
 {
 	EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0);
 }
 
 /*
  * Clock handling routines.
  *
  * This code is written to operate with two timers that run independently of
  * each other.
  *
  * The main timer, running hz times per second, is used to trigger interval
  * timers, timeouts and rescheduling as needed.
  *
  * The second timer handles kernel and user profiling,
  * and does resource use estimation.  If the second timer is programmable,
  * it is randomized to avoid aliasing between the two clocks.  For example,
  * the randomization prevents an adversary from always giving up the cpu
  * just before its quantum expires.  Otherwise, it would never accumulate
  * cpu ticks.  The mean frequency of the second timer is stathz.
  *
  * If no second timer exists, stathz will be zero; in this case we drive
  * profiling and statistics off the main clock.  This WILL NOT be accurate;
  * do not do it unless absolutely necessary.
  *
  * The statistics clock may (or may not) be run at a higher rate while
  * profiling.  This profile clock runs at profhz.  We require that profhz
  * be an integral multiple of stathz.
  *
  * If the statistics clock is running fast, it must be divided by the ratio
  * profhz/stathz for statistics.  (For profiling, every tick counts.)
  *
  * Time-of-day is maintained using a "timecounter", which may or may
  * not be related to the hardware generating the above mentioned
  * interrupts.
  */
 
 int	stathz;
 int	profhz;
 int	profprocs;
 volatile int	ticks;
 int	psratio;
 
 DPCPU_DEFINE_STATIC(int, pcputicks);	/* Per-CPU version of ticks. */
 #ifdef DEVICE_POLLING
 static int devpoll_run = 0;
 #endif
 
 /*
  * Initialize clock frequencies and start both clocks running.
  */
 /* ARGSUSED*/
 static void
 initclocks(void *dummy)
 {
 	int i;
 
 	/*
 	 * Set divisors to 1 (normal case) and let the machine-specific
 	 * code do its bit.
 	 */
 	mtx_init(&time_lock, "time lock", NULL, MTX_DEF);
 	cpu_initclocks();
 
 	/*
 	 * Compute profhz/stathz, and fix profhz if needed.
 	 */
 	i = stathz ? stathz : hz;
 	if (profhz == 0)
 		profhz = i;
 	psratio = profhz / i;
 
 #ifdef SW_WATCHDOG
 	/* Enable hardclock watchdog now, even if a hardware watchdog exists. */
 	watchdog_attach();
 #else
 	/* Volunteer to run a software watchdog. */
 	if (wdog_software_attach == NULL)
 		wdog_software_attach = watchdog_attach;
 #endif
 }
 
 static __noinline void
 hardclock_itimer(struct thread *td, struct pstats *pstats, int cnt, int usermode)
 {
 	struct proc *p;
 	int flags;
 
 	flags = 0;
 	p = td->td_proc;
 	if (usermode &&
 	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
 		PROC_ITIMLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL],
 		    tick * cnt) == 0)
 			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
 		PROC_ITIMUNLOCK(p);
 	}
 	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
 		PROC_ITIMLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_PROF],
 		    tick * cnt) == 0)
 			flags |= TDF_PROFPEND | TDF_ASTPENDING;
 		PROC_ITIMUNLOCK(p);
 	}
 	if (flags != 0) {
 		thread_lock(td);
 		td->td_flags |= flags;
 		thread_unlock(td);
 	}
 }
 
 void
 hardclock(int cnt, int usermode)
 {
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int *t = DPCPU_PTR(pcputicks);
 	int global, i, newticks;
 
 	/*
 	 * Update per-CPU and possibly global ticks values.
 	 */
 	*t += cnt;
 	global = ticks;
 	do {
 		newticks = *t - global;
 		if (newticks <= 0) {
 			if (newticks < -1)
 				*t = global - 1;
 			newticks = 0;
 			break;
 		}
 	} while (!atomic_fcmpset_int(&ticks, &global, *t));
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
 	pstats = p->p_stats;
 	if (__predict_false(
 	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) ||
 	    timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)))
 		hardclock_itimer(td, pstats, cnt, usermode);
 
 #ifdef	HWPMC_HOOKS
 	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
 		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
 #endif
 	/* We are in charge to handle this tick duty. */
 	if (newticks > 0) {
 		tc_ticktock(newticks);
 #ifdef DEVICE_POLLING
 		/* Dangerous and no need to call these things concurrently. */
 		if (atomic_cmpset_acq_int(&devpoll_run, 0, 1)) {
 			/* This is very short and quick. */
 			hardclock_device_poll();
 			atomic_store_rel_int(&devpoll_run, 0);
 		}
 #endif /* DEVICE_POLLING */
 		if (watchdog_enabled > 0) {
 			i = atomic_fetchadd_int(&watchdog_ticks, -newticks);
 			if (i > 0 && i <= newticks)
 				watchdog_fire();
 		}
+		intr_event_handle(clk_intr_event, NULL);
 	}
 	if (curcpu == CPU_FIRST())
 		cpu_tick_calibration();
 	if (__predict_false(DPCPU_GET(epoch_cb_count)))
 		GROUPTASK_ENQUEUE(DPCPU_PTR(epoch_cb_task));
 }
 
 void
 hardclock_sync(int cpu)
 {
 	int *t;
 	KASSERT(!CPU_ABSENT(cpu), ("Absent CPU %d", cpu));
 	t = DPCPU_ID_PTR(cpu, pcputicks);
 
 	*t = ticks;
 }
 
 /*
  * Compute number of ticks in the specified amount of time.
  */
 int
 tvtohz(struct timeval *tv)
 {
 	unsigned long ticks;
 	long sec, usec;
 
 	/*
 	 * If the number of usecs in the whole seconds part of the time
 	 * difference fits in a long, then the total number of usecs will
 	 * fit in an unsigned long.  Compute the total and convert it to
 	 * ticks, rounding up and adding 1 to allow for the current tick
 	 * to expire.  Rounding also depends on unsigned long arithmetic
 	 * to avoid overflow.
 	 *
 	 * Otherwise, if the number of ticks in the whole seconds part of
 	 * the time difference fits in a long, then convert the parts to
 	 * ticks separately and add, using similar rounding methods and
 	 * overflow avoidance.  This method would work in the previous
 	 * case but it is slightly slower and assumes that hz is integral.
 	 *
 	 * Otherwise, round the time difference down to the maximum
 	 * representable value.
 	 *
 	 * If ints have 32 bits, then the maximum value for any timeout in
 	 * 10ms ticks is 248 days.
 	 */
 	sec = tv->tv_sec;
 	usec = tv->tv_usec;
 	if (usec < 0) {
 		sec--;
 		usec += 1000000;
 	}
 	if (sec < 0) {
 #ifdef DIAGNOSTIC
 		if (usec > 0) {
 			sec++;
 			usec -= 1000000;
 		}
 		printf("tvotohz: negative time difference %ld sec %ld usec\n",
 		       sec, usec);
 #endif
 		ticks = 1;
 	} else if (sec <= LONG_MAX / 1000000)
 		ticks = howmany(sec * 1000000 + (unsigned long)usec, tick) + 1;
 	else if (sec <= LONG_MAX / hz)
 		ticks = sec * hz
 			+ howmany((unsigned long)usec, tick) + 1;
 	else
 		ticks = LONG_MAX;
 	if (ticks > INT_MAX)
 		ticks = INT_MAX;
 	return ((int)ticks);
 }
 
 /*
  * Start profiling on a process.
  *
  * Kernel profiling passes proc0 which never exits and hence
  * keeps the profile clock running constantly.
  */
 void
 startprofclock(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_STOPPROF)
 		return;
 	if ((p->p_flag & P_PROFIL) == 0) {
 		p->p_flag |= P_PROFIL;
 		mtx_lock(&time_lock);
 		if (++profprocs == 1)
 			cpu_startprofclock();
 		mtx_unlock(&time_lock);
 	}
 }
 
 /*
  * Stop profiling on a process.
  */
 void
 stopprofclock(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_PROFIL) {
 		if (p->p_profthreads != 0) {
 			while (p->p_profthreads != 0) {
 				p->p_flag |= P_STOPPROF;
 				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
 				    "stopprof", 0);
 			}
 		}
 		if ((p->p_flag & P_PROFIL) == 0)
 			return;
 		p->p_flag &= ~P_PROFIL;
 		mtx_lock(&time_lock);
 		if (--profprocs == 0)
 			cpu_stopprofclock();
 		mtx_unlock(&time_lock);
 	}
 }
 
 /*
  * Statistics clock.  Updates rusage information and calls the scheduler
  * to adjust priorities of the active thread.
  *
  * This should be called by all active processors.
  */
 void
 statclock(int cnt, int usermode)
 {
 	struct rusage *ru;
 	struct vmspace *vm;
 	struct thread *td;
 	struct proc *p;
 	long rss;
 	long *cp_time;
 	uint64_t runtime, new_switchtime;
 
 	td = curthread;
 	p = td->td_proc;
 
 	cp_time = (long *)PCPU_PTR(cp_time);
 	if (usermode) {
 		/*
 		 * Charge the time as appropriate.
 		 */
 		td->td_uticks += cnt;
 		if (p->p_nice > NZERO)
 			cp_time[CP_NICE] += cnt;
 		else
 			cp_time[CP_USER] += cnt;
 	} else {
 		/*
 		 * Came from kernel mode, so we were:
 		 * - handling an interrupt,
 		 * - doing syscall or trap work on behalf of the current
 		 *   user process, or
 		 * - spinning in the idle loop.
 		 * Whichever it is, charge the time as appropriate.
 		 * Note that we charge interrupts to the current process,
 		 * regardless of whether they are ``for'' that process,
 		 * so that we know how much of its real time was spent
 		 * in ``non-process'' (i.e., interrupt) work.
 		 */
 		if ((td->td_pflags & TDP_ITHREAD) ||
 		    td->td_intr_nesting_level >= 2) {
 			td->td_iticks += cnt;
 			cp_time[CP_INTR] += cnt;
 		} else {
 			td->td_pticks += cnt;
 			td->td_sticks += cnt;
 			if (!TD_IS_IDLETHREAD(td))
 				cp_time[CP_SYS] += cnt;
 			else
 				cp_time[CP_IDLE] += cnt;
 		}
 	}
 
 	/* Update resource usage integrals and maximums. */
 	MPASS(p->p_vmspace != NULL);
 	vm = p->p_vmspace;
 	ru = &td->td_ru;
 	ru->ru_ixrss += pgtok(vm->vm_tsize) * cnt;
 	ru->ru_idrss += pgtok(vm->vm_dsize) * cnt;
 	ru->ru_isrss += pgtok(vm->vm_ssize) * cnt;
 	rss = pgtok(vmspace_resident_count(vm));
 	if (ru->ru_maxrss < rss)
 		ru->ru_maxrss = rss;
 	KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock",
 	    "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz);
 	SDT_PROBE2(sched, , , tick, td, td->td_proc);
 	thread_lock_flags(td, MTX_QUIET);
 
 	/*
 	 * Compute the amount of time during which the current
 	 * thread was running, and add that to its total so far.
 	 */
 	new_switchtime = cpu_ticks();
 	runtime = new_switchtime - PCPU_GET(switchtime);
 	td->td_runtime += runtime;
 	td->td_incruntime += runtime;
 	PCPU_SET(switchtime, new_switchtime);
 
 	sched_clock(td, cnt);
 	thread_unlock(td);
 #ifdef HWPMC_HOOKS
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, stat, td->td_intr_frame);
 #endif
 }
 
 void
 profclock(int cnt, int usermode, uintfptr_t pc)
 {
 	struct thread *td;
 #ifdef GPROF
 	struct gmonparam *g;
 	uintfptr_t i;
 #endif
 
 	td = curthread;
 	if (usermode) {
 		/*
 		 * Came from user mode; CPU was in user state.
 		 * If this process is being profiled, record the tick.
 		 * if there is no related user location yet, don't
 		 * bother trying to count it.
 		 */
 		if (td->td_proc->p_flag & P_PROFIL)
 			addupc_intr(td, pc, cnt);
 	}
 #ifdef GPROF
 	else {
 		/*
 		 * Kernel statistics are just like addupc_intr, only easier.
 		 */
 		g = &_gmonparam;
 		if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
 			i = PC_TO_I(g, pc);
 			if (i < g->textsize) {
 				KCOUNT(g, i) += cnt;
 			}
 		}
 	}
 #endif
 #ifdef HWPMC_HOOKS
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, prof, td->td_intr_frame);
 #endif
 }
 
 /*
  * Return information about system clocks.
  */
 static int
 sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
 {
 	struct clockinfo clkinfo;
 	/*
 	 * Construct clockinfo structure.
 	 */
 	bzero(&clkinfo, sizeof(clkinfo));
 	clkinfo.hz = hz;
 	clkinfo.tick = tick;
 	clkinfo.profhz = profhz;
 	clkinfo.stathz = stathz ? stathz : hz;
 	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
 }
 
 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate,
 	CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_MPSAFE,
 	0, 0, sysctl_kern_clockrate, "S,clockinfo",
 	"Rate and period of various kernel clocks");
 
 static void
 watchdog_config(void *unused __unused, u_int cmd, int *error)
 {
 	u_int u;
 
 	u = cmd & WD_INTERVAL;
 	if (u >= WD_TO_1SEC) {
 		watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
 		watchdog_enabled = 1;
 		*error = 0;
 	} else {
 		watchdog_enabled = 0;
 	}
 }
 
 /*
  * Handle a watchdog timeout by dumping interrupt information and
  * then either dropping to DDB or panicking.
  */
 static void
 watchdog_fire(void)
 {
 	int nintr;
 	uint64_t inttotal;
 	u_long *curintr;
 	char *curname;
 
 	curintr = intrcnt;
 	curname = intrnames;
 	inttotal = 0;
 	nintr = sintrcnt / sizeof(u_long);
 
 	printf("interrupt                   total\n");
 	while (--nintr >= 0) {
 		if (*curintr)
 			printf("%-12s %20lu\n", curname, *curintr);
 		curname += strlen(curname) + 1;
 		inttotal += *curintr++;
 	}
 	printf("Total        %20ju\n", (uintmax_t)inttotal);
 
 #if defined(KDB) && !defined(KDB_UNATTENDED)
 	kdb_backtrace();
 	kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout");
 #else
 	panic("watchdog timeout");
 #endif
 }
Index: head/sys/kern/kern_intr.c
===================================================================
--- head/sys/kern/kern_intr.c	(revision 363526)
+++ head/sys/kern/kern_intr.c	(revision 363527)
@@ -1,1649 +1,1669 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/cpuset.h>
 #include <sys/rtprio.h>
 #include <sys/systm.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/epoch.h>
 #include <sys/random.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
+#include <machine/smp.h>
 #include <machine/stdarg.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 /*
  * Describe an interrupt thread.  There is one of these per interrupt event.
  */
 struct intr_thread {
 	struct intr_event *it_event;
 	struct thread *it_thread;	/* Kernel thread. */
 	int	it_flags;		/* (j) IT_* flags. */
 	int	it_need;		/* Needs service. */
 };
 
 /* Interrupt thread flags kept in it_flags */
 #define	IT_DEAD		0x000001	/* Thread is waiting to exit. */
 #define	IT_WAIT		0x000002	/* Thread is waiting for completion. */
 
 struct	intr_entropy {
 	struct	thread *td;
 	uintptr_t event;
 };
 
+struct	intr_event *clk_intr_event;
 struct	intr_event *tty_intr_event;
 void	*vm_ih;
 struct proc *intrproc;
 
 static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
 
 static int intr_storm_threshold = 0;
 SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN,
     &intr_storm_threshold, 0,
     "Number of consecutive interrupts before storm protection is enabled");
 static int intr_epoch_batch = 1000;
 SYSCTL_INT(_hw, OID_AUTO, intr_epoch_batch, CTLFLAG_RWTUN, &intr_epoch_batch,
     0, "Maximum interrupt handler executions without re-entering epoch(9)");
 static TAILQ_HEAD(, intr_event) event_list =
     TAILQ_HEAD_INITIALIZER(event_list);
 static struct mtx event_lock;
 MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF);
 
 static void	intr_event_update(struct intr_event *ie);
 static int	intr_event_schedule_thread(struct intr_event *ie);
 static struct intr_thread *ithread_create(const char *name);
 static void	ithread_destroy(struct intr_thread *ithread);
 static void	ithread_execute_handlers(struct proc *p, 
 		    struct intr_event *ie);
 static void	ithread_loop(void *);
 static void	ithread_update(struct intr_thread *ithd);
 static void	start_softintr(void *);
 
 /* Map an interrupt type to an ithread priority. */
 u_char
 intr_priority(enum intr_type flags)
 {
 	u_char pri;
 
 	flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
 	    INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV);
 	switch (flags) {
 	case INTR_TYPE_TTY:
 		pri = PI_TTY;
 		break;
 	case INTR_TYPE_BIO:
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_NET:
 		pri = PI_NET;
 		break;
 	case INTR_TYPE_CAM:
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_AV:
 		pri = PI_AV;
 		break;
 	case INTR_TYPE_CLK:
 		pri = PI_REALTIME;
 		break;
 	case INTR_TYPE_MISC:
 		pri = PI_DULL;          /* don't care */
 		break;
 	default:
 		/* We didn't specify an interrupt level. */
 		panic("intr_priority: no interrupt type in flags");
 	}
 
 	return pri;
 }
 
 /*
  * Update an ithread based on the associated intr_event.
  */
 static void
 ithread_update(struct intr_thread *ithd)
 {
 	struct intr_event *ie;
 	struct thread *td;
 	u_char pri;
 
 	ie = ithd->it_event;
 	td = ithd->it_thread;
 	mtx_assert(&ie->ie_lock, MA_OWNED);
 
 	/* Determine the overall priority of this event. */
 	if (CK_SLIST_EMPTY(&ie->ie_handlers))
 		pri = PRI_MAX_ITHD;
 	else
 		pri = CK_SLIST_FIRST(&ie->ie_handlers)->ih_pri;
 
 	/* Update name and priority. */
 	strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 	thread_lock(td);
 	sched_prio(td, pri);
 	thread_unlock(td);
 }
 
 /*
  * Regenerate the full name of an interrupt event and update its priority.
  */
 static void
 intr_event_update(struct intr_event *ie)
 {
 	struct intr_handler *ih;
 	char *last;
 	int missed, space, flags;
 
 	/* Start off with no entropy and just the name of the event. */
 	mtx_assert(&ie->ie_lock, MA_OWNED);
 	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
 	flags = 0;
 	missed = 0;
 	space = 1;
 
 	/* Run through all the handlers updating values. */
 	CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 <
 		    sizeof(ie->ie_fullname)) {
 			strcat(ie->ie_fullname, " ");
 			strcat(ie->ie_fullname, ih->ih_name);
 			space = 0;
 		} else
 			missed++;
 		flags |= ih->ih_flags;
 	}
 	ie->ie_hflags = flags;
 
 	/*
 	 * If there is only one handler and its name is too long, just copy in
 	 * as much of the end of the name (includes the unit number) as will
 	 * fit.  Otherwise, we have multiple handlers and not all of the names
 	 * will fit.  Add +'s to indicate missing names.  If we run out of room
 	 * and still have +'s to add, change the last character from a + to a *.
 	 */
 	if (missed == 1 && space == 1) {
 		ih = CK_SLIST_FIRST(&ie->ie_handlers);
 		missed = strlen(ie->ie_fullname) + strlen(ih->ih_name) + 2 -
 		    sizeof(ie->ie_fullname);
 		strcat(ie->ie_fullname, (missed == 0) ? " " : "-");
 		strcat(ie->ie_fullname, &ih->ih_name[missed]);
 		missed = 0;
 	}
 	last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2];
 	while (missed-- > 0) {
 		if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) {
 			if (*last == '+') {
 				*last = '*';
 				break;
 			} else
 				*last = '+';
 		} else if (space) {
 			strcat(ie->ie_fullname, " +");
 			space = 0;
 		} else
 			strcat(ie->ie_fullname, "+");
 	}
 
 	/*
 	 * If this event has an ithread, update it's priority and
 	 * name.
 	 */
 	if (ie->ie_thread != NULL)
 		ithread_update(ie->ie_thread);
 	CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
 }
 
 int
 intr_event_create(struct intr_event **event, void *source, int flags, int irq,
     void (*pre_ithread)(void *), void (*post_ithread)(void *),
     void (*post_filter)(void *), int (*assign_cpu)(void *, int),
     const char *fmt, ...)
 {
 	struct intr_event *ie;
 	va_list ap;
 
 	/* The only valid flag during creation is IE_SOFT. */
 	if ((flags & ~IE_SOFT) != 0)
 		return (EINVAL);
 	ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
 	ie->ie_source = source;
 	ie->ie_pre_ithread = pre_ithread;
 	ie->ie_post_ithread = post_ithread;
 	ie->ie_post_filter = post_filter;
 	ie->ie_assign_cpu = assign_cpu;
 	ie->ie_flags = flags;
 	ie->ie_irq = irq;
 	ie->ie_cpu = NOCPU;
 	CK_SLIST_INIT(&ie->ie_handlers);
 	mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
 
 	va_start(ap, fmt);
 	vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
 	va_end(ap);
 	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
 	mtx_lock(&event_lock);
 	TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
 	mtx_unlock(&event_lock);
 	if (event != NULL)
 		*event = ie;
 	CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
 	return (0);
 }
 
 /*
  * Bind an interrupt event to the specified CPU.  Note that not all
  * platforms support binding an interrupt to a CPU.  For those
  * platforms this request will fail.  Using a cpu id of NOCPU unbinds
  * the interrupt event.
  */
 static int
 _intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread)
 {
 	lwpid_t id;
 	int error;
 
 	/* Need a CPU to bind to. */
 	if (cpu != NOCPU && CPU_ABSENT(cpu))
 		return (EINVAL);
 
 	if (ie->ie_assign_cpu == NULL)
 		return (EOPNOTSUPP);
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR);
 	if (error)
 		return (error);
 
 	/*
 	 * If we have any ithreads try to set their mask first to verify
 	 * permissions, etc.
 	 */
 	if (bindithread) {
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_thread != NULL) {
 			id = ie->ie_thread->it_thread->td_tid;
 			mtx_unlock(&ie->ie_lock);
 			error = cpuset_setithread(id, cpu);
 			if (error)
 				return (error);
 		} else
 			mtx_unlock(&ie->ie_lock);
 	}
 	if (bindirq)
 		error = ie->ie_assign_cpu(ie->ie_source, cpu);
 	if (error) {
 		if (bindithread) {
 			mtx_lock(&ie->ie_lock);
 			if (ie->ie_thread != NULL) {
 				cpu = ie->ie_cpu;
 				id = ie->ie_thread->it_thread->td_tid;
 				mtx_unlock(&ie->ie_lock);
 				(void)cpuset_setithread(id, cpu);
 			} else
 				mtx_unlock(&ie->ie_lock);
 		}
 		return (error);
 	}
 
 	if (bindirq) {
 		mtx_lock(&ie->ie_lock);
 		ie->ie_cpu = cpu;
 		mtx_unlock(&ie->ie_lock);
 	}
 
 	return (error);
 }
 
 /*
  * Bind an interrupt event to the specified CPU.  For supported platforms, any
  * associated ithreads as well as the primary interrupt context will be bound
  * to the specificed CPU.
  */
 int
 intr_event_bind(struct intr_event *ie, int cpu)
 {
 
 	return (_intr_event_bind(ie, cpu, true, true));
 }
 
 /*
  * Bind an interrupt event to the specified CPU, but do not bind associated
  * ithreads.
  */
 int
 intr_event_bind_irqonly(struct intr_event *ie, int cpu)
 {
 
 	return (_intr_event_bind(ie, cpu, true, false));
 }
 
 /*
  * Bind an interrupt event's ithread to the specified CPU.
  */
 int
 intr_event_bind_ithread(struct intr_event *ie, int cpu)
 {
 
 	return (_intr_event_bind(ie, cpu, false, true));
 }
 
 /*
  * Bind an interrupt event's ithread to the specified cpuset.
  */
 int
 intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs)
 {
 	lwpid_t id;
 
 	mtx_lock(&ie->ie_lock);
 	if (ie->ie_thread != NULL) {
 		id = ie->ie_thread->it_thread->td_tid;
 		mtx_unlock(&ie->ie_lock);
 		return (cpuset_setthread(id, cs));
 	} else {
 		mtx_unlock(&ie->ie_lock);
 	}
 	return (ENODEV);
 }
 
 static struct intr_event *
 intr_lookup(int irq)
 {
 	struct intr_event *ie;
 
 	mtx_lock(&event_lock);
 	TAILQ_FOREACH(ie, &event_list, ie_list)
 		if (ie->ie_irq == irq &&
 		    (ie->ie_flags & IE_SOFT) == 0 &&
 		    CK_SLIST_FIRST(&ie->ie_handlers) != NULL)
 			break;
 	mtx_unlock(&event_lock);
 	return (ie);
 }
 
 int
 intr_setaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
 	cpuset_t *mask;
 	int cpu, n;
 
 	mask = m;
 	cpu = NOCPU;
 	/*
 	 * If we're setting all cpus we can unbind.  Otherwise make sure
 	 * only one cpu is in the set.
 	 */
 	if (CPU_CMP(cpuset_root, mask)) {
 		for (n = 0; n < CPU_SETSIZE; n++) {
 			if (!CPU_ISSET(n, mask))
 				continue;
 			if (cpu != NOCPU)
 				return (EINVAL);
 			cpu = n;
 		}
 	}
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
 	switch (mode) {
 	case CPU_WHICH_IRQ:
 		return (intr_event_bind(ie, cpu));
 	case CPU_WHICH_INTRHANDLER:
 		return (intr_event_bind_irqonly(ie, cpu));
 	case CPU_WHICH_ITHREAD:
 		return (intr_event_bind_ithread(ie, cpu));
 	default:
 		return (EINVAL);
 	}
 }
 
 int
 intr_getaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	cpuset_t *mask;
 	lwpid_t id;
 	int error;
 
 	mask = m;
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
 
 	error = 0;
 	CPU_ZERO(mask);
 	switch (mode) {
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_INTRHANDLER:
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_cpu == NOCPU)
 			CPU_COPY(cpuset_root, mask);
 		else
 			CPU_SET(ie->ie_cpu, mask);
 		mtx_unlock(&ie->ie_lock);
 		break;
 	case CPU_WHICH_ITHREAD:
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_thread == NULL) {
 			mtx_unlock(&ie->ie_lock);
 			CPU_COPY(cpuset_root, mask);
 		} else {
 			id = ie->ie_thread->it_thread->td_tid;
 			mtx_unlock(&ie->ie_lock);
 			error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL);
 			if (error != 0)
 				return (error);
 			CPU_COPY(&td->td_cpuset->cs_mask, mask);
 			PROC_UNLOCK(p);
 		}
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 intr_event_destroy(struct intr_event *ie)
 {
 
 	mtx_lock(&event_lock);
 	mtx_lock(&ie->ie_lock);
 	if (!CK_SLIST_EMPTY(&ie->ie_handlers)) {
 		mtx_unlock(&ie->ie_lock);
 		mtx_unlock(&event_lock);
 		return (EBUSY);
 	}
 	TAILQ_REMOVE(&event_list, ie, ie_list);
 #ifndef notyet
 	if (ie->ie_thread != NULL) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	mtx_unlock(&event_lock);
 	mtx_destroy(&ie->ie_lock);
 	free(ie, M_ITHREAD);
 	return (0);
 }
 
 static struct intr_thread *
 ithread_create(const char *name)
 {
 	struct intr_thread *ithd;
 	struct thread *td;
 	int error;
 
 	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
 
 	error = kproc_kthread_add(ithread_loop, ithd, &intrproc,
 		    &td, RFSTOPPED | RFHIGHPID,
 		    0, "intr", "%s", name);
 	if (error)
 		panic("kproc_create() failed with %d", error);
 	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
 	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
 	return (ithd);
 }
 
 static void
 ithread_destroy(struct intr_thread *ithread)
 {
 	struct thread *td;
 
 	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
 	td = ithread->it_thread;
 	thread_lock(td);
 	ithread->it_flags |= IT_DEAD;
 	if (TD_AWAITING_INTR(td)) {
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else
 		thread_unlock(td);
 }
 
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
     enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
 	struct intr_handler **prevptr;
 	struct intr_thread *it;
 
 	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
 	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
 	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
 	if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 	if (flags & INTR_TYPE_NET)
 		ih->ih_flags |= IH_NET;
 
 	/* We can only have one exclusive handler in a event. */
 	mtx_lock(&ie->ie_lock);
 	if (!CK_SLIST_EMPTY(&ie->ie_handlers)) {
 		if ((flags & INTR_EXCL) ||
 		    (CK_SLIST_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
 			mtx_unlock(&ie->ie_lock);
 			free(ih, M_ITHREAD);
 			return (EINVAL);
 		}
 	}
 
 	/* Create a thread if we need one. */
 	while (ie->ie_thread == NULL && handler != NULL) {
 		if (ie->ie_flags & IE_ADDING_THREAD)
 			msleep(ie, &ie->ie_lock, 0, "ithread", 0);
 		else {
 			ie->ie_flags |= IE_ADDING_THREAD;
 			mtx_unlock(&ie->ie_lock);
 			it = ithread_create("intr: newborn");
 			mtx_lock(&ie->ie_lock);
 			ie->ie_flags &= ~IE_ADDING_THREAD;
 			ie->ie_thread = it;
 			it->it_event = ie;
 			ithread_update(it);
 			wakeup(ie);
 		}
 	}
 
 	/* Add the new handler to the event in priority order. */
 	CK_SLIST_FOREACH_PREVPTR(temp_ih, prevptr, &ie->ie_handlers, ih_next) {
 		if (temp_ih->ih_pri > ih->ih_pri)
 			break;
 	}
 	CK_SLIST_INSERT_PREVPTR(prevptr, temp_ih, ih, ih_next);
 
 	intr_event_update(ie);
 
 	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
 	    ie->ie_name);
 	mtx_unlock(&ie->ie_lock);
 
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 }
 
 /*
  * Append a description preceded by a ':' to the name of the specified
  * interrupt handler.
  */
 int
 intr_event_describe_handler(struct intr_event *ie, void *cookie,
     const char *descr)
 {
 	struct intr_handler *ih;
 	size_t space;
 	char *start;
 
 	mtx_lock(&ie->ie_lock);
 #ifdef INVARIANTS
 	CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (ih == cookie)
 			break;
 	}
 	if (ih == NULL) {
 		mtx_unlock(&ie->ie_lock);
 		panic("handler %p not found in interrupt event %p", cookie, ie);
 	}
 #endif
 	ih = cookie;
 
 	/*
 	 * Look for an existing description by checking for an
 	 * existing ":".  This assumes device names do not include
 	 * colons.  If one is found, prepare to insert the new
 	 * description at that point.  If one is not found, find the
 	 * end of the name to use as the insertion point.
 	 */
 	start = strchr(ih->ih_name, ':');
 	if (start == NULL)
 		start = strchr(ih->ih_name, 0);
 
 	/*
 	 * See if there is enough remaining room in the string for the
 	 * description + ":".  The "- 1" leaves room for the trailing
 	 * '\0'.  The "+ 1" accounts for the colon.
 	 */
 	space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1;
 	if (strlen(descr) + 1 > space) {
 		mtx_unlock(&ie->ie_lock);
 		return (ENOSPC);
 	}
 
 	/* Append a colon followed by the description. */
 	*start = ':';
 	strcpy(start + 1, descr);
 	intr_event_update(ie);
 	mtx_unlock(&ie->ie_lock);
 	return (0);
 }
 
 /*
  * Return the ie_source field from the intr_event an intr_handler is
  * associated with.
  */
 void *
 intr_handler_source(void *cookie)
 {
 	struct intr_handler *ih;
 	struct intr_event *ie;
 
 	ih = (struct intr_handler *)cookie;
 	if (ih == NULL)
 		return (NULL);
 	ie = ih->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    ih->ih_name));
 	return (ie->ie_source);
 }
 
 /*
  * If intr_event_handle() is running in the ISR context at the time of the call,
  * then wait for it to complete.
  */
 static void
 intr_event_barrier(struct intr_event *ie)
 {
 	int phase;
 
 	mtx_assert(&ie->ie_lock, MA_OWNED);
 	phase = ie->ie_phase;
 
 	/*
 	 * Switch phase to direct future interrupts to the other active counter.
 	 * Make sure that any preceding stores are visible before the switch.
 	 */
 	KASSERT(ie->ie_active[!phase] == 0, ("idle phase has activity"));
 	atomic_store_rel_int(&ie->ie_phase, !phase);
 
 	/*
 	 * This code cooperates with wait-free iteration of ie_handlers
 	 * in intr_event_handle.
 	 * Make sure that the removal and the phase update are not reordered
 	 * with the active count check.
 	 * Note that no combination of acquire and release fences can provide
 	 * that guarantee as Store->Load sequences can always be reordered.
 	 */
 	atomic_thread_fence_seq_cst();
 
 	/*
 	 * Now wait on the inactive phase.
 	 * The acquire fence is needed so that that all post-barrier accesses
 	 * are after the check.
 	 */
 	while (ie->ie_active[phase] > 0)
 		cpu_spinwait();
 	atomic_thread_fence_acq();
 }
 
 static void
 intr_handler_barrier(struct intr_handler *handler)
 {
 	struct intr_event *ie;
 
 	ie = handler->ih_event;
 	mtx_assert(&ie->ie_lock, MA_OWNED);
 	KASSERT((handler->ih_flags & IH_DEAD) == 0,
 	    ("update for a removed handler"));
 
 	if (ie->ie_thread == NULL) {
 		intr_event_barrier(ie);
 		return;
 	}
 	if ((handler->ih_flags & IH_CHANGED) == 0) {
 		handler->ih_flags |= IH_CHANGED;
 		intr_event_schedule_thread(ie);
 	}
 	while ((handler->ih_flags & IH_CHANGED) != 0)
 		msleep(handler, &ie->ie_lock, 0, "ih_barr", 0);
 }
 
 /*
  * Sleep until an ithread finishes executing an interrupt handler.
  *
  * XXX Doesn't currently handle interrupt filters or fast interrupt
  * handlers.  This is intended for compatibility with linux drivers
  * only.  Do not use in BSD code.
  */
 void
 _intr_drain(int irq)
 {
 	struct intr_event *ie;
 	struct intr_thread *ithd;
 	struct thread *td;
 
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return;
 	if (ie->ie_thread == NULL)
 		return;
 	ithd = ie->ie_thread;
 	td = ithd->it_thread;
 	/*
 	 * We set the flag and wait for it to be cleared to avoid
 	 * long delays with potentially busy interrupt handlers
 	 * were we to only sample TD_AWAITING_INTR() every tick.
 	 */
 	thread_lock(td);
 	if (!TD_AWAITING_INTR(td)) {
 		ithd->it_flags |= IT_WAIT;
 		while (ithd->it_flags & IT_WAIT) {
 			thread_unlock(td);
 			pause("idrain", 1);
 			thread_lock(td);
 		}
 	}
 	thread_unlock(td);
 	return;
 }
 
 int
 intr_event_remove_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 	struct intr_handler *ih;
 	struct intr_handler **prevptr;
 #ifdef notyet
 	int dead;
 #endif
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 
 	mtx_lock(&ie->ie_lock);
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
 	CK_SLIST_FOREACH_PREVPTR(ih, prevptr, &ie->ie_handlers, ih_next) {
 		if (ih == handler)
 			break;
 	}
 	if (ih == NULL) {
 		panic("interrupt handler \"%s\" not found in "
 		    "interrupt event \"%s\"", handler->ih_name, ie->ie_name);
 	}
 
 	/*
 	 * If there is no ithread, then directly remove the handler.  Note that
 	 * intr_event_handle() iterates ie_handlers in a lock-less fashion, so
 	 * care needs to be taken to keep ie_handlers consistent and to free
 	 * the removed handler only when ie_handlers is quiescent.
 	 */
 	if (ie->ie_thread == NULL) {
 		CK_SLIST_REMOVE_PREVPTR(prevptr, ih, ih_next);
 		intr_event_barrier(ie);
 		intr_event_update(ie);
 		mtx_unlock(&ie->ie_lock);
 		free(handler, M_ITHREAD);
 		return (0);
 	}
 
 	/*
 	 * Let the interrupt thread do the job.
 	 * The interrupt source is disabled when the interrupt thread is
 	 * running, so it does not have to worry about interaction with
 	 * intr_event_handle().
 	 */
 	KASSERT((handler->ih_flags & IH_DEAD) == 0,
 	    ("duplicate handle remove"));
 	handler->ih_flags |= IH_DEAD;
 	intr_event_schedule_thread(ie);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	intr_event_update(ie);
 
 #ifdef notyet
 	/*
 	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
 	 * this could lead to races of stale data when servicing an
 	 * interrupt.
 	 */
 	dead = 1;
 	CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (ih->ih_handler != NULL) {
 			dead = 0;
 			break;
 		}
 	}
 	if (dead) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
 int
 intr_event_suspend_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
 	handler->ih_flags |= IH_SUSP;
 	intr_handler_barrier(handler);
 	mtx_unlock(&ie->ie_lock);
 	return (0);
 }
 
 int
 intr_event_resume_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 
 	/*
 	 * intr_handler_barrier() acts not only as a barrier,
 	 * it also allows to check for any pending interrupts.
 	 */
 	mtx_lock(&ie->ie_lock);
 	handler->ih_flags &= ~IH_SUSP;
 	intr_handler_barrier(handler);
 	mtx_unlock(&ie->ie_lock);
 	return (0);
 }
 
 static int
 intr_event_schedule_thread(struct intr_event *ie)
 {
 	struct intr_entropy entropy;
 	struct intr_thread *it;
 	struct thread *td;
 	struct thread *ctd;
 
 	/*
 	 * If no ithread or no handlers, then we have a stray interrupt.
 	 */
 	if (ie == NULL || CK_SLIST_EMPTY(&ie->ie_handlers) ||
 	    ie->ie_thread == NULL)
 		return (EINVAL);
 
 	ctd = curthread;
 	it = ie->ie_thread;
 	td = it->it_thread;
 
 	/*
 	 * If any of the handlers for this ithread claim to be good
 	 * sources of entropy, then gather some.
 	 */
 	if (ie->ie_hflags & IH_ENTROPY) {
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
 		random_harvest_queue(&entropy, sizeof(entropy), RANDOM_INTERRUPT);
 	}
 
 	KASSERT(td->td_proc != NULL, ("ithread %s has no process", ie->ie_name));
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 *
 	 * Use store_rel to arrange that the store to ih_need in
 	 * swi_sched() is before the store to it_need and prepare for
 	 * transfer of this order to loads in the ithread.
 	 */
 	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, td->td_proc->p_pid,
 		    td->td_name);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, td->td_proc->p_pid, td->td_name, it->it_need, td->td_state);
 		thread_unlock(td);
 	}
 
 	return (0);
 }
 
 /*
  * Allow interrupt event binding for software interrupt handlers -- a no-op,
  * since interrupts are generated in software rather than being directed by
  * a PIC.
  */
 static int
 swi_assign_cpu(void *arg, int cpu)
 {
 
 	return (0);
 }
 
 /*
  * Add a software interrupt handler to a specified event.  If a given event
  * is not specified, then a new event is created.
  */
 int
 swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler,
 	    void *arg, int pri, enum intr_type flags, void **cookiep)
 {
 	struct intr_event *ie;
-	int error;
+	int error = 0;
 
 	if (flags & INTR_ENTROPY)
 		return (EINVAL);
 
 	ie = (eventp != NULL) ? *eventp : NULL;
 
 	if (ie != NULL) {
 		if (!(ie->ie_flags & IE_SOFT))
 			return (EINVAL);
 	} else {
 		error = intr_event_create(&ie, NULL, IE_SOFT, 0,
 		    NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri);
 		if (error)
 			return (error);
 		if (eventp != NULL)
 			*eventp = ie;
 	}
-	error = intr_event_add_handler(ie, name, NULL, handler, arg,
-	    PI_SWI(pri), flags, cookiep);
+	if (handler != NULL) {
+		error = intr_event_add_handler(ie, name, NULL, handler, arg,
+		    PI_SWI(pri), flags, cookiep);
+	}
 	return (error);
 }
 
 /*
  * Schedule a software interrupt thread.
  */
 void
 swi_sched(void *cookie, int flags)
 {
 	struct intr_handler *ih = (struct intr_handler *)cookie;
 	struct intr_event *ie = ih->ih_event;
 	struct intr_entropy entropy;
 	int error __unused;
 
 	CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
 	    ih->ih_need);
 
-	entropy.event = (uintptr_t)ih;
-	entropy.td = curthread;
-	random_harvest_queue(&entropy, sizeof(entropy), RANDOM_SWI);
+	if ((flags & SWI_FROMNMI) == 0) {
+		entropy.event = (uintptr_t)ih;
+		entropy.td = curthread;
+		random_harvest_queue(&entropy, sizeof(entropy), RANDOM_SWI);
+	}
 
 	/*
 	 * Set ih_need for this handler so that if the ithread is already
 	 * running it will execute this handler on the next pass.  Otherwise,
 	 * it will execute it the next time it runs.
 	 */
 	ih->ih_need = 1;
 
-	if (!(flags & SWI_DELAY)) {
+	if (flags & SWI_DELAY)
+		return;
+
+	if (flags & SWI_FROMNMI) {
+#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
+		KASSERT(ie == clk_intr_event,
+		    ("SWI_FROMNMI used not with clk_intr_event"));
+		ipi_self_from_nmi(IPI_SWI);
+#endif
+	} else {
 		VM_CNT_INC(v_soft);
 		error = intr_event_schedule_thread(ie);
 		KASSERT(error == 0, ("stray software interrupt"));
 	}
 }
 
 /*
  * Remove a software interrupt handler.  Currently this code does not
  * remove the associated interrupt event if it becomes empty.  Calling code
  * may do so manually via intr_event_destroy(), but that's not really
  * an optimal interface.
  */
 int
 swi_remove(void *cookie)
 {
 
 	return (intr_event_remove_handler(cookie));
 }
 
 static void
 intr_event_execute_handlers(struct proc *p, struct intr_event *ie)
 {
 	struct intr_handler *ih, *ihn, *ihp;
 
 	ihp = NULL;
 	CK_SLIST_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) {
 		/*
 		 * If this handler is marked for death, remove it from
 		 * the list of handlers and wake up the sleeper.
 		 */
 		if (ih->ih_flags & IH_DEAD) {
 			mtx_lock(&ie->ie_lock);
 			if (ihp == NULL)
 				CK_SLIST_REMOVE_HEAD(&ie->ie_handlers, ih_next);
 			else
 				CK_SLIST_REMOVE_AFTER(ihp, ih_next);
 			ih->ih_flags &= ~IH_DEAD;
 			wakeup(ih);
 			mtx_unlock(&ie->ie_lock);
 			continue;
 		}
 
 		/*
 		 * Now that we know that the current element won't be removed
 		 * update the previous element.
 		 */
 		ihp = ih;
 
 		if ((ih->ih_flags & IH_CHANGED) != 0) {
 			mtx_lock(&ie->ie_lock);
 			ih->ih_flags &= ~IH_CHANGED;
 			wakeup(ih);
 			mtx_unlock(&ie->ie_lock);
 		}
 
 		/* Skip filter only handlers */
 		if (ih->ih_handler == NULL)
 			continue;
 
 		/* Skip suspended handlers */
 		if ((ih->ih_flags & IH_SUSP) != 0)
 			continue;
 
 		/*
 		 * For software interrupt threads, we only execute
 		 * handlers that have their need flag set.  Hardware
 		 * interrupt threads always invoke all of their handlers.
 		 *
 		 * ih_need can only be 0 or 1.  Failed cmpset below
 		 * means that there is no request to execute handlers,
 		 * so a retry of the cmpset is not needed.
 		 */
 		if ((ie->ie_flags & IE_SOFT) != 0 &&
 		    atomic_cmpset_int(&ih->ih_need, 1, 0) == 0)
 			continue;
 
 		/* Execute this handler. */
 		CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
 		    __func__, p->p_pid, (void *)ih->ih_handler, 
 		    ih->ih_argument, ih->ih_name, ih->ih_flags);
 
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_lock(&Giant);
 		ih->ih_handler(ih->ih_argument);
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_unlock(&Giant);
 	}
 }
 
 static void
 ithread_execute_handlers(struct proc *p, struct intr_event *ie)
 {
 
 	/* Interrupt handlers should not sleep. */
 	if (!(ie->ie_flags & IE_SOFT))
 		THREAD_NO_SLEEPING();
 	intr_event_execute_handlers(p, ie);
 	if (!(ie->ie_flags & IE_SOFT))
 		THREAD_SLEEPING_OK();
 
 	/*
 	 * Interrupt storm handling:
 	 *
 	 * If this interrupt source is currently storming, then throttle
 	 * it to only fire the handler once  per clock tick.
 	 *
 	 * If this interrupt source is not currently storming, but the
 	 * number of back to back interrupts exceeds the storm threshold,
 	 * then enter storming mode.
 	 */
 	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
 	    !(ie->ie_flags & IE_SOFT)) {
 		/* Report the message only once every second. */
 		if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
 			printf(
 	"interrupt storm detected on \"%s\"; throttling interrupt source\n",
 			    ie->ie_name);
 		}
 		pause("istorm", 1);
 	} else
 		ie->ie_count++;
 
 	/*
 	 * Now that all the handlers have had a chance to run, reenable
 	 * the interrupt source.
 	 */
 	if (ie->ie_post_ithread != NULL)
 		ie->ie_post_ithread(ie->ie_source);
 }
 
 /*
  * This is the main code for interrupt threads.
  */
 static void
 ithread_loop(void *arg)
 {
 	struct epoch_tracker et;
 	struct intr_thread *ithd;
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	int wake, epoch_count;
 	bool needs_epoch;
 
 	td = curthread;
 	p = td->td_proc;
 	ithd = (struct intr_thread *)arg;
 	KASSERT(ithd->it_thread == td,
 	    ("%s: ithread and proc linkage out of sync", __func__));
 	ie = ithd->it_event;
 	ie->ie_count = 0;
 	wake = 0;
 
 	/*
 	 * As long as we have interrupts outstanding, go through the
 	 * list of handlers, giving each one a go at it.
 	 */
 	for (;;) {
 		/*
 		 * If we are an orphaned thread, then just die.
 		 */
 		if (ithd->it_flags & IT_DEAD) {
 			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
 			    p->p_pid, td->td_name);
 			free(ithd, M_ITHREAD);
 			kthread_exit();
 		}
 
 		/*
 		 * Service interrupts.  If another interrupt arrives while
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 *
 		 * The load_acq part of the following cmpset ensures
 		 * that the load of ih_need in ithread_execute_handlers()
 		 * is ordered after the load of it_need here.
 		 */
 		needs_epoch =
 		    (atomic_load_int(&ie->ie_hflags) & IH_NET) != 0;
 		if (needs_epoch) {
 			epoch_count = 0;
 			NET_EPOCH_ENTER(et);
 		}
 		while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) {
 			ithread_execute_handlers(p, ie);
 			if (needs_epoch &&
 			    ++epoch_count >= intr_epoch_batch) {
 				NET_EPOCH_EXIT(et);
 				epoch_count = 0;
 				NET_EPOCH_ENTER(et);
 			}
 		}
 		if (needs_epoch)
 			NET_EPOCH_EXIT(et);
 		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		/*
 		 * Processed all our interrupts.  Now get the sched
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
 		if (atomic_load_acq_int(&ithd->it_need) == 0 &&
 		    (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT);
 		} else {
 			if (ithd->it_flags & IT_WAIT) {
 				wake = 1;
 				ithd->it_flags &= ~IT_WAIT;
 			}
 			thread_unlock(td);
 		}
 		if (wake) {
 			wakeup(ithd);
 			wake = 0;
 		}
 	}
 }
 
 /*
  * Main interrupt handling body.
  *
  * Input:
  * o ie:                        the event connected to this interrupt.
  * o frame:                     some archs (i.e. i386) pass a frame to some.
  *                              handlers as their main argument.
  * Return value:
  * o 0:                         everything ok.
  * o EINVAL:                    stray interrupt.
  */
 int
 intr_event_handle(struct intr_event *ie, struct trapframe *frame)
 {
 	struct intr_handler *ih;
 	struct trapframe *oldframe;
 	struct thread *td;
 	int phase;
 	int ret;
 	bool filter, thread;
 
 	td = curthread;
 
 #ifdef KSTACK_USAGE_PROF
 	intr_prof_stack_use(td, frame);
 #endif
 
 	/* An interrupt with no event or handlers is a stray interrupt. */
 	if (ie == NULL || CK_SLIST_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
 
 	/*
 	 * Execute fast interrupt handlers directly.
 	 * To support clock handlers, if a handler registers
 	 * with a NULL argument, then we pass it a pointer to
 	 * a trapframe as its argument.
 	 */
 	td->td_intr_nesting_level++;
 	filter = false;
 	thread = false;
 	ret = 0;
 	critical_enter();
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = frame;
 
 	phase = ie->ie_phase;
 	atomic_add_int(&ie->ie_active[phase], 1);
 
 	/*
 	 * This fence is required to ensure that no later loads are
 	 * re-ordered before the ie_active store.
 	 */
 	atomic_thread_fence_seq_cst();
 
 	CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if ((ih->ih_flags & IH_SUSP) != 0)
 			continue;
+		if ((ie->ie_flags & IE_SOFT) != 0 && ih->ih_need == 0)
+			continue;
 		if (ih->ih_filter == NULL) {
 			thread = true;
 			continue;
 		}
 		CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__,
 		    ih->ih_filter, ih->ih_argument == NULL ? frame :
 		    ih->ih_argument, ih->ih_name);
 		if (ih->ih_argument == NULL)
 			ret = ih->ih_filter(frame);
 		else
 			ret = ih->ih_filter(ih->ih_argument);
 		KASSERT(ret == FILTER_STRAY ||
 		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
 		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
 		    ("%s: incorrect return value %#x from %s", __func__, ret,
 		    ih->ih_name));
 		filter = filter || ret == FILTER_HANDLED;
 
 		/*
 		 * Wrapper handler special handling:
 		 *
 		 * in some particular cases (like pccard and pccbb),
 		 * the _real_ device handler is wrapped in a couple of
 		 * functions - a filter wrapper and an ithread wrapper.
 		 * In this case (and just in this case), the filter wrapper
 		 * could ask the system to schedule the ithread and mask
 		 * the interrupt source if the wrapped handler is composed
 		 * of just an ithread handler.
 		 *
 		 * TODO: write a generic wrapper to avoid people rolling
 		 * their own.
 		 */
 		if (!thread) {
 			if (ret == FILTER_SCHEDULE_THREAD)
 				thread = true;
 		}
 	}
 	atomic_add_rel_int(&ie->ie_active[phase], -1);
 
 	td->td_intr_frame = oldframe;
 
 	if (thread) {
 		if (ie->ie_pre_ithread != NULL)
 			ie->ie_pre_ithread(ie->ie_source);
 	} else {
 		if (ie->ie_post_filter != NULL)
 			ie->ie_post_filter(ie->ie_source);
 	}
 
 	/* Schedule the ithread if needed. */
 	if (thread) {
 		int error __unused;
 
 		error =  intr_event_schedule_thread(ie);
 		KASSERT(error == 0, ("bad stray interrupt"));
 	}
 	critical_exit();
 	td->td_intr_nesting_level--;
 #ifdef notyet
 	/* The interrupt is not aknowledged by any filter and has no ithread. */
 	if (!thread && !filter)
 		return (EINVAL);
 #endif
 	return (0);
 }
 
 #ifdef DDB
 /*
  * Dump details about an interrupt handler
  */
 static void
 db_dump_intrhand(struct intr_handler *ih)
 {
 	int comma;
 
 	db_printf("\t%-10s ", ih->ih_name);
 	switch (ih->ih_pri) {
 	case PI_REALTIME:
 		db_printf("CLK ");
 		break;
 	case PI_AV:
 		db_printf("AV  ");
 		break;
 	case PI_TTY:
 		db_printf("TTY ");
 		break;
 	case PI_NET:
 		db_printf("NET ");
 		break;
 	case PI_DISK:
 		db_printf("DISK");
 		break;
 	case PI_DULL:
 		db_printf("DULL");
 		break;
 	default:
 		if (ih->ih_pri >= PI_SOFT)
 			db_printf("SWI ");
 		else
 			db_printf("%4u", ih->ih_pri);
 		break;
 	}
 	db_printf(" ");
 	if (ih->ih_filter != NULL) {
 		db_printf("[F]");
 		db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC);
 	}
 	if (ih->ih_handler != NULL) {
 		if (ih->ih_filter != NULL)
 			db_printf(",");
 		db_printf("[H]");
 		db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
 	}
 	db_printf("(%p)", ih->ih_argument);
 	if (ih->ih_need ||
 	    (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
 	    IH_MPSAFE)) != 0) {
 		db_printf(" {");
 		comma = 0;
 		if (ih->ih_flags & IH_EXCLUSIVE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("EXCL");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_ENTROPY) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ENTROPY");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_DEAD) {
 			if (comma)
 				db_printf(", ");
 			db_printf("DEAD");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_MPSAFE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("MPSAFE");
 			comma = 1;
 		}
 		if (ih->ih_need) {
 			if (comma)
 				db_printf(", ");
 			db_printf("NEED");
 		}
 		db_printf("}");
 	}
 	db_printf("\n");
 }
 
 /*
  * Dump details about a event.
  */
 void
 db_dump_intr_event(struct intr_event *ie, int handlers)
 {
 	struct intr_handler *ih;
 	struct intr_thread *it;
 	int comma;
 
 	db_printf("%s ", ie->ie_fullname);
 	it = ie->ie_thread;
 	if (it != NULL)
 		db_printf("(pid %d)", it->it_thread->td_proc->p_pid);
 	else
 		db_printf("(no thread)");
 	if ((ie->ie_flags & (IE_SOFT | IE_ADDING_THREAD)) != 0 ||
 	    (it != NULL && it->it_need)) {
 		db_printf(" {");
 		comma = 0;
 		if (ie->ie_flags & IE_SOFT) {
 			db_printf("SOFT");
 			comma = 1;
 		}
 		if (ie->ie_flags & IE_ADDING_THREAD) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ADDING_THREAD");
 			comma = 1;
 		}
 		if (it != NULL && it->it_need) {
 			if (comma)
 				db_printf(", ");
 			db_printf("NEED");
 		}
 		db_printf("}");
 	}
 	db_printf("\n");
 
 	if (handlers)
 		CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next)
 		    db_dump_intrhand(ih);
 }
 
 /*
  * Dump data about interrupt handlers
  */
 DB_SHOW_COMMAND(intr, db_show_intr)
 {
 	struct intr_event *ie;
 	int all, verbose;
 
 	verbose = strchr(modif, 'v') != NULL;
 	all = strchr(modif, 'a') != NULL;
 	TAILQ_FOREACH(ie, &event_list, ie_list) {
 		if (!all && CK_SLIST_EMPTY(&ie->ie_handlers))
 			continue;
 		db_dump_intr_event(ie, verbose);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* DDB */
 
 /*
  * Start standard software interrupt threads
  */
 static void
 start_softintr(void *dummy)
 {
 
+	if (swi_add(&clk_intr_event, "clk", NULL, NULL, SWI_CLOCK,
+	    INTR_MPSAFE, NULL))
+		panic("died while creating clk swi ithread");
 	if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih))
 		panic("died while creating vm swi ithread");
 }
 SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr,
     NULL);
 
 /*
  * Sysctls used by systat and others: hw.intrnames and hw.intrcnt.
  * The data for this machine dependent, and the declarations are in machine
  * dependent code.  The layout of intrnames and intrcnt however is machine
  * independent.
  *
  * We do not know the length of intrcnt and intrnames at compile time, so
  * calculate things at run time.
  */
 static int
 sysctl_intrnames(SYSCTL_HANDLER_ARGS)
 {
 	return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, intrnames,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0,
     sysctl_intrnames, "",
     "Interrupt Names");
 
 static int
 sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
 {
 #ifdef SCTL_MASK32
 	uint32_t *intrcnt32;
 	unsigned i;
 	int error;
 
 	if (req->flags & SCTL_MASK32) {
 		if (!req->oldptr)
 			return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req));
 		intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT);
 		if (intrcnt32 == NULL)
 			return (ENOMEM);
 		for (i = 0; i < sintrcnt / sizeof (u_long); i++)
 			intrcnt32[i] = intrcnt[i];
 		error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req);
 		free(intrcnt32, M_TEMP);
 		return (error);
 	}
 #endif
 	return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, intrcnt,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0,
     sysctl_intrcnt, "",
     "Interrupt Counts");
 
 #ifdef DDB
 /*
  * DDB command to dump the interrupt statistics.
  */
 DB_SHOW_COMMAND(intrcnt, db_show_intrcnt)
 {
 	u_long *i;
 	char *cp;
 	u_int j;
 
 	cp = intrnames;
 	j = 0;
 	for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit;
 	    i++, j++) {
 		if (*cp == '\0')
 			break;
 		if (*i != 0)
 			db_printf("%s\t%lu\n", cp, *i);
 		cp += strlen(cp) + 1;
 	}
 }
 #endif
Index: head/sys/sys/interrupt.h
===================================================================
--- head/sys/sys/interrupt.h	(revision 363526)
+++ head/sys/sys/interrupt.h	(revision 363527)
@@ -1,203 +1,205 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_INTERRUPT_H_
 #define _SYS_INTERRUPT_H_
 
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/ck.h>
 
 struct intr_event;
 struct intr_thread;
 struct trapframe;
 
 /*
  * Describe a hardware interrupt handler.
  *
  * Multiple interrupt handlers for a specific event can be chained
  * together.
  */
 struct intr_handler {
 	driver_filter_t	*ih_filter;	/* Filter handler function. */
 	driver_intr_t	*ih_handler;	/* Threaded handler function. */
 	void		*ih_argument;	/* Argument to pass to handlers. */
 	int		 ih_flags;
 	char		 ih_name[MAXCOMLEN + 1]; /* Name of handler. */
 	struct intr_event *ih_event;	/* Event we are connected to. */
 	int		 ih_need;	/* Needs service. */
 	CK_SLIST_ENTRY(intr_handler) ih_next; /* Next handler for this event. */
 	u_char		 ih_pri;	/* Priority of this handler. */
 };
 
 /* Interrupt handle flags kept in ih_flags */
 #define	IH_NET		0x00000001	/* Network. */
 #define	IH_EXCLUSIVE	0x00000002	/* Exclusive interrupt. */
 #define	IH_ENTROPY	0x00000004	/* Device is a good entropy source. */
 #define	IH_DEAD		0x00000008	/* Handler should be removed. */
 #define	IH_SUSP		0x00000010	/* Device is powered down. */
 #define	IH_CHANGED	0x40000000	/* Handler state is changed. */
 #define	IH_MPSAFE	0x80000000	/* Handler does not need Giant. */
 
 /*
  * Describe an interrupt event.  An event holds a list of handlers.
  * The 'pre_ithread', 'post_ithread', 'post_filter', and 'assign_cpu'
  * hooks are used to invoke MD code for certain operations.
  *
  * The 'pre_ithread' hook is called when an interrupt thread for
  * handlers without filters is scheduled.  It is responsible for
  * ensuring that 1) the system won't be swamped with an interrupt
  * storm from the associated source while the ithread runs and 2) the
  * current CPU is able to receive interrupts from other interrupt
  * sources.  The first is usually accomplished by disabling
  * level-triggered interrupts until the ithread completes.  The second
  * is accomplished on some platforms by acknowledging the interrupt
  * via an EOI.
  *
  * The 'post_ithread' hook is invoked when an ithread finishes.  It is
  * responsible for ensuring that the associated interrupt source will
  * trigger an interrupt when it is asserted in the future.  Usually
  * this is implemented by enabling a level-triggered interrupt that
  * was previously disabled via the 'pre_ithread' hook.
  *
  * The 'post_filter' hook is invoked when a filter handles an
  * interrupt.  It is responsible for ensuring that the current CPU is
  * able to receive interrupts again.  On some platforms this is done
  * by acknowledging the interrupts via an EOI.
  *
  * The 'assign_cpu' hook is used to bind an interrupt source to a
  * specific CPU.  If the interrupt cannot be bound, this function may
  * return an error.
  *
  * Note that device drivers may also use interrupt events to manage
  * multiplexing interrupt interrupt handler into handlers for child
  * devices.  In that case, the above hooks are not used.  The device
  * can create an event for its interrupt resource and register child
  * event handlers with that event.  It can then use
  * intr_event_execute_handlers() to execute non-filter handlers.
  * Currently filter handlers are not supported by this, but that can
  * be added by splitting out the filter loop from intr_event_handle()
  * if desired.
  */
 struct intr_event {
 	TAILQ_ENTRY(intr_event) ie_list;
 	CK_SLIST_HEAD(, intr_handler) ie_handlers; /* Interrupt handlers. */
 	char		ie_name[MAXCOMLEN + 1]; /* Individual event name. */
 	char		ie_fullname[MAXCOMLEN + 1];
 	struct mtx	ie_lock;
 	void		*ie_source;	/* Cookie used by MD code. */
 	struct intr_thread *ie_thread;	/* Thread we are connected to. */
 	void		(*ie_pre_ithread)(void *);
 	void		(*ie_post_ithread)(void *);
 	void		(*ie_post_filter)(void *);
 	int		(*ie_assign_cpu)(void *, int);
 	int		ie_flags;
 	int		ie_hflags;	/* Cumulative flags of all handlers. */
 	int		ie_count;	/* Loop counter. */
 	int		ie_warncnt;	/* Rate-check interrupt storm warns. */
 	struct timeval	ie_warntm;
 	int		ie_irq;		/* Physical irq number if !SOFT. */
 	int		ie_cpu;		/* CPU this event is bound to. */
 	volatile int	ie_phase;	/* Switched to establish a barrier. */
 	volatile int	ie_active[2];	/* Filters in ISR context. */
 };
 
 /* Interrupt event flags kept in ie_flags. */
 #define	IE_SOFT		0x000001	/* Software interrupt. */
 #define	IE_ADDING_THREAD 0x000004	/* Currently building an ithread. */
 
-/* Flags to pass to sched_swi. */
+/* Flags to pass to swi_sched. */
+#define	SWI_FROMNMI	0x1
 #define	SWI_DELAY	0x2
 
 /*
  * Software interrupt numbers in priority order.  The priority determines
  * the priority of the corresponding interrupt thread.
  */
 #define	SWI_TTY		0
 #define	SWI_NET		1
 #define	SWI_CAMBIO	2
 #define	SWI_VM		3
 #define	SWI_CLOCK	4
 #define	SWI_TQ_FAST	5
 #define	SWI_TQ		6
 #define	SWI_TQ_GIANT	6
 
 struct proc;
 
+extern struct	intr_event *clk_intr_event;
 extern struct	intr_event *tty_intr_event;
 extern void	*vm_ih;
 
 /* Counts and names for statistics (defined in MD code). */
 #if defined(__amd64__) || defined(__i386__) || defined(__powerpc__)
 extern u_long 	*intrcnt;	/* counts for for each device and stray */
 extern char 	*intrnames;	/* string table containing device names */
 #else
 extern u_long 	intrcnt[];	/* counts for for each device and stray */
 extern char 	intrnames[];	/* string table containing device names */
 #endif
 extern size_t	sintrcnt;	/* size of intrcnt table */
 extern size_t	sintrnames;	/* size of intrnames table */
 
 #ifdef DDB
 void	db_dump_intr_event(struct intr_event *ie, int handlers);
 #endif
 u_char	intr_priority(enum intr_type flags);
 int	intr_event_add_handler(struct intr_event *ie, const char *name,
 	    driver_filter_t filter, driver_intr_t handler, void *arg, 
 	    u_char pri, enum intr_type flags, void **cookiep);	    
 int	intr_event_bind(struct intr_event *ie, int cpu);
 int	intr_event_bind_irqonly(struct intr_event *ie, int cpu);
 int	intr_event_bind_ithread(struct intr_event *ie, int cpu);
 struct _cpuset;
 int	intr_event_bind_ithread_cpuset(struct intr_event *ie,
 	    struct _cpuset *mask);
 int	intr_event_create(struct intr_event **event, void *source,
 	    int flags, int irq, void (*pre_ithread)(void *),
 	    void (*post_ithread)(void *), void (*post_filter)(void *),
 	    int (*assign_cpu)(void *, int), const char *fmt, ...)
 	    __printflike(9, 10);
 int	intr_event_describe_handler(struct intr_event *ie, void *cookie,
 	    const char *descr);
 int	intr_event_destroy(struct intr_event *ie);
 int	intr_event_handle(struct intr_event *ie, struct trapframe *frame);
 int	intr_event_remove_handler(void *cookie);
 int	intr_event_suspend_handler(void *cookie);
 int	intr_event_resume_handler(void *cookie);
 int	intr_getaffinity(int irq, int mode, void *mask);
 void	*intr_handler_source(void *cookie);
 int	intr_setaffinity(int irq, int mode, void *mask);
 void	_intr_drain(int irq);  /* Linux compat only. */
 int	swi_add(struct intr_event **eventp, const char *name,
 	    driver_intr_t handler, void *arg, int pri, enum intr_type flags,
 	    void **cookiep);
 void	swi_sched(void *cookie, int flags);
 int	swi_remove(void *cookie);
 
 #endif
Index: head/sys/x86/include/apicvar.h
===================================================================
--- head/sys/x86/include/apicvar.h	(revision 363526)
+++ head/sys/x86/include/apicvar.h	(revision 363527)
@@ -1,492 +1,493 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _X86_APICVAR_H_
 #define _X86_APICVAR_H_
 
 /*
  * Local && I/O APIC variable definitions.
  */
 
 /*
  * Layout of local APIC interrupt vectors:
  *
  *	0xff (255)  +-------------+
  *                  |             | 15 (Spurious / IPIs / Local Interrupts)
  *	0xf0 (240)  +-------------+
  *                  |             | 14 (I/O Interrupts / Timer)
  *	0xe0 (224)  +-------------+
  *                  |             | 13 (I/O Interrupts)
  *	0xd0 (208)  +-------------+
  *                  |             | 12 (I/O Interrupts)
  *	0xc0 (192)  +-------------+
  *                  |             | 11 (I/O Interrupts)
  *	0xb0 (176)  +-------------+
  *                  |             | 10 (I/O Interrupts)
  *	0xa0 (160)  +-------------+
  *                  |             | 9 (I/O Interrupts)
  *	0x90 (144)  +-------------+
  *                  |             | 8 (I/O Interrupts / System Calls)
  *	0x80 (128)  +-------------+
  *                  |             | 7 (I/O Interrupts)
  *	0x70 (112)  +-------------+
  *                  |             | 6 (I/O Interrupts)
  *	0x60 (96)   +-------------+
  *                  |             | 5 (I/O Interrupts)
  *	0x50 (80)   +-------------+
  *                  |             | 4 (I/O Interrupts)
  *	0x40 (64)   +-------------+
  *                  |             | 3 (I/O Interrupts)
  *	0x30 (48)   +-------------+
  *                  |             | 2 (ATPIC Interrupts)
  *	0x20 (32)   +-------------+
  *                  |             | 1 (Exceptions, traps, faults, etc.)
  *	0x10 (16)   +-------------+
  *                  |             | 0 (Exceptions, traps, faults, etc.)
  *	0x00 (0)    +-------------+
  *
  * Note: 0x80 needs to be handled specially and not allocated to an
  * I/O device!
  */
 
 #define	xAPIC_MAX_APIC_ID	0xfe
 #define	xAPIC_ID_ALL		0xff
 #define	MAX_APIC_ID		0x200
 #define	APIC_ID_ALL		0xffffffff
 
 #define	IOAPIC_MAX_ID		xAPIC_MAX_APIC_ID
 
 /* I/O Interrupts are used for external devices such as ISA, PCI, etc. */
 #define	APIC_IO_INTS	(IDT_IO_INTS + 16)
 #define	APIC_NUM_IOINTS	191
 
 /* The timer interrupt is used for clock handling and drives hardclock, etc. */
 #define	APIC_TIMER_INT	(APIC_IO_INTS + APIC_NUM_IOINTS)
 
 /*  
  ********************* !!! WARNING !!! ******************************
  * Each local apic has an interrupt receive fifo that is two entries deep
  * for each interrupt priority class (higher 4 bits of interrupt vector).
  * Once the fifo is full the APIC can no longer receive interrupts for this
  * class and sending IPIs from other CPUs will be blocked.
  * To avoid deadlocks there should be no more than two IPI interrupts
  * pending at the same time.
  * Currently this is guaranteed by dividing the IPIs in two groups that have 
  * each at most one IPI interrupt pending. The first group is protected by the
  * smp_ipi_mtx and waits for the completion of the IPI (Only one IPI user 
  * at a time) The second group uses a single interrupt and a bitmap to avoid
  * redundant IPI interrupts.
  */ 
 
 /* Interrupts for local APIC LVT entries other than the timer. */
 #define	APIC_LOCAL_INTS	240
 #define	APIC_ERROR_INT	APIC_LOCAL_INTS
 #define	APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
 #define	APIC_CMC_INT	(APIC_LOCAL_INTS + 2)
 #define	APIC_IPI_INTS	(APIC_LOCAL_INTS + 3)
 
 #define	IPI_RENDEZVOUS	(APIC_IPI_INTS)		/* Inter-CPU rendezvous. */
 #define	IPI_INVLOP	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs, amd64 */
 #define	IPI_INVLTLB	(APIC_IPI_INTS + 1)	/* TLB Shootdown IPIs, i386 */
 #define	IPI_INVLPG	(APIC_IPI_INTS + 2)
 #define	IPI_INVLRNG	(APIC_IPI_INTS + 3)
 #define	IPI_INVLCACHE	(APIC_IPI_INTS + 4)
 /* Vector to handle bitmap based IPIs */
 #define	IPI_BITMAP_VECTOR	(APIC_IPI_INTS + 5) 
 
 /* IPIs handled by IPI_BITMAP_VECTOR */
 #define	IPI_AST		0 	/* Generate software trap. */
 #define IPI_PREEMPT     1
 #define IPI_HARDCLOCK   2
 #define	IPI_TRACE	3	/* Collect stack trace. */
 #define	IPI_BITMAP_LAST IPI_TRACE
 #define IPI_IS_BITMAPED(x) ((x) <= IPI_BITMAP_LAST)
 
 #define	IPI_STOP	(APIC_IPI_INTS + 6)	/* Stop CPU until restarted. */
 #define	IPI_SUSPEND	(APIC_IPI_INTS + 7)	/* Suspend CPU until restarted. */
-#define	IPI_DYN_FIRST	(APIC_IPI_INTS + 8)
+#define	IPI_SWI		(APIC_IPI_INTS + 8)	/* Run clk_intr_event. */
+#define	IPI_DYN_FIRST	(APIC_IPI_INTS + 9)
 #define	IPI_DYN_LAST	(254)			/* IPIs allocated at runtime */
 
 /*
  * IPI_STOP_HARD does not need to occupy a slot in the IPI vector space since
  * it is delivered using an NMI anyways.
  */
 #define	IPI_NMI_FIRST	255
 #define	IPI_STOP_HARD	255			/* Stop CPU with a NMI. */
 
 /*
  * The spurious interrupt can share the priority class with the IPIs since
  * it is not a normal interrupt. (Does not use the APIC's interrupt fifo)
  */
 #define	APIC_SPURIOUS_INT 255
 
 #ifndef LOCORE
 
 #define	APIC_IPI_DEST_SELF	-1
 #define	APIC_IPI_DEST_ALL	-2
 #define	APIC_IPI_DEST_OTHERS	-3
 
 #define	APIC_BUS_UNKNOWN	-1
 #define	APIC_BUS_ISA		0
 #define	APIC_BUS_EISA		1
 #define	APIC_BUS_PCI		2
 #define	APIC_BUS_MAX		APIC_BUS_PCI
 
 #define	IRQ_EXTINT		-1
 #define	IRQ_NMI			-2
 #define	IRQ_SMI			-3
 #define	IRQ_DISABLED		-4
 
 /*
  * An APIC enumerator is a pseudo bus driver that enumerates APIC's including
  * CPU's and I/O APIC's.
  */
 struct apic_enumerator {
 	const char *apic_name;
 	int (*apic_probe)(void);
 	int (*apic_probe_cpus)(void);
 	int (*apic_setup_local)(void);
 	int (*apic_setup_io)(void);
 	SLIST_ENTRY(apic_enumerator) apic_next;
 };
 
 inthand_t
 	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
 	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
 	IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
 	IDTVEC(spuriousint), IDTVEC(timerint),
 	IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti),
 	IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti),
 	IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti),
 	IDTVEC(spuriousint_pti), IDTVEC(timerint_pti);
 
 extern vm_paddr_t lapic_paddr;
 extern int *apic_cpuids;
 
 void	apic_register_enumerator(struct apic_enumerator *enumerator);
 void	*ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase);
 int	ioapic_disable_pin(void *cookie, u_int pin);
 int	ioapic_get_vector(void *cookie, u_int pin);
 void	ioapic_register(void *cookie);
 int	ioapic_remap_vector(void *cookie, u_int pin, int vector);
 int	ioapic_set_bus(void *cookie, u_int pin, int bus_type);
 int	ioapic_set_extint(void *cookie, u_int pin);
 int	ioapic_set_nmi(void *cookie, u_int pin);
 int	ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol);
 int	ioapic_set_triggermode(void *cookie, u_int pin,
 	    enum intr_trigger trigger);
 int	ioapic_set_smi(void *cookie, u_int pin);
 
 /*
  * Struct containing pointers to APIC functions whose
  * implementation is run time selectable.
  */
 struct apic_ops {
 	void	(*create)(u_int, int);
 	void	(*init)(vm_paddr_t);
 	void	(*xapic_mode)(void);
 	bool	(*is_x2apic)(void);
 	void	(*setup)(int);
 	void	(*dump)(const char *);
 	void	(*disable)(void);
 	void	(*eoi)(void);
 	int	(*id)(void);
 	int	(*intr_pending)(u_int);
 	void	(*set_logical_id)(u_int, u_int, u_int);
 	u_int	(*cpuid)(u_int);
 
 	/* Vectors */
 	u_int	(*alloc_vector)(u_int, u_int);
 	u_int	(*alloc_vectors)(u_int, u_int *, u_int, u_int);
 	void	(*enable_vector)(u_int, u_int);
 	void	(*disable_vector)(u_int, u_int);
 	void	(*free_vector)(u_int, u_int, u_int);
 
 
 	/* PMC */
 	int	(*enable_pmc)(void);
 	void	(*disable_pmc)(void);
 	void	(*reenable_pmc)(void);
 
 	/* CMC */
 	void	(*enable_cmc)(void);
 
 	/* AMD ELVT */
 	int	(*enable_mca_elvt)(void);
 
 	/* IPI */
 	void	(*ipi_raw)(register_t, u_int);
 	void	(*ipi_vectored)(u_int, int);
 	int	(*ipi_wait)(int);
 	int	(*ipi_alloc)(inthand_t *ipifunc);
 	void	(*ipi_free)(int vector);
 
 	/* LVT */
 	int	(*set_lvt_mask)(u_int, u_int, u_char);
 	int	(*set_lvt_mode)(u_int, u_int, u_int32_t);
 	int	(*set_lvt_polarity)(u_int, u_int, enum intr_polarity);
 	int	(*set_lvt_triggermode)(u_int, u_int, enum intr_trigger);
 };
 
 extern struct apic_ops apic_ops;
 
 static inline void
 lapic_create(u_int apic_id, int boot_cpu)
 {
 
 	apic_ops.create(apic_id, boot_cpu);
 }
 
 static inline void
 lapic_init(vm_paddr_t addr)
 {
 
 	apic_ops.init(addr);
 }
 
 static inline void
 lapic_xapic_mode(void)
 {
 
 	apic_ops.xapic_mode();
 }
 
 static inline bool
 lapic_is_x2apic(void)
 {
 
 	return (apic_ops.is_x2apic());
 }
 
 static inline void
 lapic_setup(int boot)
 {
 
 	apic_ops.setup(boot);
 }
 
 static inline void
 lapic_dump(const char *str)
 {
 
 	apic_ops.dump(str);
 }
 
 static inline void
 lapic_disable(void)
 {
 
 	apic_ops.disable();
 }
 
 static inline void
 lapic_eoi(void)
 {
 
 	apic_ops.eoi();
 }
 
 static inline int
 lapic_id(void)
 {
 
 	return (apic_ops.id());
 }
 
 static inline int
 lapic_intr_pending(u_int vector)
 {
 
 	return (apic_ops.intr_pending(vector));
 }
 
 /* XXX: UNUSED */
 static inline void
 lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 
 	apic_ops.set_logical_id(apic_id, cluster, cluster_id);
 }
 
 static inline u_int
 apic_cpuid(u_int apic_id)
 {
 
 	return (apic_ops.cpuid(apic_id));
 }
 
 static inline u_int
 apic_alloc_vector(u_int apic_id, u_int irq)
 {
 
 	return (apic_ops.alloc_vector(apic_id, irq));
 }
 
 static inline u_int
 apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 
 	return (apic_ops.alloc_vectors(apic_id, irqs, count, align));
 }
 
 static inline void
 apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	apic_ops.enable_vector(apic_id, vector);
 }
 
 static inline void
 apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	apic_ops.disable_vector(apic_id, vector);
 }
 
 static inline void
 apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 
 	apic_ops.free_vector(apic_id, vector, irq);
 }
 
 static inline int
 lapic_enable_pmc(void)
 {
 
 	return (apic_ops.enable_pmc());
 }
 
 static inline void
 lapic_disable_pmc(void)
 {
 
 	apic_ops.disable_pmc();
 }
 
 static inline void
 lapic_reenable_pmc(void)
 {
 
 	apic_ops.reenable_pmc();
 }
 
 static inline void
 lapic_enable_cmc(void)
 {
 
 	apic_ops.enable_cmc();
 }
 
 static inline int
 lapic_enable_mca_elvt(void)
 {
 
 	return (apic_ops.enable_mca_elvt());
 }
 
 static inline void
 lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 
 	apic_ops.ipi_raw(icrlo, dest);
 }
 
 static inline void
 lapic_ipi_vectored(u_int vector, int dest)
 {
 
 	apic_ops.ipi_vectored(vector, dest);
 }
 
 static inline int
 lapic_ipi_wait(int delay)
 {
 
 	return (apic_ops.ipi_wait(delay));
 }
 
 static inline int
 lapic_ipi_alloc(inthand_t *ipifunc)
 {
 
 	return (apic_ops.ipi_alloc(ipifunc));
 }
 
 static inline void
 lapic_ipi_free(int vector)
 {
 
 	return (apic_ops.ipi_free(vector));
 }
 
 static inline int
 lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
 {
 
 	return (apic_ops.set_lvt_mask(apic_id, lvt, masked));
 }
 
 static inline int
 lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode)
 {
 
 	return (apic_ops.set_lvt_mode(apic_id, lvt, mode));
 }
 
 static inline int
 lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 {
 
 	return (apic_ops.set_lvt_polarity(apic_id, lvt, pol));
 }
 
 static inline int
 lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger)
 {
 
 	return (apic_ops.set_lvt_triggermode(apic_id, lvt, trigger));
 }
 
 void	lapic_handle_cmc(void);
 void	lapic_handle_error(void);
 void	lapic_handle_intr(int vector, struct trapframe *frame);
 void	lapic_handle_timer(struct trapframe *frame);
 
 int	ioapic_get_rid(u_int apic_id, uint16_t *ridp);
 
 extern int x2apic_mode;
 extern int lapic_eoi_suppression;
 
 #ifdef _SYS_SYSCTL_H_
 SYSCTL_DECL(_hw_apic);
 #endif
 
 #endif /* !LOCORE */
 #endif /* _X86_APICVAR_H_ */
Index: head/sys/x86/include/x86_smp.h
===================================================================
--- head/sys/x86/include/x86_smp.h	(revision 363526)
+++ head/sys/x86/include/x86_smp.h	(revision 363527)
@@ -1,112 +1,114 @@
 /*-
  * SPDX-License-Identifier: Beerware
  *
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _X86_X86_SMP_H_
 #define	_X86_X86_SMP_H_
 
 #include <sys/bus.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/pcb.h>
 
 struct pmap;
 
 /* global data in mp_x86.c */
 extern int mp_naps;
 extern int boot_cpu_id;
 extern struct pcb stoppcbs[];
 extern int cpu_apic_ids[];
 extern int bootAP;
 extern void *dpcpu;
 extern char *bootSTK;
 extern void *bootstacks[];
 extern unsigned int boot_address;
 extern unsigned int bootMP_size;
 extern volatile int aps_ready;
 extern struct mtx ap_boot_mtx;
 extern int cpu_logical;
 extern int cpu_cores;
 extern volatile uint32_t smp_tlb_generation;
 extern struct pmap *smp_tlb_pmap;
 extern vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
 extern u_int xhits_gbl[];
 extern u_int xhits_pg[];
 extern u_int xhits_rng[];
 extern u_int ipi_global;
 extern u_int ipi_page;
 extern u_int ipi_range;
 extern u_int ipi_range_size;
 
 extern int nmi_kdb_lock;
 extern int nmi_is_broadcast;
 
 struct cpu_info {
 	int	cpu_present:1;
 	int	cpu_bsp:1;
 	int	cpu_disabled:1;
 	int	cpu_hyperthread:1;
 };
 extern struct cpu_info *cpu_info;
 
 /*
  * Set if MWAIT does not reliably wake when the MONITORed address is written.
  */
 extern bool mwait_cpustop_broken;
 
 #ifdef COUNT_IPIS
 extern u_long *ipi_invltlb_counts[MAXCPU];
 extern u_long *ipi_invlrng_counts[MAXCPU];
 extern u_long *ipi_invlpg_counts[MAXCPU];
 extern u_long *ipi_invlcache_counts[MAXCPU];
 extern u_long *ipi_rendezvous_counts[MAXCPU];
 #endif
 
 /* IPI handlers */
 inthand_t
 	IDTVEC(ipi_intr_bitmap_handler), /* Bitmap based IPIs */ 
+	IDTVEC(ipi_swi),	/* Runs delayed SWI */
 	IDTVEC(cpustop),	/* CPU stops & waits to be restarted */
 	IDTVEC(cpususpend),	/* CPU suspends & waits to be resumed */
 	IDTVEC(rendezvous);	/* handle CPU rendezvous */
 
 typedef void (*smp_invl_cb_t)(struct pmap *, vm_offset_t addr1,
     vm_offset_t addr2);
 
 /* functions in x86_mp.c */
 void	assign_cpu_ids(void);
 void	cpu_add(u_int apic_id, char boot_cpu);
 void	cpustop_handler(void);
 void	cpususpend_handler(void);
 void	alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx);
 void	init_secondary_tail(void);
 void	init_secondary(void);
 void	ipi_startup(int apic_id, int vector);
 void	ipi_all_but_self(u_int ipi);
 void 	ipi_bitmap_handler(struct trapframe frame);
 void	ipi_cpu(int cpu, u_int ipi);
 int	ipi_nmi_handler(void);
+void	ipi_swi_handler(struct trapframe frame);
 void	ipi_selected(cpuset_t cpus, u_int ipi);
 void	ipi_self_from_nmi(u_int vector);
 void	set_interrupt_apic_ids(void);
 void	smp_cache_flush(smp_invl_cb_t curcpu_cb);
 void	smp_masked_invlpg(cpuset_t mask, vm_offset_t addr, struct pmap *pmap,
 	    smp_invl_cb_t curcpu_cb);
 void	smp_masked_invlpg_range(cpuset_t mask, vm_offset_t startva,
 	    vm_offset_t endva, struct pmap *pmap, smp_invl_cb_t curcpu_cb);
 void	smp_masked_invltlb(cpuset_t mask, struct pmap *pmap,
 	    smp_invl_cb_t curcpu_cb);
 void	mem_range_AP_init(void);
 void	topo_probe(void);
 
 #endif
Index: head/sys/x86/x86/mp_x86.c
===================================================================
--- head/sys/x86/x86/mp_x86.c	(revision 363526)
+++ head/sys/x86/x86/mp_x86.c	(revision 363527)
@@ -1,1669 +1,1680 @@
 /*-
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #include <sys/csan.h>
 #ifdef GPROF 
 #include <sys/gmon.h>
 #endif
+#include <sys/interrupt.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/stack.h>
 #include <x86/ucode.h>
 
 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");
 
 /* lock region used by kernel profiling */
 int	mcount_lock;
 
 int	mp_naps;		/* # of Applications processors */
 int	boot_cpu_id = -1;	/* designated BSP */
 
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 int bootAP;
 
 /* Free these after use */
 void *bootstacks[MAXCPU];
 void *dpcpu;
 
 struct pcb stoppcbs[MAXCPU];
 struct susppcb **susppcbs;
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
 static u_long *ipi_preempt_counts[MAXCPU];
 static u_long *ipi_ast_counts[MAXCPU];
 u_long *ipi_invltlb_counts[MAXCPU];
 u_long *ipi_invlrng_counts[MAXCPU];
 u_long *ipi_invlpg_counts[MAXCPU];
 u_long *ipi_invlcache_counts[MAXCPU];
 u_long *ipi_rendezvous_counts[MAXCPU];
 static u_long *ipi_hardclock_counts[MAXCPU];
 #endif
 
 /* Default cpu_ops implementation. */
 struct cpu_ops cpu_ops;
 
 /*
  * Local data and functions.
  */
 
 static volatile cpuset_t ipi_stop_nmi_pending;
 
 volatile cpuset_t resuming_cpus;
 volatile cpuset_t toresume_cpus;
 
 /* used to hold the AP's until we are ready to release them */
 struct mtx ap_boot_mtx;
 
 /* Set to 1 once we're ready to let the APs out of the pen. */
 volatile int aps_ready = 0;
 
 /*
  * Store data from cpu_add() until later in the boot when we actually setup
  * the APs.
  */
 struct cpu_info *cpu_info;
 int *apic_cpuids;
 int cpu_apic_ids[MAXCPU];
 _Static_assert(MAXCPU <= MAX_APIC_ID,
     "MAXCPU cannot be larger that MAX_APIC_ID");
 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
     "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");
 
 static void	release_aps(void *dummy);
 static void	cpustop_handler_post(u_int cpu);
 
 static int	hyperthreading_allowed = 1;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
 	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
 
 static int	hyperthreading_intr_allowed = 0;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN,
 	&hyperthreading_intr_allowed, 0,
 	"Allow interrupts on HTT logical CPUs");
 
 static struct topo_node topo_root;
 
 static int pkg_id_shift;
 static int node_id_shift;
 static int core_id_shift;
 static int disabled_cpus;
 
 struct cache_info {
 	int	id_shift;
 	int	present;
 } static caches[MAX_CACHE_LEVELS];
 
 unsigned int boot_address;
 
 static bool stop_mwait = false;
 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0,
     "Use MONITOR/MWAIT when stopping CPU, if available");
 
 #define MiB(v)	(v ## ULL << 20)
 
 void
 mem_range_AP_init(void)
 {
 
 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
 /*
  * Round up to the next power of two, if necessary, and then
  * take log2.
  * Returns -1 if argument is zero.
  */
 static __inline int
 mask_width(u_int x)
 {
 
 	return (fls(x << (1 - powerof2(x))) - 1);
 }
 
 /*
  * Add a cache level to the cache topology description.
  */
 static int
 add_deterministic_cache(int type, int level, int share_count)
 {
 
 	if (type == 0)
 		return (0);
 	if (type > 3) {
 		printf("unexpected cache type %d\n", type);
 		return (1);
 	}
 	if (type == 2) /* ignore instruction cache */
 		return (1);
 	if (level == 0 || level > MAX_CACHE_LEVELS) {
 		printf("unexpected cache level %d\n", type);
 		return (1);
 	}
 
 	if (caches[level - 1].present) {
 		printf("WARNING: multiple entries for L%u data cache\n", level);
 		printf("%u => %u\n", caches[level - 1].id_shift,
 		    mask_width(share_count));
 	}
 	caches[level - 1].id_shift = mask_width(share_count);
 	caches[level - 1].present = 1;
 
 	if (caches[level - 1].id_shift > pkg_id_shift) {
 		printf("WARNING: L%u data cache covers more "
 		    "APIC IDs than a package (%u > %u)\n", level,
 		    caches[level - 1].id_shift, pkg_id_shift);
 		caches[level - 1].id_shift = pkg_id_shift;
 	}
 	if (caches[level - 1].id_shift < core_id_shift) {
 		printf("WARNING: L%u data cache covers fewer "
 		    "APIC IDs than a core (%u < %u)\n", level,
 		    caches[level - 1].id_shift, core_id_shift);
 		caches[level - 1].id_shift = core_id_shift;
 	}
 
 	return (1);
 }
 
 /*
  * Determine topology of processing units and caches for AMD CPUs.
  * See:
  *  - AMD CPUID Specification (Publication # 25481)
  *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
  *  - BKDG For AMD Family 10h Processors (Publication # 31116)
  *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
  *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
  *  - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
  */
 static void
 topo_probe_amd(void)
 {
 	u_int p[4];
 	uint64_t v;
 	int level;
 	int nodes_per_socket;
 	int share_count;
 	int type;
 	int i;
 
 	/* No multi-core capability. */
 	if ((amd_feature2 & AMDID2_CMP) == 0)
 		return;
 
 	/* For families 10h and newer. */
 	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
 	    AMDID_COREID_SIZE_SHIFT;
 
 	/* For 0Fh family. */
 	if (pkg_id_shift == 0)
 		pkg_id_shift =
 		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
 
 	/*
 	 * Families prior to 16h define the following value as
 	 * cores per compute unit and we don't really care about the AMD
 	 * compute units at the moment.  Perhaps we should treat them as
 	 * cores and cores within the compute units as hardware threads,
 	 * but that's up for debate.
 	 * Later families define the value as threads per compute unit,
 	 * so we are following AMD's nomenclature here.
 	 */
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
 	    CPUID_TO_FAMILY(cpu_id) >= 0x16) {
 		cpuid_count(0x8000001e, 0, p);
 		share_count = ((p[1] >> 8) & 0xff) + 1;
 		core_id_shift = mask_width(share_count);
 
 		/*
 		 * For Zen (17h), gather Nodes per Processor.  Each node is a
 		 * Zeppelin die; TR and EPYC CPUs will have multiple dies per
 		 * package.  Communication latency between dies is higher than
 		 * within them.
 		 */
 		nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
 		node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
 	}
 
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
 		for (i = 0; ; i++) {
 			cpuid_count(0x8000001d, i, p);
 			type = p[0] & 0x1f;
 			level = (p[0] >> 5) & 0x7;
 			share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 			if (!add_deterministic_cache(type, level, share_count))
 				break;
 		}
 	} else {
 		if (cpu_exthigh >= 0x80000005) {
 			cpuid_count(0x80000005, 0, p);
 			if (((p[2] >> 24) & 0xff) != 0) {
 				caches[0].id_shift = 0;
 				caches[0].present = 1;
 			}
 		}
 		if (cpu_exthigh >= 0x80000006) {
 			cpuid_count(0x80000006, 0, p);
 			if (((p[2] >> 16) & 0xffff) != 0) {
 				caches[1].id_shift = 0;
 				caches[1].present = 1;
 			}
 			if (((p[3] >> 18) & 0x3fff) != 0) {
 				nodes_per_socket = 1;
 				if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
 					/*
 					 * Handle multi-node processors that
 					 * have multiple chips, each with its
 					 * own L3 cache, on the same die.
 					 */
 					v = rdmsr(0xc001100c);
 					nodes_per_socket = 1 + ((v >> 3) & 0x7);
 				}
 				caches[2].id_shift =
 				    pkg_id_shift - mask_width(nodes_per_socket);
 				caches[2].present = 1;
 			}
 		}
 	}
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 1 and Leaf 4, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0x4(void)
 {
 	u_int p[4];
 	int max_cores;
 	int max_logical;
 
 	/* Both zero and one here mean one logical processor per package. */
 	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
 	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
 	if (max_logical <= 1)
 		return;
 
 	if (cpu_high >= 0x4) {
 		cpuid_count(0x04, 0, p);
 		max_cores = ((p[0] >> 26) & 0x3f) + 1;
 	} else
 		max_cores = 1;
 
 	core_id_shift = mask_width(max_logical/max_cores);
 	KASSERT(core_id_shift >= 0,
 	    ("intel topo: max_cores > max_logical\n"));
 	pkg_id_shift = core_id_shift + mask_width(max_cores);
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 11, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0xb(void)
 {
 	u_int p[4];
 	int bits;
 	int type;
 	int i;
 
 	/* Fall back if CPU leaf 11 doesn't really exist. */
 	cpuid_count(0x0b, 0, p);
 	if (p[1] == 0) {
 		topo_probe_intel_0x4();
 		return;
 	}
 
 	/* We only support three levels for now. */
 	for (i = 0; ; i++) {
 		cpuid_count(0x0b, i, p);
 
 		bits = p[0] & 0x1f;
 		type = (p[2] >> 8) & 0xff;
 
 		if (type == 0)
 			break;
 
 		/* TODO: check for duplicate (re-)assignment */
 		if (type == CPUID_TYPE_SMT)
 			core_id_shift = bits;
 		else if (type == CPUID_TYPE_CORE)
 			pkg_id_shift = bits;
 		else
 			printf("unknown CPU level type %d\n", type);
 	}
 
 	if (pkg_id_shift < core_id_shift) {
 		printf("WARNING: core covers more APIC IDs than a package\n");
 		core_id_shift = pkg_id_shift;
 	}
 }
 
 /*
  * Determine topology of caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 Architectures Software Developer’s Manual
  *    Volume 2A: Instruction Set Reference, A-M,
  *    CPUID instruction
  */
 static void
 topo_probe_intel_caches(void)
 {
 	u_int p[4];
 	int level;
 	int share_count;
 	int type;
 	int i;
 
 	if (cpu_high < 0x4) {
 		/*
 		 * Available cache level and sizes can be determined
 		 * via CPUID leaf 2, but that requires a huge table of hardcoded
 		 * values, so for now just assume L1 and L2 caches potentially
 		 * shared only by HTT processing units, if HTT is present.
 		 */
 		caches[0].id_shift = pkg_id_shift;
 		caches[0].present = 1;
 		caches[1].id_shift = pkg_id_shift;
 		caches[1].present = 1;
 		return;
 	}
 
 	for (i = 0; ; i++) {
 		cpuid_count(0x4, i, p);
 		type = p[0] & 0x1f;
 		level = (p[0] >> 5) & 0x7;
 		share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 		if (!add_deterministic_cache(type, level, share_count))
 			break;
 	}
 }
 
 /*
  * Determine topology of processing units and caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  */
 static void
 topo_probe_intel(void)
 {
 
 	/*
 	 * Note that 0x1 <= cpu_high < 4 case should be
 	 * compatible with topo_probe_intel_0x4() logic when
 	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
 	 * or it should trigger the fallback otherwise.
 	 */
 	if (cpu_high >= 0xb)
 		topo_probe_intel_0xb();
 	else if (cpu_high >= 0x1)
 		topo_probe_intel_0x4();
 
 	topo_probe_intel_caches();
 }
 
 /*
  * Topology information is queried only on BSP, on which this
  * code runs and for which it can query CPUID information.
  * Then topology is extrapolated on all packages using an
  * assumption that APIC ID to hardware component ID mapping is
  * homogenious.
  * That doesn't necesserily imply that the topology is uniform.
  */
 void
 topo_probe(void)
 {
 	static int cpu_topo_probed = 0;
 	struct x86_topo_layer {
 		int type;
 		int subtype;
 		int id_shift;
 	} topo_layers[MAX_CACHE_LEVELS + 4];
 	struct topo_node *parent;
 	struct topo_node *node;
 	int layer;
 	int nlayers;
 	int node_id;
 	int i;
 
 	if (cpu_topo_probed)
 		return;
 
 	CPU_ZERO(&logical_cpus_mask);
 
 	if (mp_ncpus <= 1)
 		; /* nothing */
 	else if (cpu_vendor_id == CPU_VENDOR_AMD ||
 	    cpu_vendor_id == CPU_VENDOR_HYGON)
 		topo_probe_amd();
 	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
 		topo_probe_intel();
 
 	KASSERT(pkg_id_shift >= core_id_shift,
 	    ("bug in APIC topology discovery"));
 
 	nlayers = 0;
 	bzero(topo_layers, sizeof(topo_layers));
 
 	topo_layers[nlayers].type = TOPO_TYPE_PKG;
 	topo_layers[nlayers].id_shift = pkg_id_shift;
 	if (bootverbose)
 		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
 	nlayers++;
 
 	if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
 		topo_layers[nlayers].type = TOPO_TYPE_GROUP;
 		topo_layers[nlayers].id_shift = node_id_shift;
 		if (bootverbose)
 			printf("Node ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	/*
 	 * Consider all caches to be within a package/chip
 	 * and "in front" of all sub-components like
 	 * cores and hardware threads.
 	 */
 	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
 		if (caches[i].present) {
 			if (node_id_shift != 0)
 				KASSERT(caches[i].id_shift <= node_id_shift,
 					("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift <= pkg_id_shift,
 				("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift >= core_id_shift,
 				("bug in APIC topology discovery"));
 
 			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
 			topo_layers[nlayers].subtype = i + 1;
 			topo_layers[nlayers].id_shift = caches[i].id_shift;
 			if (bootverbose)
 				printf("L%u cache ID shift: %u\n",
 				    topo_layers[nlayers].subtype,
 				    topo_layers[nlayers].id_shift);
 			nlayers++;
 		}
 	}
 
 	if (pkg_id_shift > core_id_shift) {
 		topo_layers[nlayers].type = TOPO_TYPE_CORE;
 		topo_layers[nlayers].id_shift = core_id_shift;
 		if (bootverbose)
 			printf("Core ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	topo_layers[nlayers].type = TOPO_TYPE_PU;
 	topo_layers[nlayers].id_shift = 0;
 	nlayers++;
 
 	topo_init_root(&topo_root);
 	for (i = 0; i <= max_apic_id; ++i) {
 		if (!cpu_info[i].cpu_present)
 			continue;
 
 		parent = &topo_root;
 		for (layer = 0; layer < nlayers; ++layer) {
 			node_id = i >> topo_layers[layer].id_shift;
 			parent = topo_add_node_by_hwid(parent, node_id,
 			    topo_layers[layer].type,
 			    topo_layers[layer].subtype);
 		}
 	}
 
 	parent = &topo_root;
 	for (layer = 0; layer < nlayers; ++layer) {
 		node_id = boot_cpu_id >> topo_layers[layer].id_shift;
 		node = topo_find_node_by_hwid(parent, node_id,
 		    topo_layers[layer].type,
 		    topo_layers[layer].subtype);
 		topo_promote_child(node);
 		parent = node;
 	}
 
 	cpu_topo_probed = 1;
 }
 
 /*
  * Assign logical CPU IDs to local APICs.
  */
 void
 assign_cpu_ids(void)
 {
 	struct topo_node *node;
 	u_int smt_mask;
 	int nhyper;
 
 	smt_mask = (1u << core_id_shift) - 1;
 
 	/*
 	 * Assign CPU IDs to local APIC IDs and disable any CPUs
 	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
 	 */
 	mp_ncpus = 0;
 	nhyper = 0;
 	TOPO_FOREACH(node, &topo_root) {
 		if (node->type != TOPO_TYPE_PU)
 			continue;
 
 		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
 			cpu_info[node->hwid].cpu_hyperthread = 1;
 
 		if (resource_disabled("lapic", node->hwid)) {
 			if (node->hwid != boot_cpu_id)
 				cpu_info[node->hwid].cpu_disabled = 1;
 			else
 				printf("Cannot disable BSP, APIC ID = %d\n",
 				    node->hwid);
 		}
 
 		if (!hyperthreading_allowed &&
 		    cpu_info[node->hwid].cpu_hyperthread)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (mp_ncpus >= MAXCPU)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (cpu_info[node->hwid].cpu_disabled) {
 			disabled_cpus++;
 			continue;
 		}
 
 		if (cpu_info[node->hwid].cpu_hyperthread)
 			nhyper++;
 
 		cpu_apic_ids[mp_ncpus] = node->hwid;
 		apic_cpuids[node->hwid] = mp_ncpus;
 		topo_set_pu_id(node, mp_ncpus);
 		mp_ncpus++;
 	}
 
 	KASSERT(mp_maxid >= mp_ncpus - 1,
 	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
 	    mp_ncpus));
 
 	mp_ncores = mp_ncpus - nhyper;
 	smp_threads_per_core = mp_ncpus / mp_ncores;
 }
 
 /*
  * Print various information about the SMP system hardware and setup.
  */
 void
 cpu_mp_announce(void)
 {
 	struct topo_node *node;
 	const char *hyperthread;
 	struct topo_analysis topology;
 
 	printf("FreeBSD/SMP: ");
 	if (topo_analyze(&topo_root, 1, &topology)) {
 		printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
 		if (topology.entities[TOPO_LEVEL_GROUP] > 1)
 			printf(" x %d groups",
 			    topology.entities[TOPO_LEVEL_GROUP]);
 		if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
 			printf(" x %d cache groups",
 			    topology.entities[TOPO_LEVEL_CACHEGROUP]);
 		if (topology.entities[TOPO_LEVEL_CORE] > 0)
 			printf(" x %d core(s)",
 			    topology.entities[TOPO_LEVEL_CORE]);
 		if (topology.entities[TOPO_LEVEL_THREAD] > 1)
 			printf(" x %d hardware threads",
 			    topology.entities[TOPO_LEVEL_THREAD]);
 	} else {
 		printf("Non-uniform topology");
 	}
 	printf("\n");
 
 	if (disabled_cpus) {
 		printf("FreeBSD/SMP Online: ");
 		if (topo_analyze(&topo_root, 0, &topology)) {
 			printf("%d package(s)",
 			    topology.entities[TOPO_LEVEL_PKG]);
 			if (topology.entities[TOPO_LEVEL_GROUP] > 1)
 				printf(" x %d groups",
 				    topology.entities[TOPO_LEVEL_GROUP]);
 			if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
 				printf(" x %d cache groups",
 				    topology.entities[TOPO_LEVEL_CACHEGROUP]);
 			if (topology.entities[TOPO_LEVEL_CORE] > 0)
 				printf(" x %d core(s)",
 				    topology.entities[TOPO_LEVEL_CORE]);
 			if (topology.entities[TOPO_LEVEL_THREAD] > 1)
 				printf(" x %d hardware threads",
 				    topology.entities[TOPO_LEVEL_THREAD]);
 		} else {
 			printf("Non-uniform topology");
 		}
 		printf("\n");
 	}
 
 	if (!bootverbose)
 		return;
 
 	TOPO_FOREACH(node, &topo_root) {
 		switch (node->type) {
 		case TOPO_TYPE_PKG:
 			printf("Package HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_CORE:
 			printf("\tCore HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_PU:
 			if (cpu_info[node->hwid].cpu_hyperthread)
 				hyperthread = "/HT";
 			else
 				hyperthread = "";
 
 			if (node->subtype == 0)
 				printf("\t\tCPU (AP%s): APIC ID: %u"
 				    "(disabled)\n", hyperthread, node->hwid);
 			else if (node->id == 0)
 				printf("\t\tCPU0 (BSP): APIC ID: %u\n",
 				    node->hwid);
 			else
 				printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
 				    node->id, hyperthread, node->hwid);
 			break;
 		default:
 			/* ignored */
 			break;
 		}
 	}
 }
 
 /*
  * Add a scheduling group, a group of logical processors sharing
  * a particular cache (and, thus having an affinity), to the scheduling
  * topology.
  * This function recursively works on lower level caches.
  */
 static void
 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
 {
 	struct topo_node *node;
 	int nchildren;
 	int ncores;
 	int i;
 
 	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE ||
 	    root->type == TOPO_TYPE_GROUP,
 	    ("x86topo_add_sched_group: bad type: %u", root->type));
 	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
 	cg_root->cg_count = root->cpu_count;
 	if (root->type == TOPO_TYPE_SYSTEM)
 		cg_root->cg_level = CG_SHARE_NONE;
 	else
 		cg_root->cg_level = root->subtype;
 
 	/*
 	 * Check how many core nodes we have under the given root node.
 	 * If we have multiple logical processors, but not multiple
 	 * cores, then those processors must be hardware threads.
 	 */
 	ncores = 0;
 	node = root;
 	while (node != NULL) {
 		if (node->type != TOPO_TYPE_CORE) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 
 		ncores++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	if (cg_root->cg_level != CG_SHARE_NONE &&
 	    root->cpu_count > 1 && ncores < 2)
 		cg_root->cg_flags = CG_FLAG_SMT;
 
 	/*
 	 * Find out how many cache nodes we have under the given root node.
 	 * We ignore cache nodes that cover all the same processors as the
 	 * root node.  Also, we do not descend below found cache nodes.
 	 * That is, we count top-level "non-redundant" caches under the root
 	 * node.
 	 */
 	nchildren = 0;
 	node = root;
 	while (node != NULL) {
 		if ((node->type != TOPO_TYPE_GROUP &&
 		    node->type != TOPO_TYPE_CACHE) ||
 		    (root->type != TOPO_TYPE_SYSTEM &&
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		nchildren++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	cg_root->cg_child = smp_topo_alloc(nchildren);
 	cg_root->cg_children = nchildren;
 
 	/*
 	 * Now find again the same cache nodes as above and recursively
 	 * build scheduling topologies for them.
 	 */
 	node = root;
 	i = 0;
 	while (node != NULL) {
 		if ((node->type != TOPO_TYPE_GROUP &&
 		    node->type != TOPO_TYPE_CACHE) ||
 		    (root->type != TOPO_TYPE_SYSTEM &&
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		cg_root->cg_child[i].cg_parent = cg_root;
 		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
 		i++;
 		node = topo_next_nonchild_node(root, node);
 	}
 }
 
 /*
  * Build the MI scheduling topology from the discovered hardware topology.
  */
 struct cpu_group *
 cpu_topo(void)
 {
 	struct cpu_group *cg_root;
 
 	if (mp_ncpus <= 1)
 		return (smp_topo_none());
 
 	cg_root = smp_topo_alloc(1);
 	x86topo_add_sched_group(&topo_root, cg_root);
 	return (cg_root);
 }
 
 static void
 cpu_alloc(void *dummy __unused)
 {
 	/*
 	 * Dynamically allocate the arrays that depend on the
 	 * maximum APIC ID.
 	 */
 	cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS,
 	    M_WAITOK | M_ZERO);
 	apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS,
 	    M_WAITOK | M_ZERO);
 }
 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);
 
 /*
  * Add a logical CPU to the topology.
  */
 void
 cpu_add(u_int apic_id, char boot_cpu)
 {
 
 	if (apic_id > max_apic_id) {
 		panic("SMP: APIC ID %d too high", apic_id);
 		return;
 	}
 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
 	    apic_id));
 	cpu_info[apic_id].cpu_present = 1;
 	if (boot_cpu) {
 		KASSERT(boot_cpu_id == -1,
 		    ("CPU %u claims to be BSP, but CPU %u already is", apic_id,
 		    boot_cpu_id));
 		boot_cpu_id = apic_id;
 		cpu_info[apic_id].cpu_bsp = 1;
 	}
 	if (bootverbose)
 		printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
 		    "AP");
 }
 
 void
 cpu_mp_setmaxid(void)
 {
 
 	/*
 	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
 	 * If there were no calls to cpu_add() assume this is a UP system.
 	 */
 	if (mp_ncpus == 0)
 		mp_ncpus = 1;
 }
 
 int
 cpu_mp_probe(void)
 {
 
 	/*
 	 * Always record BSP in CPU map so that the mbuf init code works
 	 * correctly.
 	 */
 	CPU_SETOF(0, &all_cpus);
 	return (mp_ncpus > 1);
 }
 
 /* Allocate memory for the AP trampoline. */
 void
 alloc_ap_trampoline(vm_paddr_t *physmap, unsigned int *physmap_idx)
 {
 	unsigned int i;
 	bool allocated;
 
 	allocated = false;
 	for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
 		/*
 		 * Find a memory region big enough and below the 1MB boundary
 		 * for the trampoline code.
 		 * NB: needs to be page aligned.
 		 */
 		if (physmap[i] >= MiB(1) ||
 		    (trunc_page(physmap[i + 1]) - round_page(physmap[i])) <
 		    round_page(bootMP_size))
 			continue;
 
 		allocated = true;
 		/*
 		 * Try to steal from the end of the region to mimic previous
 		 * behaviour, else fallback to steal from the start.
 		 */
 		if (physmap[i + 1] < MiB(1)) {
 			boot_address = trunc_page(physmap[i + 1]);
 			if ((physmap[i + 1] - boot_address) < bootMP_size)
 				boot_address -= round_page(bootMP_size);
 			physmap[i + 1] = boot_address;
 		} else {
 			boot_address = round_page(physmap[i]);
 			physmap[i] = boot_address + round_page(bootMP_size);
 		}
 		if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
 			memmove(&physmap[i], &physmap[i + 2],
 			    sizeof(*physmap) * (*physmap_idx - i + 2));
 			*physmap_idx -= 2;
 		}
 		break;
 	}
 
 	if (!allocated) {
 		boot_address = basemem * 1024 - bootMP_size;
 		if (bootverbose)
 			printf(
 "Cannot find enough space for the boot trampoline, placing it at %#x",
 			    boot_address);
 	}
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary_tail(void)
 {
 	u_int cpuid;
 
 	pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
 
 	/*
 	 * On real hardware, switch to x2apic mode if possible.  Do it
 	 * after aps_ready was signalled, to avoid manipulating the
 	 * mode while BSP might still want to send some IPI to us
 	 * (second startup IPI is ignored on modern hardware etc).
 	 */
 	lapic_xapic_mode();
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
 
 	/* set up CPU registers and state */
 	cpu_setregs();
 
 	/* set up SSE/NX */
 	initializecpu();
 
 	/* set up FPU state on the AP */
 #ifdef __amd64__
 	fpuinit();
 #else
 	npxinit(false);
 #endif
 
 	if (cpu_ops.cpu_init)
 		cpu_ops.cpu_init();
 
 	/* A quick check from sanity claus */
 	cpuid = PCPU_GET(cpuid);
 	if (PCPU_GET(apic_id) != lapic_id()) {
 		printf("SMP: cpuid = %d\n", cpuid);
 		printf("SMP: actual apic_id = %d\n", lapic_id());
 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
 		panic("cpuid mismatch! boom!!");
 	}
 
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
 
 	mtx_lock_spin(&ap_boot_mtx);
 
 	mca_init();
 
 	/* Init local apic for irq's */
 	lapic_setup(1);
 
 	/* Set memory range attributes for this CPU to match the BSP */
 	mem_range_AP_init();
 
 	smp_cpus++;
 
 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
 	if (bootverbose)
 		printf("SMP: AP CPU #%d Launched!\n", cpuid);
 	else
 		printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "",
 		    cpuid, smp_cpus == mp_ncpus ? "\n" : " ");
 
 	/* Determine if we are a logical CPU. */
 	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
 		CPU_SET(cpuid, &logical_cpus_mask);
 
 	if (bootverbose)
 		lapic_dump("AP");
 
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
 	}
 
 #ifdef __amd64__
 	/*
 	 * Enable global pages TLB extension
 	 * This also implicitly flushes the TLB 
 	 */
 	load_cr4(rcr4() | CR4_PGE);
 	if (pmap_pcid_enabled)
 		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 #endif
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
 	/* Wait until all the AP's are up. */
 	while (atomic_load_acq_int(&smp_started) == 0)
 		ia32_pause();
 
 #ifndef EARLY_AP_STARTUP
 	/* Start per-CPU event timers. */
 	cpu_initclocks_ap();
 #endif
 
 	kcsan_cpu_init(cpuid);
 
 	/*
 	 * Assert that smp_after_idle_runnable condition is reasonable.
 	 */
 	MPASS(PCPU_GET(curpcb) == NULL);
 
 	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
 }
 
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
 	struct pcpu *pc;
 	int cpu;
 
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		pc = pcpu_find(cpu);
 		while (atomic_load_ptr(&pc->pc_curpcb) == NULL)
 			cpu_spinwait();
 		kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages *
 		    PAGE_SIZE);
 	}
 }
 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
     smp_after_idle_runnable, NULL);
 
 /*
  * We tell the I/O APIC code about all the CPUs we want to receive
  * interrupts.  If we don't want certain CPUs to receive IRQs we
  * can simply not tell the I/O APIC code about them in this function.
  * We also do not tell it about the BSP since it tells itself about
  * the BSP internally to work with UP kernels and on UP machines.
  */
 void
 set_interrupt_apic_ids(void)
 {
 	u_int i, apic_id;
 
 	for (i = 0; i < MAXCPU; i++) {
 		apic_id = cpu_apic_ids[i];
 		if (apic_id == -1)
 			continue;
 		if (cpu_info[apic_id].cpu_bsp)
 			continue;
 		if (cpu_info[apic_id].cpu_disabled)
 			continue;
 
 		/* Don't let hyperthreads service interrupts. */
 		if (cpu_info[apic_id].cpu_hyperthread &&
 		    !hyperthreading_intr_allowed)
 			continue;
 
 		intr_add_cpu(i);
 	}
 }
 
 
 #ifdef COUNT_XINVLTLB_HITS
 u_int xhits_gbl[MAXCPU];
 u_int xhits_pg[MAXCPU];
 u_int xhits_rng[MAXCPU];
 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
     sizeof(xhits_gbl), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
     sizeof(xhits_pg), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
     sizeof(xhits_rng), "IU", "");
 
 u_int ipi_global;
 u_int ipi_page;
 u_int ipi_range;
 u_int ipi_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
     0, "");
 #endif /* COUNT_XINVLTLB_HITS */
 
 /*
  * Init and startup IPI.
  */
 void
 ipi_startup(int apic_id, int vector)
 {
 
 	/*
 	 * This attempts to follow the algorithm described in the
 	 * Intel Multiprocessor Specification v1.4 in section B.4.
 	 * For each IPI, we allow the local APIC ~20us to deliver the
 	 * IPI.  If that times out, we panic.
 	 */
 
 	/*
 	 * first we do an INIT IPI: this INIT IPI might be run, resetting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
 	 * ignored.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
 	lapic_ipi_wait(100);
 
 	/* Explicitly deassert the INIT IPI. */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
 	    apic_id);
 
 	DELAY(10000);		/* wait ~10mS */
 
 	/*
 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
 	 * latched, (P5 bug) this 1st STARTUP would then terminate
 	 * immediately, and the previously started INIT IPI would continue. OR
 	 * the previous INIT IPI has already run. and this STARTUP IPI will
 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
 	 * will run.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver first STARTUP IPI to APIC %d",
 		    apic_id);
 	DELAY(200);		/* wait ~200uS */
 
 	/*
 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
 	 * recognized after hardware RESET or INIT IPI.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver second STARTUP IPI to APIC %d",
 		    apic_id);
 
 	DELAY(200);		/* wait ~200uS */
 }
 
 static bool
 ipi_bitmap_set(int cpu, u_int ipi)
 {
 	u_int bitmap, old, new;
 	u_int *cpu_bitmap;
 
 	bitmap = 1 << ipi;
 	cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap;
 	old = *cpu_bitmap;
 	for (;;) {
 		if ((old & bitmap) != 0)
 			break;
 		new = old | bitmap;
 		if (atomic_fcmpset_int(cpu_bitmap, &old, new))
 			break;
 	}
 	return (old != 0);
 }
 
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
 static void
 ipi_send_cpu(int cpu, u_int ipi)
 {
 
 	KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1,
 	    ("IPI to non-existent CPU %d", cpu));
 
 	if (IPI_IS_BITMAPED(ipi)) {
 		if (ipi_bitmap_set(cpu, ipi))
 			return;
 		ipi = IPI_BITMAP_VECTOR;
 	}
 	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
 }
 
 void
 ipi_bitmap_handler(struct trapframe frame)
 {
 	struct trapframe *oldframe;
 	struct thread *td;
 	int cpu = PCPU_GET(cpuid);
 	u_int ipi_bitmap;
 
 	td = curthread;
 	ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]->
 	    pc_ipi_bitmap);
 
 	/*
 	 * sched_preempt() must be called to clear the pending preempt
 	 * IPI to enable delivery of further preempts.  However, the
 	 * critical section will cause extra scheduler lock thrashing
 	 * when used unconditionally.  Only critical_enter() if
 	 * hardclock must also run, which requires the section entry.
 	 */
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
 		critical_enter();
 
 	td->td_intr_nesting_level++;
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = &frame;
 #if defined(STACK) || defined(DDB)
 	if (ipi_bitmap & (1 << IPI_TRACE))
 		stack_capture_intr();
 #endif
 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
 #ifdef COUNT_IPIS
 		(*ipi_preempt_counts[cpu])++;
 #endif
 		sched_preempt(td);
 	}
 	if (ipi_bitmap & (1 << IPI_AST)) {
 #ifdef COUNT_IPIS
 		(*ipi_ast_counts[cpu])++;
 #endif
 		/* Nothing to do for AST */
 	}
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
 #ifdef COUNT_IPIS
 		(*ipi_hardclock_counts[cpu])++;
 #endif
 		hardclockintr();
 	}
 	td->td_intr_frame = oldframe;
 	td->td_intr_nesting_level--;
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
 		critical_exit();
 }
 
 /*
  * send an IPI to a set of cpus.
  */
 void
 ipi_selected(cpuset_t cpus, u_int ipi)
 {
 	int cpu;
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
 
 	while ((cpu = CPU_FFS(&cpus)) != 0) {
 		cpu--;
 		CPU_CLR(cpu, &cpus);
 		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 		ipi_send_cpu(cpu, ipi);
 	}
 }
 
 /*
  * send an IPI to a specific CPU.
  */
 void
 ipi_cpu(int cpu, u_int ipi)
 {
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
 
 	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 	ipi_send_cpu(cpu, ipi);
 }
 
 /*
  * send an IPI to all CPUs EXCEPT myself
  */
 void
 ipi_all_but_self(u_int ipi)
 {
 	cpuset_t other_cpus;
 	int cpu, c;
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD) {
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
 	}
 
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	if (IPI_IS_BITMAPED(ipi)) {
 		cpu = PCPU_GET(cpuid);
 		CPU_FOREACH(c) {
 			if (c != cpu)
 				ipi_bitmap_set(c, ipi);
 		}
 		ipi = IPI_BITMAP_VECTOR;
 	}
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 void
 ipi_self_from_nmi(u_int vector)
 {
 
 	lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF);
 
 	/* Wait for IPI to finish. */
 	if (!lapic_ipi_wait(50000)) {
 		if (KERNEL_PANICKED())
 			return;
 		else
 			panic("APIC: IPI is stuck");
 	}
 }
 
 int
 ipi_nmi_handler(void)
 {
 	u_int cpuid;
 
 	/*
 	 * As long as there is not a simple way to know about a NMI's
 	 * source, if the bitmask for the current CPU is present in
 	 * the global pending bitword an IPI_STOP_HARD has been issued
 	 * and should be handled.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
 		return (1);
 
 	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
 	cpustop_handler();
 	return (0);
 }
 
 int nmi_kdb_lock;
 
 void
 nmi_call_kdb_smp(u_int type, struct trapframe *frame)
 {
 	int cpu;
 	bool call_post;
 
 	cpu = PCPU_GET(cpuid);
 	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
 		nmi_call_kdb(cpu, type, frame);
 		call_post = false;
 	} else {
 		savectx(&stoppcbs[cpu]);
 		CPU_SET_ATOMIC(cpu, &stopped_cpus);
 		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
 			ia32_pause();
 		call_post = true;
 	}
 	atomic_store_rel_int(&nmi_kdb_lock, 0);
 	if (call_post)
 		cpustop_handler_post(cpu);
 }
 
 /*
  * Handle an IPI_STOP by saving our current context and spinning (or mwaiting,
  * if available) until we are resumed.
  */
 void
 cpustop_handler(void)
 {
 	struct monitorbuf *mb;
 	u_int cpu;
 	bool use_mwait;
 
 	cpu = PCPU_GET(cpuid);
 
 	savectx(&stoppcbs[cpu]);
 
 	use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 &&
 	    !mwait_cpustop_broken);
 	if (use_mwait) {
 		mb = PCPU_PTR(monitorbuf);
 		atomic_store_int(&mb->stop_state,
 		    MONITOR_STOPSTATE_STOPPED);
 	}
 
 	/* Indicate that we are stopped */
 	CPU_SET_ATOMIC(cpu, &stopped_cpus);
 
 	/* Wait for restart */
 	while (!CPU_ISSET(cpu, &started_cpus)) {
 		if (use_mwait) {
 			cpu_monitor(mb, 0, 0);
 			if (atomic_load_int(&mb->stop_state) ==
 			    MONITOR_STOPSTATE_STOPPED)
 				cpu_mwait(0, MWAIT_C1);
 			continue;
 		}
 
 		ia32_pause();
 
 		/*
 		 * Halt non-BSP CPUs on panic -- we're never going to need them
 		 * again, and might as well save power / release resources
 		 * (e.g., overprovisioned VM infrastructure).
 		 */
 		while (__predict_false(!IS_BSP() && KERNEL_PANICKED()))
 			halt();
 	}
 
 	cpustop_handler_post(cpu);
 }
 
 static void
 cpustop_handler_post(u_int cpu)
 {
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
 	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
 	/*
 	 * We don't broadcast TLB invalidations to other CPUs when they are
 	 * stopped. Hence, we clear the TLB before resuming.
 	 */
 	invltlb_glob();
 
 #if defined(__amd64__) && defined(DDB)
 	amd64_db_resume_dbreg();
 #endif
 
 	if (cpu == 0 && cpustop_restartfunc != NULL) {
 		cpustop_restartfunc();
 		cpustop_restartfunc = NULL;
 	}
 }
 
 /*
  * Handle an IPI_SUSPEND by saving our current context and spinning until we
  * are resumed.
  */
 void
 cpususpend_handler(void)
 {
 	u_int cpu;
 
 	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
 
 	cpu = PCPU_GET(cpuid);
 	if (savectx(&susppcbs[cpu]->sp_pcb)) {
 #ifdef __amd64__
 		fpususpend(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxsuspend(susppcbs[cpu]->sp_fpususpend);
 #endif
 		/*
 		 * suspended_cpus is cleared shortly after each AP is restarted
 		 * by a Startup IPI, so that the BSP can proceed to restarting
 		 * the next AP.
 		 *
 		 * resuming_cpus gets cleared when the AP completes
 		 * initialization after having been released by the BSP.
 		 * resuming_cpus is probably not the best name for the
 		 * variable, because it is actually a set of processors that
 		 * haven't resumed yet and haven't necessarily started resuming.
 		 *
 		 * Note that suspended_cpus is meaningful only for ACPI suspend
 		 * as it's not really used for Xen suspend since the APs are
 		 * automatically restored to the running state and the correct
 		 * context.  For the same reason resumectx is never called in
 		 * that case.
 		 */
 		CPU_SET_ATOMIC(cpu, &suspended_cpus);
 		CPU_SET_ATOMIC(cpu, &resuming_cpus);
 
 		/*
 		 * Invalidate the cache after setting the global status bits.
 		 * The last AP to set its bit may end up being an Owner of the
 		 * corresponding cache line in MOESI protocol.  The AP may be
 		 * stopped before the cache line is written to the main memory.
 		 */
 		wbinvd();
 	} else {
 #ifdef __amd64__
 		fpuresume(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxresume(susppcbs[cpu]->sp_fpususpend);
 #endif
 		pmap_init_pat();
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
 
 		/* Indicate that we have restarted and restored the context. */
 		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
 	/* Wait for resume directive */
 	while (!CPU_ISSET(cpu, &toresume_cpus))
 		ia32_pause();
 
 	/* Re-apply microcode updates. */
 	ucode_reload();
 
 #ifdef __i386__
 	/* Finish removing the identity mapping of low memory for this AP. */
 	invltlb_glob();
 #endif
 
 	if (cpu_ops.cpu_resume)
 		cpu_ops.cpu_resume();
 #ifdef __amd64__
 	if (vmm_resume_p)
 		vmm_resume_p();
 #endif
 
 	/* Resume MCA and local APIC */
 	lapic_xapic_mode();
 	mca_resume();
 	lapic_setup(0);
 
 	/* Indicate that we are resumed */
 	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
 	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
+}
+
+/*
+ * Handle an IPI_SWI by waking delayed SWI thread.
+ */
+void
+ipi_swi_handler(struct trapframe frame)
+{
+
+	intr_event_handle(clk_intr_event, &frame);
 }
 
 /*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
 static void
 release_aps(void *dummy __unused)
 {
 
 	if (mp_ncpus == 1) 
 		return;
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
 #ifdef COUNT_IPIS
 /*
  * Setup interrupt counters for IPI handlers.
  */
 static void
 mp_ipi_intrcnt(void *dummy)
 {
 	char buf[64];
 	int i;
 
 	CPU_FOREACH(i) {
 		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
 		intrcnt_add(buf, &ipi_invlcache_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
 		intrcnt_add(buf, &ipi_preempt_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
 		intrcnt_add(buf, &ipi_ast_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
 		intrcnt_add(buf, &ipi_hardclock_counts[i]);
 	}		
 }
 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 #endif
Index: head/sys/x86/xen/xen_apic.c
===================================================================
--- head/sys/x86/xen/xen_apic.c	(revision 363526)
+++ head/sys/x86/xen/xen_apic.c	(revision 363527)
@@ -1,573 +1,584 @@
 /*
  * Copyright (c) 2014 Roger Pau Monné <roger.pau@citrix.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/cpufunc.h>
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 
 #include <x86/apicreg.h>
 #include <x86/apicvar.h>
 
 #include <xen/xen-os.h>
 #include <xen/features.h>
 #include <xen/gnttab.h>
 #include <xen/hypervisor.h>
 #include <xen/hvm.h>
 #include <xen/xen_intr.h>
 
 #include <xen/interface/vcpu.h>
 
 /*--------------------------------- Macros -----------------------------------*/
 
 #define XEN_APIC_UNSUPPORTED \
 	panic("%s: not available in Xen PV port.", __func__)
 
 
 /*--------------------------- Forward Declarations ---------------------------*/
 #ifdef SMP
 static driver_filter_t xen_smp_rendezvous_action;
 #ifdef __amd64__
 static driver_filter_t xen_invlop;
 #else
 static driver_filter_t xen_invltlb;
 static driver_filter_t xen_invlpg;
 static driver_filter_t xen_invlrng;
 static driver_filter_t xen_invlcache;
 #endif
 static driver_filter_t xen_ipi_bitmap_handler;
 static driver_filter_t xen_cpustop_handler;
 static driver_filter_t xen_cpususpend_handler;
+static driver_filter_t xen_ipi_swi_handler;
 #endif
 
 /*---------------------------------- Macros ----------------------------------*/
 #define	IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS)
 
 /*--------------------------------- Xen IPIs ---------------------------------*/
 #ifdef SMP
 struct xen_ipi_handler
 {
 	driver_filter_t	*filter;
 	const char	*description;
 };
 
 static struct xen_ipi_handler xen_ipis[] = 
 {
 	[IPI_TO_IDX(IPI_RENDEZVOUS)]	= { xen_smp_rendezvous_action,	"r"   },
 #ifdef __amd64__
 	[IPI_TO_IDX(IPI_INVLOP)]	= { xen_invlop,			"itlb"},
 #else
 	[IPI_TO_IDX(IPI_INVLTLB)]	= { xen_invltlb,		"itlb"},
 	[IPI_TO_IDX(IPI_INVLPG)]	= { xen_invlpg,			"ipg" },
 	[IPI_TO_IDX(IPI_INVLRNG)]	= { xen_invlrng,		"irg" },
 	[IPI_TO_IDX(IPI_INVLCACHE)]	= { xen_invlcache,		"ic"  },
 #endif
 	[IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler,	"b"   },
 	[IPI_TO_IDX(IPI_STOP)]		= { xen_cpustop_handler,	"st"  },
 	[IPI_TO_IDX(IPI_SUSPEND)]	= { xen_cpususpend_handler,	"sp"  },
+	[IPI_TO_IDX(IPI_SWI)]		= { xen_ipi_swi_handler,	"sw"  },
 };
 #endif
 
 /*------------------------------- Per-CPU Data -------------------------------*/
 #ifdef SMP
 DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]);
 #endif
 
 /*------------------------------- Xen PV APIC --------------------------------*/
 
 static void
 xen_pv_lapic_create(u_int apic_id, int boot_cpu)
 {
 #ifdef SMP
 	cpu_add(apic_id, boot_cpu);
 #endif
 }
 
 static void
 xen_pv_lapic_init(vm_paddr_t addr)
 {
 
 }
 
 static void
 xen_pv_lapic_setup(int boot)
 {
 
 }
 
 static void
 xen_pv_lapic_dump(const char *str)
 {
 
 	printf("cpu%d %s XEN PV LAPIC\n", PCPU_GET(cpuid), str);
 }
 
 static void
 xen_pv_lapic_disable(void)
 {
 
 }
 
 static bool
 xen_pv_lapic_is_x2apic(void)
 {
 
 	return (false);
 }
 
 static void
 xen_pv_lapic_eoi(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_id(void)
 {
 
 	return (PCPU_GET(apic_id));
 }
 
 static int
 xen_pv_lapic_intr_pending(u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static u_int
 xen_pv_apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return (apic_cpuids[apic_id]);
 #else
 	return (0);
 #endif
 }
 
 static u_int
 xen_pv_apic_alloc_vector(u_int apic_id, u_int irq)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static u_int
 xen_pv_apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static void
 xen_pv_apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_enable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static void
 xen_pv_lapic_disable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_reenable_pmc(void)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static void
 xen_pv_lapic_enable_cmc(void)
 {
 
 }
 
 #ifdef SMP
 static void
 xen_pv_lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 #define PCPU_ID_GET(id, field) (pcpu_find(id)->pc_##field)
 static void
 send_nmi(int dest)
 {
 	unsigned int cpu;
 
 	/*
 	 * NMIs are not routed over event channels, and instead delivered as on
 	 * native using the exception vector (#2). Triggering them can be done
 	 * using the local APIC, or an hypercall as a shortcut like it's done
 	 * below.
 	 */
 	switch(dest) {
 	case APIC_IPI_DEST_SELF:
 		HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_GET(vcpu_id), NULL);
 		break;
 	case APIC_IPI_DEST_ALL:
 		CPU_FOREACH(cpu)
 			HYPERVISOR_vcpu_op(VCPUOP_send_nmi,
 			    PCPU_ID_GET(cpu, vcpu_id), NULL);
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		CPU_FOREACH(cpu)
 			if (cpu != PCPU_GET(cpuid))
 				HYPERVISOR_vcpu_op(VCPUOP_send_nmi,
 				    PCPU_ID_GET(cpu, vcpu_id), NULL);
 		break;
 	default:
 		HYPERVISOR_vcpu_op(VCPUOP_send_nmi,
 		    PCPU_ID_GET(apic_cpuid(dest), vcpu_id), NULL);
 		break;
 	}
 }
 #undef PCPU_ID_GET
 
 static void
 xen_pv_lapic_ipi_vectored(u_int vector, int dest)
 {
 	xen_intr_handle_t *ipi_handle;
 	int ipi_idx, to_cpu, self;
 
 	if (vector >= IPI_NMI_FIRST) {
 		send_nmi(dest);
 		return;
 	}
 
 	ipi_idx = IPI_TO_IDX(vector);
 	if (ipi_idx >= nitems(xen_ipis))
 		panic("IPI out of range");
 
 	switch(dest) {
 	case APIC_IPI_DEST_SELF:
 		ipi_handle = DPCPU_GET(ipi_handle);
 		xen_intr_signal(ipi_handle[ipi_idx]);
 		break;
 	case APIC_IPI_DEST_ALL:
 		CPU_FOREACH(to_cpu) {
 			ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 			xen_intr_signal(ipi_handle[ipi_idx]);
 		}
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		self = PCPU_GET(cpuid);
 		CPU_FOREACH(to_cpu) {
 			if (to_cpu != self) {
 				ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 				xen_intr_signal(ipi_handle[ipi_idx]);
 			}
 		}
 		break;
 	default:
 		to_cpu = apic_cpuid(dest);
 		ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle);
 		xen_intr_signal(ipi_handle[ipi_idx]);
 		break;
 	}
 }
 
 static int
 xen_pv_lapic_ipi_wait(int delay)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 #endif	/* SMP */
 
 static int
 xen_pv_lapic_ipi_alloc(inthand_t *ipifunc)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (-1);
 }
 
 static void
 xen_pv_lapic_ipi_free(int vector)
 {
 
 	XEN_APIC_UNSUPPORTED;
 }
 
 static int
 xen_pv_lapic_set_lvt_mask(u_int apic_id, u_int lvt, u_char masked)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_mode(u_int apic_id, u_int lvt, uint32_t mode)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 static int
 xen_pv_lapic_set_lvt_triggermode(u_int apic_id, u_int lvt,
     enum intr_trigger trigger)
 {
 
 	XEN_APIC_UNSUPPORTED;
 	return (0);
 }
 
 /* Xen apic_ops implementation */
 struct apic_ops xen_apic_ops = {
 	.create			= xen_pv_lapic_create,
 	.init			= xen_pv_lapic_init,
 	.xapic_mode		= xen_pv_lapic_disable,
 	.is_x2apic		= xen_pv_lapic_is_x2apic,
 	.setup			= xen_pv_lapic_setup,
 	.dump			= xen_pv_lapic_dump,
 	.disable		= xen_pv_lapic_disable,
 	.eoi			= xen_pv_lapic_eoi,
 	.id			= xen_pv_lapic_id,
 	.intr_pending		= xen_pv_lapic_intr_pending,
 	.set_logical_id		= xen_pv_lapic_set_logical_id,
 	.cpuid			= xen_pv_apic_cpuid,
 	.alloc_vector		= xen_pv_apic_alloc_vector,
 	.alloc_vectors		= xen_pv_apic_alloc_vectors,
 	.enable_vector		= xen_pv_apic_enable_vector,
 	.disable_vector		= xen_pv_apic_disable_vector,
 	.free_vector		= xen_pv_apic_free_vector,
 	.enable_pmc		= xen_pv_lapic_enable_pmc,
 	.disable_pmc		= xen_pv_lapic_disable_pmc,
 	.reenable_pmc		= xen_pv_lapic_reenable_pmc,
 	.enable_cmc		= xen_pv_lapic_enable_cmc,
 #ifdef SMP
 	.ipi_raw		= xen_pv_lapic_ipi_raw,
 	.ipi_vectored		= xen_pv_lapic_ipi_vectored,
 	.ipi_wait		= xen_pv_lapic_ipi_wait,
 #endif
 	.ipi_alloc		= xen_pv_lapic_ipi_alloc,
 	.ipi_free		= xen_pv_lapic_ipi_free,
 	.set_lvt_mask		= xen_pv_lapic_set_lvt_mask,
 	.set_lvt_mode		= xen_pv_lapic_set_lvt_mode,
 	.set_lvt_polarity	= xen_pv_lapic_set_lvt_polarity,
 	.set_lvt_triggermode	= xen_pv_lapic_set_lvt_triggermode,
 };
 
 #ifdef SMP
 /*---------------------------- XEN PV IPI Handlers ---------------------------*/
 /*
  * These are C clones of the ASM functions found in apic_vector.
  */
 static int
 xen_ipi_bitmap_handler(void *arg)
 {
 	struct trapframe *frame;
 
 	frame = arg;
 	ipi_bitmap_handler(*frame);
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_smp_rendezvous_action(void *arg)
 {
 #ifdef COUNT_IPIS
 	(*ipi_rendezvous_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
 	smp_rendezvous_action();
 	return (FILTER_HANDLED);
 }
 
 #ifdef __amd64__
 static int
 xen_invlop(void *arg)
 {
 
 	invlop_handler();
 	return (FILTER_HANDLED);
 }
 
 #else /* __i386__ */
 
 static int
 xen_invltlb(void *arg)
 {
 
 	invltlb_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invlpg(void *arg)
 {
 
 	invlpg_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invlrng(void *arg)
 {
 
 	invlrng_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_invlcache(void *arg)
 {
 
 	invlcache_handler();
 	return (FILTER_HANDLED);
 }
 #endif /* __amd64__ */
 
 static int
 xen_cpustop_handler(void *arg)
 {
 
 	cpustop_handler();
 	return (FILTER_HANDLED);
 }
 
 static int
 xen_cpususpend_handler(void *arg)
 {
 
 	cpususpend_handler();
+	return (FILTER_HANDLED);
+}
+
+static int
+xen_ipi_swi_handler(void *arg)
+{
+	struct trapframe *frame = arg;
+
+	ipi_swi_handler(*frame);
 	return (FILTER_HANDLED);
 }
 
 /*----------------------------- XEN PV IPI setup -----------------------------*/
 /*
  * Those functions are provided outside of the Xen PV APIC implementation
  * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC,
  * because on PVHVM there's an emulated LAPIC provided by Xen.
  */
 static void
 xen_cpu_ipi_init(int cpu)
 {
 	xen_intr_handle_t *ipi_handle;
 	const struct xen_ipi_handler *ipi;
 	int idx, rc;
 
 	ipi_handle = DPCPU_ID_GET(cpu, ipi_handle);
 
 	for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) {
 
 		if (ipi->filter == NULL) {
 			ipi_handle[idx] = NULL;
 			continue;
 		}
 
 		rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter,
 		    INTR_TYPE_TTY, &ipi_handle[idx]);
 		if (rc != 0)
 			panic("Unable to allocate a XEN IPI port");
 		xen_intr_describe(ipi_handle[idx], "%s", ipi->description);
 	}
 }
 
 static void
 xen_setup_cpus(void)
 {
 	int i;
 
 	if (!xen_vector_callback_enabled)
 		return;
 
 	CPU_FOREACH(i)
 		xen_cpu_ipi_init(i);
 
 	/* Set the xen pv ipi ops to replace the native ones */
 	if (xen_hvm_domain())
 		apic_ops.ipi_vectored = xen_pv_lapic_ipi_vectored;
 }
 
 /* Switch to using PV IPIs as soon as the vcpu_id is set. */
 SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_SECOND, xen_setup_cpus, NULL);
 #endif /* SMP */