diff --git a/sys/x86/isa/clock.c b/sys/x86/isa/clock.c
index 8fa642295611..a0bf60159e78 100644
--- a/sys/x86/isa/clock.c
+++ b/sys/x86/isa/clock.c
@@ -1,665 +1,657 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990 The Regents of the University of California.
  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz and Don Ahn.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)clock.c	7.2 (Berkeley) 5/12/91
  */
 
 #include <sys/cdefs.h>
 /*
  * Routines to handle clock hardware.
  */
 
 #ifdef __amd64__
 #define	DEV_APIC
 #else
 #include "opt_apic.h"
 #endif
 #include "opt_clock.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/kdb.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/timeet.h>
 #include <sys/timetc.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/init.h>
 #include <x86/ppireg.h>
 #include <x86/timerreg.h>
 
 #include <isa/rtc.h>
 #ifdef DEV_ISA
 #include <isa/isareg.h>
 #include <isa/isavar.h>
 #endif
 
 int	clkintr_pending;
 #ifndef TIMER_FREQ
 #define TIMER_FREQ   1193182
 #endif
 u_int	i8254_freq = TIMER_FREQ;
 TUNABLE_INT("hw.i8254.freq", &i8254_freq);
 int	i8254_max_count;
 static int i8254_timecounter = 1;
 
 static	struct mtx clock_lock;
 static	struct intsrc *i8254_intsrc;
 static	uint16_t i8254_lastcount;
 static	uint16_t i8254_offset;
 static	int	(*i8254_pending)(struct intsrc *);
 static	int	i8254_ticked;
 
 struct attimer_softc {
 	int intr_en;
 	int port_rid, intr_rid;
 	struct resource *port_res;
 	struct resource *intr_res;
 	void *intr_handler;
 	struct timecounter tc;
 	struct eventtimer et;
 	int		mode;
 #define	MODE_STOP	0
 #define	MODE_PERIODIC	1
 #define	MODE_ONESHOT	2
 	uint32_t	period;
 };
 static struct attimer_softc *attimer_sc = NULL;
 
 static int timer0_period = -2;
 static int timer0_mode = 0xffff;
 static int timer0_last = 0xffff;
 
 /* Values for timerX_state: */
 #define	RELEASED	0
 #define	RELEASE_PENDING	1
 #define	ACQUIRED	2
 #define	ACQUIRE_PENDING	3
 
 static	u_char	timer2_state;
 
 static	unsigned i8254_get_timecount(struct timecounter *tc);
 static	void	set_i8254_freq(int mode, uint32_t period);
 
 void
 clock_init(void)
 {
 	/* Init the clock lock */
 	mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_NOPROFILE);
 	/* Init the clock in order to use DELAY */
 	init_ops.early_clock_source_init();
 	tsc_init();
 }
 
 static int
 clkintr(void *arg)
 {
 	struct attimer_softc *sc = (struct attimer_softc *)arg;
 
 	if (i8254_timecounter && sc->period != 0) {
 		mtx_lock_spin(&clock_lock);
 		if (i8254_ticked)
 			i8254_ticked = 0;
 		else {
 			i8254_offset += i8254_max_count;
 			i8254_lastcount = 0;
 		}
 		clkintr_pending = 0;
 		mtx_unlock_spin(&clock_lock);
 	}
 
 	if (sc->et.et_active && sc->mode != MODE_STOP)
 		sc->et.et_event_cb(&sc->et, sc->et.et_arg);
 
 	return (FILTER_HANDLED);
 }
 
 int
 timer_spkr_acquire(void)
 {
 	int mode;
 
 	mode = TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT;
 
 	if (timer2_state != RELEASED)
 		return (-1);
 	timer2_state = ACQUIRED;
 
 	/*
 	 * This access to the timer registers is as atomic as possible
 	 * because it is a single instruction.  We could do better if we
 	 * knew the rate.  Use of splclock() limits glitches to 10-100us,
 	 * and this is probably good enough for timer2, so we aren't as
 	 * careful with it as with timer0.
 	 */
 	outb(TIMER_MODE, TIMER_SEL2 | (mode & 0x3f));
 
 	ppi_spkr_on();		/* enable counter2 output to speaker */
 	return (0);
 }
 
 int
 timer_spkr_release(void)
 {
 
 	if (timer2_state != ACQUIRED)
 		return (-1);
 	timer2_state = RELEASED;
 	outb(TIMER_MODE, TIMER_SEL2 | TIMER_SQWAVE | TIMER_16BIT);
 
 	ppi_spkr_off();		/* disable counter2 output to speaker */
 	return (0);
 }
 
 void
 timer_spkr_setfreq(int freq)
 {
 
 	freq = i8254_freq / freq;
 	mtx_lock_spin(&clock_lock);
 	outb(TIMER_CNTR2, freq & 0xff);
 	outb(TIMER_CNTR2, freq >> 8);
 	mtx_unlock_spin(&clock_lock);
 }
 
 static int
 getit(void)
 {
 	int high, low;
 
 	mtx_lock_spin(&clock_lock);
 
 	/* Select timer0 and latch counter value. */
 	outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
 
 	low = inb(TIMER_CNTR0);
 	high = inb(TIMER_CNTR0);
 
 	mtx_unlock_spin(&clock_lock);
 	return ((high << 8) | low);
 }
 
 /*
  * Wait "n" microseconds.
  * Relies on timer 1 counting down from (i8254_freq / hz)
  * Note: timer had better have been programmed before this is first used!
  */
 void
 i8254_delay(int n)
 {
 	int delta, prev_tick, tick, ticks_left;
 #ifdef DELAYDEBUG
 	int getit_calls = 1;
 	int n1;
 	static int state = 0;
 
 	if (state == 0) {
 		state = 1;
 		for (n1 = 1; n1 <= 10000000; n1 *= 10)
 			DELAY(n1);
 		state = 2;
 	}
 	if (state == 1)
 		printf("DELAY(%d)...", n);
 #endif
 	/*
 	 * Read the counter first, so that the rest of the setup overhead is
 	 * counted.  Guess the initial overhead is 20 usec (on most systems it
 	 * takes about 1.5 usec for each of the i/o's in getit().  The loop
 	 * takes about 6 usec on a 486/33 and 13 usec on a 386/20.  The
 	 * multiplications and divisions to scale the count take a while).
 	 *
 	 * However, if ddb is active then use a fake counter since reading
 	 * the i8254 counter involves acquiring a lock.  ddb must not do
 	 * locking for many reasons, but it calls here for at least atkbd
 	 * input.
 	 */
 #ifdef KDB
 	if (kdb_active)
 		prev_tick = 1;
 	else
 #endif
 		prev_tick = getit();
 	n -= 0;			/* XXX actually guess no initial overhead */
 	/*
 	 * Calculate (n * (i8254_freq / 1e6)) without using floating point
 	 * and without any avoidable overflows.
 	 */
 	if (n <= 0)
 		ticks_left = 0;
 	else if (n < 256)
 		/*
 		 * Use fixed point to avoid a slow division by 1000000.
 		 * 39099 = 1193182 * 2^15 / 10^6 rounded to nearest.
 		 * 2^15 is the first power of 2 that gives exact results
 		 * for n between 0 and 256.
 		 */
 		ticks_left = ((u_int)n * 39099 + (1 << 15) - 1) >> 15;
 	else
 		/*
 		 * Don't bother using fixed point, although gcc-2.7.2
 		 * generates particularly poor code for the long long
 		 * division, since even the slow way will complete long
 		 * before the delay is up (unless we're interrupted).
 		 */
 		ticks_left = ((u_int)n * (long long)i8254_freq + 999999)
 			     / 1000000;
 
 	while (ticks_left > 0) {
 #ifdef KDB
 		if (kdb_active) {
 			inb(0x84);
 			tick = prev_tick - 1;
 			if (tick <= 0)
 				tick = i8254_max_count;
 		} else
 #endif
 			tick = getit();
 #ifdef DELAYDEBUG
 		++getit_calls;
 #endif
 		delta = prev_tick - tick;
 		prev_tick = tick;
 		if (delta < 0) {
 			delta += i8254_max_count;
 			/*
 			 * Guard against i8254_max_count being wrong.
 			 * This shouldn't happen in normal operation,
 			 * but it may happen if set_i8254_freq() is
 			 * traced.
 			 */
 			if (delta < 0)
 				delta = 0;
 		}
 		ticks_left -= delta;
 	}
 #ifdef DELAYDEBUG
 	if (state == 1)
 		printf(" %d calls to getit() at %d usec each\n",
 		       getit_calls, (n + 5) / getit_calls);
 #endif
 }
 
 static void
 set_i8254_freq(int mode, uint32_t period)
 {
 	int new_count, new_mode;
 
 	mtx_lock_spin(&clock_lock);
 	if (mode == MODE_STOP) {
 		if (i8254_timecounter) {
 			mode = MODE_PERIODIC;
 			new_count = 0x10000;
 		} else
 			new_count = -1;
 	} else {
 		new_count = min(((uint64_t)i8254_freq * period +
 		    0x80000000LLU) >> 32, 0x10000);
 	}
 	if (new_count == timer0_period)
 		goto out;
 	i8254_max_count = ((new_count & ~0xffff) != 0) ? 0xffff : new_count;
 	timer0_period = (mode == MODE_PERIODIC) ? new_count : -1;
 	switch (mode) {
 	case MODE_STOP:
 		new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_16BIT;
 		outb(TIMER_MODE, new_mode);
 		outb(TIMER_CNTR0, 0);
 		outb(TIMER_CNTR0, 0);
 		break;
 	case MODE_PERIODIC:
 		new_mode = TIMER_SEL0 | TIMER_RATEGEN | TIMER_16BIT;
 		outb(TIMER_MODE, new_mode);
 		outb(TIMER_CNTR0, new_count & 0xff);
 		outb(TIMER_CNTR0, new_count >> 8);
 		break;
 	case MODE_ONESHOT:
 		if (new_count < 256 && timer0_last < 256) {
 			new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_LSB;
 			if (new_mode != timer0_mode)
 				outb(TIMER_MODE, new_mode);
 			outb(TIMER_CNTR0, new_count & 0xff);
 			break;
 		}
 		new_mode = TIMER_SEL0 | TIMER_INTTC | TIMER_16BIT;
 		if (new_mode != timer0_mode)
 			outb(TIMER_MODE, new_mode);
 		outb(TIMER_CNTR0, new_count & 0xff);
 		outb(TIMER_CNTR0, new_count >> 8);
 		break;
 	default:
 		panic("set_i8254_freq: unknown operational mode");
 	}
 	timer0_mode = new_mode;
 	timer0_last = new_count;
 out:
 	mtx_unlock_spin(&clock_lock);
 }
 
 static void
 i8254_restore(void)
 {
 
 	timer0_period = -2;
 	timer0_mode = 0xffff;
 	timer0_last = 0xffff;
 	if (attimer_sc != NULL)
 		set_i8254_freq(attimer_sc->mode, attimer_sc->period);
 	else
 		set_i8254_freq(MODE_STOP, 0);
 }
 
 /* This is separate from startrtclock() so that it can be called early. */
 void
 i8254_init(void)
 {
 
 	set_i8254_freq(MODE_STOP, 0);
 }
 
 void
 startrtclock(void)
 {
 
 	start_TSC();
 }
 
 void
 cpu_initclocks(void)
 {
-#ifdef EARLY_AP_STARTUP
 	struct thread *td;
 	int i;
 
 	td = curthread;
 
 	tsc_calibrate();
 #ifdef DEV_APIC
 	lapic_calibrate_timer();
 #endif
 	cpu_initclocks_bsp();
 	CPU_FOREACH(i) {
 		if (i == 0)
 			continue;
 		thread_lock(td);
 		sched_bind(td, i);
 		thread_unlock(td);
 		cpu_initclocks_ap();
 	}
 	thread_lock(td);
 	if (sched_is_bound(td))
 		sched_unbind(td);
 	thread_unlock(td);
-#else
-	tsc_calibrate();
-#ifdef DEV_APIC
-	lapic_calibrate_timer();
-#endif
-	cpu_initclocks_bsp();
-#endif
 }
 
 static int
 sysctl_machdep_i8254_freq(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	u_int freq;
 
 	/*
 	 * Use `i8254' instead of `timer' in external names because `timer'
 	 * is too generic.  Should use it everywhere.
 	 */
 	freq = i8254_freq;
 	error = sysctl_handle_int(oidp, &freq, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		i8254_freq = freq;
 		if (attimer_sc != NULL) {
 			set_i8254_freq(attimer_sc->mode, attimer_sc->period);
 			attimer_sc->tc.tc_frequency = freq;
 		} else {
 			set_i8254_freq(MODE_STOP, 0);
 		}
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, i8254_freq,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, sizeof(u_int), sysctl_machdep_i8254_freq, "IU",
     "i8254 timer frequency");
 
 static unsigned
 i8254_get_timecount(struct timecounter *tc)
 {
 	device_t dev = (device_t)tc->tc_priv;
 	struct attimer_softc *sc = device_get_softc(dev);
 	register_t flags;
 	uint16_t count;
 	u_int high, low;
 
 	if (sc->period == 0)
 		return (i8254_max_count - getit());
 
 #ifdef __amd64__
 	flags = read_rflags();
 #else
 	flags = read_eflags();
 #endif
 	mtx_lock_spin(&clock_lock);
 
 	/* Select timer0 and latch counter value. */
 	outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
 
 	low = inb(TIMER_CNTR0);
 	high = inb(TIMER_CNTR0);
 	count = i8254_max_count - ((high << 8) | low);
 	if (count < i8254_lastcount ||
 	    (!i8254_ticked && (clkintr_pending ||
 	    ((count < 20 || (!(flags & PSL_I) &&
 	    count < i8254_max_count / 2u)) &&
 	    i8254_pending != NULL && i8254_pending(i8254_intsrc))))) {
 		i8254_ticked = 1;
 		i8254_offset += i8254_max_count;
 	}
 	i8254_lastcount = count;
 	count += i8254_offset;
 	mtx_unlock_spin(&clock_lock);
 	return (count);
 }
 
 static int
 attimer_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 	device_t dev = (device_t)et->et_priv;
 	struct attimer_softc *sc = device_get_softc(dev);
 
 	if (period != 0) {
 		sc->mode = MODE_PERIODIC;
 		sc->period = period;
 	} else {
 		sc->mode = MODE_ONESHOT;
 		sc->period = first;
 	}
 	if (!sc->intr_en) {
 		i8254_intsrc->is_pic->pic_enable_source(i8254_intsrc);
 		sc->intr_en = 1;
 	}
 	set_i8254_freq(sc->mode, sc->period);
 	return (0);
 }
 
 static int
 attimer_stop(struct eventtimer *et)
 {
 	device_t dev = (device_t)et->et_priv;
 	struct attimer_softc *sc = device_get_softc(dev);
 
 	sc->mode = MODE_STOP;
 	sc->period = 0;
 	set_i8254_freq(sc->mode, sc->period);
 	return (0);
 }
 
 #ifdef DEV_ISA
 /*
  * Attach to the ISA PnP descriptors for the timer
  */
 static struct isa_pnp_id attimer_ids[] = {
 	{ 0x0001d041 /* PNP0100 */, "AT timer" },
 	{ 0 }
 };
 
 static int
 attimer_probe(device_t dev)
 {
 	int result;
 
 	result = ISA_PNP_PROBE(device_get_parent(dev), dev, attimer_ids);
 	/* ENOENT means no PnP-ID, device is hinted. */
 	if (result == ENOENT) {
 		device_set_desc(dev, "AT timer");
 		return (BUS_PROBE_LOW_PRIORITY);
 	}
 	return (result);
 }
 
 static int
 attimer_attach(device_t dev)
 {
 	struct attimer_softc *sc;
 	rman_res_t s;
 	int i;
 
 	attimer_sc = sc = device_get_softc(dev);
 	bzero(sc, sizeof(struct attimer_softc));
 	if (!(sc->port_res = bus_alloc_resource(dev, SYS_RES_IOPORT,
 	    &sc->port_rid, IO_TIMER1, IO_TIMER1 + 3, 4, RF_ACTIVE)))
 		device_printf(dev,"Warning: Couldn't map I/O.\n");
 	i8254_intsrc = intr_lookup_source(0);
 	if (i8254_intsrc != NULL)
 		i8254_pending = i8254_intsrc->is_pic->pic_source_pending;
 	resource_int_value(device_get_name(dev), device_get_unit(dev),
 	    "timecounter", &i8254_timecounter);
 	set_i8254_freq(MODE_STOP, 0);
 	if (i8254_timecounter) {
 		sc->tc.tc_get_timecount = i8254_get_timecount;
 		sc->tc.tc_counter_mask = 0xffff;
 		sc->tc.tc_frequency = i8254_freq;
 		sc->tc.tc_name = "i8254";
 		sc->tc.tc_quality = 0;
 		sc->tc.tc_priv = dev;
 		tc_init(&sc->tc);
 	}
 	if (resource_int_value(device_get_name(dev), device_get_unit(dev),
 	    "clock", &i) != 0 || i != 0) {
 	    	sc->intr_rid = 0;
 		while (bus_get_resource(dev, SYS_RES_IRQ, sc->intr_rid,
 		    &s, NULL) == 0 && s != 0)
 			sc->intr_rid++;
 		if (!(sc->intr_res = bus_alloc_resource(dev, SYS_RES_IRQ,
 		    &sc->intr_rid, 0, 0, 1, RF_ACTIVE))) {
 			device_printf(dev,"Can't map interrupt.\n");
 			return (0);
 		}
 		/* Dirty hack, to make bus_setup_intr to not enable source. */
 		i8254_intsrc->is_handlers++;
 		if ((bus_setup_intr(dev, sc->intr_res,
 		    INTR_MPSAFE | INTR_TYPE_CLK,
 		    (driver_filter_t *)clkintr, NULL,
 		    sc, &sc->intr_handler))) {
 			device_printf(dev, "Can't setup interrupt.\n");
 			i8254_intsrc->is_handlers--;
 			return (0);
 		}
 		i8254_intsrc->is_handlers--;
 		i8254_intsrc->is_pic->pic_enable_intr(i8254_intsrc);
 		sc->et.et_name = "i8254";
 		sc->et.et_flags = ET_FLAGS_PERIODIC;
 		if (!i8254_timecounter)
 			sc->et.et_flags |= ET_FLAGS_ONESHOT;
 		sc->et.et_quality = 100;
 		sc->et.et_frequency = i8254_freq;
 		sc->et.et_min_period = (0x0002LLU << 32) / i8254_freq;
 		sc->et.et_max_period = (0xfffeLLU << 32) / i8254_freq;
 		sc->et.et_start = attimer_start;
 		sc->et.et_stop = attimer_stop;
 		sc->et.et_priv = dev;
 		et_register(&sc->et);
 	}
 	return(0);
 }
 
 static int
 attimer_resume(device_t dev)
 {
 
 	i8254_restore();
 	return (0);
 }
 
 static device_method_t attimer_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		attimer_probe),
 	DEVMETHOD(device_attach,	attimer_attach),
 	DEVMETHOD(device_detach,	bus_generic_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
 	DEVMETHOD(device_resume,	attimer_resume),
 	{ 0, 0 }
 };
 
 static driver_t attimer_driver = {
 	"attimer",
 	attimer_methods,
 	sizeof(struct attimer_softc),
 };
 
 DRIVER_MODULE(attimer, isa, attimer_driver, 0, 0);
 DRIVER_MODULE(attimer, acpi, attimer_driver, 0, 0);
 ISA_PNP_INFO(attimer_ids);
 
 #endif /* DEV_ISA */
diff --git a/sys/x86/x86/intr_machdep.c b/sys/x86/x86/intr_machdep.c
index b43fa790d264..458a0cb396bb 100644
--- a/sys/x86/x86/intr_machdep.c
+++ b/sys/x86/x86/intr_machdep.c
@@ -1,852 +1,791 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Machine dependent interrupt code for x86.  For x86, we have to
  * deal with different PICs.  Thus, we use the passed in vector to lookup
  * an interrupt source associated with that vector.  The interrupt source
  * describes which PIC the source belongs to and includes methods to handle
  * that source.
  */
 
 #include "opt_atpic.h"
 #include "opt_ddb.h"
 #include "opt_smp.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/vmmeter.h>
 #include <machine/clock.h>
 #include <machine/intr_machdep.h>
 #include <machine/smp.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifndef DEV_ATPIC
 #include <machine/segments.h>
 #include <machine/frame.h>
 #include <dev/ic/i8259.h>
 #include <x86/isa/icu.h>
 #include <isa/isareg.h>
 #endif
 
 #include <vm/vm.h>
 
 typedef void (*mask_fn)(void *);
 
 static int intrcnt_index;
 static struct intsrc **interrupt_sources;
 #ifdef SMP
 static struct intsrc **interrupt_sorted;
 static int intrbalance;
 SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RWTUN, &intrbalance, 0,
     "Interrupt auto-balance interval (seconds).  Zero disables.");
 static struct timeout_task intrbalance_task;
 #endif
 static struct sx intrsrc_lock;
 static struct mtx intrpic_lock;
 static struct mtx intrcnt_lock;
 static TAILQ_HEAD(pics_head, pic) pics;
 u_int num_io_irqs;
 
 #if defined(SMP) && !defined(EARLY_AP_STARTUP)
-static int assign_cpu;
+#error EARLY_AP_STARTUP required on x86
 #endif
 
 #define	INTRNAME_LEN	(MAXCOMLEN + 1)
 u_long *intrcnt;
 char *intrnames;
 size_t sintrcnt = sizeof(intrcnt);
 size_t sintrnames = sizeof(intrnames);
 int nintrcnt;
 
 static MALLOC_DEFINE(M_INTR, "intr", "Interrupt Sources");
 
 static int	intr_assign_cpu(void *arg, int cpu);
 static void	intr_disable_src(void *arg);
 static void	intr_init(void *__dummy);
 static int	intr_pic_registered(struct pic *pic);
 static void	intrcnt_setname(const char *name, int index);
 static void	intrcnt_updatename(struct intsrc *is);
 static void	intrcnt_register(struct intsrc *is);
 
 /*
  * SYSINIT levels for SI_SUB_INTR:
  *
  * SI_ORDER_FIRST: Initialize locks and pics TAILQ, xen_hvm_cpu_init
  * SI_ORDER_SECOND: Xen PICs
  * SI_ORDER_THIRD: Add I/O APIC PICs, alloc MSI and Xen IRQ ranges
  * SI_ORDER_FOURTH: Add 8259A PICs
  * SI_ORDER_FOURTH + 1: Finalize interrupt count and add interrupt sources
  * SI_ORDER_MIDDLE: SMP interrupt counters
  * SI_ORDER_ANY: Enable interrupts on BSP
  */
 
 static int
 intr_pic_registered(struct pic *pic)
 {
 	struct pic *p;
 
 	TAILQ_FOREACH(p, &pics, pics) {
 		if (p == pic)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Register a new interrupt controller (PIC).  This is to support suspend
  * and resume where we suspend/resume controllers rather than individual
  * sources.  This also allows controllers with no active sources (such as
  * 8259As in a system using the APICs) to participate in suspend and resume.
  */
 int
 intr_register_pic(struct pic *pic)
 {
 	int error;
 
 	mtx_lock(&intrpic_lock);
 	if (intr_pic_registered(pic))
 		error = EBUSY;
 	else {
 		TAILQ_INSERT_TAIL(&pics, pic, pics);
 		error = 0;
 	}
 	mtx_unlock(&intrpic_lock);
 	return (error);
 }
 
 /*
  * Allocate interrupt source arrays and register interrupt sources
  * once the number of interrupts is known.
  */
 static void
 intr_init_sources(void *arg)
 {
 	struct pic *pic;
 
 	MPASS(num_io_irqs > 0);
 
 	interrupt_sources = mallocarray(num_io_irqs, sizeof(*interrupt_sources),
 	    M_INTR, M_WAITOK | M_ZERO);
 #ifdef SMP
 	interrupt_sorted = mallocarray(num_io_irqs, sizeof(*interrupt_sorted),
 	    M_INTR, M_WAITOK | M_ZERO);
 #endif
 
 	/*
 	 * - 1 ??? dummy counter.
 	 * - 2 counters for each I/O interrupt.
 	 * - 1 counter for each CPU for lapic timer.
 	 * - 1 counter for each CPU for the Hyper-V vmbus driver.
 	 * - 8 counters for each CPU for IPI counters for SMP.
 	 */
 	nintrcnt = 1 + num_io_irqs * 2 + mp_ncpus * 2;
 #ifdef COUNT_IPIS
 	if (mp_ncpus > 1)
 		nintrcnt += 8 * mp_ncpus;
 #endif
 	intrcnt = mallocarray(nintrcnt, sizeof(u_long), M_INTR, M_WAITOK |
 	    M_ZERO);
 	intrnames = mallocarray(nintrcnt, INTRNAME_LEN, M_INTR, M_WAITOK |
 	    M_ZERO);
 	sintrcnt = nintrcnt * sizeof(u_long);
 	sintrnames = nintrcnt * INTRNAME_LEN;
 
 	intrcnt_setname("???", 0);
 	intrcnt_index = 1;
 
 	/*
 	 * NB: intrpic_lock is not held here to avoid LORs due to
 	 * malloc() in intr_register_source().  However, we are still
 	 * single-threaded at this point in startup so the list of
 	 * PICs shouldn't change.
 	 */
 	TAILQ_FOREACH(pic, &pics, pics) {
 		if (pic->pic_register_sources != NULL)
 			pic->pic_register_sources(pic);
 	}
 }
 SYSINIT(intr_init_sources, SI_SUB_INTR, SI_ORDER_FOURTH + 1, intr_init_sources,
     NULL);
 
 /*
  * Register a new interrupt source with the global interrupt system.
  * The global interrupts need to be disabled when this function is
  * called.
  */
 int
 intr_register_source(struct intsrc *isrc)
 {
 	int error, vector;
 
 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
 	vector = isrc->is_pic->pic_vector(isrc);
 	KASSERT(vector < num_io_irqs, ("IRQ %d too large (%u irqs)", vector,
 	    num_io_irqs));
 	if (interrupt_sources[vector] != NULL)
 		return (EEXIST);
 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
 	    intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
 	    (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
 	    vector);
 	if (error)
 		return (error);
 	sx_xlock(&intrsrc_lock);
 	if (interrupt_sources[vector] != NULL) {
 		sx_xunlock(&intrsrc_lock);
 		intr_event_destroy(isrc->is_event);
 		return (EEXIST);
 	}
 	intrcnt_register(isrc);
 	interrupt_sources[vector] = isrc;
 	isrc->is_handlers = 0;
 	sx_xunlock(&intrsrc_lock);
 	return (0);
 }
 
 struct intsrc *
 intr_lookup_source(int vector)
 {
 
 	if (vector < 0 || vector >= num_io_irqs)
 		return (NULL);
 	return (interrupt_sources[vector]);
 }
 
 int
 intr_add_handler(const char *name, int vector, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
     int domain)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
 	    arg, intr_priority(flags), flags, cookiep);
 	if (error == 0) {
 		sx_xlock(&intrsrc_lock);
 		intrcnt_updatename(isrc);
 		isrc->is_handlers++;
 		if (isrc->is_handlers == 1) {
 			isrc->is_domain = domain;
 			isrc->is_pic->pic_enable_intr(isrc);
 			isrc->is_pic->pic_enable_source(isrc);
 		}
 		sx_xunlock(&intrsrc_lock);
 	}
 	return (error);
 }
 
 int
 intr_remove_handler(void *cookie)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_handler_source(cookie);
 	error = intr_event_remove_handler(cookie);
 	if (error == 0) {
 		sx_xlock(&intrsrc_lock);
 		isrc->is_handlers--;
 		if (isrc->is_handlers == 0) {
 			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
 			isrc->is_pic->pic_disable_intr(isrc);
 		}
 		intrcnt_updatename(isrc);
 		sx_xunlock(&intrsrc_lock);
 	}
 	return (error);
 }
 
 int
 intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
 }
 
 static void
 intr_disable_src(void *arg)
 {
 	struct intsrc *isrc;
 
 	isrc = arg;
 	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 }
 
 void
 intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
 {
 	struct intr_event *ie;
 	int vector;
 
 	/*
 	 * We count software interrupts when we process them.  The
 	 * code here follows previous practice, but there's an
 	 * argument for counting hardware interrupts when they're
 	 * processed too.
 	 */
 	(*isrc->is_count)++;
 	VM_CNT_INC(v_intr);
 
 	ie = isrc->is_event;
 
 	/*
 	 * XXX: We assume that IRQ 0 is only used for the ISA timer
 	 * device (clk).
 	 */
 	vector = isrc->is_pic->pic_vector(isrc);
 	if (vector == 0)
 		clkintr_pending = 1;
 
 	/*
 	 * For stray interrupts, mask and EOI the source, bump the
 	 * stray count, and log the condition.
 	 */
 	if (intr_event_handle(ie, frame) != 0) {
 		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 		(*isrc->is_straycount)++;
 		if (*isrc->is_straycount < INTR_STRAY_LOG_MAX)
 			log(LOG_ERR, "stray irq%d\n", vector);
 		else if (*isrc->is_straycount == INTR_STRAY_LOG_MAX)
 			log(LOG_CRIT,
 			    "too many stray irq %d's: not logging anymore\n",
 			    vector);
 	}
 }
 
 void
 intr_resume(bool suspend_cancelled)
 {
 	struct pic *pic;
 
 #ifndef DEV_ATPIC
 	atpic_reset();
 #endif
 	mtx_lock(&intrpic_lock);
 	TAILQ_FOREACH(pic, &pics, pics) {
 		if (pic->pic_resume != NULL)
 			pic->pic_resume(pic, suspend_cancelled);
 	}
 	mtx_unlock(&intrpic_lock);
 }
 
 void
 intr_suspend(void)
 {
 	struct pic *pic;
 
 	mtx_lock(&intrpic_lock);
 	TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
 		if (pic->pic_suspend != NULL)
 			pic->pic_suspend(pic);
 	}
 	mtx_unlock(&intrpic_lock);
 }
 
 static int
 intr_assign_cpu(void *arg, int cpu)
 {
 #ifdef SMP
 	struct intsrc *isrc;
 	int error;
 
-#ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 
 	/* Nothing to do if there is only a single CPU. */
 	if (mp_ncpus > 1 && cpu != NOCPU) {
-#else
-	/*
-	 * Don't do anything during early boot.  We will pick up the
-	 * assignment once the APs are started.
-	 */
-	if (assign_cpu && cpu != NOCPU) {
-#endif
 		isrc = arg;
 		sx_xlock(&intrsrc_lock);
 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
 		if (error == 0)
 			isrc->is_cpu = cpu;
 		sx_xunlock(&intrsrc_lock);
 	} else
 		error = 0;
 	return (error);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 static void
 intrcnt_setname(const char *name, int index)
 {
 
 	snprintf(intrnames + INTRNAME_LEN * index, INTRNAME_LEN, "%-*s",
 	    INTRNAME_LEN - 1, name);
 }
 
 static void
 intrcnt_updatename(struct intsrc *is)
 {
 
 	intrcnt_setname(is->is_event->ie_fullname, is->is_index);
 }
 
 static void
 intrcnt_register(struct intsrc *is)
 {
 	char straystr[INTRNAME_LEN];
 
 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
 	mtx_lock_spin(&intrcnt_lock);
 	MPASS(intrcnt_index + 2 <= nintrcnt);
 	is->is_index = intrcnt_index;
 	intrcnt_index += 2;
 	snprintf(straystr, sizeof(straystr), "stray irq%d",
 	    is->is_pic->pic_vector(is));
 	intrcnt_updatename(is);
 	is->is_count = &intrcnt[is->is_index];
 	intrcnt_setname(straystr, is->is_index + 1);
 	is->is_straycount = &intrcnt[is->is_index + 1];
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 void
 intrcnt_add(const char *name, u_long **countp)
 {
 
 	mtx_lock_spin(&intrcnt_lock);
 	MPASS(intrcnt_index < nintrcnt);
 	*countp = &intrcnt[intrcnt_index];
 	intrcnt_setname(name, intrcnt_index);
 	intrcnt_index++;
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 static void
 intr_init(void *dummy __unused)
 {
 
 	TAILQ_INIT(&pics);
 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
 	sx_init(&intrsrc_lock, "intrsrc");
 	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
 }
 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
 
 static void
 intr_init_final(void *dummy __unused)
 {
 
 	/*
 	 * Enable interrupts on the BSP after all of the interrupt
 	 * controllers are initialized.  Device interrupts are still
 	 * disabled in the interrupt controllers until interrupt
 	 * handlers are registered.  Interrupts are enabled on each AP
 	 * after their first context switch.
 	 */
 	enable_intr();
 }
 SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
 
 #ifndef DEV_ATPIC
 /* Initialize the two 8259A's to a known-good shutdown state. */
 void
 atpic_reset(void)
 {
 
 	outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
 	outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
 	outb(IO_ICU1, OCW3_SEL | OCW3_RR);
 
 	outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
 	outb(IO_ICU2, OCW3_SEL | OCW3_RR);
 }
 #endif
 
 /* Add a description to an active interrupt handler. */
 int
 intr_describe(u_int vector, void *ih, const char *descr)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	error = intr_event_describe_handler(isrc->is_event, ih, descr);
 	if (error)
 		return (error);
 	intrcnt_updatename(isrc);
 	return (0);
 }
 
 void
 intr_reprogram(void)
 {
 	struct intsrc *is;
 	u_int v;
 
 	sx_xlock(&intrsrc_lock);
 	for (v = 0; v < num_io_irqs; v++) {
 		is = interrupt_sources[v];
 		if (is == NULL)
 			continue;
 		if (is->is_pic->pic_reprogram_pin != NULL)
 			is->is_pic->pic_reprogram_pin(is);
 	}
 	sx_xunlock(&intrsrc_lock);
 }
 
 #ifdef DDB
 /*
  * Dump data about interrupt handlers
  */
 DB_SHOW_COMMAND(irqs, db_show_irqs)
 {
 	struct intsrc **isrc;
 	u_int i;
 	int verbose;
 
 	if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	isrc = interrupt_sources;
 	for (i = 0; i < num_io_irqs && !db_pager_quit; i++, isrc++)
 		if (*isrc != NULL)
 			db_dump_intr_event((*isrc)->is_event, verbose);
 }
 #endif
 
 #ifdef SMP
 /*
  * Support for balancing interrupt sources across CPUs.  For now we just
  * allocate CPUs round-robin.
  *
  * XXX If the system has a domain with without any usable CPUs (e.g., where all
  * APIC IDs are 256 or greater and we do not have an IOMMU) we use
  * intr_no_domain to fall back to assigning interrupts without regard for
  * domain.  Once we can rely on the presence of an IOMMU on all x86 platforms
  * we can revert this.
  */
 
 cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
 static int current_cpu[MAXMEMDOM];
 static bool intr_no_domain;
 
 static void
 intr_init_cpus(void)
 {
 	int i;
 
 	for (i = 0; i < vm_ndomains; i++) {
 		if (CPU_OVERLAP(&cpuset_domain[i], &intr_cpus) == 0) {
 			intr_no_domain = true;
 			printf("%s: unable to route interrupts to CPUs in domain %d\n",
 			    __func__, i);
 		}
 
 		current_cpu[i] = 0;
 		if (intr_no_domain && i > 0)
 			continue;
 		if (!CPU_ISSET(current_cpu[i], &intr_cpus) ||
 		    !CPU_ISSET(current_cpu[i], &cpuset_domain[i]))
 			intr_next_cpu(i);
 	}
 }
 
 /*
  * Return the CPU that the next interrupt source should use.  For now
  * this just returns the next local APIC according to round-robin.
  */
 u_int
 intr_next_cpu(int domain)
 {
 	u_int apic_id;
 
-#ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 	if (mp_ncpus == 1)
 		return (PCPU_GET(apic_id));
-#else
-	/* Leave all interrupts on the BSP during boot. */
-	if (!assign_cpu)
-		return (PCPU_GET(apic_id));
-#endif
 
 	if (intr_no_domain)
 		domain = 0;
 	mtx_lock_spin(&icu_lock);
 	apic_id = cpu_apic_ids[current_cpu[domain]];
 	do {
 		current_cpu[domain]++;
 		if (current_cpu[domain] > mp_maxid)
 			current_cpu[domain] = 0;
 	} while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
 	    (!CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]) &&
 	    !intr_no_domain));
 	mtx_unlock_spin(&icu_lock);
 	return (apic_id);
 }
 
 /*
  * Add a CPU to our mask of valid CPUs that can be destinations of
  * interrupts.
  */
 void
 intr_add_cpu(u_int cpu)
 {
 
 	if (cpu >= MAXCPU)
 		panic("%s: Invalid CPU ID %u", __func__, cpu);
 	if (bootverbose)
 		printf("INTR: Adding local APIC %d as a target\n",
 		    cpu_apic_ids[cpu]);
 
 	CPU_SET(cpu, &intr_cpus);
 }
 
-#ifdef EARLY_AP_STARTUP
 static void
 intr_smp_startup(void *arg __unused)
 {
 
 	intr_init_cpus();
 	return;
 }
 SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup,
     NULL);
 
-#else
-/*
- * Distribute all the interrupt sources among the available CPUs once the
- * AP's have been launched.
- */
-static void
-intr_shuffle_irqs(void *arg __unused)
-{
-	struct intsrc *isrc;
-	u_int cpu, i;
-
-	intr_init_cpus();
-	/* Don't bother on UP. */
-	if (mp_ncpus == 1)
-		return;
-
-	/* Round-robin assign a CPU to each enabled source. */
-	sx_xlock(&intrsrc_lock);
-	assign_cpu = 1;
-	for (i = 0; i < num_io_irqs; i++) {
-		isrc = interrupt_sources[i];
-		if (isrc != NULL && isrc->is_handlers > 0) {
-			/*
-			 * If this event is already bound to a CPU,
-			 * then assign the source to that CPU instead
-			 * of picking one via round-robin.  Note that
-			 * this is careful to only advance the
-			 * round-robin if the CPU assignment succeeds.
-			 */
-			cpu = isrc->is_event->ie_cpu;
-			if (cpu == NOCPU)
-				cpu = current_cpu[isrc->is_domain];
-			if (isrc->is_pic->pic_assign_cpu(isrc,
-			    cpu_apic_ids[cpu]) == 0) {
-				isrc->is_cpu = cpu;
-				if (isrc->is_event->ie_cpu == NOCPU)
-					intr_next_cpu(isrc->is_domain);
-			}
-		}
-	}
-	sx_xunlock(&intrsrc_lock);
-}
-SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
-    NULL);
-#endif
-
 /*
  * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
  */
 static int
 sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct intsrc *isrc;
 	u_int i;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sx_slock(&intrsrc_lock);
 	for (i = 0; i < num_io_irqs; i++) {
 		isrc = interrupt_sources[i];
 		if (isrc == NULL)
 			continue;
 		sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
 		    isrc->is_event->ie_fullname,
 		    isrc->is_index,
 		    isrc->is_cpu,
 		    isrc->is_domain,
 		    *isrc->is_count);
 	}
 
 	sx_sunlock(&intrsrc_lock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 SYSCTL_PROC(_hw, OID_AUTO, intrs,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
     0, 0, sysctl_hw_intrs, "A",
     "interrupt:number @cpu: count");
 
 /*
  * Compare two, possibly NULL, entries in the interrupt source array
  * by load.
  */
 static int
 intrcmp(const void *one, const void *two)
 {
 	const struct intsrc *i1, *i2;
 
 	i1 = *(const struct intsrc * const *)one;
 	i2 = *(const struct intsrc * const *)two;
 	if (i1 != NULL && i2 != NULL)
 		return (*i1->is_count - *i2->is_count);
 	if (i1 != NULL)
 		return (1);
 	if (i2 != NULL)
 		return (-1);
 	return (0);
 }
 
 /*
  * Balance IRQs across available CPUs according to load.
  */
 static void
 intr_balance(void *dummy __unused, int pending __unused)
 {
 	struct intsrc *isrc;
 	int interval;
 	u_int cpu;
 	int i;
 
 	interval = intrbalance;
 	if (interval == 0)
 		goto out;
 
 	/*
 	 * Sort interrupts according to count.
 	 */
 	sx_xlock(&intrsrc_lock);
 	memcpy(interrupt_sorted, interrupt_sources, num_io_irqs *
 	    sizeof(interrupt_sorted[0]));
 	qsort(interrupt_sorted, num_io_irqs, sizeof(interrupt_sorted[0]),
 	    intrcmp);
 
 	/*
 	 * Restart the scan from the same location to avoid moving in the
 	 * common case.
 	 */
 	intr_init_cpus();
 
 	/*
 	 * Assign round-robin from most loaded to least.
 	 */
 	for (i = num_io_irqs - 1; i >= 0; i--) {
 		isrc = interrupt_sorted[i];
 		if (isrc == NULL  || isrc->is_event->ie_cpu != NOCPU)
 			continue;
 		cpu = current_cpu[isrc->is_domain];
 		intr_next_cpu(isrc->is_domain);
 		if (isrc->is_cpu != cpu &&
 		    isrc->is_pic->pic_assign_cpu(isrc,
 		    cpu_apic_ids[cpu]) == 0)
 			isrc->is_cpu = cpu;
 	}
 	sx_xunlock(&intrsrc_lock);
 out:
 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
 	    interval ? hz * interval : hz * 60);
 
 }
 
 static void
 intr_balance_init(void *dummy __unused)
 {
 
 	TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
 	    NULL);
 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
 }
 SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
 
 #else
 /*
  * Always route interrupts to the current processor in the UP case.
  */
 u_int
 intr_next_cpu(int domain)
 {
 
 	return (PCPU_GET(apic_id));
 }
 #endif
diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c
index b382368cc626..43fe4625cd08 100644
--- a/sys/x86/x86/local_apic.c
+++ b/sys/x86/x86/local_apic.c
@@ -1,2181 +1,2167 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1996, by Steve Passe
  * All rights reserved.
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Local APIC support on Pentium and later processors.
  */
 
 #include <sys/cdefs.h>
 #include "opt_atpic.h"
 #include "opt_hwpmc_hooks.h"
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/asan.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/msan.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/timeet.h>
 #include <sys/timetc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cpufunc.h>
 #include <machine/cputypes.h>
 #include <machine/fpu.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <x86/init.h>
 
 #ifdef DDB
 #include <sys/interrupt.h>
 #include <ddb/ddb.h>
 #endif
 
 #ifdef __amd64__
 #define	SDT_APIC	SDT_SYSIGT
 #define	GSEL_APIC	0
 #else
 #define	SDT_APIC	SDT_SYS386IGT
 #define	GSEL_APIC	GSEL(GCODE_SEL, SEL_KPL)
 #endif
 
 static MALLOC_DEFINE(M_LAPIC, "local_apic", "Local APIC items");
 
 /* Sanity checks on IDT vectors. */
 CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT);
 CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS);
 CTASSERT(APIC_LOCAL_INTS == 240);
 CTASSERT(IPI_STOP < APIC_SPURIOUS_INT);
 
 /*
  * I/O interrupts use non-negative IRQ values.  These values are used
  * to mark unused IDT entries or IDT entries reserved for a non-I/O
  * interrupt.
  */
 #define	IRQ_FREE	-1
 #define	IRQ_TIMER	-2
 #define	IRQ_SYSCALL	-3
 #define	IRQ_DTRACE_RET	-4
 #define	IRQ_EVTCHN	-5
 
 enum lat_timer_mode {
 	LAT_MODE_UNDEF =	0,
 	LAT_MODE_PERIODIC =	1,
 	LAT_MODE_ONESHOT =	2,
 	LAT_MODE_DEADLINE =	3,
 };
 
 /*
  * Support for local APICs.  Local APICs manage interrupts on each
  * individual processor as opposed to I/O APICs which receive interrupts
  * from I/O devices and then forward them on to the local APICs.
  *
  * Local APICs can also send interrupts to each other thus providing the
  * mechanism for IPIs.
  */
 
 struct lvt {
 	u_int lvt_edgetrigger:1;
 	u_int lvt_activehi:1;
 	u_int lvt_masked:1;
 	u_int lvt_active:1;
 	u_int lvt_mode:16;
 	u_int lvt_vector:8;
 };
 
 struct lapic {
 	struct lvt la_lvts[APIC_LVT_MAX + 1];
 	struct lvt la_elvts[APIC_ELVT_MAX + 1];
 	u_int la_id:8;
 	u_int la_cluster:4;
 	u_int la_cluster_id:2;
 	u_int la_present:1;
 	u_long *la_timer_count;
 	uint64_t la_timer_period;
 	enum lat_timer_mode la_timer_mode;
 	uint32_t lvt_timer_base;
 	uint32_t lvt_timer_last;
 	/* Include IDT_SYSCALL to make indexing easier. */
 	int la_ioint_irqs[APIC_NUM_IOINTS + 1];
 } static *lapics;
 
 /* Global defaults for local APIC LVT entries. */
 static struct lvt lvts[APIC_LVT_MAX + 1] = {
 	{ 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 },	/* LINT0: masked ExtINT */
 	{ 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 },	/* LINT1: NMI */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT },	/* Timer */
 	{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT },	/* Error */
 	{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 },	/* PMC */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT },	/* Thermal */
 	{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT },	/* CMCI */
 };
 
 /* Global defaults for AMD local APIC ELVT entries. */
 static struct lvt elvts[APIC_ELVT_MAX + 1] = {
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, APIC_CMC_INT },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 	{ 1, 1, 1, 0, APIC_LVT_DM_FIXED, 0 },
 };
 
 static inthand_t *ioint_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1),	/* 32 - 63 */
 	IDTVEC(apic_isr2),	/* 64 - 95 */
 	IDTVEC(apic_isr3),	/* 96 - 127 */
 	IDTVEC(apic_isr4),	/* 128 - 159 */
 	IDTVEC(apic_isr5),	/* 160 - 191 */
 	IDTVEC(apic_isr6),	/* 192 - 223 */
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
 static inthand_t *ioint_pti_handlers[] = {
 	NULL,			/* 0 - 31 */
 	IDTVEC(apic_isr1_pti),	/* 32 - 63 */
 	IDTVEC(apic_isr2_pti),	/* 64 - 95 */
 	IDTVEC(apic_isr3_pti),	/* 96 - 127 */
 	IDTVEC(apic_isr4_pti),	/* 128 - 159 */
 	IDTVEC(apic_isr5_pti),	/* 160 - 191 */
 	IDTVEC(apic_isr6_pti),	/* 192 - 223 */
 	IDTVEC(apic_isr7_pti),	/* 224 - 255 */
 };
 
 static u_int32_t lapic_timer_divisors[] = {
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
 extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
 
 volatile char *lapic_map;
 vm_paddr_t lapic_paddr = DEFAULT_APIC_BASE;
 int x2apic_mode;
 int lapic_eoi_suppression;
 static int lapic_timer_tsc_deadline;
 static u_long lapic_timer_divisor, count_freq;
 static struct eventtimer lapic_et;
 #ifdef SMP
 static uint64_t lapic_ipi_wait_mult;
 static int __read_mostly lapic_ds_idle_timeout = 1000000;
 #endif
 unsigned int max_apic_id;
 
 SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "APIC options");
 SYSCTL_INT(_hw_apic, OID_AUTO, x2apic_mode, CTLFLAG_RD, &x2apic_mode, 0, "");
 SYSCTL_INT(_hw_apic, OID_AUTO, eoi_suppression, CTLFLAG_RD,
     &lapic_eoi_suppression, 0, "");
 SYSCTL_INT(_hw_apic, OID_AUTO, timer_tsc_deadline, CTLFLAG_RD,
     &lapic_timer_tsc_deadline, 0, "");
 #ifdef SMP
 SYSCTL_INT(_hw_apic, OID_AUTO, ds_idle_timeout, CTLFLAG_RWTUN,
     &lapic_ds_idle_timeout, 0,
     "timeout (in us) for APIC Delivery Status to become Idle (xAPIC only)");
 #endif
 
 static void lapic_calibrate_initcount(struct lapic *la);
 
 /*
  * Use __nosanitizethread to exempt the LAPIC I/O accessors from KCSan
  * instrumentation.  Otherwise, if x2APIC is not available, use of the global
  * lapic_map will generate a KCSan false positive.  While the mapping is
  * shared among all CPUs, the physical access will always take place on the
  * local CPU's APIC, so there isn't in fact a race here.  Furthermore, the
  * KCSan warning printf can cause a panic if issued during LAPIC access,
  * due to attempted recursive use of event timer resources.
  */
 
 static uint32_t __nosanitizethread
 lapic_read32(enum LAPIC_REGISTERS reg)
 {
 	uint32_t res;
 
 	if (x2apic_mode) {
 		res = rdmsr32(MSR_APIC_000 + reg);
 	} else {
 		res = *(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL);
 	}
 	return (res);
 }
 
 static void __nosanitizethread
 lapic_write32(enum LAPIC_REGISTERS reg, uint32_t val)
 {
 
 	if (x2apic_mode) {
 		mfence();
 		lfence();
 		wrmsr(MSR_APIC_000 + reg, val);
 	} else {
 		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
 	}
 }
 
 static void __nosanitizethread
 lapic_write32_nofence(enum LAPIC_REGISTERS reg, uint32_t val)
 {
 
 	if (x2apic_mode) {
 		wrmsr(MSR_APIC_000 + reg, val);
 	} else {
 		*(volatile uint32_t *)(lapic_map + reg * LAPIC_MEM_MUL) = val;
 	}
 }
 
 #ifdef SMP
 static uint64_t
 lapic_read_icr_lo(void)
 {
 
 	return (lapic_read32(LAPIC_ICR_LO));
 }
 
 static void
 lapic_write_icr(uint32_t vhi, uint32_t vlo)
 {
 	register_t saveintr;
 	uint64_t v;
 
 	if (x2apic_mode) {
 		v = ((uint64_t)vhi << 32) | vlo;
 		mfence();
 		wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, v);
 	} else {
 		saveintr = intr_disable();
 		lapic_write32(LAPIC_ICR_HI, vhi);
 		lapic_write32(LAPIC_ICR_LO, vlo);
 		intr_restore(saveintr);
 	}
 }
 
 static void
 lapic_write_icr_lo(uint32_t vlo)
 {
 
 	if (x2apic_mode) {
 		mfence();
 		wrmsr(MSR_APIC_000 + LAPIC_ICR_LO, vlo);
 	} else {
 		lapic_write32(LAPIC_ICR_LO, vlo);
 	}
 }
 
 static void
 lapic_write_self_ipi(uint32_t vector)
 {
 
 	KASSERT(x2apic_mode, ("SELF IPI write in xAPIC mode"));
 	wrmsr(MSR_APIC_000 + LAPIC_SELF_IPI, vector);
 }
 #endif /* SMP */
 
 static void
 lapic_enable_x2apic(void)
 {
 	uint64_t apic_base;
 
 	apic_base = rdmsr(MSR_APICBASE);
 	apic_base |= APICBASE_X2APIC | APICBASE_ENABLED;
 	wrmsr(MSR_APICBASE, apic_base);
 }
 
 bool
 lapic_is_x2apic(void)
 {
 	uint64_t apic_base;
 
 	apic_base = rdmsr(MSR_APICBASE);
 	return ((apic_base & (APICBASE_X2APIC | APICBASE_ENABLED)) ==
 	    (APICBASE_X2APIC | APICBASE_ENABLED));
 }
 
 static void	lapic_enable(void);
 static void	lapic_resume(struct pic *pic, bool suspend_cancelled);
 static void	lapic_timer_oneshot(struct lapic *);
 static void	lapic_timer_oneshot_nointr(struct lapic *, uint32_t);
 static void	lapic_timer_periodic(struct lapic *);
 static void	lapic_timer_deadline(struct lapic *);
 static void	lapic_timer_stop(struct lapic *);
 static void	lapic_timer_set_divisor(u_int divisor);
 static uint32_t	lvt_mode(struct lapic *la, u_int pin, uint32_t value);
 static int	lapic_et_start(struct eventtimer *et,
 		    sbintime_t first, sbintime_t period);
 static int	lapic_et_stop(struct eventtimer *et);
 static u_int	apic_idt_to_irq(u_int apic_id, u_int vector);
 static void	lapic_set_tpr(u_int vector);
 
 struct pic lapic_pic = { .pic_resume = lapic_resume };
 
 static uint32_t
 lvt_mode_impl(struct lapic *la, struct lvt *lvt, u_int pin, uint32_t value)
 {
 
 	value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM |
 	    APIC_LVT_VECTOR);
 	if (lvt->lvt_edgetrigger == 0)
 		value |= APIC_LVT_TM;
 	if (lvt->lvt_activehi == 0)
 		value |= APIC_LVT_IIPP_INTALO;
 	if (lvt->lvt_masked)
 		value |= APIC_LVT_M;
 	value |= lvt->lvt_mode;
 	switch (lvt->lvt_mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		if (!lvt->lvt_edgetrigger && bootverbose) {
 			printf("lapic%u: Forcing LINT%u to edge trigger\n",
 			    la->la_id, pin);
 			value &= ~APIC_LVT_TM;
 		}
 		/* Use a vector of 0. */
 		break;
 	case APIC_LVT_DM_FIXED:
 		value |= lvt->lvt_vector;
 		break;
 	default:
 		panic("bad APIC LVT delivery mode: %#x\n", value);
 	}
 	return (value);
 }
 
 static uint32_t
 lvt_mode(struct lapic *la, u_int pin, uint32_t value)
 {
 	struct lvt *lvt;
 
 	KASSERT(pin <= APIC_LVT_MAX,
 	    ("%s: pin %u out of range", __func__, pin));
 	if (la->la_lvts[pin].lvt_active)
 		lvt = &la->la_lvts[pin];
 	else
 		lvt = &lvts[pin];
 
 	return (lvt_mode_impl(la, lvt, pin, value));
 }
 
 static uint32_t
 elvt_mode(struct lapic *la, u_int idx, uint32_t value)
 {
 	struct lvt *elvt;
 
 	KASSERT(idx <= APIC_ELVT_MAX,
 	    ("%s: idx %u out of range", __func__, idx));
 
 	elvt = &la->la_elvts[idx];
 	KASSERT(elvt->lvt_active, ("%s: ELVT%u is not active", __func__, idx));
 	KASSERT(elvt->lvt_edgetrigger,
 	    ("%s: ELVT%u is not edge triggered", __func__, idx));
 	KASSERT(elvt->lvt_activehi,
 	    ("%s: ELVT%u is not active high", __func__, idx));
 	return (lvt_mode_impl(la, elvt, idx, value));
 }
 
 /*
  * Map the local APIC and setup necessary interrupt vectors.
  */
 void
 lapic_init(vm_paddr_t addr)
 {
 #ifdef SMP
 	uint64_t r, r1, r2, rx;
 #endif
 	uint32_t ver;
 	int i;
 	bool arat;
 
 	TSENTER();
 
 	/*
 	 * Enable x2APIC mode if possible. Map the local APIC
 	 * registers page.
 	 *
 	 * Keep the LAPIC registers page mapped uncached for x2APIC
 	 * mode too, to have direct map page attribute set to
 	 * uncached.  This is needed to work around CPU errata present
 	 * on all Intel processors.
 	 */
 	KASSERT(trunc_page(addr) == addr,
 	    ("local APIC not aligned on a page boundary"));
 	lapic_paddr = addr;
 	lapic_map = pmap_mapdev(addr, PAGE_SIZE);
 	if (x2apic_mode) {
 		lapic_enable_x2apic();
 		lapic_map = NULL;
 	}
 
 	/* Setup the spurious interrupt handler. */
 	setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL,
 	    GSEL_APIC);
 
 	/* Perform basic initialization of the BSP's local APIC. */
 	lapic_enable();
 
 	/* Set BSP's per-CPU local APIC ID. */
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
 	setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
 	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* Local APIC error interrupt. */
 	setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
 	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* XXX: Thermal interrupt */
 
 	/* Local APIC CMCI. */
 	setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
 	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
 		/* Set if APIC timer runs in C3. */
 		arat = (cpu_power_eax & CPUTPM1_ARAT);
 
 		bzero(&lapic_et, sizeof(lapic_et));
 		lapic_et.et_name = "LAPIC";
 		lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT |
 		    ET_FLAGS_PERCPU;
 		lapic_et.et_quality = 600;
 		if (!arat) {
 			lapic_et.et_flags |= ET_FLAGS_C3STOP;
 			lapic_et.et_quality = 100;
 		}
 		if ((cpu_feature & CPUID_TSC) != 0 &&
 		    (cpu_feature2 & CPUID2_TSCDLT) != 0 &&
 		    tsc_is_invariant && tsc_freq != 0) {
 			lapic_timer_tsc_deadline = 1;
 			TUNABLE_INT_FETCH("hw.lapic_tsc_deadline",
 			    &lapic_timer_tsc_deadline);
 		}
 
 		lapic_et.et_frequency = 0;
 		/* We don't know frequency yet, so trying to guess. */
 		lapic_et.et_min_period = 0x00001000LL;
 		lapic_et.et_max_period = SBT_1S;
 		lapic_et.et_start = lapic_et_start;
 		lapic_et.et_stop = lapic_et_stop;
 		lapic_et.et_priv = NULL;
 		et_register(&lapic_et);
 	}
 
 	/*
 	 * Set lapic_eoi_suppression after lapic_enable(), to not
 	 * enable suppression in the hardware prematurely.  Note that
 	 * we by default enable suppression even when system only has
 	 * one IO-APIC, since EOI is broadcasted to all APIC agents,
 	 * including CPUs, otherwise.
 	 *
 	 * It seems that at least some KVM versions report
 	 * EOI_SUPPRESSION bit, but auto-EOI does not work.
 	 */
 	ver = lapic_read32(LAPIC_VERSION);
 	if ((ver & APIC_VER_EOI_SUPPRESSION) != 0) {
 		lapic_eoi_suppression = 1;
 		if (vm_guest == VM_GUEST_KVM) {
 			if (bootverbose)
 				printf(
 		       "KVM -- disabling lapic eoi suppression\n");
 			lapic_eoi_suppression = 0;
 		}
 		TUNABLE_INT_FETCH("hw.lapic_eoi_suppression",
 		    &lapic_eoi_suppression);
 	}
 
 #ifdef SMP
 #define	LOOPS	1000
 	/*
 	 * Calibrate the busy loop waiting for IPI ack in xAPIC mode.
 	 * lapic_ipi_wait_mult contains the number of iterations which
 	 * approximately delay execution for 1 microsecond (the
 	 * argument to lapic_ipi_wait() is in microseconds).
 	 *
 	 * We assume that TSC is present and already measured.
 	 * Possible TSC frequency jumps are irrelevant to the
 	 * calibration loop below, the CPU clock management code is
 	 * not yet started, and we do not enter sleep states.
 	 */
 	KASSERT((cpu_feature & CPUID_TSC) != 0 && tsc_freq != 0,
 	    ("TSC not initialized"));
 	if (!x2apic_mode) {
 		r = rdtsc();
 		for (rx = 0; rx < LOOPS; rx++) {
 			(void)lapic_read_icr_lo();
 			ia32_pause();
 		}
 		r = rdtsc() - r;
 		r1 = tsc_freq * LOOPS;
 		r2 = r * 1000000;
 		lapic_ipi_wait_mult = r1 >= r2 ? r1 / r2 : 1;
 		if (bootverbose) {
 			printf("LAPIC: ipi_wait() us multiplier %ju (r %ju "
 			    "tsc %ju)\n", (uintmax_t)lapic_ipi_wait_mult,
 			    (uintmax_t)r, (uintmax_t)tsc_freq);
 		}
 	}
 #undef LOOPS
 #endif /* SMP */
 
 	TSEXIT();
 }
 
 /*
  * Create a local APIC instance.
  */
 void
 lapic_create(u_int apic_id, int boot_cpu)
 {
 	int i;
 
 	if (apic_id > max_apic_id) {
 		printf("APIC: Ignoring local APIC with ID %d\n", apic_id);
 		if (boot_cpu)
 			panic("Can't ignore BSP");
 		return;
 	}
 	KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u",
 	    apic_id));
 
 	/*
 	 * Assume no local LVT overrides and a cluster of 0 and
 	 * intra-cluster ID of 0.
 	 */
 	lapics[apic_id].la_present = 1;
 	lapics[apic_id].la_id = apic_id;
 	for (i = 0; i <= APIC_LVT_MAX; i++) {
 		lapics[apic_id].la_lvts[i] = lvts[i];
 		lapics[apic_id].la_lvts[i].lvt_active = 0;
 	}
 	for (i = 0; i <= APIC_ELVT_MAX; i++) {
 		lapics[apic_id].la_elvts[i] = elvts[i];
 		lapics[apic_id].la_elvts[i].lvt_active = 0;
 	}
 	for (i = 0; i <= APIC_NUM_IOINTS; i++)
 	    lapics[apic_id].la_ioint_irqs[i] = IRQ_FREE;
 	lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL;
 	lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] =
 	    IRQ_TIMER;
 #ifdef KDTRACE_HOOKS
 	lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] =
 	    IRQ_DTRACE_RET;
 #endif
 #ifdef XENHVM
 	lapics[apic_id].la_ioint_irqs[IDT_EVTCHN - APIC_IO_INTS] = IRQ_EVTCHN;
 #endif
 
 #ifdef SMP
 	cpu_add(apic_id, boot_cpu);
 #endif
 }
 
 static inline uint32_t
 amd_read_ext_features(void)
 {
 	uint32_t version;
 
 	if (cpu_vendor_id != CPU_VENDOR_AMD &&
 	    cpu_vendor_id != CPU_VENDOR_HYGON)
 		return (0);
 	version = lapic_read32(LAPIC_VERSION);
 	if ((version & APIC_VER_AMD_EXT_SPACE) != 0)
 		return (lapic_read32(LAPIC_EXT_FEATURES));
 	else
 		return (0);
 }
 
 static inline uint32_t
 amd_read_elvt_count(void)
 {
 	uint32_t extf;
 	uint32_t count;
 
 	extf = amd_read_ext_features();
 	count = (extf & APIC_EXTF_ELVT_MASK) >> APIC_EXTF_ELVT_SHIFT;
 	count = min(count, APIC_ELVT_MAX + 1);
 	return (count);
 }
 
 /*
  * Dump contents of local APIC registers
  */
 void
 lapic_dump(const char* str)
 {
 	uint32_t version;
 	uint32_t maxlvt;
 	uint32_t extf;
 	int elvt_count;
 	int i;
 
 	version = lapic_read32(LAPIC_VERSION);
 	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	printf("cpu%d %s:\n", PCPU_GET(cpuid), str);
 	printf("     ID: 0x%08x   VER: 0x%08x LDR: 0x%08x DFR: 0x%08x",
 	    lapic_read32(LAPIC_ID), version,
 	    lapic_read32(LAPIC_LDR), x2apic_mode ? 0 : lapic_read32(LAPIC_DFR));
 	if ((cpu_feature2 & CPUID2_X2APIC) != 0)
 		printf(" x2APIC: %d", x2apic_mode);
 	printf("\n  lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n",
 	    lapic_read32(LAPIC_LVT_LINT0), lapic_read32(LAPIC_LVT_LINT1),
 	    lapic_read32(LAPIC_TPR), lapic_read32(LAPIC_SVR));
 	printf("  timer: 0x%08x therm: 0x%08x err: 0x%08x",
 	    lapic_read32(LAPIC_LVT_TIMER), lapic_read32(LAPIC_LVT_THERMAL),
 	    lapic_read32(LAPIC_LVT_ERROR));
 	if (maxlvt >= APIC_LVT_PMC)
 		printf(" pmc: 0x%08x", lapic_read32(LAPIC_LVT_PCINT));
 	printf("\n");
 	if (maxlvt >= APIC_LVT_CMCI)
 		printf("   cmci: 0x%08x\n", lapic_read32(LAPIC_LVT_CMCI));
 	extf = amd_read_ext_features();
 	if (extf != 0) {
 		printf("   AMD ext features: 0x%08x", extf);
 		elvt_count = amd_read_elvt_count();
 		for (i = 0; i < elvt_count; i++)
 			printf("%s elvt%d: 0x%08x", (i % 4) ? "" : "\n ", i,
 			    lapic_read32(LAPIC_EXT_LVT0 + i));
 		printf("\n");
 	}
 }
 
 void
 lapic_xapic_mode(void)
 {
 	register_t saveintr;
 
 	saveintr = intr_disable();
 	if (x2apic_mode)
 		lapic_enable_x2apic();
 	intr_restore(saveintr);
 }
 
 void
 lapic_setup(int boot)
 {
 	struct lapic *la;
 	uint32_t version;
 	uint32_t maxlvt;
 	register_t saveintr;
 	int elvt_count;
 	int i;
 
 	saveintr = intr_disable();
 
 	la = &lapics[lapic_id()];
 	KASSERT(la->la_present, ("missing APIC structure"));
 	version = lapic_read32(LAPIC_VERSION);
 	maxlvt = (version & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 
 	/* Initialize the TPR to allow all interrupts. */
 	lapic_set_tpr(0);
 
 	/* Setup spurious vector and enable the local APIC. */
 	lapic_enable();
 
 	/* Program LINT[01] LVT entries. */
 	lapic_write32(LAPIC_LVT_LINT0, lvt_mode(la, APIC_LVT_LINT0,
 	    lapic_read32(LAPIC_LVT_LINT0)));
 	lapic_write32(LAPIC_LVT_LINT1, lvt_mode(la, APIC_LVT_LINT1,
 	    lapic_read32(LAPIC_LVT_LINT1)));
 
 	/* Program the PMC LVT entry if present. */
 	if (maxlvt >= APIC_LVT_PMC) {
 		lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
 		    LAPIC_LVT_PCINT));
 	}
 
 	/*
 	 * Program the timer LVT.  Calibration is deferred until it is certain
 	 * that we have a reliable timecounter.
 	 */
 	la->lvt_timer_base = lvt_mode(la, APIC_LVT_TIMER,
 	    lapic_read32(LAPIC_LVT_TIMER));
 	la->lvt_timer_last = la->lvt_timer_base;
 	lapic_write32(LAPIC_LVT_TIMER, la->lvt_timer_base);
 
 	if (boot)
 		la->la_timer_mode = LAT_MODE_UNDEF;
 	else if (la->la_timer_mode != LAT_MODE_UNDEF) {
 		KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor",
 		    lapic_id()));
 		switch (la->la_timer_mode) {
 		case LAT_MODE_PERIODIC:
 			lapic_timer_set_divisor(lapic_timer_divisor);
 			lapic_timer_periodic(la);
 			break;
 		case LAT_MODE_ONESHOT:
 			lapic_timer_set_divisor(lapic_timer_divisor);
 			lapic_timer_oneshot(la);
 			break;
 		case LAT_MODE_DEADLINE:
 			lapic_timer_deadline(la);
 			break;
 		default:
 			panic("corrupted la_timer_mode %p %d", la,
 			    la->la_timer_mode);
 		}
 	}
 
 	/* Program error LVT and clear any existing errors. */
 	lapic_write32(LAPIC_LVT_ERROR, lvt_mode(la, APIC_LVT_ERROR,
 	    lapic_read32(LAPIC_LVT_ERROR)));
 	lapic_write32(LAPIC_ESR, 0);
 
 	/* XXX: Thermal LVT */
 
 	/* Program the CMCI LVT entry if present. */
 	if (maxlvt >= APIC_LVT_CMCI) {
 		lapic_write32(LAPIC_LVT_CMCI, lvt_mode(la, APIC_LVT_CMCI,
 		    lapic_read32(LAPIC_LVT_CMCI)));
 	}
 
 	elvt_count = amd_read_elvt_count();
 	for (i = 0; i < elvt_count; i++) {
 		if (la->la_elvts[i].lvt_active)
 			lapic_write32(LAPIC_EXT_LVT0 + i,
 			    elvt_mode(la, i, lapic_read32(LAPIC_EXT_LVT0 + i)));
 	}
 
 	intr_restore(saveintr);
 }
 
 static void
 lapic_intrcnt(void *dummy __unused)
 {
 	struct pcpu *pc;
 	struct lapic *la;
 	char buf[MAXCOMLEN + 1];
 
 	/* If there are no APICs, skip this function. */
 	if (lapics == NULL)
 		return;
 
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
 		la = &lapics[pc->pc_apic_id];
 		if (!la->la_present)
 		    continue;
 
 		snprintf(buf, sizeof(buf), "cpu%d:timer", pc->pc_cpuid);
 		intrcnt_add(buf, &la->la_timer_count);
 	}
 }
 SYSINIT(lapic_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, lapic_intrcnt, NULL);
 
 void
 lapic_reenable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	uint32_t value;
 
 	value = lapic_read32(LAPIC_LVT_PCINT);
 	value &= ~APIC_LVT_M;
 	lapic_write32(LAPIC_LVT_PCINT, value);
 #endif
 }
 
 #ifdef HWPMC_HOOKS
 static void
 lapic_update_pmc(void *dummy)
 {
 	struct lapic *la;
 
 	la = &lapics[lapic_id()];
 	lapic_write32(LAPIC_LVT_PCINT, lvt_mode(la, APIC_LVT_PMC,
 	    lapic_read32(LAPIC_LVT_PCINT)));
 }
 #endif
 
 void
 lapic_calibrate_timer(void)
 {
 	struct lapic *la;
 	register_t intr;
 
 #ifdef DEV_ATPIC
 	/* Fail if the local APIC is not present. */
 	if (!x2apic_mode && lapic_map == NULL)
 		return;
 #endif
 
 	intr = intr_disable();
 	la = &lapics[lapic_id()];
 
 	lapic_calibrate_initcount(la);
 
 	intr_restore(intr);
 
 	if (lapic_timer_tsc_deadline && bootverbose) {
 		printf("lapic: deadline tsc mode, Frequency %ju Hz\n",
 		    (uintmax_t)tsc_freq);
 	}
 }
 
 int
 lapic_enable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 #ifdef DEV_ATPIC
 	/* Fail if the local APIC is not present. */
 	if (!x2apic_mode && lapic_map == NULL)
 		return (0);
 #endif
 
 	/* Fail if the PMC LVT is not present. */
 	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return (0);
 
 	lvts[APIC_LVT_PMC].lvt_masked = 0;
 
-#ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
-#else
-#ifdef SMP
-	/*
-	 * If hwpmc was loaded at boot time then the APs may not be
-	 * started yet.  In that case, don't forward the request to
-	 * them as they will program the lvt when they start.
-	 */
-	if (smp_started)
-		smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
-	else
-#endif
-		lapic_update_pmc(NULL);
-#endif
 	return (1);
 #else
 	return (0);
 #endif
 }
 
 void
 lapic_disable_pmc(void)
 {
 #ifdef HWPMC_HOOKS
 	u_int32_t maxlvt;
 
 #ifdef DEV_ATPIC
 	/* Fail if the local APIC is not present. */
 	if (!x2apic_mode && lapic_map == NULL)
 		return;
 #endif
 
 	/* Fail if the PMC LVT is not present. */
 	maxlvt = (lapic_read32(LAPIC_VERSION) & APIC_VER_MAXLVT) >> MAXLVTSHIFT;
 	if (maxlvt < APIC_LVT_PMC)
 		return;
 
 	lvts[APIC_LVT_PMC].lvt_masked = 1;
 
 #ifdef SMP
 	/* The APs should always be started when hwpmc is unloaded. */
 	KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early"));
 #endif
 	smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL);
 #endif
 }
 
 static int
 lapic_calibrate_initcount_cpuid_vm(void)
 {
 	u_int regs[4];
 	uint64_t freq;
 
 	/* Get value from CPUID leaf if possible. */
 	if (vm_guest == VM_GUEST_NO)
 		return (false);
 	if (hv_high < 0x40000010)
 		return (false);
 	do_cpuid(0x40000010, regs);
 	freq = (uint64_t)(regs[1]) * 1000;
 
 	/* Pick timer divisor. */
 	lapic_timer_divisor = 2;
 	do {
 		if (freq / lapic_timer_divisor < APIC_TIMER_MAX_COUNT)
 			break;
 		lapic_timer_divisor <<= 1;
 	} while (lapic_timer_divisor <= 128);
 	if (lapic_timer_divisor > 128)
 		return (false);
 
 	/* Record divided frequency. */
 	count_freq = freq / lapic_timer_divisor;
 	return (count_freq != 0);
 }
 
 static uint64_t
 cb_lapic_getcount(void)
 {
 
 	return (APIC_TIMER_MAX_COUNT - lapic_read32(LAPIC_CCR_TIMER));
 }
 
 static void
 lapic_calibrate_initcount(struct lapic *la)
 {
 	uint64_t freq;
 
 	if (lapic_calibrate_initcount_cpuid_vm())
 		goto done;
 
 	/* Calibrate the APIC timer frequency. */
 	lapic_timer_set_divisor(2);
 	lapic_timer_oneshot_nointr(la, APIC_TIMER_MAX_COUNT);
 	fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);
 	freq = clockcalib(cb_lapic_getcount, "lapic");
 	fpu_kern_leave(curthread, NULL);
 
 	/* Pick a different divisor if necessary. */
 	lapic_timer_divisor = 2;
 	do {
 		if (freq * 2 / lapic_timer_divisor < APIC_TIMER_MAX_COUNT)
 			break;
 		lapic_timer_divisor <<= 1;
 	} while (lapic_timer_divisor <= 128);
 	if (lapic_timer_divisor > 128)
 		panic("lapic: Divisor too big");
 	count_freq = freq * 2 / lapic_timer_divisor;
 done:
 	if (bootverbose) {
 		printf("lapic: Divisor %lu, Frequency %lu Hz\n",
 		    lapic_timer_divisor, count_freq);
 	}
 }
 
 static void
 lapic_change_mode(struct eventtimer *et, struct lapic *la,
     enum lat_timer_mode newmode)
 {
 	if (la->la_timer_mode == newmode)
 		return;
 	switch (newmode) {
 	case LAT_MODE_PERIODIC:
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		et->et_frequency = count_freq;
 		break;
 	case LAT_MODE_DEADLINE:
 		et->et_frequency = tsc_freq;
 		break;
 	case LAT_MODE_ONESHOT:
 		lapic_timer_set_divisor(lapic_timer_divisor);
 		et->et_frequency = count_freq;
 		break;
 	default:
 		panic("lapic_change_mode %d", newmode);
 	}
 	la->la_timer_mode = newmode;
 	et->et_min_period = (0x00000002LLU << 32) / et->et_frequency;
 	et->et_max_period = (0xfffffffeLLU << 32) / et->et_frequency;
 }
 
 static int
 lapic_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
 {
 	struct lapic *la;
 
 	la = &lapics[PCPU_GET(apic_id)];
 	if (period != 0) {
 		lapic_change_mode(et, la, LAT_MODE_PERIODIC);
 		la->la_timer_period = ((uint32_t)et->et_frequency * period) >>
 		    32;
 		lapic_timer_periodic(la);
 	} else if (lapic_timer_tsc_deadline) {
 		lapic_change_mode(et, la, LAT_MODE_DEADLINE);
 		la->la_timer_period = (et->et_frequency * first) >> 32;
 		lapic_timer_deadline(la);
 	} else {
 		lapic_change_mode(et, la, LAT_MODE_ONESHOT);
 		la->la_timer_period = ((uint32_t)et->et_frequency * first) >>
 		    32;
 		lapic_timer_oneshot(la);
 	}
 	return (0);
 }
 
 static int
 lapic_et_stop(struct eventtimer *et)
 {
 	struct lapic *la;
 
 	la = &lapics[PCPU_GET(apic_id)];
 	lapic_timer_stop(la);
 	la->la_timer_mode = LAT_MODE_UNDEF;
 	return (0);
 }
 
 void
 lapic_disable(void)
 {
 	uint32_t value;
 
 	/* Software disable the local APIC. */
 	value = lapic_read32(LAPIC_SVR);
 	value &= ~APIC_SVR_SWEN;
 	lapic_write32(LAPIC_SVR, value);
 }
 
 static void
 lapic_enable(void)
 {
 	uint32_t value;
 
 	/* Program the spurious vector to enable the local APIC. */
 	value = lapic_read32(LAPIC_SVR);
 	value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS);
 	value |= APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT;
 	if (lapic_eoi_suppression)
 		value |= APIC_SVR_EOI_SUPPRESSION;
 	lapic_write32(LAPIC_SVR, value);
 }
 
 /* Reset the local APIC on the BSP during resume. */
 static void
 lapic_resume(struct pic *pic, bool suspend_cancelled)
 {
 
 	lapic_setup(0);
 }
 
 int
 lapic_id(void)
 {
 	uint32_t v;
 
 	KASSERT(x2apic_mode || lapic_map != NULL, ("local APIC is not mapped"));
 	v = lapic_read32(LAPIC_ID);
 	if (!x2apic_mode)
 		v >>= APIC_ID_SHIFT;
 	return (v);
 }
 
 int
 lapic_intr_pending(u_int vector)
 {
 	uint32_t irr;
 
 	/*
 	 * The IRR registers are an array of registers each of which
 	 * only describes 32 interrupts in the low 32 bits.  Thus, we
 	 * divide the vector by 32 to get the register index.
 	 * Finally, we modulus the vector by 32 to determine the
 	 * individual bit to test.
 	 */
 	irr = lapic_read32(LAPIC_IRR0 + vector / 32);
 	return (irr & 1 << (vector % 32));
 }
 
 void
 lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id)
 {
 	struct lapic *la;
 
 	KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist",
 	    __func__, apic_id));
 	KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big",
 	    __func__, cluster));
 	KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID,
 	    ("%s: intra cluster id %u too big", __func__, cluster_id));
 	la = &lapics[apic_id];
 	la->la_cluster = cluster;
 	la->la_cluster_id = cluster_id;
 }
 
 int
 lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked)
 {
 
 	if (pin > APIC_LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_masked = masked;
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_masked = masked;
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked");
 	return (0);
 }
 
 int
 lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode)
 {
 	struct lvt *lvt;
 
 	if (pin > APIC_LVT_MAX)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvt = &lvts[pin];
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lvt = &lapics[apic_id].la_lvts[pin];
 		lvt->lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	lvt->lvt_mode = mode;
 	switch (mode) {
 	case APIC_LVT_DM_NMI:
 	case APIC_LVT_DM_SMI:
 	case APIC_LVT_DM_INIT:
 	case APIC_LVT_DM_EXTINT:
 		lvt->lvt_edgetrigger = 1;
 		lvt->lvt_activehi = 1;
 		if (mode == APIC_LVT_DM_EXTINT)
 			lvt->lvt_masked = 1;
 		else
 			lvt->lvt_masked = 0;
 		break;
 	default:
 		panic("Unsupported delivery mode: 0x%x\n", mode);
 	}
 	if (bootverbose) {
 		printf(" Routing ");
 		switch (mode) {
 		case APIC_LVT_DM_NMI:
 			printf("NMI");
 			break;
 		case APIC_LVT_DM_SMI:
 			printf("SMI");
 			break;
 		case APIC_LVT_DM_INIT:
 			printf("INIT");
 			break;
 		case APIC_LVT_DM_EXTINT:
 			printf("ExtINT");
 			break;
 		}
 		printf(" -> LINT%u\n", pin);
 	}
 	return (0);
 }
 
 int
 lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol)
 {
 
 	if (pin > APIC_LVT_MAX || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		lapics[apic_id].la_lvts[pin].lvt_activehi =
 		    (pol == INTR_POLARITY_HIGH);
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u polarity: %s\n", pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
 	return (0);
 }
 
 int
 lapic_set_lvt_triggermode(u_int apic_id, u_int pin,
      enum intr_trigger trigger)
 {
 
 	if (pin > APIC_LVT_MAX || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (apic_id == APIC_ID_ALL) {
 		lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 		if (bootverbose)
 			printf("lapic:");
 	} else {
 		KASSERT(lapics[apic_id].la_present,
 		    ("%s: missing APIC %u", __func__, apic_id));
 		lapics[apic_id].la_lvts[pin].lvt_edgetrigger =
 		    (trigger == INTR_TRIGGER_EDGE);
 		lapics[apic_id].la_lvts[pin].lvt_active = 1;
 		if (bootverbose)
 			printf("lapic%u:", apic_id);
 	}
 	if (bootverbose)
 		printf(" LINT%u trigger: %s\n", pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
 	return (0);
 }
 
 /*
  * Adjust the TPR of the current CPU so that it blocks all interrupts below
  * the passed in vector.
  */
 static void
 lapic_set_tpr(u_int vector)
 {
 #ifdef CHEAP_TPR
 	lapic_write32(LAPIC_TPR, vector);
 #else
 	uint32_t tpr;
 
 	tpr = lapic_read32(LAPIC_TPR) & ~APIC_TPR_PRIO;
 	tpr |= vector;
 	lapic_write32(LAPIC_TPR, tpr);
 #endif
 }
 
 void
 lapic_eoi(void)
 {
 
 	lapic_write32_nofence(LAPIC_EOI, 0);
 }
 
 void
 lapic_handle_intr(int vector, struct trapframe *frame)
 {
 	struct intsrc *isrc;
 
 	kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0);
 	kmsan_mark(&vector, sizeof(vector), KMSAN_STATE_INITED);
 	kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED);
 	trap_check_kstack();
 
 	isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id),
 	    vector));
 	intr_execute_handlers(isrc, frame);
 }
 
 void
 lapic_handle_timer(struct trapframe *frame)
 {
 	struct lapic *la;
 	struct trapframe *oldframe;
 	struct thread *td;
 
 	/* Send EOI first thing. */
 	lapic_eoi();
 
 	kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0);
 	kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED);
 	trap_check_kstack();
 
 #if defined(SMP) && !defined(SCHED_ULE)
 	/*
 	 * Don't do any accounting for the disabled HTT cores, since it
 	 * will provide misleading numbers for the userland.
 	 *
 	 * No locking is necessary here, since even if we lose the race
 	 * when hlt_cpus_mask changes it is not a big deal, really.
 	 *
 	 * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask
 	 * and unlike other schedulers it actually schedules threads to
 	 * those CPUs.
 	 */
 	if (CPU_ISSET(PCPU_GET(cpuid), &hlt_cpus_mask))
 		return;
 #endif
 
 	/* Look up our local APIC structure for the tick counters. */
 	la = &lapics[PCPU_GET(apic_id)];
 	(*la->la_timer_count)++;
 	critical_enter();
 	if (lapic_et.et_active) {
 		td = curthread;
 		td->td_intr_nesting_level++;
 		oldframe = td->td_intr_frame;
 		td->td_intr_frame = frame;
 		lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg);
 		td->td_intr_frame = oldframe;
 		td->td_intr_nesting_level--;
 	}
 	critical_exit();
 }
 
 static void
 lapic_timer_set_divisor(u_int divisor)
 {
 
 	KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor));
 	KASSERT(ffs(divisor) <= nitems(lapic_timer_divisors),
 		("lapic: invalid divisor %u", divisor));
 	lapic_write32(LAPIC_DCR_TIMER, lapic_timer_divisors[ffs(divisor) - 1]);
 }
 
 static void
 lapic_timer_oneshot(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_ONE_SHOT;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
 lapic_timer_oneshot_nointr(struct lapic *la, uint32_t count)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~APIC_LVTT_TM;
 	value |= APIC_LVTT_TM_ONE_SHOT | APIC_LVT_M;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, count);
 }
 
 static void
 lapic_timer_periodic(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_PERIODIC;
 	la->lvt_timer_last = value;
 	lapic_write32(LAPIC_LVT_TIMER, value);
 	lapic_write32(LAPIC_ICR_TIMER, la->la_timer_period);
 }
 
 static void
 lapic_timer_deadline(struct lapic *la)
 {
 	uint32_t value;
 
 	value = la->lvt_timer_base;
 	value &= ~(APIC_LVTT_TM | APIC_LVT_M);
 	value |= APIC_LVTT_TM_TSCDLT;
 	if (value != la->lvt_timer_last) {
 		la->lvt_timer_last = value;
 		lapic_write32_nofence(LAPIC_LVT_TIMER, value);
 		if (!x2apic_mode)
 			mfence();
 	}
 	wrmsr(MSR_TSC_DEADLINE, la->la_timer_period + rdtsc());
 }
 
 static void
 lapic_timer_stop(struct lapic *la)
 {
 	uint32_t value;
 
 	if (la->la_timer_mode == LAT_MODE_DEADLINE) {
 		wrmsr(MSR_TSC_DEADLINE, 0);
 		mfence();
 	} else {
 		value = la->lvt_timer_base;
 		value &= ~APIC_LVTT_TM;
 		value |= APIC_LVT_M;
 		la->lvt_timer_last = value;
 		lapic_write32(LAPIC_LVT_TIMER, value);
 	}
 }
 
 void
 lapic_handle_cmc(void)
 {
 	trap_check_kstack();
 
 	lapic_eoi();
 	cmc_intr();
 }
 
 /*
  * Called from the mca_init() to activate the CMC interrupt if this CPU is
  * responsible for monitoring any MC banks for CMC events.  Since mca_init()
  * is called prior to lapic_setup() during boot, this just needs to unmask
  * this CPU's LVT_CMCI entry.
  */
 void
 lapic_enable_cmc(void)
 {
 	u_int apic_id;
 
 #ifdef DEV_ATPIC
 	if (!x2apic_mode && lapic_map == NULL)
 		return;
 #endif
 	apic_id = PCPU_GET(apic_id);
 	KASSERT(lapics[apic_id].la_present,
 	    ("%s: missing APIC %u", __func__, apic_id));
 	lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_masked = 0;
 	lapics[apic_id].la_lvts[APIC_LVT_CMCI].lvt_active = 1;
 }
 
 int
 lapic_enable_mca_elvt(void)
 {
 	u_int apic_id;
 	uint32_t value;
 	int elvt_count;
 
 #ifdef DEV_ATPIC
 	if (lapic_map == NULL)
 		return (-1);
 #endif
 
 	apic_id = PCPU_GET(apic_id);
 	KASSERT(lapics[apic_id].la_present,
 	    ("%s: missing APIC %u", __func__, apic_id));
 	elvt_count = amd_read_elvt_count();
 	if (elvt_count <= APIC_ELVT_MCA)
 		return (-1);
 
 	value = lapic_read32(LAPIC_EXT_LVT0 + APIC_ELVT_MCA);
 	if ((value & APIC_LVT_M) == 0) {
 		if (bootverbose)
 			printf("AMD MCE Thresholding Extended LVT is already active\n");
 		return (APIC_ELVT_MCA);
 	}
 	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_masked = 0;
 	lapics[apic_id].la_elvts[APIC_ELVT_MCA].lvt_active = 1;
 	return (APIC_ELVT_MCA);
 }
 
 void
 lapic_handle_error(void)
 {
 	uint32_t esr;
 
 	trap_check_kstack();
 
 	/*
 	 * Read the contents of the error status register.  Write to
 	 * the register first before reading from it to force the APIC
 	 * to update its value to indicate any errors that have
 	 * occurred since the previous write to the register.
 	 */
 	lapic_write32(LAPIC_ESR, 0);
 	esr = lapic_read32(LAPIC_ESR);
 
 	printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr);
 	lapic_eoi();
 }
 
 u_int
 apic_cpuid(u_int apic_id)
 {
 #ifdef SMP
 	return apic_cpuids[apic_id];
 #else
 	return 0;
 #endif
 }
 
 /* Request a free IDT vector to be used by the specified IRQ. */
 u_int
 apic_alloc_vector(u_int apic_id, u_int irq)
 {
 	u_int vector;
 
 	KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
 
 	/*
 	 * Search for a free vector.  Currently we just use a very simple
 	 * algorithm to find the first free vector.
 	 */
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 		if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE)
 			continue;
 		lapics[apic_id].la_ioint_irqs[vector] = irq;
 		mtx_unlock_spin(&icu_lock);
 		return (vector + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 /*
  * Request 'count' free contiguous IDT vectors to be used by 'count'
  * IRQs.  'count' must be a power of two and the vectors will be
  * aligned on a boundary of 'align'.  If the request cannot be
  * satisfied, 0 is returned.
  */
 u_int
 apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align)
 {
 	u_int first, run, vector;
 
 	KASSERT(powerof2(count), ("bad count"));
 	KASSERT(powerof2(align), ("bad align"));
 	KASSERT(align >= count, ("align < count"));
 #ifdef INVARIANTS
 	for (run = 0; run < count; run++)
 		KASSERT(irqs[run] < num_io_irqs, ("Invalid IRQ %u at index %u",
 		    irqs[run], run));
 #endif
 
 	/*
 	 * Search for 'count' free vectors.  As with apic_alloc_vector(),
 	 * this just uses a simple first fit algorithm.
 	 */
 	run = 0;
 	first = 0;
 	mtx_lock_spin(&icu_lock);
 	for (vector = 0; vector < APIC_NUM_IOINTS; vector++) {
 		/* Vector is in use, end run. */
 		if (lapics[apic_id].la_ioint_irqs[vector] != IRQ_FREE) {
 			run = 0;
 			first = 0;
 			continue;
 		}
 
 		/* Start a new run if run == 0 and vector is aligned. */
 		if (run == 0) {
 			if (((vector + APIC_IO_INTS) & (align - 1)) != 0)
 				continue;
 			first = vector;
 		}
 		run++;
 
 		/* Keep looping if the run isn't long enough yet. */
 		if (run < count)
 			continue;
 
 		/* Found a run, assign IRQs and return the first vector. */
 		for (vector = 0; vector < count; vector++)
 			lapics[apic_id].la_ioint_irqs[first + vector] =
 			    irqs[vector];
 		mtx_unlock_spin(&icu_lock);
 		return (first + APIC_IO_INTS);
 	}
 	mtx_unlock_spin(&icu_lock);
 	printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count);
 	return (0);
 }
 
 /*
  * Enable a vector for a particular apic_id.  Since all lapics share idt
  * entries and ioint_handlers this enables the vector on all lapics.  lapics
  * which do not have the vector configured would report spurious interrupts
  * should it fire.
  */
 void
 apic_enable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
 	    SDT_APIC, SEL_KPL, GSEL_APIC);
 }
 
 void
 apic_disable_vector(u_int apic_id, u_int vector)
 {
 
 	KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry"));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	KASSERT(ioint_handlers[vector / 32] != NULL,
 	    ("No ISR handler for vector %u", vector));
 #ifdef notyet
 	/*
 	 * We can not currently clear the idt entry because other cpus
 	 * may have a valid vector at this offset.
 	 */
 	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC,
 	    SEL_KPL, GSEL_APIC);
 #endif
 }
 
 /* Release an APIC vector when it's no longer in use. */
 void
 apic_free_vector(u_int apic_id, u_int vector, u_int irq)
 {
 	struct thread *td;
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 	KASSERT(irq < num_io_irqs, ("Invalid IRQ %u", irq));
 	KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] ==
 	    irq, ("IRQ mismatch"));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 
 	/*
 	 * Bind us to the cpu that owned the vector before freeing it so
 	 * we don't lose an interrupt delivery race.
 	 */
 	td = curthread;
 	if (!rebooting) {
 		thread_lock(td);
 		if (sched_is_bound(td))
 			panic("apic_free_vector: Thread already bound.\n");
 		sched_bind(td, apic_cpuid(apic_id));
 		thread_unlock(td);
 	}
 	mtx_lock_spin(&icu_lock);
 	lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = IRQ_FREE;
 	mtx_unlock_spin(&icu_lock);
 	if (!rebooting) {
 		thread_lock(td);
 		sched_unbind(td);
 		thread_unlock(td);
 	}
 }
 
 /* Map an IDT vector (APIC) to an IRQ (interrupt source). */
 static u_int
 apic_idt_to_irq(u_int apic_id, u_int vector)
 {
 	int irq;
 
 	KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL &&
 	    vector <= APIC_IO_INTS + APIC_NUM_IOINTS,
 	    ("Vector %u does not map to an IRQ line", vector));
 #ifdef KDTRACE_HOOKS
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
 	irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS];
 	if (irq < 0)
 		irq = 0;
 	return (irq);
 }
 
 #ifdef DDB
 /*
  * Dump data about APIC IDT vector mappings.
  */
 DB_SHOW_COMMAND_FLAGS(apic, db_show_apic, DB_CMD_MEMSAFE)
 {
 	struct intsrc *isrc;
 	int i, verbose;
 	u_int apic_id;
 	u_int irq;
 
 	if (strcmp(modif, "vv") == 0)
 		verbose = 2;
 	else if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	for (apic_id = 0; apic_id <= max_apic_id; apic_id++) {
 		if (lapics[apic_id].la_present == 0)
 			continue;
 		db_printf("Interrupts bound to lapic %u\n", apic_id);
 		for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) {
 			irq = lapics[apic_id].la_ioint_irqs[i];
 			if (irq == IRQ_FREE || irq == IRQ_SYSCALL)
 				continue;
 #ifdef KDTRACE_HOOKS
 			if (irq == IRQ_DTRACE_RET)
 				continue;
 #endif
 #ifdef XENHVM
 			if (irq == IRQ_EVTCHN)
 				continue;
 #endif
 			db_printf("vec 0x%2x -> ", i + APIC_IO_INTS);
 			if (irq == IRQ_TIMER)
 				db_printf("lapic timer\n");
 			else if (irq < num_io_irqs) {
 				isrc = intr_lookup_source(irq);
 				if (isrc == NULL || verbose == 0)
 					db_printf("IRQ %u\n", irq);
 				else
 					db_dump_intr_event(isrc->is_event,
 					    verbose == 2);
 			} else
 				db_printf("IRQ %u ???\n", irq);
 		}
 	}
 }
 
 static void
 dump_mask(const char *prefix, uint32_t v, int base)
 {
 	int i, first;
 
 	first = 1;
 	for (i = 0; i < 32; i++)
 		if (v & (1 << i)) {
 			if (first) {
 				db_printf("%s:", prefix);
 				first = 0;
 			}
 			db_printf(" %02x", base + i);
 		}
 	if (!first)
 		db_printf("\n");
 }
 
 /* Show info from the lapic regs for this CPU. */
 DB_SHOW_COMMAND_FLAGS(lapic, db_show_lapic, DB_CMD_MEMSAFE)
 {
 	uint32_t v;
 
 	db_printf("lapic ID = %d\n", lapic_id());
 	v = lapic_read32(LAPIC_VERSION);
 	db_printf("version  = %d.%d\n", (v & APIC_VER_VERSION) >> 4,
 	    v & 0xf);
 	db_printf("max LVT  = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT);
 	v = lapic_read32(LAPIC_SVR);
 	db_printf("SVR      = %02x (%s)\n", v & APIC_SVR_VECTOR,
 	    v & APIC_SVR_ENABLE ? "enabled" : "disabled");
 	db_printf("TPR      = %02x\n", lapic_read32(LAPIC_TPR));
 
 #define dump_field(prefix, regn, index)					\
 	dump_mask(__XSTRING(prefix ## index), 				\
 	    lapic_read32(LAPIC_ ## regn ## index),			\
 	    index * 32)
 
 	db_printf("In-service Interrupts:\n");
 	dump_field(isr, ISR, 0);
 	dump_field(isr, ISR, 1);
 	dump_field(isr, ISR, 2);
 	dump_field(isr, ISR, 3);
 	dump_field(isr, ISR, 4);
 	dump_field(isr, ISR, 5);
 	dump_field(isr, ISR, 6);
 	dump_field(isr, ISR, 7);
 
 	db_printf("TMR Interrupts:\n");
 	dump_field(tmr, TMR, 0);
 	dump_field(tmr, TMR, 1);
 	dump_field(tmr, TMR, 2);
 	dump_field(tmr, TMR, 3);
 	dump_field(tmr, TMR, 4);
 	dump_field(tmr, TMR, 5);
 	dump_field(tmr, TMR, 6);
 	dump_field(tmr, TMR, 7);
 
 	db_printf("IRR Interrupts:\n");
 	dump_field(irr, IRR, 0);
 	dump_field(irr, IRR, 1);
 	dump_field(irr, IRR, 2);
 	dump_field(irr, IRR, 3);
 	dump_field(irr, IRR, 4);
 	dump_field(irr, IRR, 5);
 	dump_field(irr, IRR, 6);
 	dump_field(irr, IRR, 7);
 
 #undef dump_field
 }
 #endif
 
 /*
  * APIC probing support code.  This includes code to manage enumerators.
  */
 
 static SLIST_HEAD(, apic_enumerator) enumerators =
 	SLIST_HEAD_INITIALIZER(enumerators);
 static struct apic_enumerator *best_enum;
 
 void
 apic_register_enumerator(struct apic_enumerator *enumerator)
 {
 #ifdef INVARIANTS
 	struct apic_enumerator *apic_enum;
 
 	SLIST_FOREACH(apic_enum, &enumerators, apic_next) {
 		if (apic_enum == enumerator)
 			panic("%s: Duplicate register of %s", __func__,
 			    enumerator->apic_name);
 	}
 #endif
 	SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next);
 }
 
 /*
  * We have to look for CPU's very, very early because certain subsystems
  * want to know how many CPU's we have extremely early on in the boot
  * process.
  */
 static void
 apic_init(void *dummy __unused)
 {
 	struct apic_enumerator *enumerator;
 	int retval, best;
 
 	/* We only support built in local APICs. */
 	if (!(cpu_feature & CPUID_APIC))
 		return;
 
 	/* Don't probe if APIC mode is disabled. */
 	if (resource_disabled("apic", 0))
 		return;
 
 	/* Probe all the enumerators to find the best match. */
 	best_enum = NULL;
 	best = 0;
 	SLIST_FOREACH(enumerator, &enumerators, apic_next) {
 		retval = enumerator->apic_probe();
 		if (retval > 0)
 			continue;
 		if (best_enum == NULL || best < retval) {
 			best_enum = enumerator;
 			best = retval;
 		}
 	}
 	if (best_enum == NULL) {
 		if (bootverbose)
 			printf("APIC: Could not find any APICs.\n");
 #ifndef DEV_ATPIC
 		panic("running without device atpic requires a local APIC");
 #endif
 		return;
 	}
 
 	if (bootverbose)
 		printf("APIC: Using the %s enumerator.\n",
 		    best_enum->apic_name);
 
 #ifdef I686_CPU
 	/*
 	 * To work around an errata, we disable the local APIC on some
 	 * CPUs during early startup.  We need to turn the local APIC back
 	 * on on such CPUs now.
 	 */
 	ppro_reenable_apic();
 #endif
 
 	/* Probe the CPU's in the system. */
 	retval = best_enum->apic_probe_cpus();
 	if (retval != 0)
 		printf("%s: Failed to probe CPUs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 }
 SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL);
 
 /*
  * Setup the local APIC.  We have to do this prior to starting up the APs
  * in the SMP case.
  */
 static void
 apic_setup_local(void *dummy __unused)
 {
 	int retval;
 
 	if (best_enum == NULL)
 		return;
 
 	lapics = malloc(sizeof(*lapics) * (max_apic_id + 1), M_LAPIC,
 	    M_WAITOK | M_ZERO);
 
 	/* Initialize the local APIC. */
 	retval = best_enum->apic_setup_local();
 	if (retval != 0)
 		printf("%s: Failed to setup the local APIC: returned %d\n",
 		    best_enum->apic_name, retval);
 }
 SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL);
 
 /*
  * Setup the I/O APICs.
  */
 static void
 apic_setup_io(void *dummy __unused)
 {
 	int retval;
 
 	if (best_enum == NULL)
 		return;
 
 	/*
 	 * Local APIC must be registered before other PICs and pseudo PICs
 	 * for proper suspend/resume order.
 	 */
 	intr_register_pic(&lapic_pic);
 
 	retval = best_enum->apic_setup_io();
 	if (retval != 0)
 		printf("%s: Failed to setup I/O APICs: returned %d\n",
 		    best_enum->apic_name, retval);
 
 	/*
 	 * Finish setting up the local APIC on the BSP once we know
 	 * how to properly program the LINT pins.  In particular, this
 	 * enables the EOI suppression mode, if LAPIC supports it and
 	 * user did not disable the mode.
 	 */
 	lapic_setup(1);
 	if (bootverbose)
 		lapic_dump("BSP");
 
 	/* Enable the MSI "pic". */
 	msi_init();
 
 #ifdef XENHVM
 	xen_intr_alloc_irqs();
 #endif
 }
 SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_THIRD, apic_setup_io, NULL);
 
 #ifdef SMP
 /*
  * Inter Processor Interrupt functions.  The lapic_ipi_*() functions are
  * private to the MD code.  The public interface for the rest of the
  * kernel is defined in mp_machdep.c.
  */
 
 /*
  * Wait delay microseconds for IPI to be sent.  If delay is -1, we
  * wait forever.
  */
 int
 lapic_ipi_wait(int delay)
 {
 	uint64_t rx;
 
 	/* LAPIC_ICR.APIC_DELSTAT_MASK is undefined in x2APIC mode */
 	if (x2apic_mode)
 		return (1);
 
 	for (rx = 0; delay == -1 || rx < lapic_ipi_wait_mult * delay; rx++) {
 		if ((lapic_read_icr_lo() & APIC_DELSTAT_MASK) ==
 		    APIC_DELSTAT_IDLE)
 			return (1);
 		ia32_pause();
 	}
 	return (0);
 }
 
 void
 lapic_ipi_raw(register_t icrlo, u_int dest)
 {
 	uint32_t icrhi;
 
 	/* XXX: Need more sanity checking of icrlo? */
 	KASSERT(x2apic_mode || lapic_map != NULL,
 	    ("%s called too early", __func__));
 	KASSERT(x2apic_mode ||
 	    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 	    ("%s: invalid dest field", __func__));
 	KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0,
 	    ("%s: reserved bits set in ICR LO register", __func__));
 
 	if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) {
 		if (x2apic_mode)
 			icrhi = dest;
 		else
 			icrhi = dest << APIC_ID_SHIFT;
 		lapic_write_icr(icrhi, icrlo);
 	} else {
 		lapic_write_icr_lo(icrlo);
 	}
 }
 
 #ifdef DETECT_DEADLOCK
 #define	AFTER_SPIN	50
 #endif
 
 static void
 native_lapic_ipi_vectored(u_int vector, int dest)
 {
 	register_t icrlo, destfield;
 
 	KASSERT((vector & ~APIC_VECTOR_MASK) == 0,
 	    ("%s: invalid vector %d", __func__, vector));
 
 	destfield = 0;
 	switch (dest) {
 	case APIC_IPI_DEST_SELF:
 		if (x2apic_mode && vector < IPI_NMI_FIRST) {
 			lapic_write_self_ipi(vector);
 			return;
 		}
 		icrlo = APIC_DEST_SELF;
 		break;
 	case APIC_IPI_DEST_ALL:
 		icrlo = APIC_DEST_ALLISELF;
 		break;
 	case APIC_IPI_DEST_OTHERS:
 		icrlo = APIC_DEST_ALLESELF;
 		break;
 	default:
 		icrlo = 0;
 		KASSERT(x2apic_mode ||
 		    (dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0,
 		    ("%s: invalid destination 0x%x", __func__, dest));
 		destfield = dest;
 	}
 
 	/*
 	 * NMI IPIs are just fake vectors used to send a NMI.  Use special rules
 	 * regarding NMIs if passed, otherwise specify the vector.
 	 */
 	if (vector >= IPI_NMI_FIRST)
 		icrlo |= APIC_DELMODE_NMI;
 	else
 		icrlo |= vector | APIC_DELMODE_FIXED;
 	icrlo |= APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT;
 
 	/* Wait for an earlier IPI to finish. */
 	if (!lapic_ipi_wait(lapic_ds_idle_timeout)) {
 		if (KERNEL_PANICKED())
 			return;
 		else
 			panic("APIC: Previous IPI is stuck");
 	}
 
 	lapic_ipi_raw(icrlo, destfield);
 
 #ifdef DETECT_DEADLOCK
 	/* Wait for IPI to be delivered. */
 	if (!lapic_ipi_wait(AFTER_SPIN)) {
 #ifdef needsattention
 		/*
 		 * XXX FIXME:
 		 *
 		 * The above function waits for the message to actually be
 		 * delivered.  It breaks out after an arbitrary timeout
 		 * since the message should eventually be delivered (at
 		 * least in theory) and that if it wasn't we would catch
 		 * the failure with the check above when the next IPI is
 		 * sent.
 		 *
 		 * We could skip this wait entirely, EXCEPT it probably
 		 * protects us from other routines that assume that the
 		 * message was delivered and acted upon when this function
 		 * returns.
 		 */
 		printf("APIC: IPI might be stuck\n");
 #else /* !needsattention */
 		/* Wait until mesage is sent without a timeout. */
 		while (lapic_read_icr_lo() & APIC_DELSTAT_PEND)
 			ia32_pause();
 #endif /* needsattention */
 	}
 #endif /* DETECT_DEADLOCK */
 }
 
 void (*ipi_vectored)(u_int, int) = &native_lapic_ipi_vectored;
 #endif /* SMP */
 
 /*
  * Since the IDT is shared by all CPUs the IPI slot update needs to be globally
  * visible.
  *
  * Consider the case where an IPI is generated immediately after allocation:
  *     vector = lapic_ipi_alloc(ipifunc);
  *     ipi_selected(other_cpus, vector);
  *
  * In xAPIC mode a write to ICR_LO has serializing semantics because the
  * APIC page is mapped as an uncached region. In x2APIC mode there is an
  * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
  * the IDT slot update is globally visible before the IPI is delivered.
  */
 int
 lapic_ipi_alloc(inthand_t *ipifunc)
 {
 	struct gate_descriptor *ip;
 	long func;
 	int idx, vector;
 
 	KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
 	    ("invalid ipifunc %p", ipifunc));
 
 	vector = -1;
 	mtx_lock_spin(&icu_lock);
 	for (idx = IPI_DYN_FIRST; idx <= IPI_DYN_LAST; idx++) {
 		ip = &idt[idx];
 		func = (ip->gd_hioffset << 16) | ip->gd_looffset;
 #ifdef __i386__
 		func -= setidt_disp;
 #endif
 		if ((!pti && func == (uintptr_t)&IDTVEC(rsvd)) ||
 		    (pti && func == (uintptr_t)&IDTVEC(rsvd_pti))) {
 			vector = idx;
 			setidt(vector, ipifunc, SDT_APIC, SEL_KPL, GSEL_APIC);
 			break;
 		}
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (vector);
 }
 
 void
 lapic_ipi_free(int vector)
 {
 	struct gate_descriptor *ip;
 	long func __diagused;
 
 	KASSERT(vector >= IPI_DYN_FIRST && vector <= IPI_DYN_LAST,
 	    ("%s: invalid vector %d", __func__, vector));
 
 	mtx_lock_spin(&icu_lock);
 	ip = &idt[vector];
 	func = (ip->gd_hioffset << 16) | ip->gd_looffset;
 #ifdef __i386__
 	func -= setidt_disp;
 #endif
 	KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
 	    func != (uintptr_t)&IDTVEC(rsvd_pti),
 	    ("invalid idtfunc %#lx", func));
 	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APIC,
 	    SEL_KPL, GSEL_APIC);
 	mtx_unlock_spin(&icu_lock);
 }
diff --git a/sys/x86/x86/mca.c b/sys/x86/x86/mca.c
index dca6b935e4a3..b293fcedbd84 100644
--- a/sys/x86/x86/mca.c
+++ b/sys/x86/x86/mca.c
@@ -1,1565 +1,1561 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2009 Hudson River Trading LLC
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for x86 machine check architecture.
  */
 
 #include <sys/cdefs.h>
 #ifdef __amd64__
 #define	DEV_APIC
 #else
 #include "opt_apic.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 /* Modes for mca_scan() */
 enum scan_mode {
 	POLLED,
 	MCE,
 	CMCI,
 };
 
 #ifdef DEV_APIC
 /*
  * State maintained for each monitored MCx bank to control the
  * corrected machine check interrupt threshold.
  */
 struct cmc_state {
 	int	max_threshold;
 	time_t	last_intr;
 };
 
 struct amd_et_state {
 	int	cur_threshold;
 	time_t	last_intr;
 };
 #endif
 
 struct mca_internal {
 	struct mca_record rec;
 	STAILQ_ENTRY(mca_internal) link;
 };
 
 struct mca_enumerator_ops {
         unsigned int (*ctl)(int);
         unsigned int (*status)(int);
         unsigned int (*addr)(int);
         unsigned int (*misc)(int);
 };
 
 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
 
 static volatile int mca_count;	/* Number of records stored. */
 static int mca_banks;		/* Number of per-CPU register banks. */
 static int mca_maxcount = -1;	/* Limit on records stored. (-1 = unlimited) */
 
 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "Machine Check Architecture");
 
 static int mca_enabled = 1;
 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
     "Administrative toggle for machine check support");
 
 static int log_corrected = 1;
 SYSCTL_INT(_hw_mca, OID_AUTO, log_corrected, CTLFLAG_RWTUN, &log_corrected, 0,
     "Log corrected errors to the console");
 
 static int amd10h_L1TP = 1;
 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
 
 static int intel6h_HSD131;
 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
     "Administrative toggle for logging of spurious corrected errors");
 
 int workaround_erratum383;
 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
     &workaround_erratum383, 0,
     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
 
 static STAILQ_HEAD(, mca_internal) mca_freelist;
 static int mca_freecount;
 static STAILQ_HEAD(, mca_internal) mca_records;
 static STAILQ_HEAD(, mca_internal) mca_pending;
 static int mca_ticks = 300;
 static struct taskqueue *mca_tq;
 static struct task mca_resize_task;
 static struct timeout_task mca_scan_task;
 static struct mtx mca_lock;
 
 static unsigned int
 mca_ia32_ctl_reg(int bank)
 {
 	return (MSR_MC_CTL(bank));
 }
 
 static unsigned int
 mca_ia32_status_reg(int bank)
 {
 	return (MSR_MC_STATUS(bank));
 }
 
 static unsigned int
 mca_ia32_addr_reg(int bank)
 {
 	return (MSR_MC_ADDR(bank));
 }
 
 static unsigned int
 mca_ia32_misc_reg(int bank)
 {
 	return (MSR_MC_MISC(bank));
 }
 
 static unsigned int
 mca_smca_ctl_reg(int bank)
 {
         return (MSR_SMCA_MC_CTL(bank));
 }
 
 static unsigned int
 mca_smca_status_reg(int bank)
 {
         return (MSR_SMCA_MC_STATUS(bank));
 }
 
 static unsigned int
 mca_smca_addr_reg(int bank)
 {
         return (MSR_SMCA_MC_ADDR(bank));
 }
 
 static unsigned int
 mca_smca_misc_reg(int bank)
 {
         return (MSR_SMCA_MC_MISC(bank));
 }
 
 static struct mca_enumerator_ops mca_msr_ops = {
         .ctl    = mca_ia32_ctl_reg,
         .status = mca_ia32_status_reg,
         .addr   = mca_ia32_addr_reg,
         .misc   = mca_ia32_misc_reg
 };
 
 #ifdef DEV_APIC
 static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
 static struct amd_et_state **amd_et_state;	/* Indexed by cpuid, bank. */
 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
 
 static int amd_elvt = -1;
 
 static inline bool
 amd_thresholding_supported(void)
 {
 	if (cpu_vendor_id != CPU_VENDOR_AMD &&
 	    cpu_vendor_id != CPU_VENDOR_HYGON)
 		return (false);
 	/*
 	 * The RASCap register is wholly reserved in families 0x10-0x15 (through model 1F).
 	 *
 	 * It begins to be documented in family 0x15 model 30 and family 0x16,
 	 * but neither of these families documents the ScalableMca bit, which
 	 * supposedly defines the presence of this feature on family 0x17.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16)
 		return (true);
 	if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
 		return ((amd_rascap & AMDRAS_SCALABLE_MCA) != 0);
 	return (false);
 }
 #endif
 
 static inline bool
 cmci_supported(uint64_t mcg_cap)
 {
 	/*
 	 * MCG_CAP_CMCI_P bit is reserved in AMD documentation.  Until
 	 * it is defined, do not use it to check for CMCI support.
 	 */
 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
 		return (false);
 	return ((mcg_cap & MCG_CAP_CMCI_P) != 0);
 }
 
 static inline bool
 tes_supported(uint64_t mcg_cap)
 {
 
 	/*
 	 * MCG_CAP_TES_P bit is reserved in AMD documentation.  Until
 	 * it is defined, do not use it to check for TES support.
 	 */
 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
 		return (false);
 	return ((mcg_cap & MCG_CAP_TES_P) != 0);
 }
 
 static inline bool
 ser_supported(uint64_t mcg_cap)
 {
 
 	return (tes_supported(mcg_cap) && (mcg_cap & MCG_CAP_SER_P) != 0);
 }
 
 static int
 sysctl_positive_int(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = *(int *)arg1;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	if (value <= 0)
 		return (EINVAL);
 	*(int *)arg1 = value;
 	return (0);
 }
 
 static int
 sysctl_mca_records(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1;
 	u_int namelen = arg2;
 	struct mca_record record;
 	struct mca_internal *rec;
 	int i;
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	if (name[0] < 0 || name[0] >= mca_count)
 		return (EINVAL);
 
 	mtx_lock_spin(&mca_lock);
 	if (name[0] >= mca_count) {
 		mtx_unlock_spin(&mca_lock);
 		return (EINVAL);
 	}
 	i = 0;
 	STAILQ_FOREACH(rec, &mca_records, link) {
 		if (i == name[0]) {
 			record = rec->rec;
 			break;
 		}
 		i++;
 	}
 	mtx_unlock_spin(&mca_lock);
 	return (SYSCTL_OUT(req, &record, sizeof(record)));
 }
 
 static const char *
 mca_error_ttype(uint16_t mca_error)
 {
 
 	switch ((mca_error & 0x000c) >> 2) {
 	case 0:
 		return ("I");
 	case 1:
 		return ("D");
 	case 2:
 		return ("G");
 	}
 	return ("?");
 }
 
 static const char *
 mca_error_level(uint16_t mca_error)
 {
 
 	switch (mca_error & 0x0003) {
 	case 0:
 		return ("L0");
 	case 1:
 		return ("L1");
 	case 2:
 		return ("L2");
 	case 3:
 		return ("LG");
 	}
 	return ("L?");
 }
 
 static const char *
 mca_error_request(uint16_t mca_error)
 {
 
 	switch ((mca_error & 0x00f0) >> 4) {
 	case 0x0:
 		return ("ERR");
 	case 0x1:
 		return ("RD");
 	case 0x2:
 		return ("WR");
 	case 0x3:
 		return ("DRD");
 	case 0x4:
 		return ("DWR");
 	case 0x5:
 		return ("IRD");
 	case 0x6:
 		return ("PREFETCH");
 	case 0x7:
 		return ("EVICT");
 	case 0x8:
 		return ("SNOOP");
 	}
 	return ("???");
 }
 
 static const char *
 mca_error_mmtype(uint16_t mca_error)
 {
 
 	switch ((mca_error & 0x70) >> 4) {
 	case 0x0:
 		return ("GEN");
 	case 0x1:
 		return ("RD");
 	case 0x2:
 		return ("WR");
 	case 0x3:
 		return ("AC");
 	case 0x4:
 		return ("MS");
 	}
 	return ("???");
 }
 
 static const char *
 mca_addres_mode(uint64_t mca_misc)
 {
 
 	switch ((mca_misc & MC_MISC_ADDRESS_MODE) >> 6) {
 	case 0x0:
 		return ("Segment Offset");
 	case 0x1:
 		return ("Linear Address");
 	case 0x2:
 		return ("Physical Address");
 	case 0x3:
 		return ("Memory Address");
 	case 0x7:
 		return ("Generic");
 	}
 	return ("???");
 }
 
 static int
 mca_mute(const struct mca_record *rec)
 {
 
 	/*
 	 * Skip spurious corrected parity errors generated by Intel Haswell-
 	 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
 	 * erratum respectively), unless reporting is enabled.
 	 * Note that these errors also have been observed with the D0-stepping
 	 * of Haswell, while at least initially the CPU specification updates
 	 * suggested only the C0-stepping to be affected.  Similarly, Celeron
 	 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
 	 * same problem, with HSM142 only referring to 0x3c and 0x46.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    (CPUID_TO_MODEL(cpu_id) == 0x3c ||	/* HSD131, HSM142, HSW131 */
 	    CPUID_TO_MODEL(cpu_id) == 0x3d ||	/* BDM48 */
 	    CPUID_TO_MODEL(cpu_id) == 0x45 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x46) &&	/* HSM142 */
 	    rec->mr_bank == 0 &&
 	    (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
 	    !intel6h_HSD131)
 	    	return (1);
 
 	return (0);
 }
 
 /* Dump details about a single machine check. */
 static void
 mca_log(const struct mca_record *rec)
 {
 	uint16_t mca_error;
 
 	if (mca_mute(rec))
 		return;
 
 	if (!log_corrected && (rec->mr_status & MC_STATUS_UC) == 0 &&
 	    (!tes_supported(rec->mr_mcg_cap) ||
 	    ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) != 0x2))
 		return;
 
 	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
 	    (long long)rec->mr_status);
 	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
 	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
 	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
 	    rec->mr_cpu_id, rec->mr_apic_id);
 	printf("MCA: CPU %d ", rec->mr_cpu);
 	if (rec->mr_status & MC_STATUS_UC)
 		printf("UNCOR ");
 	else {
 		printf("COR ");
 		if (cmci_supported(rec->mr_mcg_cap))
 			printf("(%lld) ", ((long long)rec->mr_status &
 			    MC_STATUS_COR_COUNT) >> 38);
 		if (tes_supported(rec->mr_mcg_cap)) {
 			switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) {
 			case 0x1:
 				printf("(Green) ");
 			case 0x2:
 				printf("(Yellow) ");
 			}
 		}
 	}
 	if (rec->mr_status & MC_STATUS_EN)
 		printf("EN ");
 	if (rec->mr_status & MC_STATUS_PCC)
 		printf("PCC ");
 	if (ser_supported(rec->mr_mcg_cap)) {
 		if (rec->mr_status & MC_STATUS_S)
 			printf("S ");
 		if (rec->mr_status & MC_STATUS_AR)
 			printf("AR ");
 	}
 	if (rec->mr_status & MC_STATUS_OVER)
 		printf("OVER ");
 	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
 	switch (mca_error) {
 		/* Simple error codes. */
 	case 0x0000:
 		printf("no error");
 		break;
 	case 0x0001:
 		printf("unclassified error");
 		break;
 	case 0x0002:
 		printf("ucode ROM parity error");
 		break;
 	case 0x0003:
 		printf("external error");
 		break;
 	case 0x0004:
 		printf("FRC error");
 		break;
 	case 0x0005:
 		printf("internal parity error");
 		break;
 	case 0x0006:
 		printf("SMM handler code access violation");
 		break;
 	case 0x0400:
 		printf("internal timer error");
 		break;
 	case 0x0e0b:
 		printf("generic I/O error");
 		if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL &&
 		    (rec->mr_status & MC_STATUS_MISCV)) {
 			printf(" (pci%d:%d:%d:%d)",
 			    (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32),
 			    (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24),
 			    (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19),
 			    (int)((rec->mr_misc & MC_MISC_PCIE_FUNC) >> 16));
 		}
 		break;
 	default:
 		if ((mca_error & 0xfc00) == 0x0400) {
 			printf("internal error %x", mca_error & 0x03ff);
 			break;
 		}
 
 		/* Compound error codes. */
 
 		/* Memory hierarchy error. */
 		if ((mca_error & 0xeffc) == 0x000c) {
 			printf("%s memory error", mca_error_level(mca_error));
 			break;
 		}
 
 		/* TLB error. */
 		if ((mca_error & 0xeff0) == 0x0010) {
 			printf("%sTLB %s error", mca_error_ttype(mca_error),
 			    mca_error_level(mca_error));
 			break;
 		}
 
 		/* Memory controller error. */
 		if ((mca_error & 0xef80) == 0x0080) {
 			printf("%s channel ", mca_error_mmtype(mca_error));
 			if ((mca_error & 0x000f) != 0x000f)
 				printf("%d", mca_error & 0x000f);
 			else
 				printf("??");
 			printf(" memory error");
 			break;
 		}
 
 		/* Cache error. */
 		if ((mca_error & 0xef00) == 0x0100) {
 			printf("%sCACHE %s %s error",
 			    mca_error_ttype(mca_error),
 			    mca_error_level(mca_error),
 			    mca_error_request(mca_error));
 			break;
 		}
 
 		/* Extended memory error. */
 		if ((mca_error & 0xef80) == 0x0280) {
 			printf("%s channel ", mca_error_mmtype(mca_error));
 			if ((mca_error & 0x000f) != 0x000f)
 				printf("%d", mca_error & 0x000f);
 			else
 				printf("??");
 			printf(" extended memory error");
 			break;
 		}
 
 		/* Bus and/or Interconnect error. */
 		if ((mca_error & 0xe800) == 0x0800) {
 			printf("BUS%s ", mca_error_level(mca_error));
 			switch ((mca_error & 0x0600) >> 9) {
 			case 0:
 				printf("Source");
 				break;
 			case 1:
 				printf("Responder");
 				break;
 			case 2:
 				printf("Observer");
 				break;
 			default:
 				printf("???");
 				break;
 			}
 			printf(" %s ", mca_error_request(mca_error));
 			switch ((mca_error & 0x000c) >> 2) {
 			case 0:
 				printf("Memory");
 				break;
 			case 2:
 				printf("I/O");
 				break;
 			case 3:
 				printf("Other");
 				break;
 			default:
 				printf("???");
 				break;
 			}
 			if (mca_error & 0x0100)
 				printf(" timed out");
 			break;
 		}
 
 		printf("unknown error %x", mca_error);
 		break;
 	}
 	printf("\n");
 	if (rec->mr_status & MC_STATUS_ADDRV) {
 		printf("MCA: Address 0x%llx", (long long)rec->mr_addr);
 		if (ser_supported(rec->mr_mcg_cap) &&
 		    (rec->mr_status & MC_STATUS_MISCV)) {
 			printf(" (Mode: %s, LSB: %d)",
 			    mca_addres_mode(rec->mr_misc),
 			    (int)(rec->mr_misc & MC_MISC_RA_LSB));
 		}
 		printf("\n");
 	}
 	if (rec->mr_status & MC_STATUS_MISCV)
 		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
 }
 
 static bool
 mca_is_mce(uint64_t mcg_cap, uint64_t status, bool *recoverablep)
 {
 
 	/* Corrected error. */
 	if ((status & MC_STATUS_UC) == 0)
 		return (0);
 
 	/* Spurious MCA error. */
 	if ((status & MC_STATUS_EN) == 0)
 		return (0);
 
 	/* The processor does not support software error recovery. */
 	if (!ser_supported(mcg_cap)) {
 		*recoverablep = false;
 		return (1);
 	}
 
 	/* Context might have been corrupted. */
 	if (status & MC_STATUS_PCC) {
 		*recoverablep = false;
 		return (1);
 	}
 
 	/* Uncorrected software recoverable. */
 	if (status & MC_STATUS_S) {
 		/* Action required vs optional. */
 		if (status & MC_STATUS_AR)
 			*recoverablep = false;
 		return (1);
 	}
 
 	/* Uncorrected no action required. */
 	return (0);
 }
 
 static int
 mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank,
     struct mca_record *rec, bool *recoverablep)
 {
 	uint64_t status;
 	u_int p[4];
 	bool mce, recover;
 
 	status = rdmsr(mca_msr_ops.status(bank));
 	if (!(status & MC_STATUS_VAL))
 		return (0);
 
 	recover = *recoverablep;
 	mce = mca_is_mce(mcg_cap, status, &recover);
 	if (mce != (mode == MCE))
 		return (0);
 	*recoverablep = recover;
 
 	/* Save exception information. */
 	rec->mr_status = status;
 	rec->mr_bank = bank;
 	rec->mr_addr = 0;
 	if (status & MC_STATUS_ADDRV)
 		rec->mr_addr = rdmsr(mca_msr_ops.addr(bank));
 	rec->mr_misc = 0;
 	if (status & MC_STATUS_MISCV)
 		rec->mr_misc = rdmsr(mca_msr_ops.misc(bank));
 	rec->mr_tsc = rdtsc();
 	rec->mr_apic_id = PCPU_GET(apic_id);
 	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
 	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
 	rec->mr_cpu_id = cpu_id;
 	rec->mr_cpu_vendor_id = cpu_vendor_id;
 	rec->mr_cpu = PCPU_GET(cpuid);
 
 	/*
 	 * Clear machine check.  Don't do this for uncorrectable
 	 * errors so that the BIOS can see them.
 	 */
 	if (!mce || recover) {
 		wrmsr(mca_msr_ops.status(bank), 0);
 		do_cpuid(0, p);
 	}
 	return (1);
 }
 
 static void
 mca_resize_freelist(void)
 {
 	struct mca_internal *next, *rec;
 	STAILQ_HEAD(, mca_internal) tmplist;
 	int count, i, desired_max, desired_min;
 
 	/*
 	 * Ensure we have at least one record for each bank and one
 	 * record per CPU, but no more than twice that amount.
 	 */
 	desired_min = imax(mp_ncpus, mca_banks);
 	desired_max = imax(mp_ncpus, mca_banks) * 2;
 	STAILQ_INIT(&tmplist);
 	mtx_lock_spin(&mca_lock);
 	while (mca_freecount > desired_max) {
 		rec = STAILQ_FIRST(&mca_freelist);
 		KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty",
 		    mca_freecount));
 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
 		mca_freecount--;
 		STAILQ_INSERT_TAIL(&tmplist, rec, link);
 	}
 	while (mca_freecount < desired_min) {
 		count = desired_min - mca_freecount;
 		mtx_unlock_spin(&mca_lock);
 		for (i = 0; i < count; i++) {
 			rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
 			STAILQ_INSERT_TAIL(&tmplist, rec, link);
 		}
 		mtx_lock_spin(&mca_lock);
 		STAILQ_CONCAT(&mca_freelist, &tmplist);
 		mca_freecount += count;
 	}
 	mtx_unlock_spin(&mca_lock);
 	STAILQ_FOREACH_SAFE(rec, &tmplist, link, next)
 		free(rec, M_MCA);
 }
 
 static void
 mca_resize(void *context, int pending)
 {
 
 	mca_resize_freelist();
 }
 
 static void
 mca_record_entry(enum scan_mode mode, const struct mca_record *record)
 {
 	struct mca_internal *rec;
 
 	if (mode == POLLED) {
 		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
 		mtx_lock_spin(&mca_lock);
 	} else {
 		mtx_lock_spin(&mca_lock);
 		rec = STAILQ_FIRST(&mca_freelist);
 		if (rec == NULL) {
 			printf("MCA: Unable to allocate space for an event.\n");
 			mca_log(record);
 			mtx_unlock_spin(&mca_lock);
 			return;
 		}
 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
 		mca_freecount--;
 	}
 
 	rec->rec = *record;
 	STAILQ_INSERT_TAIL(&mca_pending, rec, link);
 	mtx_unlock_spin(&mca_lock);
 }
 
 #ifdef DEV_APIC
 /*
  * Update the interrupt threshold for a CMCI.  The strategy is to use
  * a low trigger that interrupts as soon as the first event occurs.
  * However, if a steady stream of events arrive, the threshold is
  * increased until the interrupts are throttled to once every
  * cmc_throttle seconds or the periodic scan.  If a periodic scan
  * finds that the threshold is too high, it is lowered.
  */
 static int
 update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
     int cur_threshold, int max_threshold)
 {
 	u_int delta;
 	int limit;
 
 	delta = (u_int)(time_uptime - last_intr);
 	limit = cur_threshold;
 
 	/*
 	 * If an interrupt was received less than cmc_throttle seconds
 	 * since the previous interrupt and the count from the current
 	 * event is greater than or equal to the current threshold,
 	 * double the threshold up to the max.
 	 */
 	if (mode == CMCI && valid) {
 		if (delta < cmc_throttle && count >= limit &&
 		    limit < max_threshold) {
 			limit = min(limit << 1, max_threshold);
 		}
 		return (limit);
 	}
 
 	/*
 	 * When the banks are polled, check to see if the threshold
 	 * should be lowered.
 	 */
 	if (mode != POLLED)
 		return (limit);
 
 	/* If a CMCI occurred recently, do nothing for now. */
 	if (delta < cmc_throttle)
 		return (limit);
 
 	/*
 	 * Compute a new limit based on the average rate of events per
 	 * cmc_throttle seconds since the last interrupt.
 	 */
 	if (valid) {
 		limit = count * cmc_throttle / delta;
 		if (limit <= 0)
 			limit = 1;
 		else if (limit > max_threshold)
 			limit = max_threshold;
 	} else {
 		limit = 1;
 	}
 	return (limit);
 }
 
 static void
 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
 {
 	struct cmc_state *cc;
 	uint64_t ctl;
 	int cur_threshold, new_threshold;
 	int count;
 
 	/* Fetch the current limit for this bank. */
 	cc = &cmc_state[PCPU_GET(cpuid)][bank];
 	ctl = rdmsr(MSR_MC_CTL2(bank));
 	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
 	cur_threshold = ctl & MC_CTL2_THRESHOLD;
 
 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
 	    cur_threshold, cc->max_threshold);
 
 	if (mode == CMCI && valid)
 		cc->last_intr = time_uptime;
 	if (new_threshold != cur_threshold) {
 		ctl &= ~MC_CTL2_THRESHOLD;
 		ctl |= new_threshold;
 		wrmsr(MSR_MC_CTL2(bank), ctl);
 	}
 }
 
 static void
 amd_thresholding_update(enum scan_mode mode, int bank, int valid)
 {
 	struct amd_et_state *cc;
 	uint64_t misc;
 	int new_threshold;
 	int count;
 
 	cc = &amd_et_state[PCPU_GET(cpuid)][bank];
 	misc = rdmsr(mca_msr_ops.misc(bank));
 	count = (misc & MC_MISC_AMD_CNT_MASK) >> MC_MISC_AMD_CNT_SHIFT;
 	count = count - (MC_MISC_AMD_CNT_MAX - cc->cur_threshold);
 
 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
 	    cc->cur_threshold, MC_MISC_AMD_CNT_MAX);
 
 	cc->cur_threshold = new_threshold;
 	misc &= ~MC_MISC_AMD_CNT_MASK;
 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
 	    << MC_MISC_AMD_CNT_SHIFT;
 	misc &= ~MC_MISC_AMD_OVERFLOW;
 	wrmsr(mca_msr_ops.misc(bank), misc);
 	if (mode == CMCI && valid)
 		cc->last_intr = time_uptime;
 }
 #endif
 
 /*
  * This scans all the machine check banks of the current CPU to see if
  * there are any machine checks.  Any non-recoverable errors are
  * reported immediately via mca_log().  The current thread must be
  * pinned when this is called.  The 'mode' parameter indicates if we
  * are being called from the MC exception handler, the CMCI handler,
  * or the periodic poller.
  */
 static int
 mca_scan(enum scan_mode mode, bool *recoverablep)
 {
 	struct mca_record rec;
 	uint64_t mcg_cap;
 	int count = 0, i, valid;
 
 	mcg_cap = rdmsr(MSR_MCG_CAP);
 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
 #ifdef DEV_APIC
 		/*
 		 * For a CMCI, only check banks this CPU is
 		 * responsible for.
 		 */
 		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
 			continue;
 #endif
 
 		valid = mca_check_status(mode, mcg_cap, i, &rec, recoverablep);
 		if (valid) {
 			count++;
 			if (*recoverablep)
 				mca_record_entry(mode, &rec);
 			else
 				mca_log(&rec);
 		}
 
 #ifdef DEV_APIC
 		/*
 		 * If this is a bank this CPU monitors via CMCI,
 		 * update the threshold.
 		 */
 		if (PCPU_GET(cmci_mask) & 1 << i) {
 			if (cmc_state != NULL)
 				cmci_update(mode, i, valid, &rec);
 			else
 				amd_thresholding_update(mode, i, valid);
 		}
 #endif
 	}
 	return (count);
 }
 
 /*
  * Store a new record on the mca_records list while enforcing
  * mca_maxcount.
  */
 static void
 mca_store_record(struct mca_internal *mca)
 {
 
 	/*
 	 * If we are storing no records (mca_maxcount == 0),
 	 * we just free this record.
 	 *
 	 * If we are storing records (mca_maxcount != 0) and
 	 * we have free space on the list, store the record
 	 * and increment mca_count.
 	 *
 	 * If we are storing records and we do not have free
 	 * space on the list, store the new record at the
 	 * tail and free the oldest one from the head.
 	 */
 	if (mca_maxcount != 0)
 		STAILQ_INSERT_TAIL(&mca_records, mca, link);
 	if (mca_maxcount < 0 || mca_count < mca_maxcount)
 		mca_count++;
 	else {
 		if (mca_maxcount != 0) {
 			mca = STAILQ_FIRST(&mca_records);
 			STAILQ_REMOVE_HEAD(&mca_records, link);
 		}
 		STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
 		mca_freecount++;
 	}
 }
 
 /*
  * Do the work to process machine check records which have just been
  * gathered. Print any pending logs to the console. Queue them for storage.
  * Trigger a resizing of the free list.
  */
 static void
 mca_process_records(enum scan_mode mode)
 {
 	struct mca_internal *mca;
 
 	mtx_lock_spin(&mca_lock);
 	while ((mca = STAILQ_FIRST(&mca_pending)) != NULL) {
 		STAILQ_REMOVE_HEAD(&mca_pending, link);
 		mca_log(&mca->rec);
 		mca_store_record(mca);
 	}
 	mtx_unlock_spin(&mca_lock);
 	if (mode == POLLED)
 		mca_resize_freelist();
 	else if (!cold)
 		taskqueue_enqueue(mca_tq, &mca_resize_task);
 }
 
 /*
  * Scan the machine check banks on all CPUs by binding to each CPU in
  * turn.  If any of the CPUs contained new machine check records, log
  * them to the console.
  */
 static void
 mca_scan_cpus(void *context, int pending)
 {
 	struct thread *td;
 	int cpu;
 	bool recoverable = true;
 
 	mca_resize_freelist();
 	td = curthread;
 	thread_lock(td);
 	CPU_FOREACH(cpu) {
 		sched_bind(td, cpu);
 		thread_unlock(td);
 		mca_scan(POLLED, &recoverable);
 		thread_lock(td);
 		sched_unbind(td);
 	}
 	thread_unlock(td);
 	if (!STAILQ_EMPTY(&mca_pending))
 		mca_process_records(POLLED);
 	taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
 	    mca_ticks * SBT_1S, 0, C_PREL(1));
 }
 
 static int
 sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	i = 0;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error)
 		return (error);
 	if (i)
 		taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
 		    0, 0, 0);
 	return (0);
 }
 
 static int
 sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)
 {
 	struct mca_internal *mca;
 	int error, i;
 	bool doresize;
 
 	i = mca_maxcount;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || req->newptr == NULL)
 		return (error);
 	mtx_lock_spin(&mca_lock);
 	mca_maxcount = i;
 	doresize = false;
 	if (mca_maxcount >= 0)
 		while (mca_count > mca_maxcount) {
 			mca = STAILQ_FIRST(&mca_records);
 			STAILQ_REMOVE_HEAD(&mca_records, link);
 			mca_count--;
 			STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
 			mca_freecount++;
 			doresize = true;
 		}
 	mtx_unlock_spin(&mca_lock);
 	if (doresize && !cold)
 		taskqueue_enqueue(mca_tq, &mca_resize_task);
 	return (error);
 }
 
 static void
 mca_startup(void *dummy)
 {
 
 	if (mca_banks <= 0)
 		return;
 
 	/* CMCIs during boot may have claimed items from the freelist. */
 	mca_resize_freelist();
 
 	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
 	taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
 	    mca_ticks * SBT_1S, 0, C_PREL(1));
 }
-#ifdef EARLY_AP_STARTUP
 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
-#else
-SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
-#endif
 
 #ifdef DEV_APIC
 static void
 cmci_setup(void)
 {
 	int i;
 
 	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
 	    M_WAITOK);
 	for (i = 0; i <= mp_maxid; i++)
 		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
 		    M_MCA, M_WAITOK | M_ZERO);
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	    &cmc_throttle, 0, sysctl_positive_int, "I",
 	    "Interval in seconds to throttle corrected MC interrupts");
 }
 
 static void
 amd_thresholding_setup(void)
 {
 	u_int i;
 
 	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state *),
 	    M_MCA, M_WAITOK);
 	for (i = 0; i <= mp_maxid; i++)
 		amd_et_state[i] = malloc(sizeof(struct amd_et_state) *
 		    mca_banks, M_MCA, M_WAITOK | M_ZERO);
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	    &cmc_throttle, 0, sysctl_positive_int, "I",
 	    "Interval in seconds to throttle corrected MC interrupts");
 }
 #endif
 
 static void
 mca_setup(uint64_t mcg_cap)
 {
 
 	/*
 	 * On AMD Family 10h processors, unless logging of level one TLB
 	 * parity (L1TP) errors is disabled, enable the recommended workaround
 	 * for Erratum 383.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
 		workaround_erratum383 = 1;
 
 	mca_banks = mcg_cap & MCG_CAP_COUNT;
 	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
 	STAILQ_INIT(&mca_records);
 	STAILQ_INIT(&mca_pending);
 	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
 	    taskqueue_thread_enqueue, &mca_tq);
 	TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL);
 	STAILQ_INIT(&mca_freelist);
 	TASK_INIT(&mca_resize_task, 0, mca_resize, NULL);
 	mca_resize_freelist();
 	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
 	    "Record count");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "maxcount", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	    &mca_maxcount, 0, sysctl_mca_maxcount, "I",
 	    "Maximum record count (-1 is unlimited)");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "interval", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
 	    &mca_ticks, 0, sysctl_positive_int, "I",
 	    "Periodic interval in seconds to scan for machine checks");
 	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "records", CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mca_records,
 	    "Machine check records");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
 	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
 	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
 #ifdef DEV_APIC
 	if (cmci_supported(mcg_cap))
 		cmci_setup();
 	else if (amd_thresholding_supported())
 		amd_thresholding_setup();
 #endif
 }
 
 #ifdef DEV_APIC
 /*
  * See if we should monitor CMCI for this bank.  If CMCI_EN is already
  * set in MC_CTL2, then another CPU is responsible for this bank, so
  * ignore it.  If CMCI_EN returns zero after being set, then this bank
  * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
  * now monitor this bank.
  */
 static void
 cmci_monitor(int i)
 {
 	struct cmc_state *cc;
 	uint64_t ctl;
 
 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 
 	/*
 	 * It is possible for some APs to report CMCI support even if the BSP
 	 * does not, apparently due to a BIOS bug.
 	 */
 	if (cmc_state == NULL) {
 		if (bootverbose) {
 			printf(
 		    "AP %d (%d,%d) reports CMCI support but the BSP does not\n",
 			    PCPU_GET(cpuid), PCPU_GET(apic_id),
 			    PCPU_GET(acpi_id));
 		}
 		return;
 	}
 
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	if (ctl & MC_CTL2_CMCI_EN)
 		/* Already monitored by another CPU. */
 		return;
 
 	/* Set the threshold to one event for now. */
 	ctl &= ~MC_CTL2_THRESHOLD;
 	ctl |= MC_CTL2_CMCI_EN | 1;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	if (!(ctl & MC_CTL2_CMCI_EN))
 		/* This bank does not support CMCI. */
 		return;
 
 	cc = &cmc_state[PCPU_GET(cpuid)][i];
 
 	/* Determine maximum threshold. */
 	ctl &= ~MC_CTL2_THRESHOLD;
 	ctl |= 0x7fff;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
 
 	/* Start off with a threshold of 1. */
 	ctl &= ~MC_CTL2_THRESHOLD;
 	ctl |= 1;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 
 	/* Mark this bank as monitored. */
 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
 }
 
 /*
  * For resume, reset the threshold for any banks we monitor back to
  * one and throw away the timestamp of the last interrupt.
  */
 static void
 cmci_resume(int i)
 {
 	struct cmc_state *cc;
 	uint64_t ctl;
 
 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 
 	/* See cmci_monitor(). */
 	if (cmc_state == NULL)
 		return;
 
 	/* Ignore banks not monitored by this CPU. */
 	if (!(PCPU_GET(cmci_mask) & 1 << i))
 		return;
 
 	cc = &cmc_state[PCPU_GET(cpuid)][i];
 	cc->last_intr = 0;
 	ctl = rdmsr(MSR_MC_CTL2(i));
 	ctl &= ~MC_CTL2_THRESHOLD;
 	ctl |= MC_CTL2_CMCI_EN | 1;
 	wrmsr(MSR_MC_CTL2(i), ctl);
 }
 
 /*
  * Apply an AMD ET configuration to the corresponding MSR.
  */
 static void
 amd_thresholding_start(struct amd_et_state *cc, int bank)
 {
 	uint64_t misc;
 
 	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
 
 	misc = rdmsr(mca_msr_ops.misc(bank));
 
 	misc &= ~MC_MISC_AMD_INT_MASK;
 	misc |= MC_MISC_AMD_INT_LVT;
 
 	misc &= ~MC_MISC_AMD_LVT_MASK;
 	misc |= (uint64_t)amd_elvt << MC_MISC_AMD_LVT_SHIFT;
 
 	misc &= ~MC_MISC_AMD_CNT_MASK;
 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
 	    << MC_MISC_AMD_CNT_SHIFT;
 
 	misc &= ~MC_MISC_AMD_OVERFLOW;
 	misc |= MC_MISC_AMD_CNTEN;
 
 	wrmsr(mca_msr_ops.misc(bank), misc);
 }
 
 static void
 amd_thresholding_monitor(int i)
 {
 	struct amd_et_state *cc;
 	uint64_t misc;
 
 	/*
 	 * Kludge: On 10h, banks after 4 are not thresholding but also may have
 	 * bogus Valid bits.  Skip them.  This is definitely fixed in 15h, but
 	 * I have not investigated whether it is fixed in earlier models.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) < 0x15 && i >= 5)
 		return;
 
 	/* The counter must be valid and present. */
 	misc = rdmsr(mca_msr_ops.misc(i));
 	if ((misc & (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) !=
 	    (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP))
 		return;
 
 	/* The register should not be locked. */
 	if ((misc & MC_MISC_AMD_LOCK) != 0) {
 		if (bootverbose)
 			printf("%s: 0x%jx: Bank %d: locked\n", __func__,
 			    (uintmax_t)misc, i);
 		return;
 	}
 
 	/*
 	 * If counter is enabled then either the firmware or another CPU
 	 * has already claimed it.
 	 */
 	if ((misc & MC_MISC_AMD_CNTEN) != 0) {
 		if (bootverbose)
 			printf("%s: 0x%jx: Bank %d: already enabled\n",
 			    __func__, (uintmax_t)misc, i);
 		return;
 	}
 
 	/*
 	 * Configure an Extended Interrupt LVT register for reporting
 	 * counter overflows if that feature is supported and the first
 	 * extended register is available.
 	 */
 	amd_elvt = lapic_enable_mca_elvt();
 	if (amd_elvt < 0) {
 		printf("%s: Bank %d: lapic enable mca elvt failed: %d\n",
 		    __func__, i, amd_elvt);
 		return;
 	}
 
 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
 	cc->cur_threshold = 1;
 	amd_thresholding_start(cc, i);
 
 	/* Mark this bank as monitored. */
 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
 }
 
 static void
 amd_thresholding_resume(int i)
 {
 	struct amd_et_state *cc;
 
 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
 
 	/* Ignore banks not monitored by this CPU. */
 	if (!(PCPU_GET(cmci_mask) & 1 << i))
 		return;
 
 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
 	cc->last_intr = 0;
 	cc->cur_threshold = 1;
 	amd_thresholding_start(cc, i);
 }
 #endif
 
 /*
  * Initializes per-CPU machine check registers and enables corrected
  * machine check interrupts.
  */
 static void
 _mca_init(int boot)
 {
 	uint64_t mcg_cap;
 	uint64_t ctl, mask;
 	int i, skip, family;
 
 	family = CPUID_TO_FAMILY(cpu_id);
 
 	/* MCE is required. */
 	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
 		return;
 
 	if (cpu_feature & CPUID_MCA) {
 		if (boot)
 			PCPU_SET(cmci_mask, 0);
 
 		mcg_cap = rdmsr(MSR_MCG_CAP);
 		if (mcg_cap & MCG_CAP_CTL_P)
 			/* Enable MCA features. */
 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
 		if (IS_BSP() && boot)
 			mca_setup(mcg_cap);
 
 		/*
 		 * Disable logging of level one TLB parity (L1TP) errors by
 		 * the data cache as an alternative workaround for AMD Family
 		 * 10h Erratum 383.  Unlike the recommended workaround, there
 		 * is no performance penalty to this workaround.  However,
 		 * L1TP errors will go unreported.
 		 */
 		if (cpu_vendor_id == CPU_VENDOR_AMD && family == 0x10 &&
 		    !amd10h_L1TP) {
 			mask = rdmsr(MSR_MC0_CTL_MASK);
 			if ((mask & (1UL << 5)) == 0)
 				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
 		}
 		if (amd_rascap & AMDRAS_SCALABLE_MCA) {
 			mca_msr_ops.ctl = mca_smca_ctl_reg;
 			mca_msr_ops.status = mca_smca_status_reg;
 			mca_msr_ops.addr = mca_smca_addr_reg;
 			mca_msr_ops.misc = mca_smca_misc_reg;
 		}
 
 		/* Enable local MCE if supported. */
 		if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 		    (mcg_cap & MCG_CAP_LMCE_P) &&
 		    (rdmsr(MSR_IA32_FEATURE_CONTROL) &
 		     IA32_FEATURE_CONTROL_LMCE_EN))
 			wrmsr(MSR_MCG_EXT_CTL, rdmsr(MSR_MCG_EXT_CTL) | 1);
 
 		/*
 		 * The cmci_monitor() must not be executed
 		 * simultaneously by several CPUs.
 		 */
 		if (boot)
 			mtx_lock_spin(&mca_lock);
 
 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
 			/* By default enable logging of all errors. */
 			ctl = 0xffffffffffffffffUL;
 			skip = 0;
 
 			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
 				/*
 				 * For P6 models before Nehalem MC0_CTL is
 				 * always enabled and reserved.
 				 */
 				if (i == 0 && family == 0x6
 				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
 					skip = 1;
 			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
 				/* BKDG for Family 10h: unset GartTblWkEn. */
 				if (i == MC_AMDNB_BANK && family >= 0xf &&
 				    family < 0x17)
 					ctl &= ~(1UL << 10);
 			}
 
 			if (!skip)
 				wrmsr(mca_msr_ops.ctl(i), ctl);
 
 #ifdef DEV_APIC
 			if (cmci_supported(mcg_cap)) {
 				if (boot)
 					cmci_monitor(i);
 				else
 					cmci_resume(i);
 			} else if (amd_thresholding_supported()) {
 				if (boot)
 					amd_thresholding_monitor(i);
 				else
 					amd_thresholding_resume(i);
 			}
 #endif
 
 			/* Clear all errors. */
 			wrmsr(mca_msr_ops.status(i), 0);
 		}
 		if (boot)
 			mtx_unlock_spin(&mca_lock);
 
 #ifdef DEV_APIC
 		if (cmci_supported(mcg_cap) &&
 		    PCPU_GET(cmci_mask) != 0 && boot)
 			lapic_enable_cmc();
 #endif
 	}
 
 	load_cr4(rcr4() | CR4_MCE);
 }
 
 /* Must be executed on each CPU during boot. */
 void
 mca_init(void)
 {
 
 	_mca_init(1);
 }
 
 /* Must be executed on each CPU during resume. */
 void
 mca_resume(void)
 {
 
 	_mca_init(0);
 }
 
 /*
  * The machine check registers for the BSP cannot be initialized until
  * the local APIC is initialized.  This happens at SI_SUB_CPU,
  * SI_ORDER_SECOND.
  */
 static void
 mca_init_bsp(void *arg __unused)
 {
 
 	mca_init();
 }
 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
 
 /* Called when a machine check exception fires. */
 void
 mca_intr(void)
 {
 	uint64_t mcg_status;
 	int count;
 	bool lmcs, recoverable;
 
 	if (!(cpu_feature & CPUID_MCA)) {
 		/*
 		 * Just print the values of the old Pentium registers
 		 * and panic.
 		 */
 		printf("MC Type: 0x%jx  Address: 0x%jx\n",
 		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
 		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
 		panic("Machine check exception");
 	}
 
 	/* Scan the banks and check for any non-recoverable errors. */
 	mcg_status = rdmsr(MSR_MCG_STATUS);
 	recoverable = (mcg_status & MCG_STATUS_RIPV) != 0;
 	lmcs = (cpu_vendor_id != CPU_VENDOR_INTEL ||
 	    (mcg_status & MCG_STATUS_LMCS));
 	count = mca_scan(MCE, &recoverable);
 
 	if (!recoverable) {
 		/*
 		 * Only panic if the error was detected local to this CPU.
 		 * Some errors will assert a machine check on all CPUs, but
 		 * only certain CPUs will find a valid bank to log.
 		 */
 		while (!lmcs && count == 0)
 			cpu_spinwait();
 
 		panic("Unrecoverable machine check exception");
 	}
 
 	/* Clear MCIP. */
 	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
 }
 
 #ifdef DEV_APIC
 /* Called for a CMCI (correctable machine check interrupt). */
 void
 cmc_intr(void)
 {
 	bool recoverable = true;
 
 	/*
 	 * Serialize MCA bank scanning to prevent collisions from
 	 * sibling threads.
 	 *
 	 * If we found anything, log them to the console.
 	 */
 	if (mca_scan(CMCI, &recoverable) != 0)
 		mca_process_records(CMCI);
 }
 #endif
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index ddcb54b63d88..11b11471d736 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1,1736 +1,1731 @@
 /*-
  * Copyright (c) 1996, by Steve Passe
  * Copyright (c) 2003, by Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the developer may NOT be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_acpi.h"
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_gdb.h"
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 #include "opt_sched.h"
 #include "opt_smp.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/asan.h>
 #include <sys/bus.h>
 #include <sys/cons.h>	/* cngetc() */
 #include <sys/cpuset.h>
 #include <sys/csan.h>
 #include <sys/interrupt.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 #include <x86/apicreg.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/stack.h>
 #include <x86/ucode.h>
 
 #ifdef DEV_ACPI
 #include <contrib/dev/acpica/include/acpi.h>
 #include <dev/acpica/acpivar.h>
 #endif
 
 static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");
 
 int	mp_naps;		/* # of Applications processors */
 int	boot_cpu_id = -1;	/* designated BSP */
 
 /* AP uses this during bootstrap.  Do not staticize.  */
 char *bootSTK;
 int bootAP;
 
 /* Free these after use */
 void *bootstacks[MAXCPU];
 void *dpcpu;
 
 struct susppcb **susppcbs;
 
 #ifdef COUNT_IPIS
 /* Interrupt counts. */
 static u_long *ipi_preempt_counts[MAXCPU];
 static u_long *ipi_ast_counts[MAXCPU];
 u_long *ipi_invltlb_counts[MAXCPU];
 u_long *ipi_invlrng_counts[MAXCPU];
 u_long *ipi_invlpg_counts[MAXCPU];
 u_long *ipi_invlcache_counts[MAXCPU];
 u_long *ipi_rendezvous_counts[MAXCPU];
 static u_long *ipi_hardclock_counts[MAXCPU];
 #endif
 
 /* Default cpu_ops implementation. */
 struct cpu_ops cpu_ops;
 
 /*
  * Local data and functions.
  */
 
 static volatile cpuset_t ipi_stop_nmi_pending;
 
 volatile cpuset_t resuming_cpus;
 volatile cpuset_t toresume_cpus;
 
 /* used to hold the AP's until we are ready to release them */
 struct mtx ap_boot_mtx;
 
 /* Set to 1 once we're ready to let the APs out of the pen. */
 volatile int aps_ready = 0;
 
 /*
  * Store data from cpu_add() until later in the boot when we actually setup
  * the APs.
  */
 struct cpu_info *cpu_info;
 int *apic_cpuids;
 int cpu_apic_ids[MAXCPU];
 _Static_assert(MAXCPU <= MAX_APIC_ID,
     "MAXCPU cannot be larger that MAX_APIC_ID");
 _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
     "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");
 
 static void	release_aps(void *dummy);
 static void	cpustop_handler_post(u_int cpu);
 
 static int	hyperthreading_allowed = 1;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
 	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");
 
 static int	hyperthreading_intr_allowed = 0;
 SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN,
 	&hyperthreading_intr_allowed, 0,
 	"Allow interrupts on HTT logical CPUs");
 
 static int	intr_apic_id_limit = -1;
 SYSCTL_INT(_machdep, OID_AUTO, intr_apic_id_limit, CTLFLAG_RDTUN,
 	&intr_apic_id_limit, 0,
 	"Maximum permitted APIC ID for interrupt delivery (-1 is unlimited)");
 
 static struct topo_node topo_root;
 
 static int pkg_id_shift;
 static int node_id_shift;
 static int core_id_shift;
 static int disabled_cpus;
 
 struct cache_info {
 	int	id_shift;
 	int	present;
 } static caches[MAX_CACHE_LEVELS];
 
 static bool stop_mwait = false;
 SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0,
     "Use MONITOR/MWAIT when stopping CPU, if available");
 
 void
 mem_range_AP_init(void)
 {
 
 	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
 		mem_range_softc.mr_op->initAP(&mem_range_softc);
 }
 
 /*
  * Round up to the next power of two, if necessary, and then
  * take log2.
  * Returns -1 if argument is zero.
  */
 static __inline int
 mask_width(u_int x)
 {
 
 	return (fls(x << (1 - powerof2(x))) - 1);
 }
 
 /*
  * Add a cache level to the cache topology description.
  */
 static int
 add_deterministic_cache(int type, int level, int share_count)
 {
 
 	if (type == 0)
 		return (0);
 	if (type > 3) {
 		printf("unexpected cache type %d\n", type);
 		return (1);
 	}
 	if (type == 2) /* ignore instruction cache */
 		return (1);
 	if (level == 0 || level > MAX_CACHE_LEVELS) {
 		printf("unexpected cache level %d\n", level);
 		return (1);
 	}
 
 	if (caches[level - 1].present) {
 		printf("WARNING: multiple entries for L%u data cache\n", level);
 		printf("%u => %u\n", caches[level - 1].id_shift,
 		    mask_width(share_count));
 	}
 	caches[level - 1].id_shift = mask_width(share_count);
 	caches[level - 1].present = 1;
 
 	if (caches[level - 1].id_shift > pkg_id_shift) {
 		printf("WARNING: L%u data cache covers more "
 		    "APIC IDs than a package (%u > %u)\n", level,
 		    caches[level - 1].id_shift, pkg_id_shift);
 		caches[level - 1].id_shift = pkg_id_shift;
 	}
 	if (caches[level - 1].id_shift < core_id_shift) {
 		printf("WARNING: L%u data cache covers fewer "
 		    "APIC IDs than a core (%u < %u)\n", level,
 		    caches[level - 1].id_shift, core_id_shift);
 		caches[level - 1].id_shift = core_id_shift;
 	}
 
 	return (1);
 }
 
 /*
  * Determine topology of processing units and caches for AMD CPUs.
  * See:
  *  - AMD CPUID Specification (Publication # 25481)
  *  - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
  *  - BKDG For AMD Family 10h Processors (Publication # 31116)
  *  - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
  *  - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
  *  - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
  */
 static void
 topo_probe_amd(void)
 {
 	u_int p[4];
 	uint64_t v;
 	int level;
 	int nodes_per_socket;
 	int share_count;
 	int type;
 	int i;
 
 	/* No multi-core capability. */
 	if ((amd_feature2 & AMDID2_CMP) == 0)
 		return;
 
 	/*
 	 * XXX Lack of an AMD IOMMU driver prevents use of APIC IDs above
 	 * xAPIC_MAX_APIC_ID.  This is a workaround so we boot and function on
 	 * AMD systems with high thread counts, albeit with reduced interrupt
 	 * performance.
 	 *
 	 * We should really set the limit to xAPIC_MAX_APIC_ID by default, and
 	 * have the IOMMU driver increase it.  That way if a driver is present
 	 * but disabled, or is otherwise not able to route the interrupts, the
 	 * system can fall back to a functional state.  That will require a more
 	 * substantial change though, including having the IOMMU initialize
 	 * earlier.
 	 */
 	if (intr_apic_id_limit == -1)
 		intr_apic_id_limit = xAPIC_MAX_APIC_ID;
 
 	/* For families 10h and newer. */
 	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
 	    AMDID_COREID_SIZE_SHIFT;
 
 	/* For 0Fh family. */
 	if (pkg_id_shift == 0)
 		pkg_id_shift =
 		    mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);
 
 	/*
 	 * Families prior to 16h define the following value as
 	 * cores per compute unit and we don't really care about the AMD
 	 * compute units at the moment.  Perhaps we should treat them as
 	 * cores and cores within the compute units as hardware threads,
 	 * but that's up for debate.
 	 * Later families define the value as threads per compute unit,
 	 * so we are following AMD's nomenclature here.
 	 */
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
 	    CPUID_TO_FAMILY(cpu_id) >= 0x16) {
 		cpuid_count(0x8000001e, 0, p);
 		share_count = ((p[1] >> 8) & 0xff) + 1;
 		core_id_shift = mask_width(share_count);
 
 		/*
 		 * For Zen (17h), gather Nodes per Processor.  Each node is a
 		 * Zeppelin die; TR and EPYC CPUs will have multiple dies per
 		 * package.  Communication latency between dies is higher than
 		 * within them.
 		 */
 		nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
 		node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
 	}
 
 	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
 		for (i = 0; ; i++) {
 			cpuid_count(0x8000001d, i, p);
 			type = p[0] & 0x1f;
 			level = (p[0] >> 5) & 0x7;
 			share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 			if (!add_deterministic_cache(type, level, share_count))
 				break;
 		}
 	} else {
 		if (cpu_exthigh >= 0x80000005) {
 			cpuid_count(0x80000005, 0, p);
 			if (((p[2] >> 24) & 0xff) != 0) {
 				caches[0].id_shift = 0;
 				caches[0].present = 1;
 			}
 		}
 		if (cpu_exthigh >= 0x80000006) {
 			cpuid_count(0x80000006, 0, p);
 			if (((p[2] >> 16) & 0xffff) != 0) {
 				caches[1].id_shift = 0;
 				caches[1].present = 1;
 			}
 			if (((p[3] >> 18) & 0x3fff) != 0) {
 				nodes_per_socket = 1;
 				if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
 					/*
 					 * Handle multi-node processors that
 					 * have multiple chips, each with its
 					 * own L3 cache, on the same die.
 					 */
 					v = rdmsr(0xc001100c);
 					nodes_per_socket = 1 + ((v >> 3) & 0x7);
 				}
 				caches[2].id_shift =
 				    pkg_id_shift - mask_width(nodes_per_socket);
 				caches[2].present = 1;
 			}
 		}
 	}
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 1 and Leaf 4, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0x4(void)
 {
 	u_int p[4];
 	int max_cores;
 	int max_logical;
 
 	/* Both zero and one here mean one logical processor per package. */
 	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
 	    (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
 	if (max_logical <= 1)
 		return;
 
 	if (cpu_high >= 0x4) {
 		cpuid_count(0x04, 0, p);
 		max_cores = ((p[0] >> 26) & 0x3f) + 1;
 	} else
 		max_cores = 1;
 
 	core_id_shift = mask_width(max_logical/max_cores);
 	KASSERT(core_id_shift >= 0,
 	    ("intel topo: max_cores > max_logical\n"));
 	pkg_id_shift = core_id_shift + mask_width(max_cores);
 }
 
 /*
  * Determine topology of processing units for Intel CPUs
  * using CPUID Leaf 1Fh or 0Bh, if supported.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
  *    Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
  *    FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
  */
 static void
 topo_probe_intel_0xb(void)
 {
 	u_int leaf;
 	u_int p[4] = { 0 };
 	int bits;
 	int type;
 	int i;
 
 	/* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */
 	if (cpu_high >= 0x1f) {
 		leaf = 0x1f;
 		cpuid_count(leaf, 0, p);
 	}
 	/* Fall back to leaf 0Bh (Extended Topology Enumeration). */
 	if (p[1] == 0) {
 		leaf = 0x0b;
 		cpuid_count(leaf, 0, p);
 	}
 	/* Fall back to leaf 04h (Deterministic Cache Parameters). */
 	if (p[1] == 0) {
 		topo_probe_intel_0x4();
 		return;
 	}
 
 	/* We only support three levels for now. */
 	for (i = 0; ; i++) {
 		cpuid_count(leaf, i, p);
 
 		bits = p[0] & 0x1f;
 		type = (p[2] >> 8) & 0xff;
 
 		if (type == 0)
 			break;
 
 		if (type == CPUID_TYPE_SMT)
 			core_id_shift = bits;
 		else if (type == CPUID_TYPE_CORE)
 			pkg_id_shift = bits;
 		else if (bootverbose)
 			printf("Topology level type %d shift: %d\n", type, bits);
 	}
 
 	if (pkg_id_shift < core_id_shift) {
 		printf("WARNING: core covers more APIC IDs than a package\n");
 		core_id_shift = pkg_id_shift;
 	}
 }
 
 /*
  * Determine topology of caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  *  - Intel 64 and IA-32 Architectures Software Developer’s Manual
  *    Volume 2A: Instruction Set Reference, A-M,
  *    CPUID instruction
  */
 static void
 topo_probe_intel_caches(void)
 {
 	u_int p[4];
 	int level;
 	int share_count;
 	int type;
 	int i;
 
 	if (cpu_high < 0x4) {
 		/*
 		 * Available cache level and sizes can be determined
 		 * via CPUID leaf 2, but that requires a huge table of hardcoded
 		 * values, so for now just assume L1 and L2 caches potentially
 		 * shared only by HTT processing units, if HTT is present.
 		 */
 		caches[0].id_shift = pkg_id_shift;
 		caches[0].present = 1;
 		caches[1].id_shift = pkg_id_shift;
 		caches[1].present = 1;
 		return;
 	}
 
 	for (i = 0; ; i++) {
 		cpuid_count(0x4, i, p);
 		type = p[0] & 0x1f;
 		level = (p[0] >> 5) & 0x7;
 		share_count = 1 + ((p[0] >> 14) & 0xfff);
 
 		if (!add_deterministic_cache(type, level, share_count))
 			break;
 	}
 }
 
 /*
  * Determine topology of processing units and caches for Intel CPUs.
  * See:
  *  - Intel 64 Architecture Processor Topology Enumeration
  */
 static void
 topo_probe_intel(void)
 {
 
 	/*
 	 * Note that 0x1 <= cpu_high < 4 case should be
 	 * compatible with topo_probe_intel_0x4() logic when
 	 * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
 	 * or it should trigger the fallback otherwise.
 	 */
 	if (cpu_high >= 0xb)
 		topo_probe_intel_0xb();
 	else if (cpu_high >= 0x1)
 		topo_probe_intel_0x4();
 
 	topo_probe_intel_caches();
 }
 
 /*
  * Topology information is queried only on BSP, on which this
  * code runs and for which it can query CPUID information.
  * Then topology is extrapolated on all packages using an
  * assumption that APIC ID to hardware component ID mapping is
  * homogenious.
  * That doesn't necesserily imply that the topology is uniform.
  */
 void
 topo_probe(void)
 {
 	static int cpu_topo_probed = 0;
 	struct x86_topo_layer {
 		int type;
 		int subtype;
 		int id_shift;
 	} topo_layers[MAX_CACHE_LEVELS + 5];
 	struct topo_node *parent;
 	struct topo_node *node;
 	int layer;
 	int nlayers;
 	int node_id;
 	int i;
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 	int d, domain;
 #endif
 
 	if (cpu_topo_probed)
 		return;
 
 	CPU_ZERO(&logical_cpus_mask);
 
 	if (mp_ncpus <= 1)
 		; /* nothing */
 	else if (cpu_vendor_id == CPU_VENDOR_AMD ||
 	    cpu_vendor_id == CPU_VENDOR_HYGON)
 		topo_probe_amd();
 	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
 		topo_probe_intel();
 
 	KASSERT(pkg_id_shift >= core_id_shift,
 	    ("bug in APIC topology discovery"));
 
 	nlayers = 0;
 	bzero(topo_layers, sizeof(topo_layers));
 
 	topo_layers[nlayers].type = TOPO_TYPE_PKG;
 	topo_layers[nlayers].id_shift = pkg_id_shift;
 	if (bootverbose)
 		printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
 	nlayers++;
 
 	if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
 		topo_layers[nlayers].type = TOPO_TYPE_GROUP;
 		topo_layers[nlayers].id_shift = node_id_shift;
 		if (bootverbose)
 			printf("Node ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	/*
 	 * Consider all caches to be within a package/chip
 	 * and "in front" of all sub-components like
 	 * cores and hardware threads.
 	 */
 	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
 		if (caches[i].present) {
 			if (node_id_shift != 0)
 				KASSERT(caches[i].id_shift <= node_id_shift,
 					("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift <= pkg_id_shift,
 				("bug in APIC topology discovery"));
 			KASSERT(caches[i].id_shift >= core_id_shift,
 				("bug in APIC topology discovery"));
 
 			topo_layers[nlayers].type = TOPO_TYPE_CACHE;
 			topo_layers[nlayers].subtype = i + 1;
 			topo_layers[nlayers].id_shift = caches[i].id_shift;
 			if (bootverbose)
 				printf("L%u cache ID shift: %u\n",
 				    topo_layers[nlayers].subtype,
 				    topo_layers[nlayers].id_shift);
 			nlayers++;
 		}
 	}
 
 	if (pkg_id_shift > core_id_shift) {
 		topo_layers[nlayers].type = TOPO_TYPE_CORE;
 		topo_layers[nlayers].id_shift = core_id_shift;
 		if (bootverbose)
 			printf("Core ID shift: %u\n",
 			    topo_layers[nlayers].id_shift);
 		nlayers++;
 	}
 
 	topo_layers[nlayers].type = TOPO_TYPE_PU;
 	topo_layers[nlayers].id_shift = 0;
 	nlayers++;
 
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 	if (vm_ndomains > 1) {
 		for (layer = 0; layer < nlayers; ++layer) {
 			for (i = 0; i <= max_apic_id; ++i) {
 				if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0)
 					domain = -1;
 				if (!cpu_info[i].cpu_present)
 					continue;
 				d = acpi_pxm_get_cpu_locality(i);
 				if (domain >= 0 && domain != d)
 					break;
 				domain = d;
 			}
 			if (i > max_apic_id)
 				break;
 		}
 		KASSERT(layer < nlayers, ("NUMA domain smaller than PU"));
 		memmove(&topo_layers[layer+1], &topo_layers[layer],
 		    sizeof(*topo_layers) * (nlayers - layer));
 		topo_layers[layer].type = TOPO_TYPE_NODE;
 		topo_layers[layer].subtype = CG_SHARE_NONE;
 		nlayers++;
 	}
 #endif
 
 	topo_init_root(&topo_root);
 	for (i = 0; i <= max_apic_id; ++i) {
 		if (!cpu_info[i].cpu_present)
 			continue;
 
 		parent = &topo_root;
 		for (layer = 0; layer < nlayers; ++layer) {
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 			if (topo_layers[layer].type == TOPO_TYPE_NODE) {
 				node_id = acpi_pxm_get_cpu_locality(i);
 			} else
 #endif
 				node_id = i >> topo_layers[layer].id_shift;
 			parent = topo_add_node_by_hwid(parent, node_id,
 			    topo_layers[layer].type,
 			    topo_layers[layer].subtype);
 		}
 	}
 
 	parent = &topo_root;
 	for (layer = 0; layer < nlayers; ++layer) {
 #if defined(DEV_ACPI) && MAXMEMDOM > 1
 		if (topo_layers[layer].type == TOPO_TYPE_NODE)
 			node_id = acpi_pxm_get_cpu_locality(boot_cpu_id);
 		else
 #endif
 			node_id = boot_cpu_id >> topo_layers[layer].id_shift;
 		node = topo_find_node_by_hwid(parent, node_id,
 		    topo_layers[layer].type,
 		    topo_layers[layer].subtype);
 		topo_promote_child(node);
 		parent = node;
 	}
 
 	cpu_topo_probed = 1;
 }
 
 /*
  * Assign logical CPU IDs to local APICs.
  */
 void
 assign_cpu_ids(void)
 {
 	struct topo_node *node;
 	u_int smt_mask;
 	int nhyper;
 
 	smt_mask = (1u << core_id_shift) - 1;
 
 	/*
 	 * Assign CPU IDs to local APIC IDs and disable any CPUs
 	 * beyond MAXCPU.  CPU 0 is always assigned to the BSP.
 	 */
 	mp_ncpus = 0;
 	nhyper = 0;
 	TOPO_FOREACH(node, &topo_root) {
 		if (node->type != TOPO_TYPE_PU)
 			continue;
 
 		if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
 			cpu_info[node->hwid].cpu_hyperthread = 1;
 
 		if (resource_disabled("lapic", node->hwid)) {
 			if (node->hwid != boot_cpu_id)
 				cpu_info[node->hwid].cpu_disabled = 1;
 			else
 				printf("Cannot disable BSP, APIC ID = %d\n",
 				    node->hwid);
 		}
 
 		if (!hyperthreading_allowed &&
 		    cpu_info[node->hwid].cpu_hyperthread)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (mp_ncpus >= MAXCPU)
 			cpu_info[node->hwid].cpu_disabled = 1;
 
 		if (cpu_info[node->hwid].cpu_disabled) {
 			disabled_cpus++;
 			continue;
 		}
 
 		if (cpu_info[node->hwid].cpu_hyperthread)
 			nhyper++;
 
 		cpu_apic_ids[mp_ncpus] = node->hwid;
 		apic_cpuids[node->hwid] = mp_ncpus;
 		topo_set_pu_id(node, mp_ncpus);
 		mp_ncpus++;
 	}
 
 	KASSERT(mp_maxid >= mp_ncpus - 1,
 	    ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
 	    mp_ncpus));
 
 	mp_ncores = mp_ncpus - nhyper;
 	smp_threads_per_core = mp_ncpus / mp_ncores;
 }
 
 /*
  * Print various information about the SMP system hardware and setup.
  */
 void
 cpu_mp_announce(void)
 {
 	struct topo_node *node;
 	const char *hyperthread;
 	struct topo_analysis topology;
 
 	printf("FreeBSD/SMP: ");
 	if (topo_analyze(&topo_root, 1, &topology)) {
 		printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
 		if (topology.entities[TOPO_LEVEL_GROUP] > 1)
 			printf(" x %d groups",
 			    topology.entities[TOPO_LEVEL_GROUP]);
 		if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
 			printf(" x %d cache groups",
 			    topology.entities[TOPO_LEVEL_CACHEGROUP]);
 		if (topology.entities[TOPO_LEVEL_CORE] > 0)
 			printf(" x %d core(s)",
 			    topology.entities[TOPO_LEVEL_CORE]);
 		if (topology.entities[TOPO_LEVEL_THREAD] > 1)
 			printf(" x %d hardware threads",
 			    topology.entities[TOPO_LEVEL_THREAD]);
 	} else {
 		printf("Non-uniform topology");
 	}
 	printf("\n");
 
 	if (disabled_cpus) {
 		printf("FreeBSD/SMP Online: ");
 		if (topo_analyze(&topo_root, 0, &topology)) {
 			printf("%d package(s)",
 			    topology.entities[TOPO_LEVEL_PKG]);
 			if (topology.entities[TOPO_LEVEL_GROUP] > 1)
 				printf(" x %d groups",
 				    topology.entities[TOPO_LEVEL_GROUP]);
 			if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
 				printf(" x %d cache groups",
 				    topology.entities[TOPO_LEVEL_CACHEGROUP]);
 			if (topology.entities[TOPO_LEVEL_CORE] > 0)
 				printf(" x %d core(s)",
 				    topology.entities[TOPO_LEVEL_CORE]);
 			if (topology.entities[TOPO_LEVEL_THREAD] > 1)
 				printf(" x %d hardware threads",
 				    topology.entities[TOPO_LEVEL_THREAD]);
 		} else {
 			printf("Non-uniform topology");
 		}
 		printf("\n");
 	}
 
 	if (!bootverbose)
 		return;
 
 	TOPO_FOREACH(node, &topo_root) {
 		switch (node->type) {
 		case TOPO_TYPE_PKG:
 			printf("Package HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_CORE:
 			printf("\tCore HW ID = %u\n", node->hwid);
 			break;
 		case TOPO_TYPE_PU:
 			if (cpu_info[node->hwid].cpu_hyperthread)
 				hyperthread = "/HT";
 			else
 				hyperthread = "";
 
 			if (node->subtype == 0)
 				printf("\t\tCPU (AP%s): APIC ID: %u"
 				    "(disabled)\n", hyperthread, node->hwid);
 			else if (node->id == 0)
 				printf("\t\tCPU0 (BSP): APIC ID: %u\n",
 				    node->hwid);
 			else
 				printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
 				    node->id, hyperthread, node->hwid);
 			break;
 		default:
 			/* ignored */
 			break;
 		}
 	}
 }
 
 /*
  * Add a scheduling group, a group of logical processors sharing
  * a particular cache (and, thus having an affinity), to the scheduling
  * topology.
  * This function recursively works on lower level caches.
  */
 static void
 x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root)
 {
 	struct topo_node *node;
 	int nchildren;
 	int ncores;
 	int i;
 
 	KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE ||
 	    root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP,
 	    ("x86topo_add_sched_group: bad type: %u", root->type));
 	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
 	cg_root->cg_count = root->cpu_count;
 	if (root->type == TOPO_TYPE_CACHE)
 		cg_root->cg_level = root->subtype;
 	else
 		cg_root->cg_level = CG_SHARE_NONE;
 	if (root->type == TOPO_TYPE_NODE)
 		cg_root->cg_flags = CG_FLAG_NODE;
 	else
 		cg_root->cg_flags = 0;
 
 	/*
 	 * Check how many core nodes we have under the given root node.
 	 * If we have multiple logical processors, but not multiple
 	 * cores, then those processors must be hardware threads.
 	 */
 	ncores = 0;
 	node = root;
 	while (node != NULL) {
 		if (node->type != TOPO_TYPE_CORE) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 
 		ncores++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	if (cg_root->cg_level != CG_SHARE_NONE &&
 	    root->cpu_count > 1 && ncores < 2)
 		cg_root->cg_flags |= CG_FLAG_SMT;
 
 	/*
 	 * Find out how many cache nodes we have under the given root node.
 	 * We ignore cache nodes that cover all the same processors as the
 	 * root node.  Also, we do not descend below found cache nodes.
 	 * That is, we count top-level "non-redundant" caches under the root
 	 * node.
 	 */
 	nchildren = 0;
 	node = root;
 	while (node != NULL) {
 		/*
 		 * When some APICs are disabled by tunables, nodes can end up
 		 * with an empty cpuset. Nodes with an empty cpuset will be
 		 * translated into cpu groups with empty cpusets. smp_topo_fill
 		 * will then set cg_first and cg_last to -1. This isn't
 		 * correctly handled in all functions. E.g. when
 		 * cpu_search_lowest and cpu_search_highest loop through all
 		 * cpus, they call CPU_ISSET on cpu -1 which ends up in a
 		 * general protection fault.
 		 *
 		 * We could fix the scheduler to handle empty cpu groups
 		 * correctly. Nevertheless, empty cpu groups are causing
 		 * overhead for no value. So, it makes more sense to just don't
 		 * create them.
 		 */
 		if (CPU_EMPTY(&node->cpuset)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) {
 			if (node->type == TOPO_TYPE_CACHE &&
 			    cg_root->cg_level < node->subtype)
 				cg_root->cg_level = node->subtype;
 			if (node->type == TOPO_TYPE_NODE)
 				cg_root->cg_flags |= CG_FLAG_NODE;
 			node = topo_next_node(root, node);
 			continue;
 		}
 		if (node->type != TOPO_TYPE_GROUP &&
 		    node->type != TOPO_TYPE_NODE &&
 		    node->type != TOPO_TYPE_CACHE) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		nchildren++;
 		node = topo_next_nonchild_node(root, node);
 	}
 
 	/*
 	 * We are not interested in nodes including only one CPU each.
 	 */
 	if (nchildren == root->cpu_count)
 		return;
 
 	/*
 	 * We are not interested in nodes without children.
 	 */
 	cg_root->cg_children = nchildren;
 	if (nchildren == 0)
 		return;
 
 	cg_root->cg_child = smp_topo_alloc(nchildren);
 
 	/*
 	 * Now find again the same cache nodes as above and recursively
 	 * build scheduling topologies for them.
 	 */
 	node = root;
 	i = 0;
 	while (node != NULL) {
 		if ((node->type != TOPO_TYPE_GROUP &&
 		    node->type != TOPO_TYPE_NODE &&
 		    node->type != TOPO_TYPE_CACHE) ||
 		    CPU_CMP(&node->cpuset, &root->cpuset) == 0 ||
 		    CPU_EMPTY(&node->cpuset)) {
 			node = topo_next_node(root, node);
 			continue;
 		}
 		cg_root->cg_child[i].cg_parent = cg_root;
 		x86topo_add_sched_group(node, &cg_root->cg_child[i]);
 		i++;
 		node = topo_next_nonchild_node(root, node);
 	}
 }
 
 /*
  * Build the MI scheduling topology from the discovered hardware topology.
  */
 struct cpu_group *
 cpu_topo(void)
 {
 	struct cpu_group *cg_root;
 
 	if (mp_ncpus <= 1)
 		return (smp_topo_none());
 
 	cg_root = smp_topo_alloc(1);
 	x86topo_add_sched_group(&topo_root, cg_root);
 	return (cg_root);
 }
 
 static void
 cpu_alloc(void *dummy __unused)
 {
 	/*
 	 * Dynamically allocate the arrays that depend on the
 	 * maximum APIC ID.
 	 */
 	cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS,
 	    M_WAITOK | M_ZERO);
 	apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS,
 	    M_WAITOK | M_ZERO);
 }
 SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);
 
 /*
  * Add a logical CPU to the topology.
  */
 void
 cpu_add(u_int apic_id, char boot_cpu)
 {
 
 	if (apic_id > max_apic_id)
 		panic("SMP: APIC ID %d too high", apic_id);
 
 	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
 	    apic_id));
 	cpu_info[apic_id].cpu_present = 1;
 	if (boot_cpu) {
 		KASSERT(boot_cpu_id == -1,
 		    ("CPU %u claims to be BSP, but CPU %u already is", apic_id,
 		    boot_cpu_id));
 		boot_cpu_id = apic_id;
 		cpu_info[apic_id].cpu_bsp = 1;
 	}
 	if (bootverbose)
 		printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
 		    "AP");
 }
 
 void
 cpu_mp_setmaxid(void)
 {
 
 	/*
 	 * mp_ncpus and mp_maxid should be already set by calls to cpu_add().
 	 * If there were no calls to cpu_add() assume this is a UP system.
 	 */
 	if (mp_ncpus == 0)
 		mp_ncpus = 1;
 }
 
 int
 cpu_mp_probe(void)
 {
 
 	/*
 	 * Always record BSP in CPU map so that the mbuf init code works
 	 * correctly.
 	 */
 	CPU_SETOF(0, &all_cpus);
 	return (mp_ncpus > 1);
 }
 
 /*
  * AP CPU's call this to initialize themselves.
  */
 void
 init_secondary_tail(void)
 {
 	u_int cpuid;
 
 	pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));
 
 	/*
 	 * On real hardware, switch to x2apic mode if possible.  Do it
 	 * after aps_ready was signalled, to avoid manipulating the
 	 * mode while BSP might still want to send some IPI to us
 	 * (second startup IPI is ignored on modern hardware etc).
 	 */
 	lapic_xapic_mode();
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
 
 	/* set up CPU registers and state */
 	cpu_setregs();
 
 	/* set up SSE/NX */
 	initializecpu();
 
 	/* set up FPU state on the AP */
 #ifdef __amd64__
 	fpuinit();
 #else
 	npxinit(false);
 #endif
 
 	if (cpu_ops.cpu_init)
 		cpu_ops.cpu_init();
 
 	/* A quick check from sanity claus */
 	cpuid = PCPU_GET(cpuid);
 	if (PCPU_GET(apic_id) != lapic_id()) {
 		printf("SMP: cpuid = %d\n", cpuid);
 		printf("SMP: actual apic_id = %d\n", lapic_id());
 		printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
 		panic("cpuid mismatch! boom!!");
 	}
 
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
 	schedinit_ap();
 
 	mtx_lock_spin(&ap_boot_mtx);
 
 	mca_init();
 
 	/* Init local apic for irq's */
 	lapic_setup(1);
 
 	/* Set memory range attributes for this CPU to match the BSP */
 	mem_range_AP_init();
 
 	smp_cpus++;
 
 	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
 	if (bootverbose)
 		printf("SMP: AP CPU #%d Launched!\n", cpuid);
 	else
 		printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "",
 		    cpuid, smp_cpus == mp_ncpus ? "\n" : " ");
 
 	/* Determine if we are a logical CPU. */
 	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
 		CPU_SET(cpuid, &logical_cpus_mask);
 
 	if (bootverbose)
 		lapic_dump("AP");
 
 	if (smp_cpus == mp_ncpus) {
 		/* enable IPI's, tlb shootdown, freezes etc */
 		atomic_store_rel_int(&smp_started, 1);
 	}
 
 #ifdef __amd64__
 	if (pmap_pcid_enabled)
 		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 #endif
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
 	/* Wait until all the AP's are up. */
 	while (atomic_load_acq_int(&smp_started) == 0)
 		ia32_pause();
 
-#ifndef EARLY_AP_STARTUP
-	/* Start per-CPU event timers. */
-	cpu_initclocks_ap();
-#endif
-
 	kcsan_cpu_init(cpuid);
 
 	sched_ap_entry();
 
 	panic("scheduler returned us to %s", __func__);
 	/* NOTREACHED */
 }
 
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
 	int cpu;
 
 	if (mp_ncpus == 1)
 		return;
 
 	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
 
 	/*
 	 * Wait for all APs to handle an interrupt.  After that, we know that
 	 * the APs have entered the scheduler at least once, so the boot stacks
 	 * are safe to free.
 	 */
 	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
 	    smp_no_rendezvous_barrier, NULL);
 
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
 		kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE);
 	}
 }
 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
     smp_after_idle_runnable, NULL);
 
 /*
  * We tell the I/O APIC code about all the CPUs we want to receive
  * interrupts.  If we don't want certain CPUs to receive IRQs we
  * can simply not tell the I/O APIC code about them in this function.
  * We also do not tell it about the BSP since it tells itself about
  * the BSP internally to work with UP kernels and on UP machines.
  */
 void
 set_interrupt_apic_ids(void)
 {
 	u_int i, apic_id;
 
 	for (i = 0; i < MAXCPU; i++) {
 		apic_id = cpu_apic_ids[i];
 		if (apic_id == -1)
 			continue;
 		if (cpu_info[apic_id].cpu_bsp)
 			continue;
 		if (cpu_info[apic_id].cpu_disabled)
 			continue;
 		if (intr_apic_id_limit >= 0 && apic_id > intr_apic_id_limit)
 			continue;
 
 		/* Don't let hyperthreads service interrupts. */
 		if (cpu_info[apic_id].cpu_hyperthread &&
 		    !hyperthreading_intr_allowed)
 			continue;
 
 		intr_add_cpu(i);
 	}
 }
 
 #ifdef COUNT_XINVLTLB_HITS
 u_int xhits_gbl[MAXCPU];
 u_int xhits_pg[MAXCPU];
 u_int xhits_rng[MAXCPU];
 static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
     sizeof(xhits_gbl), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
     sizeof(xhits_pg), "IU", "");
 SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
     sizeof(xhits_rng), "IU", "");
 
 u_int ipi_global;
 u_int ipi_page;
 u_int ipi_range;
 u_int ipi_range_size;
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
 SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
     0, "");
 #endif /* COUNT_XINVLTLB_HITS */
 
 /*
  * Init and startup IPI.
  */
 void
 ipi_startup(int apic_id, int vector)
 {
 
 	/*
 	 * This attempts to follow the algorithm described in the
 	 * Intel Multiprocessor Specification v1.4 in section B.4.
 	 * For each IPI, we allow the local APIC ~20us to deliver the
 	 * IPI.  If that times out, we panic.
 	 */
 
 	/*
 	 * first we do an INIT IPI: this INIT IPI might be run, resetting
 	 * and running the target CPU. OR this INIT IPI might be latched (P5
 	 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
 	 * ignored.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id);
 	lapic_ipi_wait(100);
 
 	/* Explicitly deassert the INIT IPI. */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL |
 	    APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT,
 	    apic_id);
 
 	DELAY(10000);		/* wait ~10mS */
 
 	/*
 	 * next we do a STARTUP IPI: the previous INIT IPI might still be
 	 * latched, (P5 bug) this 1st STARTUP would then terminate
 	 * immediately, and the previously started INIT IPI would continue. OR
 	 * the previous INIT IPI has already run. and this STARTUP IPI will
 	 * run. OR the previous INIT IPI was ignored. and this STARTUP IPI
 	 * will run.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver first STARTUP IPI to APIC %d",
 		    apic_id);
 	DELAY(200);		/* wait ~200uS */
 
 	/*
 	 * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
 	 * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
 	 * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
 	 * recognized after hardware RESET or INIT IPI.
 	 */
 	lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE |
 	    APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP |
 	    vector, apic_id);
 	if (!lapic_ipi_wait(100))
 		panic("Failed to deliver second STARTUP IPI to APIC %d",
 		    apic_id);
 
 	DELAY(200);		/* wait ~200uS */
 }
 
 static bool
 ipi_bitmap_set(int cpu, u_int ipi)
 {
 	u_int bitmap, old, new;
 	u_int *cpu_bitmap;
 
 	bitmap = 1 << ipi;
 	cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap;
 	old = *cpu_bitmap;
 	for (;;) {
 		if ((old & bitmap) != 0)
 			break;
 		new = old | bitmap;
 		if (atomic_fcmpset_int(cpu_bitmap, &old, new))
 			break;
 	}
 	return (old != 0);
 }
 
 /*
  * Send an IPI to specified CPU handling the bitmap logic.
  */
 static void
 ipi_send_cpu(int cpu, u_int ipi)
 {
 
 	KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1,
 	    ("IPI to non-existent CPU %d", cpu));
 
 	if (IPI_IS_BITMAPED(ipi)) {
 		if (ipi_bitmap_set(cpu, ipi))
 			return;
 		ipi = IPI_BITMAP_VECTOR;
 	}
 	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
 }
 
 void
 ipi_bitmap_handler(struct trapframe frame)
 {
 	struct trapframe *oldframe;
 	struct thread *td;
 	int cpu = PCPU_GET(cpuid);
 	u_int ipi_bitmap;
 
 	kasan_mark(&frame, sizeof(frame), sizeof(frame), 0);
 
 	td = curthread;
 	ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]->
 	    pc_ipi_bitmap);
 
 	/*
 	 * sched_preempt() must be called to clear the pending preempt
 	 * IPI to enable delivery of further preempts.  However, the
 	 * critical section will cause extra scheduler lock thrashing
 	 * when used unconditionally.  Only critical_enter() if
 	 * hardclock must also run, which requires the section entry.
 	 */
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
 		critical_enter();
 
 	td->td_intr_nesting_level++;
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = &frame;
 #if defined(STACK) || defined(DDB)
 	if (ipi_bitmap & (1 << IPI_TRACE))
 		stack_capture_intr();
 #endif
 	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
 #ifdef COUNT_IPIS
 		(*ipi_preempt_counts[cpu])++;
 #endif
 		sched_preempt(td);
 	}
 	if (ipi_bitmap & (1 << IPI_AST)) {
 #ifdef COUNT_IPIS
 		(*ipi_ast_counts[cpu])++;
 #endif
 		/* Nothing to do for AST */
 	}
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
 #ifdef COUNT_IPIS
 		(*ipi_hardclock_counts[cpu])++;
 #endif
 		hardclockintr();
 	}
 	td->td_intr_frame = oldframe;
 	td->td_intr_nesting_level--;
 	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
 		critical_exit();
 }
 
 /*
  * send an IPI to a set of cpus.
  */
 void
 ipi_selected(cpuset_t cpus, u_int ipi)
 {
 	int cpu;
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);
 
 	CPU_FOREACH_ISSET(cpu, &cpus) {
 		CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 		ipi_send_cpu(cpu, ipi);
 	}
 }
 
 /*
  * send an IPI to a specific CPU.
  */
 void
 ipi_cpu(int cpu, u_int ipi)
 {
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD)
 		CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);
 
 	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
 	ipi_send_cpu(cpu, ipi);
 }
 
 /*
  * send an IPI to all CPUs EXCEPT myself
  */
 void
 ipi_all_but_self(u_int ipi)
 {
 	cpuset_t other_cpus;
 	int cpu, c;
 
 	/*
 	 * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
 	 * of help in order to understand what is the source.
 	 * Set the mask of receiving CPUs for this purpose.
 	 */
 	if (ipi == IPI_STOP_HARD) {
 		other_cpus = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
 	}
 
 	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
 	if (IPI_IS_BITMAPED(ipi)) {
 		cpu = PCPU_GET(cpuid);
 		CPU_FOREACH(c) {
 			if (c != cpu)
 				ipi_bitmap_set(c, ipi);
 		}
 		ipi = IPI_BITMAP_VECTOR;
 	}
 	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
 }
 
 void
 ipi_self_from_nmi(u_int vector)
 {
 
 	lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF);
 
 	/* Wait for IPI to finish. */
 	if (!lapic_ipi_wait(50000)) {
 		if (KERNEL_PANICKED())
 			return;
 		else
 			panic("APIC: IPI is stuck");
 	}
 }
 
 int
 ipi_nmi_handler(void)
 {
 	u_int cpuid;
 
 	/*
 	 * As long as there is not a simple way to know about a NMI's
 	 * source, if the bitmask for the current CPU is present in
 	 * the global pending bitword an IPI_STOP_HARD has been issued
 	 * and should be handled.
 	 */
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
 		return (1);
 
 	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
 	cpustop_handler();
 	return (0);
 }
 
 int nmi_kdb_lock;
 
 void
 nmi_call_kdb_smp(u_int type, struct trapframe *frame)
 {
 	int cpu;
 	bool call_post;
 
 	cpu = PCPU_GET(cpuid);
 	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
 		nmi_call_kdb(cpu, type, frame);
 		call_post = false;
 	} else {
 		savectx(&stoppcbs[cpu]);
 		CPU_SET_ATOMIC(cpu, &stopped_cpus);
 		while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
 			ia32_pause();
 		call_post = true;
 	}
 	atomic_store_rel_int(&nmi_kdb_lock, 0);
 	if (call_post)
 		cpustop_handler_post(cpu);
 }
 
 /*
  * Handle an IPI_STOP by saving our current context and spinning (or mwaiting,
  * if available) until we are resumed.
  */
 void
 cpustop_handler(void)
 {
 	struct monitorbuf *mb;
 	u_int cpu;
 	bool use_mwait;
 
 	cpu = PCPU_GET(cpuid);
 
 	savectx(&stoppcbs[cpu]);
 
 	use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 &&
 	    !mwait_cpustop_broken);
 	if (use_mwait) {
 		mb = PCPU_PTR(monitorbuf);
 		atomic_store_int(&mb->stop_state,
 		    MONITOR_STOPSTATE_STOPPED);
 	}
 
 	/* Indicate that we are stopped */
 	CPU_SET_ATOMIC(cpu, &stopped_cpus);
 
 	/* Wait for restart */
 	while (!CPU_ISSET(cpu, &started_cpus)) {
 		if (use_mwait) {
 			cpu_monitor(mb, 0, 0);
 			if (atomic_load_int(&mb->stop_state) ==
 			    MONITOR_STOPSTATE_STOPPED)
 				cpu_mwait(0, MWAIT_C1);
 			continue;
 		}
 
 		ia32_pause();
 
 		/*
 		 * Halt non-BSP CPUs on panic -- we're never going to need them
 		 * again, and might as well save power / release resources
 		 * (e.g., overprovisioned VM infrastructure).
 		 */
 		while (__predict_false(!IS_BSP() && KERNEL_PANICKED()))
 			halt();
 	}
 
 	cpustop_handler_post(cpu);
 }
 
 static void
 cpustop_handler_post(u_int cpu)
 {
 
 	CPU_CLR_ATOMIC(cpu, &started_cpus);
 	CPU_CLR_ATOMIC(cpu, &stopped_cpus);
 
 	/*
 	 * We don't broadcast TLB invalidations to other CPUs when they are
 	 * stopped. Hence, we clear the TLB before resuming.
 	 */
 	invltlb_glob();
 
 #if defined(__amd64__) && (defined(DDB) || defined(GDB))
 	amd64_db_resume_dbreg();
 #endif
 
 	if (cpu == 0 && cpustop_restartfunc != NULL) {
 		cpustop_restartfunc();
 		cpustop_restartfunc = NULL;
 	}
 }
 
 /*
  * Handle an IPI_SUSPEND by saving our current context and spinning until we
  * are resumed.
  */
 void
 cpususpend_handler(void)
 {
 	u_int cpu;
 
 	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);
 
 	cpu = PCPU_GET(cpuid);
 	if (savectx(&susppcbs[cpu]->sp_pcb)) {
 #ifdef __amd64__
 		fpususpend(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxsuspend(susppcbs[cpu]->sp_fpususpend);
 #endif
 		/*
 		 * suspended_cpus is cleared shortly after each AP is restarted
 		 * by a Startup IPI, so that the BSP can proceed to restarting
 		 * the next AP.
 		 *
 		 * resuming_cpus gets cleared when the AP completes
 		 * initialization after having been released by the BSP.
 		 * resuming_cpus is probably not the best name for the
 		 * variable, because it is actually a set of processors that
 		 * haven't resumed yet and haven't necessarily started resuming.
 		 *
 		 * Note that suspended_cpus is meaningful only for ACPI suspend
 		 * as it's not really used for Xen suspend since the APs are
 		 * automatically restored to the running state and the correct
 		 * context.  For the same reason resumectx is never called in
 		 * that case.
 		 */
 		CPU_SET_ATOMIC(cpu, &suspended_cpus);
 		CPU_SET_ATOMIC(cpu, &resuming_cpus);
 
 		/*
 		 * Invalidate the cache after setting the global status bits.
 		 * The last AP to set its bit may end up being an Owner of the
 		 * corresponding cache line in MOESI protocol.  The AP may be
 		 * stopped before the cache line is written to the main memory.
 		 */
 		wbinvd();
 	} else {
 #ifdef __amd64__
 		fpuresume(susppcbs[cpu]->sp_fpususpend);
 #else
 		npxresume(susppcbs[cpu]->sp_fpususpend);
 #endif
 		pmap_init_pat();
 		initializecpu();
 		PCPU_SET(switchtime, 0);
 		PCPU_SET(switchticks, ticks);
 
 		/* Indicate that we have restarted and restored the context. */
 		CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	}
 
 	/* Wait for resume directive */
 	while (!CPU_ISSET(cpu, &toresume_cpus))
 		ia32_pause();
 
 	/* Re-apply microcode updates. */
 	ucode_reload();
 
 #ifdef __i386__
 	/* Finish removing the identity mapping of low memory for this AP. */
 	invltlb_glob();
 #endif
 
 	if (cpu_ops.cpu_resume)
 		cpu_ops.cpu_resume();
 #ifdef __amd64__
 	if (vmm_resume_p)
 		vmm_resume_p();
 #endif
 
 	/* Resume MCA and local APIC */
 	lapic_xapic_mode();
 	mca_resume();
 	lapic_setup(0);
 
 	/* Indicate that we are resumed */
 	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
 	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
 	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
 }
 
 /*
  * Handle an IPI_SWI by waking delayed SWI thread.
  */
 void
 ipi_swi_handler(struct trapframe frame)
 {
 
 	intr_event_handle(clk_intr_event, &frame);
 }
 
 /*
  * This is called once the rest of the system is up and running and we're
  * ready to let the AP's out of the pen.
  */
 static void
 release_aps(void *dummy __unused)
 {
 
 	if (mp_ncpus == 1) 
 		return;
 	atomic_store_rel_int(&aps_ready, 1);
 	while (smp_started == 0)
 		ia32_pause();
 }
 SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);
 
 #ifdef COUNT_IPIS
 /*
  * Setup interrupt counters for IPI handlers.
  */
 static void
 mp_ipi_intrcnt(void *dummy)
 {
 	char buf[64];
 	int i;
 
 	CPU_FOREACH(i) {
 		snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
 		intrcnt_add(buf, &ipi_invltlb_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
 		intrcnt_add(buf, &ipi_invlrng_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
 		intrcnt_add(buf, &ipi_invlpg_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
 		intrcnt_add(buf, &ipi_invlcache_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
 		intrcnt_add(buf, &ipi_preempt_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:ast", i);
 		intrcnt_add(buf, &ipi_ast_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
 		intrcnt_add(buf, &ipi_rendezvous_counts[i]);
 		snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
 		intrcnt_add(buf, &ipi_hardclock_counts[i]);
 	}
 }
 SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
 #endif