0003-Add-CPU-core-performance-efficiency-score-variable-t.patch
koinec_yahoo.co.jp (Koine Yuusuke)
Actions

Authored By

	koinec_yahoo.co.jp
	Fri, May 10, 12:32 PM

Size

118 KB

Referenced Files

None

Subscribers

None

0003-Add-CPU-core-performance-efficiency-score-variable-t.patch
View Options

	From b313fd786984e705dd63a2efb2ac06413aafab68 Mon Sep 17 00:00:00 2001
	From: Koine Yuusuke <koinec@yahoo.co.jp>
	Date: Mon, 6 May 2024 11:30:00 +0900
	Subject: [PATCH 3/7] Add CPU core performance/efficiency score variable to
	SMP's cpu_group struct for the coredirector driver. The ULE scheduler seems
	to refer to the "cpu_group" structure generated by the SMP code to determine
	which cores to allocate tasks to. However, currently it seems that the
	performance of all cores is treated equally, so in order to be able to change
	this, I will add the score variable array for each CPU core group to the
	"cpu_group" structure.

	---
	sys/conf/options \| 1 +
	sys/kern/subr_smp.c \| 8 +++++++-
	sys/sys/smp.h \| 13 +++++++++++++
	sys/x86/x86/mp_x86.c \| 8 ++++++++
	4 files changed, 29 insertions(+), 1 deletion(-)

	diff --git a/sys/conf/options b/sys/conf/options
	index fcab21ad7e78..fbf8d3c2b731 100644
	--- a/sys/conf/options
	+++ b/sys/conf/options
	@@ -1,1025 +1,1026 @@
	#
	# On the handling of kernel options
	#
	# All kernel options should be listed in NOTES, with suitable
	# descriptions. Negative options (options that make some code not
	# compile) should be commented out; LINT (generated from NOTES) should
	# compile as much code as possible. Try to structure option-using
	# code so that a single option only switch code on, or only switch
	# code off, to make it possible to have a full compile-test. If
	# necessary, you can check for COMPILING_LINT to get maximum code
	# coverage.
	#
	# All new options shall also be listed in either "conf/options" or
	# "conf/options.<machine>". Options that affect a single source-file
	# <xxx>.[c\|s] should be directed into "opt_<xxx>.h", while options
	# that affect multiple files should either go in "opt_global.h" if
	# this is a kernel-wide option (used just about everywhere), or in
	# "opt_<option-name-in-lower-case>.h" if it affects only some files.
	# Note that the effect of listing only an option without a
	# header-file-name in conf/options (and cousins) is that the last
	# convention is followed.
	#
	# This handling scheme is not yet fully implemented.
	#
	#
	# Format of this file:
	# Option name filename
	#
	# If filename is missing, the default is
	# opt_<name-of-option-in-lower-case>.h

	AAC_DEBUG opt_aac.h
	AACRAID_DEBUG opt_aacraid.h
	AHC_ALLOW_MEMIO opt_aic7xxx.h
	AHC_TMODE_ENABLE opt_aic7xxx.h
	AHC_DUMP_EEPROM opt_aic7xxx.h
	AHC_DEBUG opt_aic7xxx.h
	AHC_DEBUG_OPTS opt_aic7xxx.h
	AHC_REG_PRETTY_PRINT opt_aic7xxx.h
	AHD_DEBUG opt_aic79xx.h
	AHD_DEBUG_OPTS opt_aic79xx.h
	AHD_TMODE_ENABLE opt_aic79xx.h
	AHD_REG_PRETTY_PRINT opt_aic79xx.h

	# Debugging options.
	ALT_BREAK_TO_DEBUGGER opt_kdb.h
	BREAK_TO_DEBUGGER opt_kdb.h
	BUF_TRACKING opt_global.h
	DDB
	DDB_BUFR_SIZE opt_ddb.h
	DDB_CAPTURE_DEFAULTBUFSIZE opt_ddb.h
	DDB_CAPTURE_MAXBUFSIZE opt_ddb.h
	DDB_CTF opt_ddb.h
	DDB_NUMSYM opt_ddb.h
	EARLY_PRINTF opt_global.h
	FULL_BUF_TRACKING opt_global.h
	GDB
	KDB opt_global.h
	KDB_TRACE opt_kdb.h
	KDB_UNATTENDED opt_kdb.h
	KLD_DEBUG opt_kld.h
	NUM_CORE_FILES opt_global.h
	QUEUE_MACRO_DEBUG_TRACE opt_global.h
	QUEUE_MACRO_DEBUG_TRASH opt_global.h
	SYSCTL_DEBUG opt_sysctl.h
	TEXTDUMP_PREFERRED opt_ddb.h
	TEXTDUMP_VERBOSE opt_ddb.h
	TSLOG opt_global.h
	TSLOG_PAGEZERO opt_global.h
	TSLOGSIZE opt_global.h

	# Miscellaneous options.
	ALQ
	ALTERA_SDCARD_FAST_SIM opt_altera_sdcard.h
	ATSE_CFI_HACK opt_cfi.h
	AUDIT opt_global.h
	BOOTHOWTO opt_global.h
	BOOTVERBOSE opt_global.h
	CALLOUT_PROFILING
	CAPABILITIES opt_capsicum.h
	CAPABILITY_MODE opt_capsicum.h
	CC_CDG opt_global.h
	CC_CHD opt_global.h
	CC_CUBIC opt_global.h
	CC_DEFAULT opt_cc.h
	CC_DCTCP opt_global.h
	CC_HD opt_global.h
	CC_HTCP opt_global.h
	CC_NEWRENO opt_global.h
	CC_VEGAS opt_global.h
	COMPAT_43 opt_global.h
	COMPAT_43TTY opt_global.h
	COMPAT_FREEBSD4 opt_global.h
	COMPAT_FREEBSD5 opt_global.h
	COMPAT_FREEBSD6 opt_global.h
	COMPAT_FREEBSD7 opt_global.h
	COMPAT_FREEBSD9 opt_global.h
	COMPAT_FREEBSD10 opt_global.h
	COMPAT_FREEBSD11 opt_global.h
	COMPAT_FREEBSD12 opt_global.h
	COMPAT_FREEBSD13 opt_global.h
	COMPAT_FREEBSD14 opt_global.h
	COMPAT_LINUXKPI opt_dontuse.h
	COMPILING_LINT opt_global.h
	CY_PCI_FASTINTR
	DEADLKRES opt_watchdog.h
	EXPERIMENTAL opt_global.h
	DIRECTIO
	FILEMON opt_dontuse.h
	FFCLOCK
	FULL_PREEMPTION opt_sched.h
	GZIO opt_gzio.h
	IMGACT_BINMISC opt_dontuse.h
	IPI_PREEMPTION opt_sched.h
	GEOM_BDE opt_geom.h
	GEOM_CACHE opt_geom.h
	GEOM_CONCAT opt_geom.h
	GEOM_ELI opt_geom.h
	GEOM_GATE opt_geom.h
	GEOM_JOURNAL opt_geom.h
	GEOM_LABEL opt_geom.h
	GEOM_LABEL_GPT opt_geom.h
	GEOM_LINUX_LVM opt_geom.h
	GEOM_MAP opt_geom.h
	GEOM_MIRROR opt_geom.h
	GEOM_MOUNTVER opt_geom.h
	GEOM_MULTIPATH opt_geom.h
	GEOM_NOP opt_geom.h
	GEOM_PART_APM opt_geom.h
	GEOM_PART_BSD opt_geom.h
	GEOM_PART_BSD64 opt_geom.h
	GEOM_PART_EBR opt_geom.h
	GEOM_PART_GPT opt_geom.h
	GEOM_PART_LDM opt_geom.h
	GEOM_PART_MBR opt_geom.h
	GEOM_RAID opt_geom.h
	GEOM_RAID3 opt_geom.h
	GEOM_SHSEC opt_geom.h
	GEOM_STRIPE opt_geom.h
	GEOM_UZIP opt_geom.h
	GEOM_UZIP_DEBUG opt_geom.h
	GEOM_VINUM opt_geom.h
	GEOM_VIRSTOR opt_geom.h
	GEOM_ZERO opt_geom.h
	IFLIB opt_iflib.h
	KDTRACE_HOOKS opt_global.h
	KDTRACE_FRAME opt_kdtrace.h
	KDTRACE_NO_MIB_SDT opt_global.h
	KN_HASHSIZE opt_kqueue.h
	KSTACK_MAX_PAGES
	KSTACK_PAGES
	KSTACK_USAGE_PROF
	KTRACE
	KTRACE_REQUEST_POOL opt_ktrace.h
	LIBICONV
	MAC opt_global.h
	MAC_BIBA opt_dontuse.h
	MAC_BSDEXTENDED opt_dontuse.h
	MAC_DDB opt_dontuse.h
	MAC_DEBUG opt_mac.h
	MAC_IFOFF opt_dontuse.h
	MAC_IPACL opt_dontuse.h
	MAC_LOMAC opt_dontuse.h
	MAC_MLS opt_dontuse.h
	MAC_NONE opt_dontuse.h
	MAC_NTPD opt_dontuse.h
	MAC_PARTITION opt_dontuse.h
	MAC_PORTACL opt_dontuse.h
	MAC_PRIORITY opt_dontuse.h
	MAC_SEEOTHERUIDS opt_dontuse.h
	MAC_STATIC opt_mac.h
	MAC_STUB opt_dontuse.h
	MAC_TEST opt_dontuse.h
	MAC_GRANTBYLABEL opt_dontuse.h
	MAC_VERIEXEC opt_dontuse.h
	MAC_VERIEXEC_DEBUG opt_mac.h
	MAC_VERIEXEC_SHA1 opt_dontuse.h
	MAC_VERIEXEC_SHA256 opt_dontuse.h
	MAC_VERIEXEC_SHA384 opt_dontuse.h
	MAC_VERIEXEC_SHA512 opt_dontuse.h
	MD_ROOT opt_md.h
	MD_ROOT_FSTYPE opt_md.h
	MD_ROOT_READONLY opt_md.h
	MD_ROOT_SIZE opt_md.h
	MD_ROOT_MEM opt_md.h
	MFI_DEBUG opt_mfi.h
	MFI_DECODE_LOG opt_mfi.h
	MPROF_BUFFERS opt_mprof.h
	MPROF_HASH_SIZE opt_mprof.h
	NEW_PCIB opt_global.h
	NO_ADAPTIVE_MUTEXES opt_adaptive_mutexes.h
	NO_ADAPTIVE_RWLOCKS
	NO_ADAPTIVE_SX
	NO_OBSOLETE_CODE opt_global.h
	NO_SYSCTL_DESCR opt_global.h
	NSWBUF_MIN opt_param.h
	MBUF_PACKET_ZONE_DISABLE opt_global.h
	PANIC_REBOOT_WAIT_TIME opt_panic.h
	PCI_HP opt_pci.h
	PCI_IOV opt_global.h
	PPC_DEBUG opt_ppc.h
	PPC_PROBE_CHIPSET opt_ppc.h
	PPS_SYNC opt_ntp.h
	PREEMPTION opt_sched.h
	QUOTA
	SCHED_4BSD opt_sched.h
	SCHED_STATS opt_sched.h
	SCHED_ULE opt_sched.h
	SLEEPQUEUE_PROFILING
	SLHCI_DEBUG opt_slhci.h
	STACK opt_stack.h
	SUIDDIR
	MSGMNB opt_sysvipc.h
	MSGMNI opt_sysvipc.h
	MSGSEG opt_sysvipc.h
	MSGSSZ opt_sysvipc.h
	MSGTQL opt_sysvipc.h
	SEMMNI opt_sysvipc.h
	SEMMNS opt_sysvipc.h
	SEMMNU opt_sysvipc.h
	SEMMSL opt_sysvipc.h
	SEMOPM opt_sysvipc.h
	SEMUME opt_sysvipc.h
	SHMALL opt_sysvipc.h
	SHMMAX opt_sysvipc.h
	SHMMAXPGS opt_sysvipc.h
	SHMMIN opt_sysvipc.h
	SHMMNI opt_sysvipc.h
	SHMSEG opt_sysvipc.h
	SYSVMSG opt_sysvipc.h
	SYSVSEM opt_sysvipc.h
	SYSVSHM opt_sysvipc.h
	SW_WATCHDOG opt_watchdog.h
	TCPHPTS
	TCP_REQUEST_TRK opt_global.h
	TCP_ACCOUNTING opt_global.h
	TCP_BBR opt_inet.h
	TCP_RACK opt_inet.h
	#
	# TCP SaD Detection is an experimental Sack attack Detection (SaD)
	# algorithm that uses "normal" behaviour with SACK's to detect
	# a possible attack. It is strictly experimental at this point.
	#
	TCP_SAD_DETECTION opt_inet.h
	TURNSTILE_PROFILING
	UMTX_PROFILING
	UMTX_CHAINS opt_global.h
	VERBOSE_SYSINIT
	ZSTDIO opt_zstdio.h

	# Sanitizers
	COVERAGE opt_global.h
	KASAN opt_global.h
	KCOV
	KCSAN opt_global.h
	KMSAN opt_global.h
	KUBSAN opt_global.h

	# POSIX kernel options
	P1003_1B_MQUEUE opt_posix.h
	P1003_1B_SEMAPHORES opt_posix.h
	_KPOSIX_PRIORITY_SCHEDULING opt_posix.h

	# Do we want the config file compiled into the kernel?
	INCLUDE_CONFIG_FILE opt_config.h

	# Options for static filesystems. These should only be used at config
	# time, since the corresponding lkms cannot work if there are any static
	# dependencies. Unusability is enforced by hiding the defines for the
	# options in a never-included header.
	AUTOFS opt_dontuse.h
	CD9660 opt_dontuse.h
	EXT2FS opt_dontuse.h
	FDESCFS opt_dontuse.h
	FFS opt_dontuse.h
	FUSEFS opt_dontuse.h
	MSDOSFS opt_dontuse.h
	NULLFS opt_dontuse.h
	PROCFS opt_dontuse.h
	PSEUDOFS opt_dontuse.h
	SMBFS opt_dontuse.h
	TARFS opt_dontuse.h
	TMPFS opt_dontuse.h
	UDF opt_dontuse.h
	UNIONFS opt_dontuse.h
	ZFS opt_dontuse.h

	# Pseudofs debugging
	PSEUDOFS_TRACE opt_pseudofs.h

	# Tarfs debugging
	TARFS_DEBUG opt_tarfs.h

	# In-kernel GSS-API
	KGSSAPI opt_kgssapi.h
	KGSSAPI_DEBUG opt_kgssapi.h

	# These static filesystems have one slightly bogus static dependency in
	# sys/i386/i386/autoconf.c. If any of these filesystems are
	# statically compiled into the kernel, code for mounting them as root
	# filesystems will be enabled - but look below.
	# NFSCL - client
	# NFSD - server
	NFSCL opt_nfs.h
	NFSD opt_nfs.h

	# filesystems and libiconv bridge
	CD9660_ICONV opt_dontuse.h
	MSDOSFS_ICONV opt_dontuse.h
	UDF_ICONV opt_dontuse.h

	# If you are following the conditions in the copyright,
	# you can enable soft-updates which will speed up a lot of thigs
	# and make the system safer from crashes at the same time.
	# otherwise a STUB module will be compiled in.
	SOFTUPDATES opt_ffs.h

	# On small, embedded systems, it can be useful to turn off support for
	# snapshots. It saves about 30-40k for a feature that would be lightly
	# used, if it is used at all.
	NO_FFS_SNAPSHOT opt_ffs.h

	# Enabling this option turns on support for Access Control Lists in UFS,
	# which can be used to support high security configurations. Depends on
	# UFS_EXTATTR.
	UFS_ACL opt_ufs.h

	# Enabling this option turns on support for extended attributes in UFS-based
	# filesystems, which can be used to support high security configurations
	# as well as new filesystem features.
	UFS_EXTATTR opt_ufs.h
	UFS_EXTATTR_AUTOSTART opt_ufs.h

	# Enable fast hash lookups for large directories on UFS-based filesystems.
	UFS_DIRHASH opt_ufs.h

	# Enable gjournal-based UFS journal.
	UFS_GJOURNAL opt_ufs.h

	# The below sentence is not in English, and neither is this one.
	# We plan to remove the static dependences above, with a
	# <filesystem>_ROOT option to control if it usable as root. This list
	# allows these options to be present in config files already (though
	# they won't make any difference yet).
	NFS_ROOT opt_nfsroot.h

	# SMB/CIFS requester
	NETSMB opt_netsmb.h

	# Enable debugnet(4) networking support.
	DEBUGNET opt_global.h
	# Enable netdump(4) client support.
	NETDUMP opt_global.h
	# Enable netgdb(4) support.
	NETGDB opt_global.h

	# Options used only in subr_param.c.
	HZ opt_param.h
	MAXFILES opt_param.h
	NBUF opt_param.h
	NSFBUFS opt_param.h
	VM_BCACHE_SIZE_MAX opt_param.h
	VM_SWZONE_SIZE_MAX opt_param.h
	MAXUSERS
	DFLDSIZ opt_param.h
	MAXDSIZ opt_param.h
	MAXSSIZ opt_param.h

	# Generic SCSI options.
	CAM_MAX_HIGHPOWER opt_cam.h
	CAMDEBUG opt_cam.h
	CAM_DEBUG_COMPILE opt_cam.h
	CAM_DEBUG_DELAY opt_cam.h
	CAM_DEBUG_BUS opt_cam.h
	CAM_DEBUG_TARGET opt_cam.h
	CAM_DEBUG_LUN opt_cam.h
	CAM_DEBUG_FLAGS opt_cam.h
	CAM_BOOT_DELAY opt_cam.h
	CAM_IOSCHED_DYNAMIC opt_cam.h
	CAM_IO_STATS opt_cam.h
	CAM_TEST_FAILURE opt_cam.h
	SCSI_DELAY opt_scsi.h
	SCSI_NO_SENSE_STRINGS opt_scsi.h
	SCSI_NO_OP_STRINGS opt_scsi.h

	# Options used only in cam/ata/ata_da.c
	ATA_STATIC_ID opt_ada.h

	# Options used only in cam/scsi/scsi_cd.c
	CHANGER_MIN_BUSY_SECONDS opt_cd.h
	CHANGER_MAX_BUSY_SECONDS opt_cd.h

	# Options used only in cam/scsi/scsi_da.c
	DA_TRACK_REFS opt_da.h

	# Options used only in cam/scsi/scsi_sa.c.
	SA_IO_TIMEOUT opt_sa.h
	SA_SPACE_TIMEOUT opt_sa.h
	SA_REWIND_TIMEOUT opt_sa.h
	SA_ERASE_TIMEOUT opt_sa.h
	SA_1FM_AT_EOD opt_sa.h

	# Options used only in cam/scsi/scsi_pt.c
	SCSI_PT_DEFAULT_TIMEOUT opt_pt.h

	# Options used only in cam/scsi/scsi_ses.c
	SES_ENABLE_PASSTHROUGH opt_ses.h

	# Options used in dev/sym/ (Symbios SCSI driver).
	SYM_SETUP_SCSI_DIFF opt_sym.h #-HVD support for 825a, 875, 885
	# disabled:0 (default), enabled:1
	SYM_SETUP_PCI_PARITY opt_sym.h #-PCI parity checking
	# disabled:0, enabled:1 (default)
	SYM_SETUP_MAX_LUN opt_sym.h #-Number of LUNs supported
	# default:8, range:[1..64]

	# Options used only in dev/isp/*
	ISP_TARGET_MODE opt_isp.h
	ISP_FW_CRASH_DUMP opt_isp.h
	ISP_DEFAULT_ROLES opt_isp.h
	ISP_INTERNAL_TARGET opt_isp.h
	ISP_FCTAPE_OFF opt_isp.h

	# Net stuff.
	ACCEPT_FILTER_DATA
	ACCEPT_FILTER_DNS
	ACCEPT_FILTER_HTTP
	ACCEPT_FILTER_TLS
	ALTQ opt_global.h
	ALTQ_CBQ opt_altq.h
	ALTQ_CDNR opt_altq.h
	ALTQ_CODEL opt_altq.h
	ALTQ_DEBUG opt_altq.h
	ALTQ_HFSC opt_altq.h
	ALTQ_FAIRQ opt_altq.h
	ALTQ_NOPCC opt_altq.h
	ALTQ_PRIQ opt_altq.h
	ALTQ_RED opt_altq.h
	ALTQ_RIO opt_altq.h
	BOOTP opt_bootp.h
	BOOTP_BLOCKSIZE opt_bootp.h
	BOOTP_COMPAT opt_bootp.h
	BOOTP_NFSROOT opt_bootp.h
	BOOTP_NFSV3 opt_bootp.h
	BOOTP_WIRED_TO opt_bootp.h
	DEVICE_POLLING
	DUMMYNET opt_ipdn.h
	RATELIMIT opt_ratelimit.h
	RATELIMIT_DEBUG opt_ratelimit.h
	INET opt_inet.h
	INET6 opt_inet6.h
	STATS opt_global.h
	IPDIVERT
	IPFILTER opt_ipfilter.h
	IPFILTER_DEFAULT_BLOCK opt_ipfilter.h
	IPFILTER_LOG opt_ipfilter.h
	IPFILTER_LOOKUP opt_ipfilter.h
	IPFIREWALL opt_ipfw.h
	IPFIREWALL_DEFAULT_TO_ACCEPT opt_ipfw.h
	IPFIREWALL_NAT opt_ipfw.h
	IPFIREWALL_NAT64 opt_ipfw.h
	IPFIREWALL_NPTV6 opt_ipfw.h
	IPFIREWALL_VERBOSE opt_ipfw.h
	IPFIREWALL_VERBOSE_LIMIT opt_ipfw.h
	IPFIREWALL_PMOD opt_ipfw.h
	IPSEC opt_ipsec.h
	IPSEC_DEBUG opt_ipsec.h
	IPSEC_SUPPORT opt_ipsec.h
	IPSTEALTH
	KERN_TLS
	KRPC
	LIBALIAS
	LIBMCHAIN
	MBUF_PROFILING
	MBUF_STRESS_TEST
	MROUTING opt_mrouting.h
	NFSLOCKD
	NETLINK opt_global.h
	PF_DEFAULT_TO_DROP opt_pf.h
	ROUTE_MPATH opt_route.h
	ROUTETABLES opt_route.h
	FIB_ALGO opt_route.h
	RSS opt_rss.h
	SLIP_IFF_OPTS opt_slip.h
	TCPPCAP opt_global.h
	SIFTR
	TCP_BLACKBOX opt_global.h
	TCP_HHOOK opt_global.h
	TCP_OFFLOAD opt_inet.h # Enable code to dispatch TCP offloading
	TCP_RFC7413 opt_inet.h
	TCP_RFC7413_MAX_KEYS opt_inet.h
	TCP_RFC7413_MAX_PSKS opt_inet.h
	TCP_SIGNATURE opt_ipsec.h
	VLAN_ARRAY opt_vlan.h
	XDR
	XBONEHACK

	#
	# SCTP
	#
	SCTP opt_sctp.h
	SCTP_SUPPORT opt_sctp.h
	SCTP_DEBUG opt_sctp.h # Enable debug printfs
	SCTP_LOCK_LOGGING opt_sctp.h # Log to KTR lock activity
	SCTP_MBUF_LOGGING opt_sctp.h # Log to KTR general mbuf aloc/free
	SCTP_MBCNT_LOGGING opt_sctp.h # Log to KTR mbcnt activity
	SCTP_PACKET_LOGGING opt_sctp.h # Log to a packet buffer last N packets
	SCTP_LTRACE_CHUNKS opt_sctp.h # Log to KTR chunks processed
	SCTP_LTRACE_ERRORS opt_sctp.h # Log to KTR error returns.
	SCTP_USE_PERCPU_STAT opt_sctp.h # Use per cpu stats.
	SCTP_MCORE_INPUT opt_sctp.h # Have multiple input threads for input mbufs
	SCTP_LOCAL_TRACE_BUF opt_sctp.h # Use tracebuffer exported via sysctl
	SCTP_DETAILED_STR_STATS opt_sctp.h # Use per PR-SCTP policy stream stats
	#
	#
	#

	# Netgraph(4). Use option NETGRAPH to enable the base netgraph code.
	# Each netgraph node type can be either be compiled into the kernel
	# or loaded dynamically. To get the former, include the corresponding
	# option below. Each type has its own man page, e.g. ng_async(4).
	NETGRAPH
	NETGRAPH_DEBUG opt_netgraph.h
	NETGRAPH_ASYNC opt_netgraph.h
	NETGRAPH_BLUETOOTH opt_netgraph.h
	NETGRAPH_BLUETOOTH_BT3C opt_netgraph.h
	NETGRAPH_BLUETOOTH_H4 opt_netgraph.h
	NETGRAPH_BLUETOOTH_HCI opt_netgraph.h
	NETGRAPH_BLUETOOTH_L2CAP opt_netgraph.h
	NETGRAPH_BLUETOOTH_SOCKET opt_netgraph.h
	NETGRAPH_BLUETOOTH_UBT opt_netgraph.h
	NETGRAPH_BLUETOOTH_UBTBCMFW opt_netgraph.h
	NETGRAPH_BPF opt_netgraph.h
	NETGRAPH_BRIDGE opt_netgraph.h
	NETGRAPH_CAR opt_netgraph.h
	NETGRAPH_CHECKSUM opt_netgraph.h
	NETGRAPH_CISCO opt_netgraph.h
	NETGRAPH_DEFLATE opt_netgraph.h
	NETGRAPH_DEVICE opt_netgraph.h
	NETGRAPH_ECHO opt_netgraph.h
	NETGRAPH_EIFACE opt_netgraph.h
	NETGRAPH_ETHER opt_netgraph.h
	NETGRAPH_ETHER_ECHO opt_netgraph.h
	NETGRAPH_FEC opt_netgraph.h
	NETGRAPH_FRAME_RELAY opt_netgraph.h
	NETGRAPH_GIF opt_netgraph.h
	NETGRAPH_GIF_DEMUX opt_netgraph.h
	NETGRAPH_HOLE opt_netgraph.h
	NETGRAPH_IFACE opt_netgraph.h
	NETGRAPH_IP_INPUT opt_netgraph.h
	NETGRAPH_IPFW opt_netgraph.h
	NETGRAPH_KSOCKET opt_netgraph.h
	NETGRAPH_L2TP opt_netgraph.h
	NETGRAPH_LMI opt_netgraph.h
	NETGRAPH_MPPC_COMPRESSION opt_netgraph.h
	NETGRAPH_MPPC_ENCRYPTION opt_netgraph.h
	NETGRAPH_NAT opt_netgraph.h
	NETGRAPH_NETFLOW opt_netgraph.h
	NETGRAPH_ONE2MANY opt_netgraph.h
	NETGRAPH_PATCH opt_netgraph.h
	NETGRAPH_PIPE opt_netgraph.h
	NETGRAPH_PPP opt_netgraph.h
	NETGRAPH_PPPOE opt_netgraph.h
	NETGRAPH_PPTPGRE opt_netgraph.h
	NETGRAPH_PRED1 opt_netgraph.h
	NETGRAPH_RFC1490 opt_netgraph.h
	NETGRAPH_SOCKET opt_netgraph.h
	NETGRAPH_SPLIT opt_netgraph.h
	NETGRAPH_SPPP opt_netgraph.h
	NETGRAPH_TAG opt_netgraph.h
	NETGRAPH_TCPMSS opt_netgraph.h
	NETGRAPH_TEE opt_netgraph.h
	NETGRAPH_TTY opt_netgraph.h
	NETGRAPH_UI opt_netgraph.h
	NETGRAPH_VJC opt_netgraph.h
	NETGRAPH_VLAN opt_netgraph.h

	# DRM options
	DRM_DEBUG opt_drm.h

	TI_SF_BUF_JUMBO opt_ti.h
	TI_JUMBO_HDRSPLIT opt_ti.h

	# Misc debug flags. Most of these should probably be replaced with
	# 'DEBUG', and then let people recompile just the interesting modules
	# with 'make CC="cc -DDEBUG"'.
	DEBUG_1284 opt_ppb_1284.h
	LPT_DEBUG opt_lpt.h
	PLIP_DEBUG opt_plip.h
	LOCKF_DEBUG opt_debug_lockf.h
	SI_DEBUG opt_debug_si.h
	IFMEDIA_DEBUG opt_ifmedia.h

	# Fb options
	FB_DEBUG opt_fb.h

	# ppbus related options
	PERIPH_1284 opt_ppb_1284.h
	DONTPROBE_1284 opt_ppb_1284.h

	# smbus related options
	ENABLE_ALART opt_intpm.h

	# These cause changes all over the kernel
	BLKDEV_IOSIZE opt_global.h
	BURN_BRIDGES opt_global.h
	DEBUG opt_global.h
	DEBUG_LOCKS opt_global.h
	DEBUG_VFS_LOCKS opt_global.h
	DFLTPHYS opt_global.h
	DIAGNOSTIC opt_global.h
	INVARIANT_SUPPORT opt_global.h
	INVARIANTS opt_global.h
	KASSERT_PANIC_OPTIONAL opt_global.h
	MAXCPU opt_global.h
	MAXMEMDOM opt_global.h
	MAXPHYS opt_maxphys.h
	MCLSHIFT opt_global.h
	MUTEX_NOINLINE opt_global.h
	LOCK_PROFILING opt_global.h
	MSIZE opt_global.h
	REGRESSION opt_global.h
	RWLOCK_NOINLINE opt_global.h
	SX_NOINLINE opt_global.h
	VFS_BIO_DEBUG opt_global.h

	# These are VM related options
	VM_KMEM_SIZE opt_vm.h
	VM_KMEM_SIZE_SCALE opt_vm.h
	VM_KMEM_SIZE_MAX opt_vm.h
	VM_NRESERVLEVEL opt_vm.h
	VM_LEVEL_0_ORDER opt_vm.h
	NO_SWAPPING opt_vm.h
	MALLOC_MAKE_FAILURES opt_vm.h
	MALLOC_PROFILE opt_vm.h
	MALLOC_DEBUG_MAXZONES opt_vm.h

	# The MemGuard replacement allocator used for tamper-after-free detection
	DEBUG_MEMGUARD opt_vm.h

	# The RedZone malloc(9) protection
	DEBUG_REDZONE opt_vm.h

	# Standard SMP options
	EARLY_AP_STARTUP opt_global.h
	SMP opt_global.h
	NUMA opt_global.h
	+CPUGRP_SCORE opt_global.h

	# Size of the kernel message buffer
	MSGBUF_SIZE opt_msgbuf.h

	# NFS options
	NFS_MINATTRTIMO opt_nfs.h
	NFS_MAXATTRTIMO opt_nfs.h
	NFS_MINDIRATTRTIMO opt_nfs.h
	NFS_MAXDIRATTRTIMO opt_nfs.h
	NFS_DEBUG opt_nfs.h

	# TMPFS options
	TMPFS_PAGES_MINRESERVED opt_tmpfs.h

	# Options for uart(4)
	UART_PPS_ON_CTS opt_uart.h
	UART_POLL_FREQ opt_uart.h
	UART_DEV_TOLERANCE_PCT opt_uart.h

	# options for bus/device framework
	BUS_DEBUG opt_bus.h

	# options for USB support
	USB_DEBUG opt_usb.h
	USB_HOST_ALIGN opt_usb.h
	USB_REQ_DEBUG opt_usb.h
	USB_TEMPLATE opt_usb.h
	USB_VERBOSE opt_usb.h
	USB_DMA_SINGLE_ALLOC opt_usb.h
	USB_EHCI_BIG_ENDIAN_DESC opt_usb.h
	U3G_DEBUG opt_u3g.h
	UKBD_DFLT_KEYMAP opt_ukbd.h
	UPLCOM_INTR_INTERVAL opt_uplcom.h
	UVSCOM_DEFAULT_OPKTSIZE opt_uvscom.h
	UVSCOM_INTR_INTERVAL opt_uvscom.h

	# options for the Realtek rtwn driver
	RTWN_DEBUG opt_rtwn.h
	RTWN_WITHOUT_UCODE opt_rtwn.h

	# Embedded system options
	INIT_PATH

	ROOTDEVNAME

	FDC_DEBUG opt_fdc.h
	PCFCLOCK_VERBOSE opt_pcfclock.h
	PCFCLOCK_MAX_RETRIES opt_pcfclock.h

	KTR opt_global.h
	KTR_ALQ opt_ktr.h
	KTR_MASK opt_ktr.h
	KTR_CPUMASK opt_ktr.h
	KTR_COMPILE opt_global.h
	KTR_BOOT_ENTRIES opt_global.h
	KTR_ENTRIES opt_global.h
	KTR_VERBOSE opt_ktr.h
	WITNESS opt_global.h
	WITNESS_KDB opt_witness.h
	WITNESS_NO_VNODE opt_witness.h
	WITNESS_SKIPSPIN opt_witness.h
	WITNESS_COUNT opt_witness.h
	OPENSOLARIS_WITNESS opt_global.h

	EPOCH_TRACE opt_global.h

	# options for ACPI support
	ACPI_DEBUG opt_acpi.h
	ACPI_MAX_TASKS opt_acpi.h
	ACPI_MAX_THREADS opt_acpi.h
	DEV_ACPI opt_acpi.h
	ACPI_EARLY_EPYC_WAR opt_acpi.h

	# options for IOMMU support
	IOMMU opt_iommu.h

	# ISA support
	DEV_ISA opt_isa.h
	ISAPNP opt_dontuse.h

	# various 'device presence' options.
	DEV_BPF opt_bpf.h
	DEV_CARP opt_carp.h
	DEV_NETMAP opt_global.h
	DEV_PCI opt_pci.h
	DEV_PF opt_pf.h
	DEV_PFLOG opt_pf.h
	DEV_PFSYNC opt_pf.h
	DEV_SPLASH opt_splash.h
	DEV_VLAN opt_vlan.h

	# bce driver
	BCE_DEBUG opt_bce.h
	BCE_NVRAM_WRITE_SUPPORT opt_bce.h

	SOCKBUF_DEBUG opt_global.h


	# options for hifn driver
	HIFN_DEBUG opt_hifn.h
	HIFN_RNDTEST opt_hifn.h

	# options for safenet driver
	SAFE_DEBUG opt_safe.h
	SAFE_NO_RNG opt_safe.h
	SAFE_RNDTEST opt_safe.h

	# syscons/vt options
	MAXCONS opt_syscons.h
	SC_ALT_MOUSE_IMAGE opt_syscons.h
	SC_CUT_SPACES2TABS opt_syscons.h
	SC_CUT_SEPCHARS opt_syscons.h
	SC_DEBUG_LEVEL opt_syscons.h
	SC_DFLT_FONT opt_syscons.h
	SC_DFLT_TERM opt_syscons.h
	SC_DISABLE_KDBKEY opt_syscons.h
	SC_DISABLE_REBOOT opt_syscons.h
	SC_HISTORY_SIZE opt_syscons.h
	SC_KERNEL_CONS_ATTR opt_syscons.h
	SC_KERNEL_CONS_ATTRS opt_syscons.h
	SC_KERNEL_CONS_REV_ATTR opt_syscons.h
	SC_MOUSE_CHAR opt_syscons.h
	SC_NO_CUTPASTE opt_syscons.h
	SC_NO_FONT_LOADING opt_syscons.h
	SC_NO_HISTORY opt_syscons.h
	SC_NO_MODE_CHANGE opt_syscons.h
	SC_NO_SUSPEND_VTYSWITCH opt_syscons.h
	SC_NO_SYSMOUSE opt_syscons.h
	SC_NO_TERM_DUMB opt_syscons.h
	SC_NO_TERM_SC opt_syscons.h
	SC_NO_TERM_TEKEN opt_syscons.h
	SC_NORM_ATTR opt_syscons.h
	SC_NORM_REV_ATTR opt_syscons.h
	SC_PIXEL_MODE opt_syscons.h
	SC_RENDER_DEBUG opt_syscons.h
	SC_TWOBUTTON_MOUSE opt_syscons.h
	VT_ALT_TO_ESC_HACK opt_syscons.h
	VT_FB_MAX_WIDTH opt_syscons.h
	VT_FB_MAX_HEIGHT opt_syscons.h
	VT_MAXWINDOWS opt_syscons.h
	VT_TWOBUTTON_MOUSE opt_syscons.h
	DEV_SC opt_syscons.h
	DEV_VT opt_syscons.h

	# teken terminal emulator options
	TEKEN_CONS25 opt_teken.h
	TEKEN_UTF8 opt_teken.h
	TERMINAL_KERN_ATTR opt_teken.h
	TERMINAL_NORM_ATTR opt_teken.h

	# options for printf
	PRINTF_BUFR_SIZE opt_printf.h
	BOOT_TAG opt_printf.h
	BOOT_TAG_SZ opt_printf.h

	# kbd options
	KBD_DISABLE_KEYMAP_LOAD opt_kbd.h
	KBD_INSTALL_CDEV opt_kbd.h
	KBD_MAXRETRY opt_kbd.h
	KBD_MAXWAIT opt_kbd.h
	KBD_RESETDELAY opt_kbd.h
	KBD_DELAY1 opt_kbd.h
	KBD_DELAY2 opt_kbd.h
	KBDIO_DEBUG opt_kbd.h
	KBDMUX_DFLT_KEYMAP opt_kbdmux.h

	# options for the Atheros driver
	ATH_DEBUG opt_ath.h
	ATH_TXBUF opt_ath.h
	ATH_RXBUF opt_ath.h
	ATH_DIAGAPI opt_ath.h
	ATH_TX99_DIAG opt_ath.h
	ATH_ENABLE_DFS opt_ath.h
	ATH_EEPROM_FIRMWARE opt_ath.h
	ATH_ENABLE_RADIOTAP_VENDOR_EXT opt_ath.h
	ATH_DEBUG_ALQ opt_ath.h
	ATH_KTR_INTR_DEBUG opt_ath.h

	AH_DEBUG opt_ah.h
	AH_ASSERT opt_ah.h
	AH_DEBUG_ALQ opt_ah.h
	AH_REGOPS_FUNC opt_ah.h
	AH_WRITE_REGDOMAIN opt_ah.h
	AH_DEBUG_COUNTRY opt_ah.h
	AH_WRITE_EEPROM opt_ah.h
	AH_PRIVATE_DIAG opt_ah.h
	AH_NEED_DESC_SWAP opt_ah.h
	AH_USE_INIPDGAIN opt_ah.h
	AH_MAXCHAN opt_ah.h
	AH_RXCFG_SDMAMW_4BYTES opt_ah.h
	AH_INTERRUPT_DEBUGGING opt_ah.h
	# AR5416 and later interrupt mitigation
	# XXX do not use this for AR9130
	AH_AR5416_INTERRUPT_MITIGATION opt_ah.h

	# options for the Altera mSGDMA driver (altera_msgdma)
	ALTERA_MSGDMA_DESC_STD opt_altera_msgdma.h
	ALTERA_MSGDMA_DESC_EXT opt_altera_msgdma.h
	ALTERA_MSGDMA_DESC_PF_STD opt_altera_msgdma.h
	ALTERA_MSGDMA_DESC_PF_EXT opt_altera_msgdma.h

	# options for the Broadcom BCM43xx driver (bwi)
	BWI_DEBUG opt_bwi.h
	BWI_DEBUG_VERBOSE opt_bwi.h

	# options for the Brodacom BCM43xx driver (bwn)
	BWN_DEBUG opt_bwn.h
	BWN_GPL_PHY opt_bwn.h
	BWN_USE_SIBA opt_bwn.h

	# Options for the SIBA driver
	SIBA_DEBUG opt_siba.h

	# options for the Marvell 8335 wireless driver
	MALO_DEBUG opt_malo.h
	MALO_TXBUF opt_malo.h
	MALO_RXBUF opt_malo.h

	# options for the Marvell wireless driver
	MWL_DEBUG opt_mwl.h
	MWL_TXBUF opt_mwl.h
	MWL_RXBUF opt_mwl.h
	MWL_DIAGAPI opt_mwl.h
	MWL_AGGR_SIZE opt_mwl.h
	MWL_TX_NODROP opt_mwl.h

	# Options for the Marvell NETA driver
	MVNETA_MULTIQUEUE opt_mvneta.h
	MVNETA_KTR opt_mvneta.h

	# Options for the Intel 802.11ac wireless driver
	IWM_DEBUG opt_iwm.h

	# Options for the Intel 802.11n wireless driver
	IWN_DEBUG opt_iwn.h

	# Options for the Intel 3945ABG wireless driver
	WPI_DEBUG opt_wpi.h

	# dcons options
	DCONS_BUF_SIZE opt_dcons.h
	DCONS_POLL_HZ opt_dcons.h
	DCONS_FORCE_CONSOLE opt_dcons.h
	DCONS_FORCE_GDB opt_dcons.h

	# HWPMC options
	HWPMC_DEBUG opt_global.h
	HWPMC_HOOKS

	# 802.11 support layer
	IEEE80211_DEBUG opt_wlan.h
	IEEE80211_DEBUG_REFCNT opt_wlan.h
	IEEE80211_SUPPORT_MESH opt_wlan.h
	IEEE80211_SUPPORT_SUPERG opt_wlan.h
	IEEE80211_SUPPORT_TDMA opt_wlan.h
	IEEE80211_ALQ opt_wlan.h
	IEEE80211_DFS_DEBUG opt_wlan.h

	# 802.11 TDMA support
	TDMA_SLOTLEN_DEFAULT opt_tdma.h
	TDMA_SLOTCNT_DEFAULT opt_tdma.h
	TDMA_BINTVAL_DEFAULT opt_tdma.h
	TDMA_TXRATE_11B_DEFAULT opt_tdma.h
	TDMA_TXRATE_11G_DEFAULT opt_tdma.h
	TDMA_TXRATE_11A_DEFAULT opt_tdma.h
	TDMA_TXRATE_TURBO_DEFAULT opt_tdma.h
	TDMA_TXRATE_HALF_DEFAULT opt_tdma.h
	TDMA_TXRATE_QUARTER_DEFAULT opt_tdma.h
	TDMA_TXRATE_11NA_DEFAULT opt_tdma.h
	TDMA_TXRATE_11NG_DEFAULT opt_tdma.h

	# VideoMode
	PICKMODE_DEBUG opt_videomode.h

	# Network stack virtualization options
	VIMAGE opt_global.h
	VNET_DEBUG opt_global.h

	# Common Flash Interface (CFI) options
	CFI_SUPPORT_STRATAFLASH opt_cfi.h
	CFI_ARMEDANDDANGEROUS opt_cfi.h
	CFI_HARDWAREBYTESWAP opt_cfi.h

	# Sound options
	SND_DEBUG opt_snd.h
	SND_DIAGNOSTIC opt_snd.h
	SND_FEEDER_MULTIFORMAT opt_snd.h
	SND_FEEDER_FULL_MULTIFORMAT opt_snd.h
	SND_FEEDER_RATE_HP opt_snd.h
	SND_PCM_64 opt_snd.h
	SND_OLDSTEREO opt_snd.h

	X86BIOS

	# Flattened device tree options
	FDT opt_platform.h
	FDT_DTB_STATIC opt_platform.h

	# OFED Infiniband stack
	OFED opt_ofed.h
	OFED_DEBUG_INIT opt_ofed.h
	SDP opt_ofed.h
	SDP_DEBUG opt_ofed.h
	IPOIB opt_ofed.h
	IPOIB_DEBUG opt_ofed.h
	IPOIB_CM opt_ofed.h

	# Resource Accounting
	RACCT opt_global.h
	RACCT_DEFAULT_TO_DISABLED opt_global.h

	# Resource Limits
	RCTL opt_global.h

	# Random number generator(s)
	# Alternative RNG algorithm.
	RANDOM_FENESTRASX opt_global.h
	# With this, no entropy processor is loaded, but the entropy
	# harvesting infrastructure is present. This means an entropy
	# processor may be loaded as a module.
	RANDOM_LOADABLE opt_global.h
	# This turns on high-rate and potentially expensive harvesting in
	# the uma slab allocator.
	RANDOM_ENABLE_UMA opt_global.h
	RANDOM_ENABLE_ETHER opt_global.h

	# This options turns TPM into entropy source.
	TPM_HARVEST opt_tpm.h

	# BHND(4) driver
	BHND_LOGLEVEL opt_global.h

	# GPIO and child devices
	GPIO_SPI_DEBUG opt_gpio.h

	# SPI devices
	SPIGEN_LEGACY_CDEVNAME opt_spi.h

	# etherswitch(4) driver
	RTL8366_SOFT_RESET opt_etherswitch.h

	# evdev protocol support
	EVDEV_SUPPORT opt_evdev.h
	EVDEV_DEBUG opt_evdev.h
	UINPUT_DEBUG opt_evdev.h

	# Hyper-V network driver
	HN_DEBUG opt_hn.h

	# CAM-based MMC stack
	MMCCAM
	# Encrypted kernel crash dumps
	EKCD opt_ekcd.h

	# NVME options
	NVME_USE_NVD opt_nvme.h

	# amdsbwd options
	AMDSBWD_DEBUG opt_amdsbwd.h

	# gcov support
	GCOV opt_global.h
	LINDEBUGFS

	# options for HID support
	HID_DEBUG opt_hid.h
	IICHID_DEBUG opt_hid.h
	IICHID_SAMPLING opt_hid.h
	HKBD_DFLT_KEYMAP opt_hkbd.h
	HIDRAW_MAKE_UHID_ALIAS opt_hid.h

	# kenv options
	# The early kernel environment (loader environment, config(8)-provided static)
	# is typically cleared after the dynamic environment comes up to ensure that
	# we're not inadvertently holding on to 'secret' values in these stale envs.
	# This option is insecure except in controlled environments where the static
	# environment's contents are known to be safe.
	PRESERVE_EARLY_KENV opt_global.h
	diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
	index 1f9577fddf9c..ec6b753cdf75 100644
	--- a/sys/kern/subr_smp.c
	+++ b/sys/kern/subr_smp.c
	@@ -1,1352 +1,1358 @@
	/*-
	* SPDX-License-Identifier: BSD-2-Clause
	*
	* Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* This module holds the global variables and machine independent functions
	* used for the kernel SMP support.
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/proc.h>
	#include <sys/bus.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/pcpu.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/sysctl.h>

	#include <machine/cpu.h>
	#include <machine/pcb.h>
	#include <machine/smp.h>

	#include "opt_sched.h"
	+#include "opt_global.h"

	#ifdef SMP
	MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");

	volatile cpuset_t stopped_cpus;
	volatile cpuset_t started_cpus;
	volatile cpuset_t suspended_cpus;
	cpuset_t hlt_cpus_mask;
	cpuset_t logical_cpus_mask;

	void (*cpustop_restartfunc)(void);
	#endif

	static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);

	/* This is used in modules that need to work in both SMP and UP. */
	cpuset_t all_cpus;

	int mp_ncpus;
	/* export this for libkvm consumers. */
	int mp_maxcpus = MAXCPU;

	volatile int smp_started;
	u_int mp_maxid;

	/* Array of CPU contexts saved during a panic. */
	struct pcb *stoppcbs;

	static SYSCTL_NODE(_kern, OID_AUTO, smp,
	CTLFLAG_RD \| CTLFLAG_CAPRD \| CTLFLAG_MPSAFE, NULL,
	"Kernel SMP");

	SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD\|CTLFLAG_CAPRD, &mp_maxid, 0,
	"Max CPU ID.");

	SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD\|CTLFLAG_CAPRD, &mp_maxcpus,
	0, "Max number of CPUs that the system was compiled for.");

	SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD\|CTLTYPE_INT\|CTLFLAG_MPSAFE,
	NULL, 0, sysctl_kern_smp_active, "I",
	"Indicates system is running in SMP mode");

	int smp_disabled = 0; /* has smp been disabled? */
	SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN\|CTLFLAG_CAPRD,
	&smp_disabled, 0, "SMP has been disabled from the loader");

	int smp_cpus = 1; /* how many cpu's running */
	SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD\|CTLFLAG_CAPRD, &smp_cpus, 0,
	"Number of CPUs online");

	int smp_threads_per_core = 1; /* how many SMT threads are running per core */
	SYSCTL_INT(_kern_smp, OID_AUTO, threads_per_core, CTLFLAG_RD\|CTLFLAG_CAPRD,
	&smp_threads_per_core, 0, "Number of SMT threads online per core");

	int mp_ncores = -1; /* how many physical cores running */
	SYSCTL_INT(_kern_smp, OID_AUTO, cores, CTLFLAG_RD\|CTLFLAG_CAPRD, &mp_ncores, 0,
	"Number of physical cores online");

	int smp_topology = 0; /* Which topology we're using. */
	SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
	"Topology override setting; 0 is default provided by hardware.");

	#ifdef SMP
	/* Variables needed for SMP rendezvous. */
	static volatile int smp_rv_ncpus;
	static void (volatile smp_rv_setup_func)(void arg);
	static void (volatile smp_rv_action_func)(void arg);
	static void (volatile smp_rv_teardown_func)(void arg);
	static void *volatile smp_rv_func_arg;
	static volatile int smp_rv_waiters[4];

	/*
	* Shared mutex to restrict busywaits between smp_rendezvous() and
	* smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these
	* functions trigger at once and cause multiple CPUs to busywait with
	* interrupts disabled.
	*/
	struct mtx smp_ipi_mtx;

	/*
	* Let the MD SMP code initialize mp_maxid very early if it can.
	*/
	static void
	mp_setmaxid(void *dummy)
	{

	cpu_mp_setmaxid();

	KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
	KASSERT(mp_ncpus > 1 \|\| mp_maxid == 0,
	("%s: one CPU but mp_maxid is not zero", __func__));
	KASSERT(mp_maxid >= mp_ncpus - 1,
	("%s: counters out of sync: max %d, count %d", __func__,
	mp_maxid, mp_ncpus));

	cpusetsizemin = howmany(mp_maxid + 1, NBBY);
	}
	SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);

	/*
	* Call the MD SMP initialization code.
	*/
	static void
	mp_start(void *dummy)
	{

	mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);

	/* Probe for MP hardware. */
	if (smp_disabled != 0 \|\| cpu_mp_probe() == 0) {
	mp_ncores = 1;
	mp_ncpus = 1;
	CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
	return;
	}

	cpu_mp_start();
	printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
	mp_ncpus);

	/* Provide a default for most architectures that don't have SMT/HTT. */
	if (mp_ncores < 0)
	mp_ncores = mp_ncpus;

	stoppcbs = mallocarray(mp_maxid + 1, sizeof(struct pcb), M_DEVBUF,
	M_WAITOK \| M_ZERO);

	cpu_mp_announce();
	}
	SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);

	void
	forward_signal(struct thread *td)
	{
	int id;

	/*
	* signotify() has already set TDA_AST and TDA_SIG on td_ast for
	* this thread, so all we need to do is poke it if it is currently
	* executing so that it executes ast().
	*/
	THREAD_LOCK_ASSERT(td, MA_OWNED);
	KASSERT(TD_IS_RUNNING(td),
	("forward_signal: thread is not TDS_RUNNING"));

	CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);

	if (!smp_started \|\| cold \|\| KERNEL_PANICKED())
	return;

	/* No need to IPI ourself. */
	if (td == curthread)
	return;

	id = td->td_oncpu;
	if (id == NOCPU)
	return;
	ipi_cpu(id, IPI_AST);
	}

	/*
	* When called the executing CPU will send an IPI to all other CPUs
	* requesting that they halt execution.
	*
	* Usually (but not necessarily) called with 'other_cpus' as its arg.
	*
	* - Signals all CPUs in map to stop.
	* - Waits for each to stop.
	*
	* Returns:
	* -1: error
	* 0: NA
	* 1: ok
	*
	*/
	#if defined(__amd64__) \|\| defined(__i386__)
	#define X86 1
	#else
	#define X86 0
	#endif
	static int
	generic_stop_cpus(cpuset_t map, u_int type)
	{
	#ifdef KTR
	char cpusetbuf[CPUSETBUFSIZ];
	#endif
	static volatile u_int stopping_cpu = NOCPU;
	int i;
	volatile cpuset_t *cpus;

	KASSERT(
	type == IPI_STOP \|\| type == IPI_STOP_HARD
	#if X86
	\|\| type == IPI_SUSPEND
	#endif
	, ("%s: invalid stop type", __func__));

	if (!smp_started)
	return (0);

	CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
	cpusetobj_strprint(cpusetbuf, &map), type);

	#if X86
	/*
	* When suspending, ensure there are are no IPIs in progress.
	* IPIs that have been issued, but not yet delivered (e.g.
	* not pending on a vCPU when running under virtualization)
	* will be lost, violating FreeBSD's assumption of reliable
	* IPI delivery.
	*/
	if (type == IPI_SUSPEND)
	mtx_lock_spin(&smp_ipi_mtx);
	#endif

	#if X86
	if (!nmi_is_broadcast \|\| nmi_kdb_lock == 0) {
	#endif
	if (stopping_cpu != PCPU_GET(cpuid))
	while (atomic_cmpset_int(&stopping_cpu, NOCPU,
	PCPU_GET(cpuid)) == 0)
	while (stopping_cpu != NOCPU)
	cpu_spinwait(); /* spin */

	/* send the stop IPI to all CPUs in map */
	ipi_selected(map, type);
	#if X86
	}
	#endif

	#if X86
	if (type == IPI_SUSPEND)
	cpus = &suspended_cpus;
	else
	#endif
	cpus = &stopped_cpus;

	i = 0;
	while (!CPU_SUBSET(cpus, &map)) {
	/* spin */
	cpu_spinwait();
	i++;
	if (i == 100000000) {
	printf("timeout stopping cpus\n");
	break;
	}
	}

	#if X86
	if (type == IPI_SUSPEND)
	mtx_unlock_spin(&smp_ipi_mtx);
	#endif

	stopping_cpu = NOCPU;
	return (1);
	}

	int
	stop_cpus(cpuset_t map)
	{

	return (generic_stop_cpus(map, IPI_STOP));
	}

	int
	stop_cpus_hard(cpuset_t map)
	{

	return (generic_stop_cpus(map, IPI_STOP_HARD));
	}

	#if X86
	int
	suspend_cpus(cpuset_t map)
	{

	return (generic_stop_cpus(map, IPI_SUSPEND));
	}
	#endif

	/*
	* Called by a CPU to restart stopped CPUs.
	*
	* Usually (but not necessarily) called with 'stopped_cpus' as its arg.
	*
	* - Signals all CPUs in map to restart.
	* - Waits for each to restart.
	*
	* Returns:
	* -1: error
	* 0: NA
	* 1: ok
	*/
	static int
	generic_restart_cpus(cpuset_t map, u_int type)
	{
	#ifdef KTR
	char cpusetbuf[CPUSETBUFSIZ];
	#endif
	volatile cpuset_t *cpus;

	#if X86
	KASSERT(type == IPI_STOP \|\| type == IPI_STOP_HARD
	\|\| type == IPI_SUSPEND, ("%s: invalid stop type", __func__));

	if (!smp_started)
	return (0);

	CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));

	if (type == IPI_SUSPEND)
	cpus = &resuming_cpus;
	else
	cpus = &stopped_cpus;

	/* signal other cpus to restart */
	if (type == IPI_SUSPEND)
	CPU_COPY_STORE_REL(&map, &toresume_cpus);
	else
	CPU_COPY_STORE_REL(&map, &started_cpus);

	/*
	* Wake up any CPUs stopped with MWAIT. From MI code we can't tell if
	* MONITOR/MWAIT is enabled, but the potentially redundant writes are
	* relatively inexpensive.
	*/
	if (type == IPI_STOP) {
	struct monitorbuf *mb;
	u_int id;

	CPU_FOREACH(id) {
	if (!CPU_ISSET(id, &map))
	continue;

	mb = &pcpu_find(id)->pc_monitorbuf;
	atomic_store_int(&mb->stop_state,
	MONITOR_STOPSTATE_RUNNING);
	}
	}

	if (!nmi_is_broadcast \|\| nmi_kdb_lock == 0) {
	/* wait for each to clear its bit */
	while (CPU_OVERLAP(cpus, &map))
	cpu_spinwait();
	}
	#else /* !X86 */
	KASSERT(type == IPI_STOP \|\| type == IPI_STOP_HARD,
	("%s: invalid stop type", __func__));

	if (!smp_started)
	return (0);

	CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));

	cpus = &stopped_cpus;

	/* signal other cpus to restart */
	CPU_COPY_STORE_REL(&map, &started_cpus);

	/* wait for each to clear its bit */
	while (CPU_OVERLAP(cpus, &map))
	cpu_spinwait();
	#endif
	return (1);
	}

	int
	restart_cpus(cpuset_t map)
	{

	return (generic_restart_cpus(map, IPI_STOP));
	}

	#if X86
	int
	resume_cpus(cpuset_t map)
	{

	return (generic_restart_cpus(map, IPI_SUSPEND));
	}
	#endif
	#undef X86

	/*
	* All-CPU rendezvous. CPUs are signalled, all execute the setup function
	* (if specified), rendezvous, execute the action function (if specified),
	* rendezvous again, execute the teardown function (if specified), and then
	* resume.
	*
	* Note that the supplied external functions _must_ be reentrant and aware
	* that they are running in parallel and in an unknown lock context.
	*/
	void
	smp_rendezvous_action(void)
	{
	struct thread *td;
	void *local_func_arg;
	void (local_setup_func)(void);
	void (local_action_func)(void);
	void (local_teardown_func)(void);
	#ifdef INVARIANTS
	int owepreempt;
	#endif

	/* Ensure we have up-to-date values. */
	atomic_add_acq_int(&smp_rv_waiters[0], 1);
	while (smp_rv_waiters[0] < smp_rv_ncpus)
	cpu_spinwait();

	/* Fetch rendezvous parameters after acquire barrier. */
	local_func_arg = smp_rv_func_arg;
	local_setup_func = smp_rv_setup_func;
	local_action_func = smp_rv_action_func;
	local_teardown_func = smp_rv_teardown_func;

	/*
	* Use a nested critical section to prevent any preemptions
	* from occurring during a rendezvous action routine.
	* Specifically, if a rendezvous handler is invoked via an IPI
	* and the interrupted thread was in the critical_exit()
	* function after setting td_critnest to 0 but before
	* performing a deferred preemption, this routine can be
	* invoked with td_critnest set to 0 and td_owepreempt true.
	* In that case, a critical_exit() during the rendezvous
	* action would trigger a preemption which is not permitted in
	* a rendezvous action. To fix this, wrap all of the
	* rendezvous action handlers in a critical section. We
	* cannot use a regular critical section however as having
	* critical_exit() preempt from this routine would also be
	* problematic (the preemption must not occur before the IPI
	* has been acknowledged via an EOI). Instead, we
	* intentionally ignore td_owepreempt when leaving the
	* critical section. This should be harmless because we do
	* not permit rendezvous action routines to schedule threads,
	* and thus td_owepreempt should never transition from 0 to 1
	* during this routine.
	*/
	td = curthread;
	td->td_critnest++;
	#ifdef INVARIANTS
	owepreempt = td->td_owepreempt;
	#endif

	/*
	* If requested, run a setup function before the main action
	* function. Ensure all CPUs have completed the setup
	* function before moving on to the action function.
	*/
	if (local_setup_func != smp_no_rendezvous_barrier) {
	if (local_setup_func != NULL)
	local_setup_func(local_func_arg);
	atomic_add_int(&smp_rv_waiters[1], 1);
	while (smp_rv_waiters[1] < smp_rv_ncpus)
	cpu_spinwait();
	}

	if (local_action_func != NULL)
	local_action_func(local_func_arg);

	if (local_teardown_func != smp_no_rendezvous_barrier) {
	/*
	* Signal that the main action has been completed. If a
	* full exit rendezvous is requested, then all CPUs will
	* wait here until all CPUs have finished the main action.
	*/
	atomic_add_int(&smp_rv_waiters[2], 1);
	while (smp_rv_waiters[2] < smp_rv_ncpus)
	cpu_spinwait();

	if (local_teardown_func != NULL)
	local_teardown_func(local_func_arg);
	}

	/*
	* Signal that the rendezvous is fully completed by this CPU.
	* This means that no member of smp_rv_* pseudo-structure will be
	* accessed by this target CPU after this point; in particular,
	* memory pointed by smp_rv_func_arg.
	*
	* The release semantic ensures that all accesses performed by
	* the current CPU are visible when smp_rendezvous_cpus()
	* returns, by synchronizing with the
	* atomic_load_acq_int(&smp_rv_waiters[3]).
	*/
	atomic_add_rel_int(&smp_rv_waiters[3], 1);

	td->td_critnest--;
	KASSERT(owepreempt == td->td_owepreempt,
	("rendezvous action changed td_owepreempt"));
	}

	void
	smp_rendezvous_cpus(cpuset_t map,
	void (* setup_func)(void *),
	void (* action_func)(void *),
	void (* teardown_func)(void *),
	void *arg)
	{
	int curcpumap, i, ncpus = 0;

	/* See comments in the !SMP case. */
	if (!smp_started) {
	spinlock_enter();
	if (setup_func != NULL)
	setup_func(arg);
	if (action_func != NULL)
	action_func(arg);
	if (teardown_func != NULL)
	teardown_func(arg);
	spinlock_exit();
	return;
	}

	/*
	* Make sure we come here with interrupts enabled. Otherwise we
	* livelock if smp_ipi_mtx is owned by a thread which sent us an IPI.
	*/
	MPASS(curthread->td_md.md_spinlock_count == 0);

	CPU_FOREACH(i) {
	if (CPU_ISSET(i, &map))
	ncpus++;
	}
	if (ncpus == 0)
	panic("ncpus is 0 with non-zero map");

	mtx_lock_spin(&smp_ipi_mtx);

	/* Pass rendezvous parameters via global variables. */
	smp_rv_ncpus = ncpus;
	smp_rv_setup_func = setup_func;
	smp_rv_action_func = action_func;
	smp_rv_teardown_func = teardown_func;
	smp_rv_func_arg = arg;
	smp_rv_waiters[1] = 0;
	smp_rv_waiters[2] = 0;
	smp_rv_waiters[3] = 0;
	atomic_store_rel_int(&smp_rv_waiters[0], 0);

	/*
	* Signal other processors, which will enter the IPI with
	* interrupts off.
	*/
	curcpumap = CPU_ISSET(curcpu, &map);
	CPU_CLR(curcpu, &map);
	ipi_selected(map, IPI_RENDEZVOUS);

	/* Check if the current CPU is in the map */
	if (curcpumap != 0)
	smp_rendezvous_action();

	/*
	* Ensure that the master CPU waits for all the other
	* CPUs to finish the rendezvous, so that smp_rv_*
	* pseudo-structure and the arg are guaranteed to not
	* be in use.
	*
	* Load acquire synchronizes with the release add in
	* smp_rendezvous_action(), which ensures that our caller sees
	* all memory actions done by the called functions on other
	* CPUs.
	*/
	while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
	cpu_spinwait();

	mtx_unlock_spin(&smp_ipi_mtx);
	}

	void
	smp_rendezvous(void (* setup_func)(void *),
	void (* action_func)(void *),
	void (* teardown_func)(void *),
	void *arg)
	{
	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
	}

	static void
	smp_topo_fill(struct cpu_group *cg)
	{
	int c;

	for (c = 0; c < cg->cg_children; c++)
	smp_topo_fill(&cg->cg_child[c]);
	cg->cg_first = CPU_FFS(&cg->cg_mask) - 1;
	cg->cg_last = CPU_FLS(&cg->cg_mask) - 1;
	}

	struct cpu_group *
	smp_topo(void)
	{
	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
	static struct cpu_group *top = NULL;

	/*
	* The first call to smp_topo() is guaranteed to occur
	* during the kernel boot while we are still single-threaded.
	*/
	if (top != NULL)
	return (top);

	/*
	* Check for a fake topology request for debugging purposes.
	*/
	switch (smp_topology) {
	case 1:
	/* Dual core with no sharing. */
	top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
	break;
	case 2:
	/* No topology, all cpus are equal. */
	top = smp_topo_none();
	break;
	case 3:
	/* Dual core with shared L2. */
	top = smp_topo_1level(CG_SHARE_L2, 2, 0);
	break;
	case 4:
	/* quad core, shared l3 among each package, private l2. */
	top = smp_topo_1level(CG_SHARE_L3, 4, 0);
	break;
	case 5:
	/* quad core, 2 dualcore parts on each package share l2. */
	top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
	break;
	case 6:
	/* Single-core 2xHTT */
	top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
	break;
	case 7:
	/* quad core with a shared l3, 8 threads sharing L2. */
	top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
	CG_FLAG_SMT);
	break;
	default:
	/* Default, ask the system what it wants. */
	top = cpu_topo();
	break;
	}
	/*
	* Verify the returned topology.
	*/
	if (top->cg_count != mp_ncpus)
	panic("Built bad topology at %p. CPU count %d != %d",
	top, top->cg_count, mp_ncpus);
	if (CPU_CMP(&top->cg_mask, &all_cpus))
	panic("Built bad topology at %p. CPU mask (%s) != (%s)",
	top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
	cpusetobj_strprint(cpusetbuf2, &all_cpus));

	/*
	* Collapse nonsense levels that may be created out of convenience by
	* the MD layers. They cause extra work in the search functions.
	*/
	while (top->cg_children == 1) {
	top = &top->cg_child[0];
	top->cg_parent = NULL;
	}
	smp_topo_fill(top);
	return (top);
	}

	struct cpu_group *
	smp_topo_alloc(u_int count)
	{
	static struct cpu_group *group = NULL;
	static u_int index;
	u_int curr;

	if (group == NULL) {
	group = mallocarray((mp_maxid + 1) * MAX_CACHE_LEVELS + 1,
	sizeof(*group), M_DEVBUF, M_WAITOK \| M_ZERO);
	}
	curr = index;
	index += count;
	return (&group[curr]);
	}

	struct cpu_group *
	smp_topo_none(void)
	{
	struct cpu_group *top;

	top = smp_topo_alloc(1);
	top->cg_parent = NULL;
	top->cg_child = NULL;
	top->cg_mask = all_cpus;
	top->cg_count = mp_ncpus;
	top->cg_children = 0;
	top->cg_level = CG_SHARE_NONE;
	top->cg_flags = 0;
	-
	+#if defined(CPUGRP_SCORE)
	+ memset(top->cg_score, CG_SCORE_DEFAULT, sizeof(top->cg_score));
	+#endif
	return (top);
	}

	static int
	smp_topo_addleaf(struct cpu_group parent, struct cpu_group child, int share,
	int count, int flags, int start)
	{
	char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
	cpuset_t mask;
	int i;

	CPU_ZERO(&mask);
	for (i = 0; i < count; i++, start++)
	CPU_SET(start, &mask);
	child->cg_parent = parent;
	child->cg_child = NULL;
	child->cg_children = 0;
	child->cg_level = share;
	child->cg_count = count;
	child->cg_flags = flags;
	child->cg_mask = mask;
	+#if defined(CPUGRP_SCORE)
	+ memset(child->cg_score, CG_SCORE_DEFAULT, sizeof(child->cg_score));
	+#endif
	parent->cg_children++;
	for (; parent != NULL; parent = parent->cg_parent) {
	if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
	panic("Duplicate children in %p. mask (%s) child (%s)",
	parent,
	cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
	cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
	CPU_OR(&parent->cg_mask, &parent->cg_mask, &child->cg_mask);
	parent->cg_count += child->cg_count;
	}

	return (start);
	}

	struct cpu_group *
	smp_topo_1level(int share, int count, int flags)
	{
	struct cpu_group *child;
	struct cpu_group *top;
	int packages;
	int cpu;
	int i;

	cpu = 0;
	packages = mp_ncpus / count;
	top = smp_topo_alloc(1 + packages);
	top->cg_child = child = top + 1;
	top->cg_level = CG_SHARE_NONE;
	for (i = 0; i < packages; i++, child++)
	cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
	return (top);
	}

	struct cpu_group *
	smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
	int l1flags)
	{
	struct cpu_group *top;
	struct cpu_group *l1g;
	struct cpu_group *l2g;
	int cpu;
	int i;
	int j;

	cpu = 0;
	top = smp_topo_alloc(1 + mp_ncpus / (l2count * l1count) +
	mp_ncpus / l1count);
	l2g = top + 1;
	top->cg_child = l2g;
	top->cg_level = CG_SHARE_NONE;
	top->cg_children = mp_ncpus / (l2count * l1count);
	l1g = l2g + top->cg_children;
	for (i = 0; i < top->cg_children; i++, l2g++) {
	l2g->cg_parent = top;
	l2g->cg_child = l1g;
	l2g->cg_level = l2share;
	for (j = 0; j < l2count; j++, l1g++)
	cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
	l1flags, cpu);
	}
	return (top);
	}

	struct cpu_group *
	smp_topo_find(struct cpu_group *top, int cpu)
	{
	struct cpu_group *cg;
	cpuset_t mask;
	int children;
	int i;

	CPU_SETOF(cpu, &mask);
	cg = top;
	for (;;) {
	if (!CPU_OVERLAP(&cg->cg_mask, &mask))
	return (NULL);
	if (cg->cg_children == 0)
	return (cg);
	children = cg->cg_children;
	for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
	if (CPU_OVERLAP(&cg->cg_mask, &mask))
	break;
	}
	return (NULL);
	}
	#else /* !SMP */

	void
	smp_rendezvous_cpus(cpuset_t map,
	void (setup_func)(void ),
	void (action_func)(void ),
	void (teardown_func)(void ),
	void *arg)
	{
	/*
	* In the !SMP case we just need to ensure the same initial conditions
	* as the SMP case.
	*/
	spinlock_enter();
	if (setup_func != NULL)
	setup_func(arg);
	if (action_func != NULL)
	action_func(arg);
	if (teardown_func != NULL)
	teardown_func(arg);
	spinlock_exit();
	}

	void
	smp_rendezvous(void (setup_func)(void ),
	void (action_func)(void ),
	void (teardown_func)(void ),
	void *arg)
	{

	smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
	arg);
	}

	/*
	* Provide dummy SMP support for UP kernels. Modules that need to use SMP
	* APIs will still work using this dummy support.
	*/
	static void
	mp_setvariables_for_up(void *dummy)
	{
	mp_ncpus = 1;
	mp_ncores = 1;
	mp_maxid = PCPU_GET(cpuid);
	CPU_SETOF(mp_maxid, &all_cpus);
	KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
	}
	SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
	mp_setvariables_for_up, NULL);
	#endif /* SMP */

	void
	smp_no_rendezvous_barrier(void *dummy)
	{
	#ifdef SMP
	KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
	#endif
	}

	void
	smp_rendezvous_cpus_retry(cpuset_t map,
	void (* setup_func)(void *),
	void (* action_func)(void *),
	void (* teardown_func)(void *),
	void (* wait_func)(void *, int),
	struct smp_rendezvous_cpus_retry_arg *arg)
	{
	int cpu;

	CPU_COPY(&map, &arg->cpus);

	/*
	* Only one CPU to execute on.
	*/
	if (!smp_started) {
	spinlock_enter();
	if (setup_func != NULL)
	setup_func(arg);
	if (action_func != NULL)
	action_func(arg);
	if (teardown_func != NULL)
	teardown_func(arg);
	spinlock_exit();
	return;
	}

	/*
	* Execute an action on all specified CPUs while retrying until they
	* all acknowledge completion.
	*/
	for (;;) {
	smp_rendezvous_cpus(
	arg->cpus,
	setup_func,
	action_func,
	teardown_func,
	arg);

	if (CPU_EMPTY(&arg->cpus))
	break;

	CPU_FOREACH(cpu) {
	if (!CPU_ISSET(cpu, &arg->cpus))
	continue;
	wait_func(arg, cpu);
	}
	}
	}

	void
	smp_rendezvous_cpus_done(struct smp_rendezvous_cpus_retry_arg *arg)
	{

	CPU_CLR_ATOMIC(curcpu, &arg->cpus);
	}

	/*
	* If (prio & PDROP) == 0:
	* Wait for specified idle threads to switch once. This ensures that even
	* preempted threads have cycled through the switch function once,
	* exiting their codepaths. This allows us to change global pointers
	* with no other synchronization.
	* If (prio & PDROP) != 0:
	* Force the specified CPUs to switch context at least once.
	*/
	int
	quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
	{
	struct pcpu *pcpu;
	u_int *gen;
	int error;
	int cpu;

	error = 0;
	if ((prio & PDROP) == 0) {
	gen = mallocarray(sizeof(u_int), mp_maxid + 1, M_TEMP,
	M_WAITOK);
	for (cpu = 0; cpu <= mp_maxid; cpu++) {
	if (!CPU_ISSET(cpu, &map) \|\| CPU_ABSENT(cpu))
	continue;
	pcpu = pcpu_find(cpu);
	gen[cpu] = pcpu->pc_idlethread->td_generation;
	}
	}
	for (cpu = 0; cpu <= mp_maxid; cpu++) {
	if (!CPU_ISSET(cpu, &map) \|\| CPU_ABSENT(cpu))
	continue;
	pcpu = pcpu_find(cpu);
	thread_lock(curthread);
	sched_bind(curthread, cpu);
	thread_unlock(curthread);
	if ((prio & PDROP) != 0)
	continue;
	while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
	error = tsleep(quiesce_cpus, prio & ~PDROP, wmesg, 1);
	if (error != EWOULDBLOCK)
	goto out;
	error = 0;
	}
	}
	out:
	thread_lock(curthread);
	sched_unbind(curthread);
	thread_unlock(curthread);
	if ((prio & PDROP) == 0)
	free(gen, M_TEMP);

	return (error);
	}

	int
	quiesce_all_cpus(const char *wmesg, int prio)
	{

	return quiesce_cpus(all_cpus, wmesg, prio);
	}

	/*
	* Observe all CPUs not executing in critical section.
	* We are not in one so the check for us is safe. If the found
	* thread changes to something else we know the section was
	* exited as well.
	*/
	void
	quiesce_all_critical(void)
	{
	struct thread td, newtd;
	struct pcpu *pcpu;
	int cpu;

	MPASS(curthread->td_critnest == 0);

	CPU_FOREACH(cpu) {
	pcpu = cpuid_to_pcpu[cpu];
	td = pcpu->pc_curthread;
	for (;;) {
	if (td->td_critnest == 0)
	break;
	cpu_spinwait();
	newtd = (struct thread *)
	atomic_load_acq_ptr((void *)pcpu->pc_curthread);
	if (td != newtd)
	break;
	}
	}
	}

	static void
	cpus_fence_seq_cst_issue(void *arg __unused)
	{

	atomic_thread_fence_seq_cst();
	}

	/*
	* Send an IPI forcing a sequentially consistent fence.
	*
	* Allows replacement of an explicitly fence with a compiler barrier.
	* Trades speed up during normal execution for a significant slowdown when
	* the barrier is needed.
	*/
	void
	cpus_fence_seq_cst(void)
	{

	#ifdef SMP
	smp_rendezvous(
	smp_no_rendezvous_barrier,
	cpus_fence_seq_cst_issue,
	smp_no_rendezvous_barrier,
	NULL
	);
	#else
	cpus_fence_seq_cst_issue(NULL);
	#endif
	}

	/* Extra care is taken with this sysctl because the data type is volatile */
	static int
	sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
	{
	int error, active;

	active = smp_started;
	error = SYSCTL_OUT(req, &active, sizeof(active));
	return (error);
	}

	#ifdef SMP
	void
	topo_init_node(struct topo_node *node)
	{

	bzero(node, sizeof(*node));
	TAILQ_INIT(&node->children);
	}

	void
	topo_init_root(struct topo_node *root)
	{

	topo_init_node(root);
	root->type = TOPO_TYPE_SYSTEM;
	}

	/*
	* Add a child node with the given ID under the given parent.
	* Do nothing if there is already a child with that ID.
	*/
	struct topo_node *
	topo_add_node_by_hwid(struct topo_node *parent, int hwid,
	topo_node_type type, uintptr_t subtype)
	{
	struct topo_node *node;

	TAILQ_FOREACH_REVERSE(node, &parent->children,
	topo_children, siblings) {
	if (node->hwid == hwid
	&& node->type == type && node->subtype == subtype) {
	return (node);
	}
	}

	node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
	topo_init_node(node);
	node->parent = parent;
	node->hwid = hwid;
	node->type = type;
	node->subtype = subtype;
	TAILQ_INSERT_TAIL(&parent->children, node, siblings);
	parent->nchildren++;

	return (node);
	}

	/*
	* Find a child node with the given ID under the given parent.
	*/
	struct topo_node *
	topo_find_node_by_hwid(struct topo_node *parent, int hwid,
	topo_node_type type, uintptr_t subtype)
	{

	struct topo_node *node;

	TAILQ_FOREACH(node, &parent->children, siblings) {
	if (node->hwid == hwid
	&& node->type == type && node->subtype == subtype) {
	return (node);
	}
	}

	return (NULL);
	}

	/*
	* Given a node change the order of its parent's child nodes such
	* that the node becomes the firt child while preserving the cyclic
	* order of the children. In other words, the given node is promoted
	* by rotation.
	*/
	void
	topo_promote_child(struct topo_node *child)
	{
	struct topo_node *next;
	struct topo_node *node;
	struct topo_node *parent;

	parent = child->parent;
	next = TAILQ_NEXT(child, siblings);
	TAILQ_REMOVE(&parent->children, child, siblings);
	TAILQ_INSERT_HEAD(&parent->children, child, siblings);

	while (next != NULL) {
	node = next;
	next = TAILQ_NEXT(node, siblings);
	TAILQ_REMOVE(&parent->children, node, siblings);
	TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
	child = node;
	}
	}

	/*
	* Iterate to the next node in the depth-first search (traversal) of
	* the topology tree.
	*/
	struct topo_node *
	topo_next_node(struct topo_node top, struct topo_node node)
	{
	struct topo_node *next;

	if ((next = TAILQ_FIRST(&node->children)) != NULL)
	return (next);

	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
	return (next);

	while (node != top && (node = node->parent) != top)
	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
	return (next);

	return (NULL);
	}

	/*
	* Iterate to the next node in the depth-first search of the topology tree,
	* but without descending below the current node.
	*/
	struct topo_node *
	topo_next_nonchild_node(struct topo_node top, struct topo_node node)
	{
	struct topo_node *next;

	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
	return (next);

	while (node != top && (node = node->parent) != top)
	if ((next = TAILQ_NEXT(node, siblings)) != NULL)
	return (next);

	return (NULL);
	}

	/*
	* Assign the given ID to the given topology node that represents a logical
	* processor.
	*/
	void
	topo_set_pu_id(struct topo_node *node, cpuid_t id)
	{

	KASSERT(node->type == TOPO_TYPE_PU,
	("topo_set_pu_id: wrong node type: %u", node->type));
	KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
	("topo_set_pu_id: cpuset already not empty"));
	node->id = id;
	CPU_SET(id, &node->cpuset);
	node->cpu_count = 1;
	node->subtype = 1;

	while ((node = node->parent) != NULL) {
	KASSERT(!CPU_ISSET(id, &node->cpuset),
	("logical ID %u is already set in node %p", id, node));
	CPU_SET(id, &node->cpuset);
	node->cpu_count++;
	}
	}

	static struct topology_spec {
	topo_node_type type;
	bool match_subtype;
	uintptr_t subtype;
	} topology_level_table[TOPO_LEVEL_COUNT] = {
	[TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
	[TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
	[TOPO_LEVEL_CACHEGROUP] = {
	.type = TOPO_TYPE_CACHE,
	.match_subtype = true,
	.subtype = CG_SHARE_L3,
	},
	[TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
	[TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
	};

	static bool
	topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
	struct topo_analysis *results)
	{
	struct topology_spec *spec;
	struct topo_node *node;
	int count;

	if (level >= TOPO_LEVEL_COUNT)
	return (true);

	spec = &topology_level_table[level];
	count = 0;
	node = topo_next_node(root, root);

	while (node != NULL) {
	if (node->type != spec->type \|\|
	(spec->match_subtype && node->subtype != spec->subtype)) {
	node = topo_next_node(root, node);
	continue;
	}
	if (!all && CPU_EMPTY(&node->cpuset)) {
	node = topo_next_nonchild_node(root, node);
	continue;
	}

	count++;

	if (!topo_analyze_table(node, all, level + 1, results))
	return (false);

	node = topo_next_nonchild_node(root, node);
	}

	/* No explicit subgroups is essentially one subgroup. */
	if (count == 0) {
	count = 1;

	if (!topo_analyze_table(root, all, level + 1, results))
	return (false);
	}

	if (results->entities[level] == -1)
	results->entities[level] = count;
	else if (results->entities[level] != count)
	return (false);

	return (true);
	}

	/*
	* Check if the topology is uniform, that is, each package has the same number
	* of cores in it and each core has the same number of threads (logical
	* processors) in it. If so, calculate the number of packages, the number of
	* groups per package, the number of cachegroups per group, and the number of
	* logical processors per cachegroup. 'all' parameter tells whether to include
	* administratively disabled logical processors into the analysis.
	*/
	int
	topo_analyze(struct topo_node *topo_root, int all,
	struct topo_analysis *results)
	{

	results->entities[TOPO_LEVEL_PKG] = -1;
	results->entities[TOPO_LEVEL_CORE] = -1;
	results->entities[TOPO_LEVEL_THREAD] = -1;
	results->entities[TOPO_LEVEL_GROUP] = -1;
	results->entities[TOPO_LEVEL_CACHEGROUP] = -1;

	if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
	return (0);

	KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
	("bug in topology or analysis"));

	return (1);
	}

	#endif /* SMP */
	diff --git a/sys/sys/smp.h b/sys/sys/smp.h
	index 252dc9dc1cae..735cad0439a3 100644
	--- a/sys/sys/smp.h
	+++ b/sys/sys/smp.h
	@@ -1,298 +1,311 @@
	/*-
	* SPDX-License-Identifier: Beerware
	*
	* ----------------------------------------------------------------------------
	* "THE BEER-WARE LICENSE" (Revision 42):
	* <phk@FreeBSD.org> wrote this file. As long as you retain this notice you
	* can do whatever you want with this stuff. If we meet some day, and you think
	* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
	* ----------------------------------------------------------------------------
	*/

	#ifndef _SYS_SMP_H_
	#define _SYS_SMP_H_

	#ifdef _KERNEL

	#ifndef LOCORE

	#include <sys/cpuset.h>
	#include <sys/queue.h>

	+#include "opt_global.h"
	+
	/*
	* Types of nodes in the topological tree.
	*/
	typedef enum {
	/* No node has this type; can be used in topo API calls. */
	TOPO_TYPE_DUMMY,
	/* Processing unit aka computing unit aka logical CPU. */
	TOPO_TYPE_PU,
	/* Physical subdivision of a package. */
	TOPO_TYPE_CORE,
	/* CPU L1/L2/L3 cache. */
	TOPO_TYPE_CACHE,
	/* Package aka chip, equivalent to socket. */
	TOPO_TYPE_PKG,
	/* NUMA node. */
	TOPO_TYPE_NODE,
	/* Other logical or physical grouping of PUs. */
	/* E.g. PUs on the same dye, or PUs sharing an FPU. */
	TOPO_TYPE_GROUP,
	/* The whole system. */
	TOPO_TYPE_SYSTEM
	} topo_node_type;

	/* Hardware indenitifier of a topology component. */
	typedef unsigned int hwid_t;
	/* Logical CPU idenitifier. */
	typedef int cpuid_t;

	/* A node in the topology. */
	struct topo_node {
	struct topo_node *parent;
	TAILQ_HEAD(topo_children, topo_node) children;
	TAILQ_ENTRY(topo_node) siblings;
	cpuset_t cpuset;
	topo_node_type type;
	uintptr_t subtype;
	hwid_t hwid;
	cpuid_t id;
	int nchildren;
	int cpu_count;
	};

	/*
	* Scheduling topology of a NUMA or SMP system.
	*
	* The top level topology is an array of pointers to groups. Each group
	* contains a bitmask of cpus in its group or subgroups. It may also
	* contain a pointer to an array of child groups.
	*
	* The bitmasks at non leaf groups may be used by consumers who support
	* a smaller depth than the hardware provides.
	*
	* The topology may be omitted by systems where all CPUs are equal.
	*/

	+#if defined(CPUGRP_SCORE)
	+#define CG_SCORE_CLASS_MAX 8
	+#define CG_SCORE_CAPABILITY_MAX 2
	+
	+#define CG_SCORE_DEFAULT 0x80
	+#endif
	+
	struct cpu_group {
	struct cpu_group cg_parent; / Our parent group. */
	struct cpu_group cg_child; / Optional children groups. */
	cpuset_t cg_mask; /* Mask of cpus in this group. */
	int32_t cg_count; /* Count of cpus in this group. */
	int32_t cg_first; /* First cpu in this group. */
	int32_t cg_last; /* Last cpu in this group. */
	int16_t cg_children; /* Number of children groups. */
	int8_t cg_level; /* Shared cache level. */
	int8_t cg_flags; /* Traversal modifiers. */
	+#if defined(CPUGRP_SCORE)
	+ uint8_t cg_score[CG_SCORE_CLASS_MAX][CG_SCORE_CAPABILITY_MAX];
	+ /* Performance/Efficiency Score from Intel HFI/ITD */
	+#endif
	};

	typedef struct cpu_group *cpu_group_t;

	/*
	* Defines common resources for CPUs in the group. The highest level
	* resource should be used when multiple are shared.
	*/
	#define CG_SHARE_NONE 0
	#define CG_SHARE_L1 1
	#define CG_SHARE_L2 2
	#define CG_SHARE_L3 3

	#define MAX_CACHE_LEVELS CG_SHARE_L3

	/*
	* Behavior modifiers for load balancing and affinity.
	*/
	#define CG_FLAG_HTT 0x01 /* Schedule the alternate core last. */
	#define CG_FLAG_SMT 0x02 /* New age htt, less crippled. */
	#define CG_FLAG_THREAD (CG_FLAG_HTT \| CG_FLAG_SMT) /* Any threading. */
	#define CG_FLAG_NODE 0x04 /* NUMA node. */

	/*
	* Convenience routines for building and traversing topologies.
	*/
	#ifdef SMP
	void topo_init_node(struct topo_node *node);
	void topo_init_root(struct topo_node *root);
	struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid,
	topo_node_type type, uintptr_t subtype);
	struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid,
	topo_node_type type, uintptr_t subtype);
	void topo_promote_child(struct topo_node *child);
	struct topo_node * topo_next_node(struct topo_node *top,
	struct topo_node *node);
	struct topo_node * topo_next_nonchild_node(struct topo_node *top,
	struct topo_node *node);
	void topo_set_pu_id(struct topo_node *node, cpuid_t id);

	enum topo_level {
	TOPO_LEVEL_PKG = 0,
	/*
	* Some systems have useful sub-package core organizations. On these,
	* a package has one or more subgroups. Each subgroup contains one or
	* more cache groups (cores that share a last level cache).
	*/
	TOPO_LEVEL_GROUP,
	TOPO_LEVEL_CACHEGROUP,
	TOPO_LEVEL_CORE,
	TOPO_LEVEL_THREAD,
	TOPO_LEVEL_COUNT /* Must be last */
	};
	struct topo_analysis {
	int entities[TOPO_LEVEL_COUNT];
	};
	int topo_analyze(struct topo_node *topo_root, int all,
	struct topo_analysis *results);

	#define TOPO_FOREACH(i, root) \
	for (i = root; i != NULL; i = topo_next_node(root, i))

	struct cpu_group *smp_topo(void);
	struct cpu_group *smp_topo_alloc(u_int count);
	struct cpu_group *smp_topo_none(void);
	struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags);
	struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share,
	int l1count, int l1flags);
	struct cpu_group smp_topo_find(struct cpu_group top, int cpu);

	extern void (*cpustop_restartfunc)(void);
	/* The suspend/resume cpusets are x86 only, but minimize ifdefs. */
	extern volatile cpuset_t resuming_cpus; /* woken up cpus in suspend pen */
	extern volatile cpuset_t started_cpus; /* cpus to let out of stop pen */
	extern volatile cpuset_t stopped_cpus; /* cpus in stop pen */
	extern volatile cpuset_t suspended_cpus; /* cpus [near] sleeping in susp pen */
	extern volatile cpuset_t toresume_cpus; /* cpus to let out of suspend pen */
	extern cpuset_t hlt_cpus_mask; /* XXX 'mask' is detail in old impl */
	extern cpuset_t logical_cpus_mask;
	#endif /* SMP */

	extern u_int mp_maxid;
	extern int mp_maxcpus;
	extern int mp_ncores;
	extern int mp_ncpus;
	extern int smp_cpus;
	extern volatile int smp_started;
	extern int smp_threads_per_core;

	extern cpuset_t all_cpus;
	extern cpuset_t cpuset_domain[MAXMEMDOM]; /* CPUs in each NUMA domain. */

	struct pcb;
	extern struct pcb *stoppcbs;

	/*
	* Macro allowing us to determine whether a CPU is absent at any given
	* time, thus permitting us to configure sparse maps of cpuid-dependent
	* (per-CPU) structures.
	*/
	#define CPU_ABSENT(x_cpu) (!CPU_ISSET(x_cpu, &all_cpus))

	/*
	* Macros to iterate over non-absent CPUs. CPU_FOREACH() takes an
	* integer iterator and iterates over the available set of CPUs.
	* CPU_FIRST() returns the id of the first non-absent CPU. CPU_NEXT()
	* returns the id of the next non-absent CPU. It will wrap back to
	* CPU_FIRST() once the end of the list is reached. The iterators are
	* currently implemented via inline functions.
	*/
	#define CPU_FOREACH(i) \
	for ((i) = 0; (i) <= mp_maxid; (i)++) \
	if (!CPU_ABSENT((i)))

	static __inline int
	cpu_first(void)
	{
	int i;

	for (i = 0;; i++)
	if (!CPU_ABSENT(i))
	return (i);
	}

	static __inline int
	cpu_next(int i)
	{

	for (;;) {
	i++;
	if ((u_int)i > mp_maxid)
	i = 0;
	if (!CPU_ABSENT(i))
	return (i);
	}
	}

	#define CPU_FIRST() cpu_first()
	#define CPU_NEXT(i) cpu_next((i))

	#ifdef SMP
	/*
	* Machine dependent functions used to initialize MP support.
	*
	* The cpu_mp_probe() should check to see if MP support is present and return
	* zero if it is not or non-zero if it is. If MP support is present, then
	* cpu_mp_start() will be called so that MP can be enabled. This function
	* should do things such as startup secondary processors. It should also
	* setup mp_ncpus, all_cpus, and smp_cpus. It should also ensure that
	* smp_started is initialized at the appropriate time.
	* Once cpu_mp_start() returns, machine independent MP startup code will be
	* executed and a simple message will be output to the console. Finally,
	* cpu_mp_announce() will be called so that machine dependent messages about
	* the MP support may be output to the console if desired.
	*
	* The cpu_setmaxid() function is called very early during the boot process
	* so that the MD code may set mp_maxid to provide an upper bound on CPU IDs
	* that other subsystems may use. If a platform is not able to determine
	* the exact maximum ID that early, then it may set mp_maxid to MAXCPU - 1.
	*/
	struct thread;

	struct cpu_group *cpu_topo(void);
	void cpu_mp_announce(void);
	int cpu_mp_probe(void);
	void cpu_mp_setmaxid(void);
	void cpu_mp_start(void);

	void forward_signal(struct thread *);
	int restart_cpus(cpuset_t);
	int stop_cpus(cpuset_t);
	int stop_cpus_hard(cpuset_t);
	#if defined(__amd64__) \|\| defined(__i386__)
	int suspend_cpus(cpuset_t);
	int resume_cpus(cpuset_t);
	#endif

	void smp_rendezvous_action(void);
	extern struct mtx smp_ipi_mtx;

	#endif /* SMP */

	int quiesce_all_cpus(const char *, int);
	int quiesce_cpus(cpuset_t, const char *, int);
	void quiesce_all_critical(void);
	void cpus_fence_seq_cst(void);
	void smp_no_rendezvous_barrier(void *);
	void smp_rendezvous(void ()(void ),
	void ()(void ),
	void ()(void ),
	void *arg);
	void smp_rendezvous_cpus(cpuset_t,
	void ()(void ),
	void ()(void ),
	void ()(void ),
	void *arg);

	struct smp_rendezvous_cpus_retry_arg {
	cpuset_t cpus;
	};
	void smp_rendezvous_cpus_retry(cpuset_t,
	void ()(void ),
	void ()(void ),
	void ()(void ),
	void ()(void , int),
	struct smp_rendezvous_cpus_retry_arg *);

	void smp_rendezvous_cpus_done(struct smp_rendezvous_cpus_retry_arg *);

	#endif /* !LOCORE */
	#endif /* _KERNEL */
	#endif /* _SYS_SMP_H_ */
	diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
	index 1027c2c8972b..5d9a57c8febe 100644
	--- a/sys/x86/x86/mp_x86.c
	+++ b/sys/x86/x86/mp_x86.c
	@@ -1,1749 +1,1757 @@
	/*-
	* Copyright (c) 1996, by Steve Passe
	* Copyright (c) 2003, by Peter Wemm
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. The name of the developer may NOT be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	#include "opt_acpi.h"
	#ifdef __i386__
	#include "opt_apic.h"
	#endif
	#include "opt_cpu.h"
	#include "opt_ddb.h"
	#include "opt_gdb.h"
	#include "opt_kstack_pages.h"
	#include "opt_pmap.h"
	#include "opt_sched.h"
	#include "opt_smp.h"
	#include "opt_stack.h"
	+#include "opt_global.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/asan.h>
	#include <sys/bus.h>
	#include <sys/cons.h> /* cngetc() */
	#include <sys/cpuset.h>
	#include <sys/csan.h>
	#include <sys/interrupt.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/memrange.h>
	#include <sys/mutex.h>
	#include <sys/pcpu.h>
	#include <sys/proc.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/sysctl.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_map.h>

	#include <x86/apicreg.h>
	#include <machine/clock.h>
	#include <machine/cpu.h>
	#include <machine/cputypes.h>
	#include <x86/mca.h>
	#include <machine/md_var.h>
	#include <machine/pcb.h>
	#include <machine/psl.h>
	#include <machine/smp.h>
	#include <machine/specialreg.h>
	#include <machine/stack.h>
	#include <x86/ucode.h>

	#ifdef DEV_ACPI
	#include <contrib/dev/acpica/include/acpi.h>
	#include <dev/acpica/acpivar.h>
	#endif

	static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items");

	int mp_naps; /* # of Applications processors */
	int boot_cpu_id = -1; /* designated BSP */

	/* AP uses this during bootstrap. Do not staticize. */
	char *bootSTK;
	int bootAP;

	/* Free these after use */
	void *bootstacks[MAXCPU];
	void *dpcpu;

	struct susppcb **susppcbs;

	#ifdef COUNT_IPIS
	/* Interrupt counts. */
	static u_long *ipi_preempt_counts[MAXCPU];
	static u_long *ipi_ast_counts[MAXCPU];
	u_long *ipi_invltlb_counts[MAXCPU];
	u_long *ipi_invlrng_counts[MAXCPU];
	u_long *ipi_invlpg_counts[MAXCPU];
	u_long *ipi_invlcache_counts[MAXCPU];
	u_long *ipi_rendezvous_counts[MAXCPU];
	static u_long *ipi_hardclock_counts[MAXCPU];
	#endif

	/* Default cpu_ops implementation. */
	struct cpu_ops cpu_ops;

	/*
	* Local data and functions.
	*/

	static volatile cpuset_t ipi_stop_nmi_pending;

	volatile cpuset_t resuming_cpus;
	volatile cpuset_t toresume_cpus;

	/* used to hold the AP's until we are ready to release them */
	struct mtx ap_boot_mtx;

	/* Set to 1 once we're ready to let the APs out of the pen. */
	volatile int aps_ready = 0;

	/*
	* Store data from cpu_add() until later in the boot when we actually setup
	* the APs.
	*/
	struct cpu_info *cpu_info;
	int *apic_cpuids;
	int cpu_apic_ids[MAXCPU];
	_Static_assert(MAXCPU <= MAX_APIC_ID,
	"MAXCPU cannot be larger that MAX_APIC_ID");
	_Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID,
	"xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID");

	static void release_aps(void *dummy);
	static void cpustop_handler_post(u_int cpu);

	static int hyperthreading_allowed = 1;
	SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN,
	&hyperthreading_allowed, 0, "Use Intel HTT logical CPUs");

	static int hyperthreading_intr_allowed = 0;
	SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN,
	&hyperthreading_intr_allowed, 0,
	"Allow interrupts on HTT logical CPUs");

	static int intr_apic_id_limit = -1;
	SYSCTL_INT(_machdep, OID_AUTO, intr_apic_id_limit, CTLFLAG_RDTUN,
	&intr_apic_id_limit, 0,
	"Maximum permitted APIC ID for interrupt delivery (-1 is unlimited)");

	static struct topo_node topo_root;

	static int pkg_id_shift;
	static int node_id_shift;
	static int core_id_shift;
	static int disabled_cpus;

	struct cache_info {
	int id_shift;
	int present;
	} static caches[MAX_CACHE_LEVELS];

	static bool stop_mwait = false;
	SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0,
	"Use MONITOR/MWAIT when stopping CPU, if available");

	void
	mem_range_AP_init(void)
	{

	if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP)
	mem_range_softc.mr_op->initAP(&mem_range_softc);
	}

	/*
	* Round up to the next power of two, if necessary, and then
	* take log2.
	* Returns -1 if argument is zero.
	*/
	static __inline int
	mask_width(u_int x)
	{

	return (fls(x << (1 - powerof2(x))) - 1);
	}

	/*
	* Add a cache level to the cache topology description.
	*/
	static int
	add_deterministic_cache(int type, int level, int share_count)
	{

	if (type == 0)
	return (0);
	if (type > 3) {
	printf("unexpected cache type %d\n", type);
	return (1);
	}
	if (type == 2) /* ignore instruction cache */
	return (1);
	if (level == 0 \|\| level > MAX_CACHE_LEVELS) {
	printf("unexpected cache level %d\n", level);
	return (1);
	}

	if (caches[level - 1].present) {
	printf("WARNING: multiple entries for L%u data cache\n", level);
	printf("%u => %u\n", caches[level - 1].id_shift,
	mask_width(share_count));
	}
	caches[level - 1].id_shift = mask_width(share_count);
	caches[level - 1].present = 1;

	if (caches[level - 1].id_shift > pkg_id_shift) {
	printf("WARNING: L%u data cache covers more "
	"APIC IDs than a package (%u > %u)\n", level,
	caches[level - 1].id_shift, pkg_id_shift);
	caches[level - 1].id_shift = pkg_id_shift;
	}
	if (caches[level - 1].id_shift < core_id_shift) {
	printf("WARNING: L%u data cache covers fewer "
	"APIC IDs than a core (%u < %u)\n", level,
	caches[level - 1].id_shift, core_id_shift);
	caches[level - 1].id_shift = core_id_shift;
	}

	return (1);
	}

	/*
	* Determine topology of processing units and caches for AMD CPUs.
	* See:
	* - AMD CPUID Specification (Publication # 25481)
	* - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559)
	* - BKDG For AMD Family 10h Processors (Publication # 31116)
	* - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301)
	* - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751)
	* - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945)
	*/
	static void
	topo_probe_amd(void)
	{
	u_int p[4];
	uint64_t v;
	int level;
	int nodes_per_socket;
	int share_count;
	int type;
	int i;

	/* No multi-core capability. */
	if ((amd_feature2 & AMDID2_CMP) == 0)
	return;

	/*
	* XXX Lack of an AMD IOMMU driver prevents use of APIC IDs above
	* xAPIC_MAX_APIC_ID. This is a workaround so we boot and function on
	* AMD systems with high thread counts, albeit with reduced interrupt
	* performance.
	*
	* We should really set the limit to xAPIC_MAX_APIC_ID by default, and
	* have the IOMMU driver increase it. That way if a driver is present
	* but disabled, or is otherwise not able to route the interrupts, the
	* system can fall back to a functional state. That will require a more
	* substantial change though, including having the IOMMU initialize
	* earlier.
	*/
	if (intr_apic_id_limit == -1)
	intr_apic_id_limit = xAPIC_MAX_APIC_ID;

	/* For families 10h and newer. */
	pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >>
	AMDID_COREID_SIZE_SHIFT;

	/* For 0Fh family. */
	if (pkg_id_shift == 0)
	pkg_id_shift =
	mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1);

	/*
	* Families prior to 16h define the following value as
	* cores per compute unit and we don't really care about the AMD
	* compute units at the moment. Perhaps we should treat them as
	* cores and cores within the compute units as hardware threads,
	* but that's up for debate.
	* Later families define the value as threads per compute unit,
	* so we are following AMD's nomenclature here.
	*/
	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 &&
	CPUID_TO_FAMILY(cpu_id) >= 0x16) {
	cpuid_count(0x8000001e, 0, p);
	share_count = ((p[1] >> 8) & 0xff) + 1;
	core_id_shift = mask_width(share_count);

	/*
	* For Zen (17h), gather Nodes per Processor. Each node is a
	* Zeppelin die; TR and EPYC CPUs will have multiple dies per
	* package. Communication latency between dies is higher than
	* within them.
	*/
	nodes_per_socket = ((p[2] >> 8) & 0x7) + 1;
	node_id_shift = pkg_id_shift - mask_width(nodes_per_socket);
	}

	if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) {
	for (i = 0; ; i++) {
	cpuid_count(0x8000001d, i, p);
	type = p[0] & 0x1f;
	level = (p[0] >> 5) & 0x7;
	share_count = 1 + ((p[0] >> 14) & 0xfff);

	if (!add_deterministic_cache(type, level, share_count))
	break;
	}
	} else {
	if (cpu_exthigh >= 0x80000005) {
	cpuid_count(0x80000005, 0, p);
	if (((p[2] >> 24) & 0xff) != 0) {
	caches[0].id_shift = 0;
	caches[0].present = 1;
	}
	}
	if (cpu_exthigh >= 0x80000006) {
	cpuid_count(0x80000006, 0, p);
	if (((p[2] >> 16) & 0xffff) != 0) {
	caches[1].id_shift = 0;
	caches[1].present = 1;
	}
	if (((p[3] >> 18) & 0x3fff) != 0) {
	nodes_per_socket = 1;
	if ((amd_feature2 & AMDID2_NODE_ID) != 0) {
	/*
	* Handle multi-node processors that
	* have multiple chips, each with its
	* own L3 cache, on the same die.
	*/
	v = rdmsr(0xc001100c);
	nodes_per_socket = 1 + ((v >> 3) & 0x7);
	}
	caches[2].id_shift =
	pkg_id_shift - mask_width(nodes_per_socket);
	caches[2].present = 1;
	}
	}
	}
	}

	/*
	* Determine topology of processing units for Intel CPUs
	* using CPUID Leaf 1 and Leaf 4, if supported.
	* See:
	* - Intel 64 Architecture Processor Topology Enumeration
	* - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
	* Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
	* FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
	*/
	static void
	topo_probe_intel_0x4(void)
	{
	u_int p[4];
	int max_cores;
	int max_logical;

	/* Both zero and one here mean one logical processor per package. */
	max_logical = (cpu_feature & CPUID_HTT) != 0 ?
	(cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1;
	if (max_logical <= 1)
	return;

	if (cpu_high >= 0x4) {
	cpuid_count(0x04, 0, p);
	max_cores = ((p[0] >> 26) & 0x3f) + 1;
	} else
	max_cores = 1;

	core_id_shift = mask_width(max_logical/max_cores);
	KASSERT(core_id_shift >= 0,
	("intel topo: max_cores > max_logical\n"));
	pkg_id_shift = core_id_shift + mask_width(max_cores);
	}

	/*
	* Determine topology of processing units for Intel CPUs
	* using CPUID Leaf 1Fh or 0Bh, if supported.
	* See:
	* - Intel 64 Architecture Processor Topology Enumeration
	* - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
	* Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS
	* FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS
	*/
	static void
	topo_probe_intel_0xb(void)
	{
	u_int leaf;
	u_int p[4] = { 0 };
	int bits;
	int type;
	int i;

	/* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */
	if (cpu_high >= 0x1f) {
	leaf = 0x1f;
	cpuid_count(leaf, 0, p);
	}
	/* Fall back to leaf 0Bh (Extended Topology Enumeration). */
	if (p[1] == 0) {
	leaf = 0x0b;
	cpuid_count(leaf, 0, p);
	}
	/* Fall back to leaf 04h (Deterministic Cache Parameters). */
	if (p[1] == 0) {
	topo_probe_intel_0x4();
	return;
	}

	/* We only support three levels for now. */
	for (i = 0; ; i++) {
	cpuid_count(leaf, i, p);

	bits = p[0] & 0x1f;
	type = (p[2] >> 8) & 0xff;

	if (type == 0)
	break;

	if (type == CPUID_TYPE_SMT)
	core_id_shift = bits;
	else if (type == CPUID_TYPE_CORE)
	pkg_id_shift = bits;
	else if (bootverbose)
	printf("Topology level type %d shift: %d\n", type, bits);
	}

	if (pkg_id_shift < core_id_shift) {
	printf("WARNING: core covers more APIC IDs than a package\n");
	core_id_shift = pkg_id_shift;
	}
	}

	/*
	* Determine topology of caches for Intel CPUs.
	* See:
	* - Intel 64 Architecture Processor Topology Enumeration
	* - Intel 64 and IA-32 Architectures Software Developer’s Manual
	* Volume 2A: Instruction Set Reference, A-M,
	* CPUID instruction
	*/
	static void
	topo_probe_intel_caches(void)
	{
	u_int p[4];
	int level;
	int share_count;
	int type;
	int i;

	if (cpu_high < 0x4) {
	/*
	* Available cache level and sizes can be determined
	* via CPUID leaf 2, but that requires a huge table of hardcoded
	* values, so for now just assume L1 and L2 caches potentially
	* shared only by HTT processing units, if HTT is present.
	*/
	caches[0].id_shift = pkg_id_shift;
	caches[0].present = 1;
	caches[1].id_shift = pkg_id_shift;
	caches[1].present = 1;
	return;
	}

	for (i = 0; ; i++) {
	cpuid_count(0x4, i, p);
	type = p[0] & 0x1f;
	level = (p[0] >> 5) & 0x7;
	share_count = 1 + ((p[0] >> 14) & 0xfff);

	if (!add_deterministic_cache(type, level, share_count))
	break;
	}
	}

	/*
	* Determine topology of processing units and caches for Intel CPUs.
	* See:
	* - Intel 64 Architecture Processor Topology Enumeration
	*/
	static void
	topo_probe_intel(void)
	{

	/*
	* Note that 0x1 <= cpu_high < 4 case should be
	* compatible with topo_probe_intel_0x4() logic when
	* CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1)
	* or it should trigger the fallback otherwise.
	*/
	if (cpu_high >= 0xb)
	topo_probe_intel_0xb();
	else if (cpu_high >= 0x1)
	topo_probe_intel_0x4();

	topo_probe_intel_caches();
	}

	/*
	* Topology information is queried only on BSP, on which this
	* code runs and for which it can query CPUID information.
	* Then topology is extrapolated on all packages using an
	* assumption that APIC ID to hardware component ID mapping is
	* homogenious.
	* That doesn't necesserily imply that the topology is uniform.
	*/
	void
	topo_probe(void)
	{
	static int cpu_topo_probed = 0;
	struct x86_topo_layer {
	int type;
	int subtype;
	int id_shift;
	} topo_layers[MAX_CACHE_LEVELS + 5];
	struct topo_node *parent;
	struct topo_node *node;
	int layer;
	int nlayers;
	int node_id;
	int i;
	#if defined(DEV_ACPI) && MAXMEMDOM > 1
	int d, domain;
	#endif

	if (cpu_topo_probed)
	return;

	CPU_ZERO(&logical_cpus_mask);

	if (mp_ncpus <= 1)
	; /* nothing */
	else if (cpu_vendor_id == CPU_VENDOR_AMD \|\|
	cpu_vendor_id == CPU_VENDOR_HYGON)
	topo_probe_amd();
	else if (cpu_vendor_id == CPU_VENDOR_INTEL)
	topo_probe_intel();

	KASSERT(pkg_id_shift >= core_id_shift,
	("bug in APIC topology discovery"));

	nlayers = 0;
	bzero(topo_layers, sizeof(topo_layers));

	topo_layers[nlayers].type = TOPO_TYPE_PKG;
	topo_layers[nlayers].id_shift = pkg_id_shift;
	if (bootverbose)
	printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift);
	nlayers++;

	if (pkg_id_shift > node_id_shift && node_id_shift != 0) {
	topo_layers[nlayers].type = TOPO_TYPE_GROUP;
	topo_layers[nlayers].id_shift = node_id_shift;
	if (bootverbose)
	printf("Node ID shift: %u\n",
	topo_layers[nlayers].id_shift);
	nlayers++;
	}

	/*
	* Consider all caches to be within a package/chip
	* and "in front" of all sub-components like
	* cores and hardware threads.
	*/
	for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) {
	if (caches[i].present) {
	if (node_id_shift != 0)
	KASSERT(caches[i].id_shift <= node_id_shift,
	("bug in APIC topology discovery"));
	KASSERT(caches[i].id_shift <= pkg_id_shift,
	("bug in APIC topology discovery"));
	KASSERT(caches[i].id_shift >= core_id_shift,
	("bug in APIC topology discovery"));

	topo_layers[nlayers].type = TOPO_TYPE_CACHE;
	topo_layers[nlayers].subtype = i + 1;
	topo_layers[nlayers].id_shift = caches[i].id_shift;
	if (bootverbose)
	printf("L%u cache ID shift: %u\n",
	topo_layers[nlayers].subtype,
	topo_layers[nlayers].id_shift);
	nlayers++;
	}
	}

	if (pkg_id_shift > core_id_shift) {
	topo_layers[nlayers].type = TOPO_TYPE_CORE;
	topo_layers[nlayers].id_shift = core_id_shift;
	if (bootverbose)
	printf("Core ID shift: %u\n",
	topo_layers[nlayers].id_shift);
	nlayers++;
	}

	topo_layers[nlayers].type = TOPO_TYPE_PU;
	topo_layers[nlayers].id_shift = 0;
	nlayers++;

	#if defined(DEV_ACPI) && MAXMEMDOM > 1
	if (vm_ndomains > 1) {
	for (layer = 0; layer < nlayers; ++layer) {
	for (i = 0; i <= max_apic_id; ++i) {
	if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0)
	domain = -1;
	if (!cpu_info[i].cpu_present)
	continue;
	d = acpi_pxm_get_cpu_locality(i);
	if (domain >= 0 && domain != d)
	break;
	domain = d;
	}
	if (i > max_apic_id)
	break;
	}
	KASSERT(layer < nlayers, ("NUMA domain smaller than PU"));
	memmove(&topo_layers[layer+1], &topo_layers[layer],
	sizeof(topo_layers) (nlayers - layer));
	topo_layers[layer].type = TOPO_TYPE_NODE;
	topo_layers[layer].subtype = CG_SHARE_NONE;
	nlayers++;
	}
	#endif

	topo_init_root(&topo_root);
	for (i = 0; i <= max_apic_id; ++i) {
	if (!cpu_info[i].cpu_present)
	continue;

	parent = &topo_root;
	for (layer = 0; layer < nlayers; ++layer) {
	#if defined(DEV_ACPI) && MAXMEMDOM > 1
	if (topo_layers[layer].type == TOPO_TYPE_NODE) {
	node_id = acpi_pxm_get_cpu_locality(i);
	} else
	#endif
	node_id = i >> topo_layers[layer].id_shift;
	parent = topo_add_node_by_hwid(parent, node_id,
	topo_layers[layer].type,
	topo_layers[layer].subtype);
	}
	}

	parent = &topo_root;
	for (layer = 0; layer < nlayers; ++layer) {
	#if defined(DEV_ACPI) && MAXMEMDOM > 1
	if (topo_layers[layer].type == TOPO_TYPE_NODE)
	node_id = acpi_pxm_get_cpu_locality(boot_cpu_id);
	else
	#endif
	node_id = boot_cpu_id >> topo_layers[layer].id_shift;
	node = topo_find_node_by_hwid(parent, node_id,
	topo_layers[layer].type,
	topo_layers[layer].subtype);
	topo_promote_child(node);
	parent = node;
	}

	cpu_topo_probed = 1;
	}

	/*
	* Assign logical CPU IDs to local APICs.
	*/
	void
	assign_cpu_ids(void)
	{
	struct topo_node *node;
	u_int smt_mask;
	int nhyper;

	smt_mask = (1u << core_id_shift) - 1;

	/*
	* Assign CPU IDs to local APIC IDs and disable any CPUs
	* beyond MAXCPU. CPU 0 is always assigned to the BSP.
	*/
	mp_ncpus = 0;
	nhyper = 0;
	TOPO_FOREACH(node, &topo_root) {
	if (node->type != TOPO_TYPE_PU)
	continue;

	if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask))
	cpu_info[node->hwid].cpu_hyperthread = 1;

	if (resource_disabled("lapic", node->hwid)) {
	if (node->hwid != boot_cpu_id)
	cpu_info[node->hwid].cpu_disabled = 1;
	else
	printf("Cannot disable BSP, APIC ID = %d\n",
	node->hwid);
	}

	if (!hyperthreading_allowed &&
	cpu_info[node->hwid].cpu_hyperthread)
	cpu_info[node->hwid].cpu_disabled = 1;

	if (mp_ncpus >= MAXCPU)
	cpu_info[node->hwid].cpu_disabled = 1;

	if (cpu_info[node->hwid].cpu_disabled) {
	disabled_cpus++;
	continue;
	}

	if (cpu_info[node->hwid].cpu_hyperthread)
	nhyper++;

	cpu_apic_ids[mp_ncpus] = node->hwid;
	apic_cpuids[node->hwid] = mp_ncpus;
	topo_set_pu_id(node, mp_ncpus);
	mp_ncpus++;
	}

	KASSERT(mp_maxid >= mp_ncpus - 1,
	("%s: counters out of sync: max %d, count %d", __func__, mp_maxid,
	mp_ncpus));

	mp_ncores = mp_ncpus - nhyper;
	smp_threads_per_core = mp_ncpus / mp_ncores;
	}

	/*
	* Print various information about the SMP system hardware and setup.
	*/
	void
	cpu_mp_announce(void)
	{
	struct topo_node *node;
	const char *hyperthread;
	struct topo_analysis topology;

	printf("FreeBSD/SMP: ");
	if (topo_analyze(&topo_root, 1, &topology)) {
	printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]);
	if (topology.entities[TOPO_LEVEL_GROUP] > 1)
	printf(" x %d groups",
	topology.entities[TOPO_LEVEL_GROUP]);
	if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
	printf(" x %d cache groups",
	topology.entities[TOPO_LEVEL_CACHEGROUP]);
	if (topology.entities[TOPO_LEVEL_CORE] > 0)
	printf(" x %d core(s)",
	topology.entities[TOPO_LEVEL_CORE]);
	if (topology.entities[TOPO_LEVEL_THREAD] > 1)
	printf(" x %d hardware threads",
	topology.entities[TOPO_LEVEL_THREAD]);
	} else {
	printf("Non-uniform topology");
	}
	printf("\n");

	if (disabled_cpus) {
	printf("FreeBSD/SMP Online: ");
	if (topo_analyze(&topo_root, 0, &topology)) {
	printf("%d package(s)",
	topology.entities[TOPO_LEVEL_PKG]);
	if (topology.entities[TOPO_LEVEL_GROUP] > 1)
	printf(" x %d groups",
	topology.entities[TOPO_LEVEL_GROUP]);
	if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1)
	printf(" x %d cache groups",
	topology.entities[TOPO_LEVEL_CACHEGROUP]);
	if (topology.entities[TOPO_LEVEL_CORE] > 0)
	printf(" x %d core(s)",
	topology.entities[TOPO_LEVEL_CORE]);
	if (topology.entities[TOPO_LEVEL_THREAD] > 1)
	printf(" x %d hardware threads",
	topology.entities[TOPO_LEVEL_THREAD]);
	} else {
	printf("Non-uniform topology");
	}
	printf("\n");
	}

	if (!bootverbose)
	return;

	TOPO_FOREACH(node, &topo_root) {
	switch (node->type) {
	case TOPO_TYPE_PKG:
	printf("Package HW ID = %u\n", node->hwid);
	break;
	case TOPO_TYPE_CORE:
	printf("\tCore HW ID = %u\n", node->hwid);
	break;
	case TOPO_TYPE_PU:
	if (cpu_info[node->hwid].cpu_hyperthread)
	hyperthread = "/HT";
	else
	hyperthread = "";

	if (node->subtype == 0)
	printf("\t\tCPU (AP%s): APIC ID: %u"
	"(disabled)\n", hyperthread, node->hwid);
	else if (node->id == 0)
	printf("\t\tCPU0 (BSP): APIC ID: %u\n",
	node->hwid);
	else
	printf("\t\tCPU%u (AP%s): APIC ID: %u\n",
	node->id, hyperthread, node->hwid);
	break;
	default:
	/* ignored */
	break;
	}
	}
	}

	/*
	* Add a scheduling group, a group of logical processors sharing
	* a particular cache (and, thus having an affinity), to the scheduling
	* topology.
	* This function recursively works on lower level caches.
	*/
	static void
	x86topo_add_sched_group(struct topo_node root, struct cpu_group cg_root)
	{
	struct topo_node *node;
	int nchildren;
	int ncores;
	int i;

	KASSERT(root->type == TOPO_TYPE_SYSTEM \|\| root->type == TOPO_TYPE_CACHE \|\|
	root->type == TOPO_TYPE_NODE \|\| root->type == TOPO_TYPE_GROUP,
	("x86topo_add_sched_group: bad type: %u", root->type));
	CPU_COPY(&root->cpuset, &cg_root->cg_mask);
	cg_root->cg_count = root->cpu_count;
	if (root->type == TOPO_TYPE_CACHE)
	cg_root->cg_level = root->subtype;
	else
	cg_root->cg_level = CG_SHARE_NONE;
	if (root->type == TOPO_TYPE_NODE)
	cg_root->cg_flags = CG_FLAG_NODE;
	else
	cg_root->cg_flags = 0;

	+#if defined(CPUGRP_SCORE)
	+ /*
	+ * Set default performance/efficiency score.
	+ */
	+ memset(cg_root->cg_score, CG_SCORE_DEFAULT, sizeof(cg_root->cg_score));
	+#endif
	+
	/*
	* Check how many core nodes we have under the given root node.
	* If we have multiple logical processors, but not multiple
	* cores, then those processors must be hardware threads.
	*/
	ncores = 0;
	node = root;
	while (node != NULL) {
	if (node->type != TOPO_TYPE_CORE) {
	node = topo_next_node(root, node);
	continue;
	}

	ncores++;
	node = topo_next_nonchild_node(root, node);
	}

	if (cg_root->cg_level != CG_SHARE_NONE &&
	root->cpu_count > 1 && ncores < 2)
	cg_root->cg_flags \|= CG_FLAG_SMT;

	/*
	* Find out how many cache nodes we have under the given root node.
	* We ignore cache nodes that cover all the same processors as the
	* root node. Also, we do not descend below found cache nodes.
	* That is, we count top-level "non-redundant" caches under the root
	* node.
	*/
	nchildren = 0;
	node = root;
	while (node != NULL) {
	/*
	* When some APICs are disabled by tunables, nodes can end up
	* with an empty cpuset. Nodes with an empty cpuset will be
	* translated into cpu groups with empty cpusets. smp_topo_fill
	* will then set cg_first and cg_last to -1. This isn't
	* correctly handled in all functions. E.g. when
	* cpu_search_lowest and cpu_search_highest loop through all
	* cpus, they call CPU_ISSET on cpu -1 which ends up in a
	* general protection fault.
	*
	* We could fix the scheduler to handle empty cpu groups
	* correctly. Nevertheless, empty cpu groups are causing
	* overhead for no value. So, it makes more sense to just don't
	* create them.
	*/
	if (CPU_EMPTY(&node->cpuset)) {
	node = topo_next_node(root, node);
	continue;
	}
	if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) {
	if (node->type == TOPO_TYPE_CACHE &&
	cg_root->cg_level < node->subtype)
	cg_root->cg_level = node->subtype;
	if (node->type == TOPO_TYPE_NODE)
	cg_root->cg_flags \|= CG_FLAG_NODE;
	node = topo_next_node(root, node);
	continue;
	}
	if (node->type != TOPO_TYPE_GROUP &&
	node->type != TOPO_TYPE_NODE &&
	node->type != TOPO_TYPE_CACHE) {
	node = topo_next_node(root, node);
	continue;
	}
	nchildren++;
	node = topo_next_nonchild_node(root, node);
	}

	/*
	* We are not interested in nodes including only one CPU each.
	*/
	if (nchildren == root->cpu_count)
	return;

	/*
	* We are not interested in nodes without children.
	*/
	cg_root->cg_children = nchildren;
	if (nchildren == 0)
	return;

	cg_root->cg_child = smp_topo_alloc(nchildren);

	/*
	* Now find again the same cache nodes as above and recursively
	* build scheduling topologies for them.
	*/
	node = root;
	i = 0;
	while (node != NULL) {
	if ((node->type != TOPO_TYPE_GROUP &&
	node->type != TOPO_TYPE_NODE &&
	node->type != TOPO_TYPE_CACHE) \|\|
	CPU_CMP(&node->cpuset, &root->cpuset) == 0 \|\|
	CPU_EMPTY(&node->cpuset)) {
	node = topo_next_node(root, node);
	continue;
	}
	cg_root->cg_child[i].cg_parent = cg_root;
	x86topo_add_sched_group(node, &cg_root->cg_child[i]);
	i++;
	node = topo_next_nonchild_node(root, node);
	}
	}

	/*
	* Build the MI scheduling topology from the discovered hardware topology.
	*/
	struct cpu_group *
	cpu_topo(void)
	{
	struct cpu_group *cg_root;

	if (mp_ncpus <= 1)
	return (smp_topo_none());

	cg_root = smp_topo_alloc(1);
	x86topo_add_sched_group(&topo_root, cg_root);
	return (cg_root);
	}

	static void
	cpu_alloc(void *dummy __unused)
	{
	/*
	* Dynamically allocate the arrays that depend on the
	* maximum APIC ID.
	*/
	cpu_info = malloc(sizeof(cpu_info) (max_apic_id + 1), M_CPUS,
	M_WAITOK \| M_ZERO);
	apic_cpuids = malloc(sizeof(apic_cpuids) (max_apic_id + 1), M_CPUS,
	M_WAITOK \| M_ZERO);
	}
	SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL);

	/*
	* Add a logical CPU to the topology.
	*/
	void
	cpu_add(u_int apic_id, char boot_cpu)
	{

	if (apic_id > max_apic_id)
	panic("SMP: APIC ID %d too high", apic_id);

	KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice",
	apic_id));
	cpu_info[apic_id].cpu_present = 1;
	if (boot_cpu) {
	KASSERT(boot_cpu_id == -1,
	("CPU %u claims to be BSP, but CPU %u already is", apic_id,
	boot_cpu_id));
	boot_cpu_id = apic_id;
	cpu_info[apic_id].cpu_bsp = 1;
	}
	if (bootverbose)
	printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" :
	"AP");
	}

	void
	cpu_mp_setmaxid(void)
	{

	/*
	* mp_ncpus and mp_maxid should be already set by calls to cpu_add().
	* If there were no calls to cpu_add() assume this is a UP system.
	*/
	if (mp_ncpus == 0)
	mp_ncpus = 1;
	}

	int
	cpu_mp_probe(void)
	{

	/*
	* Always record BSP in CPU map so that the mbuf init code works
	* correctly.
	*/
	CPU_SETOF(0, &all_cpus);
	return (mp_ncpus > 1);
	}

	/*
	* AP CPU's call this to initialize themselves.
	*/
	void
	init_secondary_tail(void)
	{
	u_int cpuid;

	pmap_activate_boot(vmspace_pmap(proc0.p_vmspace));

	/*
	* On real hardware, switch to x2apic mode if possible. Do it
	* after aps_ready was signalled, to avoid manipulating the
	* mode while BSP might still want to send some IPI to us
	* (second startup IPI is ignored on modern hardware etc).
	*/
	lapic_xapic_mode();

	/* Initialize the PAT MSR. */
	pmap_init_pat();

	/* set up CPU registers and state */
	cpu_setregs();

	/* set up SSE/NX */
	initializecpu();

	/* set up FPU state on the AP */
	#ifdef __amd64__
	fpuinit();
	#else
	npxinit(false);
	#endif

	if (cpu_ops.cpu_init)
	cpu_ops.cpu_init();

	/* A quick check from sanity claus */
	cpuid = PCPU_GET(cpuid);
	if (PCPU_GET(apic_id) != lapic_id()) {
	printf("SMP: cpuid = %d\n", cpuid);
	printf("SMP: actual apic_id = %d\n", lapic_id());
	printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id));
	panic("cpuid mismatch! boom!!");
	}

	/* Initialize curthread. */
	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
	PCPU_SET(curthread, PCPU_GET(idlethread));
	schedinit_ap();

	mtx_lock_spin(&ap_boot_mtx);

	mca_init();

	/* Init local apic for irq's */
	lapic_setup(1);

	/* Set memory range attributes for this CPU to match the BSP */
	mem_range_AP_init();

	smp_cpus++;

	CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid);
	if (bootverbose)
	printf("SMP: AP CPU #%d Launched!\n", cpuid);
	else
	printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "",
	cpuid, smp_cpus == mp_ncpus ? "\n" : " ");

	/* Determine if we are a logical CPU. */
	if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread)
	CPU_SET(cpuid, &logical_cpus_mask);

	if (bootverbose)
	lapic_dump("AP");

	if (smp_cpus == mp_ncpus) {
	/* enable IPI's, tlb shootdown, freezes etc */
	atomic_store_rel_int(&smp_started, 1);
	}

	#ifdef __amd64__
	if (pmap_pcid_enabled)
	load_cr4(rcr4() \| CR4_PCIDE);
	load_ds(_udatasel);
	load_es(_udatasel);
	load_fs(_ufssel);
	#endif

	mtx_unlock_spin(&ap_boot_mtx);

	/* Wait until all the AP's are up. */
	while (atomic_load_acq_int(&smp_started) == 0)
	ia32_pause();

	kcsan_cpu_init(cpuid);

	sched_ap_entry();

	panic("scheduler returned us to %s", __func__);
	/* NOTREACHED */
	}

	static void
	smp_after_idle_runnable(void *arg __unused)
	{
	int cpu;

	if (mp_ncpus == 1)
	return;

	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));

	/*
	* Wait for all APs to handle an interrupt. After that, we know that
	* the APs have entered the scheduler at least once, so the boot stacks
	* are safe to free.
	*/
	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
	smp_no_rendezvous_barrier, NULL);

	for (cpu = 1; cpu < mp_ncpus; cpu++) {
	kmem_free(bootstacks[cpu], kstack_pages * PAGE_SIZE);
	}
	}
	SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
	smp_after_idle_runnable, NULL);

	/*
	* We tell the I/O APIC code about all the CPUs we want to receive
	* interrupts. If we don't want certain CPUs to receive IRQs we
	* can simply not tell the I/O APIC code about them in this function.
	* We also do not tell it about the BSP since it tells itself about
	* the BSP internally to work with UP kernels and on UP machines.
	*/
	void
	set_interrupt_apic_ids(void)
	{
	u_int i, apic_id;

	for (i = 0; i < MAXCPU; i++) {
	apic_id = cpu_apic_ids[i];
	if (apic_id == -1)
	continue;
	if (cpu_info[apic_id].cpu_bsp)
	continue;
	if (cpu_info[apic_id].cpu_disabled)
	continue;
	if (intr_apic_id_limit >= 0 && apic_id > intr_apic_id_limit)
	continue;

	/* Don't let hyperthreads service interrupts. */
	if (cpu_info[apic_id].cpu_hyperthread &&
	!hyperthreading_intr_allowed)
	continue;

	intr_add_cpu(i);
	}
	}

	#ifdef COUNT_XINVLTLB_HITS
	u_int xhits_gbl[MAXCPU];
	u_int xhits_pg[MAXCPU];
	u_int xhits_rng[MAXCPU];
	static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"");
	SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl,
	sizeof(xhits_gbl), "IU", "");
	SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg,
	sizeof(xhits_pg), "IU", "");
	SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng,
	sizeof(xhits_rng), "IU", "");

	u_int ipi_global;
	u_int ipi_page;
	u_int ipi_range;
	u_int ipi_range_size;
	SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, "");
	SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, "");
	SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, "");
	SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size,
	0, "");
	#endif /* COUNT_XINVLTLB_HITS */

	/*
	* Init and startup IPI.
	*/
	void
	ipi_startup(int apic_id, int vector)
	{

	/*
	* This attempts to follow the algorithm described in the
	* Intel Multiprocessor Specification v1.4 in section B.4.
	* For each IPI, we allow the local APIC ~20us to deliver the
	* IPI. If that times out, we panic.
	*/

	/*
	* first we do an INIT IPI: this INIT IPI might be run, resetting
	* and running the target CPU. OR this INIT IPI might be latched (P5
	* bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be
	* ignored.
	*/
	lapic_ipi_raw(APIC_DEST_DESTFLD \| APIC_TRIGMOD_LEVEL \|
	APIC_LEVEL_ASSERT \| APIC_DESTMODE_PHY \| APIC_DELMODE_INIT, apic_id);
	lapic_ipi_wait(100);

	/* Explicitly deassert the INIT IPI. */
	lapic_ipi_raw(APIC_DEST_DESTFLD \| APIC_TRIGMOD_LEVEL \|
	APIC_LEVEL_DEASSERT \| APIC_DESTMODE_PHY \| APIC_DELMODE_INIT,
	apic_id);

	DELAY(10000); /* wait ~10mS */

	/*
	* next we do a STARTUP IPI: the previous INIT IPI might still be
	* latched, (P5 bug) this 1st STARTUP would then terminate
	* immediately, and the previously started INIT IPI would continue. OR
	* the previous INIT IPI has already run. and this STARTUP IPI will
	* run. OR the previous INIT IPI was ignored. and this STARTUP IPI
	* will run.
	*/
	lapic_ipi_raw(APIC_DEST_DESTFLD \| APIC_TRIGMOD_EDGE \|
	APIC_LEVEL_ASSERT \| APIC_DESTMODE_PHY \| APIC_DELMODE_STARTUP \|
	vector, apic_id);
	if (!lapic_ipi_wait(100))
	panic("Failed to deliver first STARTUP IPI to APIC %d",
	apic_id);
	DELAY(200); /* wait ~200uS */

	/*
	* finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF
	* the previous STARTUP IPI was cancelled by a latched INIT IPI. OR
	* this STARTUP IPI will be ignored, as only ONE STARTUP IPI is
	* recognized after hardware RESET or INIT IPI.
	*/
	lapic_ipi_raw(APIC_DEST_DESTFLD \| APIC_TRIGMOD_EDGE \|
	APIC_LEVEL_ASSERT \| APIC_DESTMODE_PHY \| APIC_DELMODE_STARTUP \|
	vector, apic_id);
	if (!lapic_ipi_wait(100))
	panic("Failed to deliver second STARTUP IPI to APIC %d",
	apic_id);

	DELAY(200); /* wait ~200uS */
	}

	static bool
	ipi_bitmap_set(int cpu, u_int ipi)
	{
	u_int bitmap, old, new;
	u_int *cpu_bitmap;

	bitmap = 1 << ipi;
	cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap;
	old = *cpu_bitmap;
	for (;;) {
	if ((old & bitmap) != 0)
	break;
	new = old \| bitmap;
	if (atomic_fcmpset_int(cpu_bitmap, &old, new))
	break;
	}
	return (old != 0);
	}

	/*
	* Send an IPI to specified CPU handling the bitmap logic.
	*/
	static void
	ipi_send_cpu(int cpu, u_int ipi)
	{

	KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1,
	("IPI to non-existent CPU %d", cpu));

	if (IPI_IS_BITMAPED(ipi)) {
	if (ipi_bitmap_set(cpu, ipi))
	return;
	ipi = IPI_BITMAP_VECTOR;
	}
	lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]);
	}

	void
	ipi_bitmap_handler(struct trapframe frame)
	{
	struct trapframe *oldframe;
	struct thread *td;
	int cpu = PCPU_GET(cpuid);
	u_int ipi_bitmap;

	kasan_mark(&frame, sizeof(frame), sizeof(frame), 0);

	td = curthread;
	ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]->
	pc_ipi_bitmap);

	/*
	* sched_preempt() must be called to clear the pending preempt
	* IPI to enable delivery of further preempts. However, the
	* critical section will cause extra scheduler lock thrashing
	* when used unconditionally. Only critical_enter() if
	* hardclock must also run, which requires the section entry.
	*/
	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
	critical_enter();

	td->td_intr_nesting_level++;
	oldframe = td->td_intr_frame;
	td->td_intr_frame = &frame;
	#if defined(STACK) \|\| defined(DDB)
	if (ipi_bitmap & (1 << IPI_TRACE))
	stack_capture_intr();
	#endif
	if (ipi_bitmap & (1 << IPI_PREEMPT)) {
	#ifdef COUNT_IPIS
	(*ipi_preempt_counts[cpu])++;
	#endif
	sched_preempt(td);
	}
	if (ipi_bitmap & (1 << IPI_AST)) {
	#ifdef COUNT_IPIS
	(*ipi_ast_counts[cpu])++;
	#endif
	/* Nothing to do for AST */
	}
	if (ipi_bitmap & (1 << IPI_HARDCLOCK)) {
	#ifdef COUNT_IPIS
	(*ipi_hardclock_counts[cpu])++;
	#endif
	hardclockintr();
	}
	td->td_intr_frame = oldframe;
	td->td_intr_nesting_level--;
	if (ipi_bitmap & (1 << IPI_HARDCLOCK))
	critical_exit();
	}

	/*
	* send an IPI to a set of cpus.
	*/
	void
	ipi_selected(cpuset_t cpus, u_int ipi)
	{
	int cpu;

	/*
	* IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
	* of help in order to understand what is the source.
	* Set the mask of receiving CPUs for this purpose.
	*/
	if (ipi == IPI_STOP_HARD)
	CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus);

	CPU_FOREACH_ISSET(cpu, &cpus) {
	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
	ipi_send_cpu(cpu, ipi);
	}
	}

	/*
	* send an IPI to a specific CPU.
	*/
	void
	ipi_cpu(int cpu, u_int ipi)
	{

	/*
	* IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
	* of help in order to understand what is the source.
	* Set the mask of receiving CPUs for this purpose.
	*/
	if (ipi == IPI_STOP_HARD)
	CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending);

	CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi);
	ipi_send_cpu(cpu, ipi);
	}

	/*
	* send an IPI to all CPUs EXCEPT myself
	*/
	void
	ipi_all_but_self(u_int ipi)
	{
	cpuset_t other_cpus;
	int cpu, c;

	/*
	* IPI_STOP_HARD maps to a NMI and the trap handler needs a bit
	* of help in order to understand what is the source.
	* Set the mask of receiving CPUs for this purpose.
	*/
	if (ipi == IPI_STOP_HARD) {
	other_cpus = all_cpus;
	CPU_CLR(PCPU_GET(cpuid), &other_cpus);
	CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus);
	}

	CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi);
	if (IPI_IS_BITMAPED(ipi)) {
	cpu = PCPU_GET(cpuid);
	CPU_FOREACH(c) {
	if (c != cpu)
	ipi_bitmap_set(c, ipi);
	}
	ipi = IPI_BITMAP_VECTOR;
	}
	lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS);
	}

	void
	ipi_self_from_nmi(u_int vector)
	{

	lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF);

	/* Wait for IPI to finish. */
	if (!lapic_ipi_wait(50000)) {
	if (KERNEL_PANICKED())
	return;
	else
	panic("APIC: IPI is stuck");
	}
	}

	int
	ipi_nmi_handler(void)
	{
	u_int cpuid;

	/*
	* As long as there is not a simple way to know about a NMI's
	* source, if the bitmask for the current CPU is present in
	* the global pending bitword an IPI_STOP_HARD has been issued
	* and should be handled.
	*/
	cpuid = PCPU_GET(cpuid);
	if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending))
	return (1);

	CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending);
	cpustop_handler();
	return (0);
	}

	int nmi_kdb_lock;

	void
	nmi_call_kdb_smp(u_int type, struct trapframe *frame)
	{
	int cpu;
	bool call_post;

	cpu = PCPU_GET(cpuid);
	if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) {
	nmi_call_kdb(cpu, type, frame);
	call_post = false;
	} else {
	savectx(&stoppcbs[cpu]);
	CPU_SET_ATOMIC(cpu, &stopped_cpus);
	while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1))
	ia32_pause();
	call_post = true;
	}
	atomic_store_rel_int(&nmi_kdb_lock, 0);
	if (call_post)
	cpustop_handler_post(cpu);
	}

	/*
	* Handle an IPI_STOP by saving our current context and spinning (or mwaiting,
	* if available) until we are resumed.
	*/
	void
	cpustop_handler(void)
	{
	struct monitorbuf *mb;
	u_int cpu;
	bool use_mwait;

	cpu = PCPU_GET(cpuid);

	savectx(&stoppcbs[cpu]);

	use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 &&
	!mwait_cpustop_broken);
	if (use_mwait) {
	mb = PCPU_PTR(monitorbuf);
	atomic_store_int(&mb->stop_state,
	MONITOR_STOPSTATE_STOPPED);
	}

	/* Indicate that we are stopped */
	CPU_SET_ATOMIC(cpu, &stopped_cpus);

	/* Wait for restart */
	while (!CPU_ISSET(cpu, &started_cpus)) {
	if (use_mwait) {
	cpu_monitor(mb, 0, 0);
	if (atomic_load_int(&mb->stop_state) ==
	MONITOR_STOPSTATE_STOPPED)
	cpu_mwait(0, MWAIT_C1);
	continue;
	}

	ia32_pause();

	/*
	* Halt non-BSP CPUs on panic -- we're never going to need them
	* again, and might as well save power / release resources
	* (e.g., overprovisioned VM infrastructure).
	*/
	while (__predict_false(!IS_BSP() && KERNEL_PANICKED()))
	halt();
	}

	cpustop_handler_post(cpu);
	}

	static void
	cpustop_handler_post(u_int cpu)
	{

	CPU_CLR_ATOMIC(cpu, &started_cpus);
	CPU_CLR_ATOMIC(cpu, &stopped_cpus);

	/*
	* We don't broadcast TLB invalidations to other CPUs when they are
	* stopped. Hence, we clear the TLB before resuming.
	*/
	invltlb_glob();

	#if defined(__amd64__) && (defined(DDB) \|\| defined(GDB))
	amd64_db_resume_dbreg();
	#endif

	if (cpu == 0 && cpustop_restartfunc != NULL) {
	cpustop_restartfunc();
	cpustop_restartfunc = NULL;
	}
	}

	/*
	* Handle an IPI_SUSPEND by saving our current context and spinning until we
	* are resumed.
	*/
	void
	cpususpend_handler(void)
	{
	u_int cpu;

	mtx_assert(&smp_ipi_mtx, MA_NOTOWNED);

	cpu = PCPU_GET(cpuid);

	#ifdef XENHVM
	/*
	* Some Xen guest types (PVH) expose a very minimal set of ACPI tables,
	* and for example have no support for SCI. That leads to the suspend
	* stacks not being allocated, and hence when attempting to perform a
	* Xen triggered suspension FreeBSD will hit a #PF. Avoid saving the
	* CPU and FPU contexts if the stacks are not allocated, as the
	* hypervisor will already take care of this. Note that we could even
	* do this for Xen triggered suspensions on guests that have full ACPI
	* support, but doing so would introduce extra complexity.
	*/
	if (susppcbs == NULL) {
	KASSERT(vm_guest == VM_GUEST_XEN, ("Missing suspend stack"));
	CPU_SET_ATOMIC(cpu, &suspended_cpus);
	CPU_SET_ATOMIC(cpu, &resuming_cpus);
	} else
	#endif
	if (savectx(&susppcbs[cpu]->sp_pcb)) {
	#ifdef __amd64__
	fpususpend(susppcbs[cpu]->sp_fpususpend);
	#else
	npxsuspend(susppcbs[cpu]->sp_fpususpend);
	#endif
	/*
	* suspended_cpus is cleared shortly after each AP is restarted
	* by a Startup IPI, so that the BSP can proceed to restarting
	* the next AP.
	*
	* resuming_cpus gets cleared when the AP completes
	* initialization after having been released by the BSP.
	* resuming_cpus is probably not the best name for the
	* variable, because it is actually a set of processors that
	* haven't resumed yet and haven't necessarily started resuming.
	*
	* Note that suspended_cpus is meaningful only for ACPI suspend
	* as it's not really used for Xen suspend since the APs are
	* automatically restored to the running state and the correct
	* context. For the same reason resumectx is never called in
	* that case.
	*/
	CPU_SET_ATOMIC(cpu, &suspended_cpus);
	CPU_SET_ATOMIC(cpu, &resuming_cpus);

	/*
	* Invalidate the cache after setting the global status bits.
	* The last AP to set its bit may end up being an Owner of the
	* corresponding cache line in MOESI protocol. The AP may be
	* stopped before the cache line is written to the main memory.
	*/
	wbinvd();
	} else {
	#ifdef __amd64__
	fpuresume(susppcbs[cpu]->sp_fpususpend);
	#else
	npxresume(susppcbs[cpu]->sp_fpususpend);
	#endif
	pmap_init_pat();
	initializecpu();
	PCPU_SET(switchtime, 0);
	PCPU_SET(switchticks, ticks);

	/* Indicate that we have restarted and restored the context. */
	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
	}

	/* Wait for resume directive */
	while (!CPU_ISSET(cpu, &toresume_cpus))
	ia32_pause();

	/* Re-apply microcode updates. */
	ucode_reload();

	#ifdef __i386__
	/* Finish removing the identity mapping of low memory for this AP. */
	invltlb_glob();
	#endif

	if (cpu_ops.cpu_resume)
	cpu_ops.cpu_resume();
	#ifdef __amd64__
	if (vmm_resume_p)
	vmm_resume_p();
	#endif

	/* Resume MCA and local APIC */
	lapic_xapic_mode();
	mca_resume();
	lapic_setup(0);

	/* Indicate that we are resumed */
	CPU_CLR_ATOMIC(cpu, &resuming_cpus);
	CPU_CLR_ATOMIC(cpu, &suspended_cpus);
	CPU_CLR_ATOMIC(cpu, &toresume_cpus);
	}

	/*
	* Handle an IPI_SWI by waking delayed SWI thread.
	*/
	void
	ipi_swi_handler(struct trapframe frame)
	{

	intr_event_handle(clk_intr_event, &frame);
	}

	/*
	* This is called once the rest of the system is up and running and we're
	* ready to let the AP's out of the pen.
	*/
	static void
	release_aps(void *dummy __unused)
	{

	if (mp_ncpus == 1)
	return;
	atomic_store_rel_int(&aps_ready, 1);
	while (smp_started == 0)
	ia32_pause();
	}
	SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL);

	#ifdef COUNT_IPIS
	/*
	* Setup interrupt counters for IPI handlers.
	*/
	static void
	mp_ipi_intrcnt(void *dummy)
	{
	char buf[64];
	int i;

	CPU_FOREACH(i) {
	snprintf(buf, sizeof(buf), "cpu%d:invltlb", i);
	intrcnt_add(buf, &ipi_invltlb_counts[i]);
	snprintf(buf, sizeof(buf), "cpu%d:invlrng", i);
	intrcnt_add(buf, &ipi_invlrng_counts[i]);
	snprintf(buf, sizeof(buf), "cpu%d:invlpg", i);
	intrcnt_add(buf, &ipi_invlpg_counts[i]);
	snprintf(buf, sizeof(buf), "cpu%d:invlcache", i);
	intrcnt_add(buf, &ipi_invlcache_counts[i]);
	snprintf(buf, sizeof(buf), "cpu%d:preempt", i);
	intrcnt_add(buf, &ipi_preempt_counts[i]);
	snprintf(buf, sizeof(buf), "cpu%d:ast", i);
	intrcnt_add(buf, &ipi_ast_counts[i]);
	snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i);
	intrcnt_add(buf, &ipi_rendezvous_counts[i]);
	snprintf(buf, sizeof(buf), "cpu%d:hardclock", i);
	intrcnt_add(buf, &ipi_hardclock_counts[i]);
	}
	}
	SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
	#endif
	--
	2.41.0

File Metadata

Mime Type: text/x-diff
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 9368452
Default Alt Text: 0003-Add-CPU-core-performance-efficiency-score-variable-t.patch (118 KB)

0003-Add-CPU-core-performance-efficiency-score-variable-t.patchkoinec_yahoo.co.jp (Koine Yuusuke)Actions

0003-Add-CPU-core-performance-efficiency-score-variable-t.patchView Options

File Metadata

Event Timeline

0003-Add-CPU-core-performance-efficiency-score-variable-t.patch
koinec_yahoo.co.jp (Koine Yuusuke)
Actions

0003-Add-CPU-core-performance-efficiency-score-variable-t.patch
View Options