diff --git a/share/man/man7/security.7 b/share/man/man7/security.7
index a3895fb8a607..71ecf1e682e6 100644
--- a/share/man/man7/security.7
+++ b/share/man/man7/security.7
@@ -1,1102 +1,1109 @@
 .\" Copyright (C) 1998 Matthew Dillon. All rights reserved.
 .\" Copyright (c) 2019 The FreeBSD Foundation, Inc.
 .\"
 .\" Parts of this documentation were written by
 .\" Konstantin Belousov <kib@FreeBSD.org> under sponsorship
 .\" from the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd May 16, 2020
+.Dd June 11, 2020
 .Dt SECURITY 7
 .Os
 .Sh NAME
 .Nm security
 .Nd introduction to security under FreeBSD
 .Sh DESCRIPTION
 Security is a function that begins and ends with the system administrator.
 While all
 .Bx
 multi-user systems have some inherent security, the job of building and
 maintaining additional security mechanisms to keep users
 .Dq honest
 is probably
 one of the single largest undertakings of the sysadmin.
 Machines are
 only as secure as you make them, and security concerns are ever competing
 with the human necessity for convenience.
 .Ux
 systems,
 in general, are capable of running a huge number of simultaneous processes
 and many of these processes operate as servers \(em meaning that external
 entities can connect and talk to them.
 As yesterday's mini-computers and mainframes
 become today's desktops, and as computers become networked and internetworked,
 security becomes an ever bigger issue.
 .Pp
 Security is best implemented through a layered onion approach.
 In a nutshell,
 what you want to do is to create as many layers of security as are convenient
 and then carefully monitor the system for intrusions.
 .Pp
 System security also pertains to dealing with various forms of attacks,
 including attacks that attempt to crash or otherwise make a system unusable
 but do not attempt to break root.
 Security concerns can be split up into
 several categories:
 .Bl -enum -offset indent
 .It
 Denial of Service attacks (DoS)
 .It
 User account compromises
 .It
 Root compromise through accessible servers
 .It
 Root compromise via user accounts
 .It
 Backdoor creation
 .El
 .Pp
 A denial of service attack is an action that deprives the machine of needed
 resources.
 Typically, DoS attacks are brute-force mechanisms that attempt
 to crash or otherwise make a machine unusable by overwhelming its servers or
 network stack.
 Some DoS attacks try to take advantages of bugs in the
 networking stack to crash a machine with a single packet.
 The latter can
 only be fixed by applying a bug fix to the kernel.
 Attacks on servers can
 often be fixed by properly specifying options to limit the load the servers
 incur on the system under adverse conditions.
 Brute-force network attacks are harder to deal with.
 A spoofed-packet attack, for example, is
 nearly impossible to stop short of cutting your system off from the Internet.
 It may not be able to take your machine down, but it can fill up your Internet
 pipe.
 .Pp
 A user account compromise is even more common than a DoS attack.
 Many
 sysadmins still run standard
 .Xr telnetd 8
 and
 .Xr ftpd 8
 servers on their machines.
 These servers, by default, do not operate over encrypted
 connections.
 The result is that if you have any moderate-sized user base,
 one or more of your users logging into your system from a remote location
 (which is the most common and convenient way to log in to a system)
 will have his or her password sniffed.
 The attentive system administrator will analyze
 his remote access logs looking for suspicious source addresses
 even for successful logins.
 .Pp
 One must always assume that once an attacker has access to a user account,
 the attacker can break root.
 However, the reality is that in a well secured
 and maintained system, access to a user account does not necessarily give the
 attacker access to root.
 The distinction is important because without access
 to root the attacker cannot generally hide his tracks and may, at best, be
 able to do nothing more than mess with the user's files or crash the machine.
 User account compromises are very common because users tend not to take the
 precautions that sysadmins take.
 .Pp
 System administrators must keep in mind that there are potentially many ways
 to break root on a machine.
 The attacker may know the root password,
 the attacker
 may find a bug in a root-run server and be able to break root over a network
 connection to that server, or the attacker may know of a bug in an SUID-root
 program that allows the attacker to break root once he has broken into a
 user's account.
 If an attacker has found a way to break root on a machine,
 the attacker may not have a need to install a backdoor.
 Many of the root holes found and closed to date involve a considerable amount
 of work by the attacker to clean up after himself, so most attackers do install
 backdoors.
 This gives you a convenient way to detect the attacker.
 Making
 it impossible for an attacker to install a backdoor may actually be detrimental
 to your security because it will not close off the hole the attacker used to
 break in originally.
 .Pp
 Security remedies should always be implemented with a multi-layered
 .Dq onion peel
 approach and can be categorized as follows:
 .Bl -enum -offset indent
 .It
 Securing root and staff accounts
 .It
 Securing root \(em root-run servers and SUID/SGID binaries
 .It
 Securing user accounts
 .It
 Securing the password file
 .It
 Securing the kernel core, raw devices, and file systems
 .It
 Quick detection of inappropriate changes made to the system
 .It
 Paranoia
 .El
 .Sh SECURING THE ROOT ACCOUNT AND SECURING STAFF ACCOUNTS
 Do not bother securing staff accounts if you have not secured the root
 account.
 Most systems have a password assigned to the root account.
 The
 first thing you do is assume that the password is
 .Em always
 compromised.
 This does not mean that you should remove the password.
 The
 password is almost always necessary for console access to the machine.
 What it does mean is that you should not make it possible to use the password
 outside of the console or possibly even with a
 .Xr su 1
 utility.
 For example, make sure that your PTYs are specified as being
 .Dq Li insecure
 in the
 .Pa /etc/ttys
 file
 so that direct root logins via
 .Xr telnet 1
 are disallowed.
 If using
 other login services such as
 .Xr sshd 8 ,
 make sure that direct root logins are
 disabled there as well.
 Consider every access method \(em services such as
 .Xr ftp 1
 often fall through the cracks.
 Direct root logins should only be allowed
 via the system console.
 .Pp
 Of course, as a sysadmin you have to be able to get to root, so we open up
 a few holes.
 But we make sure these holes require additional password
 verification to operate.
 One way to make root accessible is to add appropriate
 staff accounts to the
 .Dq Li wheel
 group (in
 .Pa /etc/group ) .
 The staff members placed in the
 .Li wheel
 group are allowed to
 .Xr su 1
 to root.
 You should never give staff
 members native
 .Li wheel
 access by putting them in the
 .Li wheel
 group in their password entry.
 Staff accounts should be placed in a
 .Dq Li staff
 group, and then added to the
 .Li wheel
 group via the
 .Pa /etc/group
 file.
 Only those staff members who actually need to have root access
 should be placed in the
 .Li wheel
 group.
 It is also possible, when using an
 authentication method such as Kerberos, to use Kerberos's
 .Pa .k5login
 file in the root account to allow a
 .Xr ksu 1
 to root without having to place anyone at all in the
 .Li wheel
 group.
 This
 may be the better solution since the
 .Li wheel
 mechanism still allows an
 intruder to break root if the intruder has gotten hold of your password
 file and can break into a staff account.
 While having the
 .Li wheel
 mechanism
 is better than having nothing at all, it is not necessarily the safest
 option.
 .Pp
 An indirect way to secure the root account is to secure your staff accounts
 by using an alternative login access method and *'ing out the crypted password
 for the staff accounts.
 This way an intruder may be able to steal the password
 file but will not be able to break into any staff accounts or root, even if
 root has a crypted password associated with it (assuming, of course, that
 you have limited root access to the console).
 Staff members
 get into their staff accounts through a secure login mechanism such as
 .Xr kerberos 8
 or
 .Xr ssh 1
 using a private/public
 key pair.
 When you use something like Kerberos you generally must secure
 the machines which run the Kerberos servers and your desktop workstation.
 When you use a public/private key pair with SSH, you must generally secure
 the machine you are logging in
 .Em from
 (typically your workstation),
 but you can
 also add an additional layer of protection to the key pair by password
 protecting the keypair when you create it with
 .Xr ssh-keygen 1 .
 Being able
 to star-out the passwords for staff accounts also guarantees that staff
 members can only log in through secure access methods that you have set up.
 You can
 thus force all staff members to use secure, encrypted connections for
 all their sessions which closes an important hole used by many intruders: that
 of sniffing the network from an unrelated, less secure machine.
 .Pp
 The more indirect security mechanisms also assume that you are logging in
 from a more restrictive server to a less restrictive server.
 For example,
 if your main box is running all sorts of servers, your workstation should not
 be running any.
 In order for your workstation to be reasonably secure
 you should run as few servers as possible, up to and including no servers
 at all, and you should run a password-protected screen blanker.
 Of course, given physical access to
 a workstation, an attacker can break any sort of security you put on it.
 This is definitely a problem that you should consider but you should also
 consider the fact that the vast majority of break-ins occur remotely, over
 a network, from people who do not have physical access to your workstation or
 servers.
 .Pp
 Using something like Kerberos also gives you the ability to disable or
 change the password for a staff account in one place and have it immediately
 affect all the machines the staff member may have an account on.
 If a staff
 member's account gets compromised, the ability to instantly change his
 password on all machines should not be underrated.
 With discrete passwords, changing a password on N machines can be a mess.
 You can also impose
 re-passwording restrictions with Kerberos: not only can a Kerberos ticket
 be made to timeout after a while, but the Kerberos system can require that
 the user choose a new password after a certain period of time
 (say, once a month).
 .Sh SECURING ROOT \(em ROOT-RUN SERVERS AND SUID/SGID BINARIES
 The prudent sysadmin only runs the servers he needs to, no more, no less.
 Be aware that third party servers are often the most bug-prone.
 For example,
 running an old version of
 .Xr imapd 8
 or
 .Xr popper 8 Pq Pa ports/mail/popper
 is like giving a universal root
 ticket out to the entire world.
 Never run a server that you have not checked
 out carefully.
 Many servers do not need to be run as root.
 For example,
 the
 .Xr talkd 8 ,
 .Xr comsat 8 ,
 and
 .Xr fingerd 8
 daemons can be run in special user
 .Dq sandboxes .
 A sandbox is not perfect unless you go to a large amount of trouble, but the
 onion approach to security still stands: if someone is able to break in
 through a server running in a sandbox, they still have to break out of the
 sandbox.
 The more layers the attacker must break through, the lower the
 likelihood of his success.
 Root holes have historically been found in
 virtually every server ever run as root, including basic system servers.
 If you are running a machine through which people only log in via
 .Xr sshd 8
 and never log in via
 .Xr telnetd 8
 then turn off those services!
 .Pp
 .Fx
 now defaults to running
 .Xr talkd 8 ,
 .Xr comsat 8 ,
 and
 .Xr fingerd 8
 in a sandbox.
 Depending on whether you
 are installing a new system or upgrading an existing system, the special
 user accounts used by these sandboxes may not be installed.
 The prudent
 sysadmin would research and implement sandboxes for servers whenever possible.
 .Pp
 There are a number of other servers that typically do not run in sandboxes:
 .Xr sendmail 8 ,
 .Xr popper 8 ,
 .Xr imapd 8 ,
 .Xr ftpd 8 ,
 and others.
 There are alternatives to
 some of these, but installing them may require more work than you are willing
 to put
 (the convenience factor strikes again).
 You may have to run these
 servers as root and rely on other mechanisms to detect break-ins that might
 occur through them.
 .Pp
 The other big potential root hole in a system are the SUID-root and SGID
 binaries installed on the system.
 Most of these binaries, such as
 .Xr su 1 ,
 reside in
 .Pa /bin , /sbin , /usr/bin ,
 or
 .Pa /usr/sbin .
 While nothing is 100% safe,
 the system-default SUID and SGID binaries can be considered reasonably safe.
 Still, root holes are occasionally found in these binaries.
 A root hole
 was found in Xlib in 1998 that made
 .Xr xterm 1 Pq Pa ports/x11/xterm
 (which is typically SUID)
 vulnerable.
 It is better to be safe than sorry and the prudent sysadmin will restrict SUID
 binaries that only staff should run to a special group that only staff can
 access, and get rid of
 .Pq Dq Li "chmod 000"
 any SUID binaries that nobody uses.
 A server with no display generally does not need an
 .Xr xterm 1
 binary.
 SGID binaries can be almost as dangerous.
 If an intruder can break an SGID-kmem binary the
 intruder might be able to read
 .Pa /dev/kmem
 and thus read the crypted password
 file, potentially compromising any passworded account.
 Alternatively an
 intruder who breaks group
 .Dq Li kmem
 can monitor keystrokes sent through PTYs,
 including PTYs used by users who log in through secure methods.
 An intruder
 that breaks the
 .Dq Li tty
 group can write to almost any user's TTY.
 If a user
 is running a terminal
 program or emulator with a keyboard-simulation feature, the intruder can
 potentially
 generate a data stream that causes the user's terminal to echo a command, which
 is then run as that user.
 .Sh SECURING USER ACCOUNTS
 User accounts are usually the most difficult to secure.
 While you can impose
 draconian access restrictions on your staff and *-out their passwords, you
 may not be able to do so with any general user accounts you might have.
 If
 you do have sufficient control then you may win out and be able to secure the
 user accounts properly.
 If not, you simply have to be more vigilant in your
 monitoring of those accounts.
 Use of SSH and Kerberos for user accounts is
 more problematic due to the extra administration and technical support
 required, but still a very good solution compared to a crypted password
 file.
 .Sh SECURING THE PASSWORD FILE
 The only sure fire way is to *-out as many passwords as you can and
 use SSH or Kerberos for access to those accounts.
 Even though the
 crypted password file
 .Pq Pa /etc/spwd.db
 can only be read by root, it may
 be possible for an intruder to obtain read access to that file even if the
 attacker cannot obtain root-write access.
 .Pp
 Your security scripts should always check for and report changes to
 the password file
 (see
 .Sx CHECKING FILE INTEGRITY
 below).
 .Sh SECURING THE KERNEL CORE, RAW DEVICES, AND FILE SYSTEMS
 If an attacker breaks root he can do just about anything, but there
 are certain conveniences.
 For example, most modern kernels have a packet sniffing device driver built in.
 Under
 .Fx
 it is called
 the
 .Xr bpf 4
 device.
 An intruder will commonly attempt to run a packet sniffer
 on a compromised machine.
 You do not need to give the intruder the
 capability and most systems should not have the
 .Xr bpf 4
 device compiled in.
 .Pp
 But even if you turn off the
 .Xr bpf 4
 device, you still have
 .Pa /dev/mem
 and
 .Pa /dev/kmem
 to worry about.
 For that matter,
 the intruder can still write to raw disk devices.
 Also, there is another kernel feature called the module loader,
 .Xr kldload 8 .
 An enterprising intruder can use a KLD module to install
 his own
 .Xr bpf 4
 device or other sniffing device on a running kernel.
 To avoid these problems you have to run
 the kernel at a higher security level, at least level 1.
 The security level can be set with a
 .Xr sysctl 8
 on the
 .Va kern.securelevel
 variable.
 Once you have
 set the security level to 1, write access to raw devices will be denied and
 special
 .Xr chflags 1
 flags, such as
 .Cm schg ,
 will be enforced.
 You must also ensure
 that the
 .Cm schg
 flag is set on critical startup binaries, directories, and
 script files \(em everything that gets run
 up to the point where the security level is set.
 This might be overdoing it, and upgrading the system is much more
 difficult when you operate at a higher security level.
 You may compromise and
 run the system at a higher security level but not set the
 .Cm schg
 flag for every
 system file and directory under the sun.
 Another possibility is to simply
 mount
 .Pa /
 and
 .Pa /usr
 read-only.
 It should be noted that being too draconian in
 what you attempt to protect may prevent the all-important detection of an
 intrusion.
 .Pp
 The kernel runs with five different security levels.
 Any super-user process can raise the level, but no process
 can lower it.
 The security levels are:
 .Bl -tag -width flag
 .It Ic -1
 Permanently insecure mode \- always run the system in insecure mode.
 This is the default initial value.
 .It Ic 0
 Insecure mode \- immutable and append-only flags may be turned off.
 All devices may be read or written subject to their permissions.
 .It Ic 1
 Secure mode \- the system immutable and system append-only flags may not
 be turned off;
 disks for mounted file systems,
 .Pa /dev/mem
 and
 .Pa /dev/kmem
 may not be opened for writing;
 .Pa /dev/io
 (if your platform has it) may not be opened at all;
 kernel modules (see
 .Xr kld 4 )
 may not be loaded or unloaded.
 The kernel debugger may not be entered using the
 .Va debug.kdb.enter
 sysctl.
 A panic or trap cannot be forced using the
 .Va debug.kdb.panic
 and other sysctl's.
 .It Ic 2
 Highly secure mode \- same as secure mode, plus disks may not be
 opened for writing (except by
 .Xr mount 2 )
 whether mounted or not.
 This level precludes tampering with file systems by unmounting them,
 but also inhibits running
 .Xr newfs 8
 while the system is multi-user.
 .Pp
 In addition, kernel time changes are restricted to less than or equal to one
 second.
 Attempts to change the time by more than this will log the message
 .Dq Time adjustment clamped to +1 second .
 .It Ic 3
 Network secure mode \- same as highly secure mode, plus
 IP packet filter rules (see
 .Xr ipfw 8 ,
 .Xr ipfirewall 4
 and
 .Xr pfctl 8 )
 cannot be changed and
 .Xr dummynet 4
 or
 .Xr pf 4
 configuration cannot be adjusted.
 .El
 .Pp
 The security level can be configured with variables documented in
 .Xr rc.conf 5 .
 .Sh CHECKING FILE INTEGRITY: BINARIES, CONFIG FILES, ETC
 When it comes right down to it, you can only protect your core system
 configuration and control files so much before the convenience factor
 rears its ugly head.
 For example, using
 .Xr chflags 1
 to set the
 .Cm schg
 bit on most of the files in
 .Pa /
 and
 .Pa /usr
 is probably counterproductive because
 while it may protect the files, it also closes a detection window.
 The
 last layer of your security onion is perhaps the most important \(em detection.
 The rest of your security is pretty much useless (or, worse, presents you with
 a false sense of safety) if you cannot detect potential incursions.
 Half
 the job of the onion is to slow down the attacker rather than stop him
 in order to give the detection layer a chance to catch him in
 the act.
 .Pp
 The best way to detect an incursion is to look for modified, missing, or
 unexpected files.
 The best
 way to look for modified files is from another (often centralized)
 limited-access system.
 Writing your security scripts on the extra-secure limited-access system
 makes them mostly invisible to potential attackers, and this is important.
 In order to take maximum advantage you generally have to give the
 limited-access box significant access to the other machines in the business,
 usually either by doing a read-only NFS export of the other machines to the
 limited-access box, or by setting up SSH keypairs to allow the limit-access
 box to SSH to the other machines.
 Except for its network traffic, NFS is
 the least visible method \(em allowing you to monitor the file systems on each
 client box virtually undetected.
 If your
 limited-access server is connected to the client boxes through a switch,
 the NFS method is often the better choice.
 If your limited-access server
 is connected to the client boxes through a hub or through several layers
 of routing, the NFS method may be too insecure (network-wise) and using SSH
 may be the better choice even with the audit-trail tracks that SSH lays.
 .Pp
 Once you give a limit-access box at least read access to the client systems
 it is supposed to monitor, you must write scripts to do the actual
 monitoring.
 Given an NFS mount, you can write scripts out of simple system
 utilities such as
 .Xr find 1
 and
 .Xr md5 1 .
 It is best to physically
 .Xr md5 1
 the client-box files boxes at least once a
 day, and to test control files such as those found in
 .Pa /etc
 and
 .Pa /usr/local/etc
 even more often.
 When mismatches are found relative to the base MD5
 information the limited-access machine knows is valid, it should scream at
 a sysadmin to go check it out.
 A good security script will also check for
 inappropriate SUID binaries and for new or deleted files on system partitions
 such as
 .Pa /
 and
 .Pa /usr .
 .Pp
 When using SSH rather than NFS, writing the security script is much more
 difficult.
 You essentially have to
 .Xr scp 1
 the scripts to the client box in order to run them, making them visible, and
 for safety you also need to
 .Xr scp 1
 the binaries (such as
 .Xr find 1 )
 that those scripts use.
 The
 .Xr sshd 8
 daemon on the client box may already be compromised.
 All in all,
 using SSH may be necessary when running over unsecure links, but it is also a
 lot harder to deal with.
 .Pp
 A good security script will also check for changes to user and staff members
 access configuration files:
 .Pa .rhosts , .shosts , .ssh/authorized_keys
 and so forth, files that might fall outside the purview of the MD5 check.
 .Pp
 If you have a huge amount of user disk space it may take too long to run
 through every file on those partitions.
 In this case, setting mount
 flags to disallow SUID binaries on those partitions is a good
 idea.
 The
 .Cm nosuid
 option
 (see
 .Xr mount 8 )
 is what you want to look into.
 I would scan them anyway at least once a
 week, since the object of this layer is to detect a break-in whether or
 not the break-in is effective.
 .Pp
 Process accounting
 (see
 .Xr accton 8 )
 is a relatively low-overhead feature of
 the operating system which I recommend using as a post-break-in evaluation
 mechanism.
 It is especially useful in tracking down how an intruder has
 actually broken into a system, assuming the file is still intact after
 the break-in occurs.
 .Pp
 Finally, security scripts should process the log files and the logs themselves
 should be generated in as secure a manner as possible \(em remote syslog can be
 very useful.
 An intruder tries to cover his tracks, and log files are critical
 to the sysadmin trying to track down the time and method of the initial
 break-in.
 One way to keep a permanent record of the log files is to run
 the system console to a serial port and collect the information on a
 continuing basis through a secure machine monitoring the consoles.
 .Sh PARANOIA
 A little paranoia never hurts.
 As a rule, a sysadmin can add any number
 of security features as long as they do not affect convenience, and
 can add security features that do affect convenience with some added
 thought.
 Even more importantly, a security administrator should mix it up
 a bit \(em if you use recommendations such as those given by this manual
 page verbatim, you give away your methodologies to the prospective
 attacker who also has access to this manual page.
 .Sh SPECIAL SECTION ON DoS ATTACKS
 This section covers Denial of Service attacks.
 A DoS attack is typically a packet attack.
 While there is not much you can do about modern spoofed
 packet attacks that saturate your network, you can generally limit the damage
 by ensuring that the attacks cannot take down your servers.
 .Bl -enum -offset indent
 .It
 Limiting server forks
 .It
 Limiting springboard attacks (ICMP response attacks, ping broadcast, etc.)
 .It
 Kernel Route Cache
 .El
 .Pp
 A common DoS attack is against a forking server that attempts to cause the
 server to eat processes, file descriptors, and memory until the machine
 dies.
 The
 .Xr inetd 8
 server
 has several options to limit this sort of attack.
 It should be noted that while it is possible to prevent a machine from going
 down it is not generally possible to prevent a service from being disrupted
 by the attack.
 Read the
 .Xr inetd 8
 manual page carefully and pay specific attention
 to the
 .Fl c , C ,
 and
 .Fl R
 options.
 Note that spoofed-IP attacks will circumvent
 the
 .Fl C
 option to
 .Xr inetd 8 ,
 so typically a combination of options must be used.
 Some standalone servers have self-fork-limitation parameters.
 .Pp
 The
 .Xr sendmail 8
 daemon has its
 .Fl OMaxDaemonChildren
 option which tends to work much
 better than trying to use
 .Xr sendmail 8 Ns 's
 load limiting options due to the
 load lag.
 You should specify a
 .Va MaxDaemonChildren
 parameter when you start
 .Xr sendmail 8
 high enough to handle your expected load but not so high that the
 computer cannot handle that number of
 .Nm sendmail Ns 's
 without falling on its face.
 It is also prudent to run
 .Xr sendmail 8
 in
 .Dq queued
 mode
 .Pq Fl ODeliveryMode=queued
 and to run the daemon
 .Pq Dq Nm sendmail Fl bd
 separate from the queue-runs
 .Pq Dq Nm sendmail Fl q15m .
 If you still want real-time delivery you can run the queue
 at a much lower interval, such as
 .Fl q1m ,
 but be sure to specify a reasonable
 .Va MaxDaemonChildren
 option for that
 .Xr sendmail 8
 to prevent cascade failures.
 .Pp
 The
 .Xr syslogd 8
 daemon can be attacked directly and it is strongly recommended that you use
 the
 .Fl s
 option whenever possible, and the
 .Fl a
 option otherwise.
 .Pp
 You should also be fairly careful
 with connect-back services such as tcpwrapper's reverse-identd, which can
 be attacked directly.
 You generally do not want to use the reverse-ident
 feature of tcpwrappers for this reason.
 .Pp
 It is a very good idea to protect internal services from external access
 by firewalling them off at your border routers.
 The idea here is to prevent
 saturation attacks from outside your LAN, not so much to protect internal
 services from network-based root compromise.
 Always configure an exclusive
 firewall, i.e.,
 .So
 firewall everything
 .Em except
 ports A, B, C, D, and M-Z
 .Sc .
 This
 way you can firewall off all of your low ports except for certain specific
 services such as
 .Xr talkd 8 ,
 .Xr sendmail 8 ,
 and other internet-accessible services.
 If you try to configure the firewall the other
 way \(em as an inclusive or permissive firewall, there is a good chance that you
 will forget to
 .Dq close
 a couple of services or that you will add a new internal
 service and forget to update the firewall.
 You can still open up the
 high-numbered port range on the firewall to allow permissive-like operation
 without compromising your low ports.
 Also take note that
 .Fx
 allows you to
 control the range of port numbers used for dynamic binding via the various
 .Va net.inet.ip.portrange
 sysctl's
 .Pq Dq Li "sysctl net.inet.ip.portrange" ,
 which can also
 ease the complexity of your firewall's configuration.
 I usually use a normal
 first/last range of 4000 to 5000, and a hiport range of 49152 to 65535, then
 block everything under 4000 off in my firewall
 (except for certain specific
 internet-accessible ports, of course).
 .Pp
 Another common DoS attack is called a springboard attack \(em to attack a server
 in a manner that causes the server to generate responses which then overload
 the server, the local network, or some other machine.
 The most common attack
 of this nature is the ICMP PING BROADCAST attack.
 The attacker spoofs ping
 packets sent to your LAN's broadcast address with the source IP address set
 to the actual machine they wish to attack.
 If your border routers are not
 configured to stomp on ping's to broadcast addresses, your LAN winds up
 generating sufficient responses to the spoofed source address to saturate the
 victim, especially when the attacker uses the same trick on several dozen
 broadcast addresses over several dozen different networks at once.
 Broadcast attacks of over a hundred and twenty megabits have been measured.
 A second common springboard attack is against the ICMP error reporting system.
 By
 constructing packets that generate ICMP error responses, an attacker can
 saturate a server's incoming network and cause the server to saturate its
 outgoing network with ICMP responses.
 This type of attack can also crash the
 server by running it out of
 .Vt mbuf Ns 's ,
 especially if the server cannot drain the
 ICMP responses it generates fast enough.
 The
 .Fx
 kernel has a new kernel
 compile option called
 .Dv ICMP_BANDLIM
 which limits the effectiveness of these
 sorts of attacks.
 The last major class of springboard attacks is related to
 certain internal
 .Xr inetd 8
 services such as the UDP echo service.
 An attacker
 simply spoofs a UDP packet with the source address being server A's echo port,
 and the destination address being server B's echo port, where server A and B
 are both on your LAN.
 The two servers then bounce this one packet back and
 forth between each other.
 The attacker can overload both servers and their
 LANs simply by injecting a few packets in this manner.
 Similar problems
 exist with the internal chargen port.
 A competent sysadmin will turn off all
 of these
 .Xr inetd 8 Ns -internal
 test services.
 .Sh ACCESS ISSUES WITH KERBEROS AND SSH
 There are a few issues with both Kerberos and SSH that need to be addressed
 if you intend to use them.
 Kerberos5 is an excellent authentication
 protocol but the kerberized
 .Xr telnet 1
 suck rocks.
 There are bugs that make them unsuitable for dealing with binary streams.
 Also, by default
 Kerberos does not encrypt a session unless you use the
 .Fl x
 option.
 SSH encrypts everything by default.
 .Pp
 SSH works quite well in every respect except when it is set up to
 forward encryption keys.
 What this means is that if you have a secure workstation holding
 keys that give you access to the rest of the system, and you
 .Xr ssh 1
 to an
 unsecure machine, your keys become exposed.
 The actual keys themselves are
 not exposed, but
 .Xr ssh 1
 installs a forwarding port for the duration of your
 login and if an attacker has broken root on the unsecure machine he can utilize
 that port to use your keys to gain access to any other machine that your
 keys unlock.
 .Pp
 We recommend that you use SSH in combination with Kerberos whenever possible
 for staff logins.
 SSH can be compiled with Kerberos support.
 This reduces
 your reliance on potentially exposable SSH keys while at the same time
 protecting passwords via Kerberos.
 SSH keys
 should only be used for automated tasks from secure machines (something
 that Kerberos is unsuited to).
 We also recommend that you either turn off
 key-forwarding in the SSH configuration, or that you make use of the
 .Va from Ns = Ns Ar IP/DOMAIN
 option that SSH allows in its
 .Pa authorized_keys
 file to make the key only usable to entities logging in from specific
 machines.
 .Sh KNOBS AND TWEAKS
 .Fx
 provides several knobs and tweak handles that make some introspection
 information access more restricted.
 Some people consider this as improving system security, so the knobs are
 briefly listed there, together with controls which enable some mitigations
 of the hardware state leaks.
 .Pp
 Hardware mitigation sysctl knobs described below have been moved under
 .Pa machdep.mitigations ,
 with backwards-compatibility shims to accept the existing names.
 A future change will rationalize the sense of the individual sysctls
 (so that enabled / true always indicates that the mitigation is active).
 For that reason the previous names remain the canonical way to set the
 mitigations, and are documented here.
 Backwards compatibility shims for the interim sysctls under
 .Pa machdep.mitigations
 will not be added.
 .Bl -tag -width security.bsd.unprivileged_proc_debug
 .It Dv security.bsd.see_other_uids
 Controls visibility of processes owned by different uid.
 The knob directly affects the
 .Dv kern.proc
 sysctls filtering of data, which results in restricted output from
 utilities like
 .Xr ps 1 .
 .It Dv security.bsd.see_other_gids
 Same, for processes owned by different gid.
 .It Dv security.bsd.see_jail_proc
 Same, for processes belonging to a jail.
 .It Dv security.bsd.conservative_signals
 When enabled, unprivileged users are only allowed to send job control
 and usual termination signals like
 .Dv SIGKILL ,
 .Dv SIGINT ,
 and
 .Dv SIGTERM ,
 to the processes executing programs with changed uids.
 .It Dv security.bsd.unprivileged_proc_debug
 Controls availability of the process debugging facilities to non-root users.
 See also
 .Xr proccontrol 1
 mode
 .Dv trace .
 .It Dv vm.pmap.pti
 Tunable, amd64-only.
 Enables mode of operation of virtual memory system where usermode page
 tables are sanitized to prevent so-called Meltdown information leak on
 some Intel CPUs.
 By default, the system detects whether the CPU needs the workaround,
 and enables it automatically.
 See also
 .Xr proccontrol 1
 mode
 .Dv kpti .
 .It Dv machdep.mitigations.flush_rsb_ctxsw
 amd64.
 Controls Return Stack Buffer flush on context switch, to prevent
 cross-process ret2spec attacks.
 Only needed, and only enabled by default, if the machine
 supports SMEP, otherwise IBRS would do necessary flushing on kernel
 entry anyway.
 .It Dv hw.mds_disable
 amd64 and i386.
 Controls Microarchitectural Data Sampling hardware information leak
 mitigation.
 .It Dv hw.spec_store_bypass_disable
 amd64 and i386.
 Controls Speculative Store Bypass hardware information leak mitigation.
 .It Dv hw.ibrs_disable
 amd64 and i386.
 Controls Indirect Branch Restricted Speculation hardware information leak
 mitigation.
 .It Dv machdep.syscall_ret_l1d_flush
 amd64.
 Controls force-flush of L1D cache on return from syscalls which report
 errors other than
 .Ev EEXIST ,
 .Ev EAGAIN ,
 .Ev EXDEV ,
 .Ev ENOENT ,
 .Ev ENOTCONN ,
 and
 .Ev EINPROGRESS .
 This is mostly a paranoid setting added to prevent hypothetical exploitation
 of unknown gadgets for unknown hardware issues.
 The error codes exclusion list is composed of the most common errors which
 typically occurs on normal system operation.
 .It Dv machdep.nmi_flush_l1d_sw
 amd64.
 Controls force-flush of L1D cache on NMI;
 this provides software assist for bhyve mitigation of L1 terminal fault
 hardware information leak.
 .It Dv hw.vmm.vmx.l1d_flush
 amd64.
 Controls the mitigation of L1 Terminal Fault in bhyve hypervisor.
 .It Dv vm.pmap.allow_2m_x_ept
 amd64.
 Allows the use of superpages for executable mappings under the EPT
 page table format used by hypervisors on Intel CPUs to map the guest
 physical address space to machine physical memory.
 May be disabled to work around a CPU Erratum called
 Machine Check Error Avoidance on Page Size Change.
+.It Dv machdep.mitigations.rngds.enable
+amd64 and i386.
+Controls mitigation of Special Register Buffer Data Sampling versus
+optimization of the MCU access.
+When set to zero, the mitigation is disabled, and the RDSEED and RDRAND
+instructions do not incur serialization overhead for shared buffer accesses,
+and do not serialize off-core memory accessses.
 .It Dv kern.elf32.aslr.enable
 Controls system-global Address Space Layout Randomization (ASLR) for
 normal non-PIE (Position Independent Executable) 32bit binaries.
 See also
 .Xr proccontrol 1
 mode
 .Dv aslr ,
 also affected by the per-image control note flag.
 .It Dv kern.elf32.aslr.pie_enable
 Controls system-global Address Space Layout Randomization for
 position-independent (PIE) 32bit binaries.
 .It Dv kern.elf32.aslr.honor_sbrk
 Makes ASLR less aggressive and more compatible with old binaries
 relying on the sbrk area.
 .It Dv kern.elf32.aslr.aslr_stack_gap
 If ASLR is enabled for a binary, a non-zero value creates a randomized
 stack gap between strings and the end of the aux vector.
 The value is the maximum percentage of main stack to waste on the gap.
 Cannot be greater than 50, i.e., at most half of the stack.
 .It Dv kern.elf64.aslr.enable
 64bit binaries ASLR control.
 .It Dv kern.elf64.aslr.pie_enable
 64bit PIE binaries ASLR control.
 .It Dv kern.elf64.aslr.honor_sbrk
 64bit binaries ASLR sbrk compatibility control.
 .It Dv kern.elf32.aslr.aslr_stack_gap
 Controls stack gap for 64bit binaries.
 .It Dv kern.elf32.nxstack
 Enables non-executable stack for 32bit processes.
 Enabled by default if supported by hardware and corresponding binary.
 .It Dv kern.elf64.nxstack
 Enables non-executable stack for 64bit processes.
 .El
 .Sh SEE ALSO
 .Xr chflags 1 ,
 .Xr find 1 ,
 .Xr md5 1 ,
 .Xr netstat 1 ,
 .Xr openssl 1 ,
 .Xr proccontrol 1 ,
 .Xr ps 1 ,
 .Xr ssh 1 ,
 .Xr xdm 1 Pq Pa ports/x11/xorg-clients ,
 .Xr group 5 ,
 .Xr ttys 5 ,
 .Xr accton 8 ,
 .Xr init 8 ,
 .Xr sshd 8 ,
 .Xr sysctl 8 ,
 .Xr syslogd 8 ,
 .Xr vipw 8
 .Sh HISTORY
 The
 .Nm
 manual page was originally written by
 .An Matthew Dillon
 and first appeared
 in
 .Fx 3.1 ,
 December 1998.
diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c
index 33db37eda8d6..b584273aff6c 100644
--- a/sys/amd64/amd64/initcpu.c
+++ b/sys/amd64/amd64/initcpu.c
@@ -1,319 +1,320 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) KATO Takenori, 1997, 1998.
  * 
  * All rights reserved.  Unpublished rights reserved under the copyright
  * laws of Japan.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 static int	hw_instruction_sse;
 SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
     &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
 static int	lower_sharedpage_init;
 int		hw_lower_amd64_sharedpage;
 SYSCTL_INT(_hw, OID_AUTO, lower_amd64_sharedpage, CTLFLAG_RDTUN,
     &hw_lower_amd64_sharedpage, 0,
    "Lower sharedpage to work around Ryzen issue with executing code near the top of user memory");
 /*
  * -1: automatic (default)
  *  0: keep enable CLFLUSH
  *  1: force disable CLFLUSH
  */
 static int	hw_clflush_disable = -1;
 
 static void
 init_amd(void)
 {
 	uint64_t msr;
 
 	/*
 	 * Work around Erratum 721 for Family 10h and 12h processors.
 	 * These processors may incorrectly update the stack pointer
 	 * after a long series of push and/or near-call instructions,
 	 * or a long series of pop and/or near-return instructions.
 	 *
 	 * http://support.amd.com/us/Processor_TechDocs/41322_10h_Rev_Gd.pdf
 	 * http://support.amd.com/us/Processor_TechDocs/44739_12h_Rev_Gd.pdf
 	 *
 	 * Hypervisors do not provide access to the errata MSR,
 	 * causing #GP exception on attempt to apply the errata.  The
 	 * MSR write shall be done on host and persist globally
 	 * anyway, so do not try to do it when under virtualization.
 	 */
 	switch (CPUID_TO_FAMILY(cpu_id)) {
 	case 0x10:
 	case 0x12:
 		if ((cpu_feature2 & CPUID2_HV) == 0)
 			wrmsr(0xc0011029, rdmsr(0xc0011029) | 1);
 		break;
 	}
 
 	/*
 	 * BIOS may fail to set InitApicIdCpuIdLo to 1 as it should per BKDG.
 	 * So, do it here or otherwise some tools could be confused by
 	 * Initial Local APIC ID reported with CPUID Function 1 in EBX.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(MSR_NB_CFG1);
 			msr |= (uint64_t)1 << 54;
 			wrmsr(MSR_NB_CFG1, msr);
 		}
 	}
 
 	/*
 	 * BIOS may configure Family 10h processors to convert WC+ cache type
 	 * to CD.  That can hurt performance of guest VMs using nested paging.
 	 * The relevant MSR bit is not documented in the BKDG,
 	 * the fix is borrowed from Linux.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(0xc001102a);
 			msr &= ~((uint64_t)1 << 24);
 			wrmsr(0xc001102a, msr);
 		}
 	}
 
 	/*
 	 * Work around Erratum 793: Specific Combination of Writes to Write
 	 * Combined Memory Types and Locked Instructions May Cause Core Hang.
 	 * See Revision Guide for AMD Family 16h Models 00h-0Fh Processors,
 	 * revision 3.04 or later, publication 51810.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x16 && CPUID_TO_MODEL(cpu_id) <= 0xf) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(MSR_LS_CFG);
 			msr |= (uint64_t)1 << 15;
 			wrmsr(MSR_LS_CFG, msr);
 		}
 	}
 
 	/* Ryzen erratas. */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1 &&
 	    (cpu_feature2 & CPUID2_HV) == 0) {
 		/* 1021 */
 		msr = rdmsr(0xc0011029);
 		msr |= 0x2000;
 		wrmsr(0xc0011029, msr);
 
 		/* 1033 */
 		msr = rdmsr(MSR_LS_CFG);
 		msr |= 0x10;
 		wrmsr(MSR_LS_CFG, msr);
 
 		/* 1049 */
 		msr = rdmsr(0xc0011028);
 		msr |= 0x10;
 		wrmsr(0xc0011028, msr);
 
 		/* 1095 */
 		msr = rdmsr(MSR_LS_CFG);
 		msr |= 0x200000000000000;
 		wrmsr(MSR_LS_CFG, msr);
 	}
 
 	/*
 	 * Work around a problem on Ryzen that is triggered by executing
 	 * code near the top of user memory, in our case the signal
 	 * trampoline code in the shared page on amd64.
 	 *
 	 * This function is executed once for the BSP before tunables take
 	 * effect so the value determined here can be overridden by the
 	 * tunable.  This function is then executed again for each AP and
 	 * also on resume.  Set a flag the first time so that value set by
 	 * the tunable is not overwritten.
 	 *
 	 * The stepping and/or microcode versions should be checked after
 	 * this issue is fixed by AMD so that we don't use this mode if not
 	 * needed.
 	 */
 	if (lower_sharedpage_init == 0) {
 		lower_sharedpage_init = 1;
 		if (CPUID_TO_FAMILY(cpu_id) == 0x17 ||
 		    CPUID_TO_FAMILY(cpu_id) == 0x18) {
 			hw_lower_amd64_sharedpage = 1;
 		}
 	}
 }
 
 /*
  * Initialize special VIA features
  */
 static void
 init_via(void)
 {
 	u_int regs[4], val;
 
 	/*
 	 * Check extended CPUID for PadLock features.
 	 *
 	 * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf
 	 */
 	do_cpuid(0xc0000000, regs);
 	if (regs[0] >= 0xc0000001) {
 		do_cpuid(0xc0000001, regs);
 		val = regs[3];
 	} else
 		return;
 
 	/* Enable RNG if present. */
 	if ((val & VIA_CPUID_HAS_RNG) != 0) {
 		via_feature_rng = VIA_HAS_RNG;
 		wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG);
 	}
 
 	/* Enable PadLock if present. */
 	if ((val & VIA_CPUID_HAS_ACE) != 0)
 		via_feature_xcrypt |= VIA_HAS_AES;
 	if ((val & VIA_CPUID_HAS_ACE2) != 0)
 		via_feature_xcrypt |= VIA_HAS_AESCTR;
 	if ((val & VIA_CPUID_HAS_PHE) != 0)
 		via_feature_xcrypt |= VIA_HAS_SHA;
 	if ((val & VIA_CPUID_HAS_PMM) != 0)
 		via_feature_xcrypt |= VIA_HAS_MM;
 	if (via_feature_xcrypt != 0)
 		wrmsr(0x1107, rdmsr(0x1107) | (1 << 28));
 }
 
 /*
  * Initialize CPU control registers
  */
 void
 initializecpu(void)
 {
 	uint64_t msr;
 	uint32_t cr4;
 
 	cr4 = rcr4();
 	if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) {
 		cr4 |= CR4_FXSR | CR4_XMM;
 		cpu_fxsr = hw_instruction_sse = 1;
 	}
 	if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE)
 		cr4 |= CR4_FSGSBASE;
 
 	if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU)
 		cr4 |= CR4_PKE;
 
 	/*
 	 * If SMEP is present, we only need to flush RSB (by default)
 	 * on context switches, to prevent cross-process ret2spec
 	 * attacks.  Do it automatically if ibrs_disable is set, to
 	 * complete the mitigation.
 	 *
 	 * Postpone enabling the SMEP on the boot CPU until the page
 	 * tables are switched from the boot loader identity mapping
 	 * to the kernel tables.  The boot loader enables the U bit in
 	 * its tables.
 	 */
 	if (IS_BSP()) {
 		if (cpu_stdext_feature & CPUID_STDEXT_SMEP &&
 		    !TUNABLE_INT_FETCH(
 		    "machdep.mitigations.cpu_flush_rsb_ctxsw",
 		    &cpu_flush_rsb_ctxsw) &&
 		    hw_ibrs_disable)
 			cpu_flush_rsb_ctxsw = 1;
 	} else {
 		if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 			cr4 |= CR4_SMEP;
 		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
 			cr4 |= CR4_SMAP;
 	}
 	load_cr4(cr4);
 	if (IS_BSP() && (amd_feature & AMDID_NX) != 0) {
 		msr = rdmsr(MSR_EFER) | EFER_NXE;
 		wrmsr(MSR_EFER, msr);
 		pg_nx = PG_NX;
 	}
 	hw_ibrs_recalculate(false);
 	hw_ssb_recalculate(false);
 	amd64_syscall_ret_flush_l1d_recalc();
+	x86_rngds_mitg_recalculate(false);
 	switch (cpu_vendor_id) {
 	case CPU_VENDOR_AMD:
 	case CPU_VENDOR_HYGON:
 		init_amd();
 		break;
 	case CPU_VENDOR_CENTAUR:
 		init_via();
 		break;
 	}
 
 	if ((amd_feature & AMDID_RDTSCP) != 0 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0)
 		wrmsr(MSR_TSC_AUX, PCPU_GET(cpuid));
 }
 
 void
 initializecpucache(void)
 {
 
 	/*
 	 * CPUID with %eax = 1, %ebx returns
 	 * Bits 15-8: CLFLUSH line size
 	 * 	(Value * 8 = cache line size in bytes)
 	 */
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8;
 	/*
 	 * XXXKIB: (temporary) hack to work around traps generated
 	 * when CLFLUSHing APIC register window under virtualization
 	 * environments.  These environments tend to disable the
 	 * CPUID_SS feature even though the native CPU supports it.
 	 */
 	TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
 	if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) {
 		cpu_feature &= ~CPUID_CLFSH;
 		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
 	}
 
 	/*
 	 * The kernel's use of CLFLUSH{,OPT} can be disabled manually
 	 * by setting the hw.clflush_disable tunable.
 	 */
 	if (hw_clflush_disable == 1) {
 		cpu_feature &= ~CPUID_CLFSH;
 		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
 	}
 }
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 239b4baaca60..b2944089e6d9 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1,2804 +1,2807 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_pci.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/csan.h>
 #include <sys/efi.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <x86/ifunc.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <x86/isa/icu.h>
 #else
 #include <x86/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 #include <x86/init.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 /*
  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  * couple of scratch registers, as well as the trapframe left behind after an
  * iret fault.
  */
 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
     offsetof(struct pti_frame, pti_rip));
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /* Preload data parse function */
 static caddr_t native_parse_preload_data(u_int64_t);
 
 /* Native function to fetch and parse the e820 map */
 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
 
 /* Default init_ops implementation. */
 struct init_ops init_ops = {
 	.parse_preload_data =	native_parse_preload_data,
 	.early_clock_source_init =	i8254_init,
 	.early_delay =			i8254_delay,
 	.parse_memmap =			native_parse_memmap,
 #ifdef SMP
 	.mp_bootaddress =		mp_bootaddress,
 	.start_all_aps =		native_start_all_aps,
 #endif
 #ifdef DEV_PCI
 	.msi_init =			msi_init,
 #endif
 };
 
 /*
  * Physical address of the EFI System Table. Stashed from the metadata hints
  * passed into the kernel and used by the EFI code to call runtime services.
  */
 vm_paddr_t efi_systbl_phys;
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_idt;
 
 struct pcpu *__pcpu;
 struct pcpu temp_bsp_pcpu;
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 void (*vmm_resume_p)(void);
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge. 
 	 */
 	sysenv = kern_getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = kern_getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)vm_free_count()))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)vm_free_count()),
 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
 #ifdef DEV_PCI
 	if (bootverbose && intel_graphics_stolen_base != 0)
 		printf("intel stolen mem: base %#jx size %ju MB\n",
 		    (uintmax_t)intel_graphics_stolen_base,
 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
 #endif
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by call
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct pcb *pcb;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	pcb = td->td_pcb;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 		xfpusave = __builtin_alloca(xfpusave_len);
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	update_pcb_bases(pcb);
 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_ss = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sys_sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct pcb *pcb;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	pcb = td->td_pcb;
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0) {
 		uprintf("pid %d (%s): sigreturn copyin failed\n",
 		    p->p_pid, td->td_name);
 		return (error);
 	}
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
 		    td->td_name, rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
 		    td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    p->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
 		xfpustate = __builtin_alloca(xfpustate_len);
 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    p->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    p->p_pid, td->td_name, ret);
 		return (ret);
 	}
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
  
 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
 {
 	struct trapframe *regs;
 	struct pcb *pcb;
 	register_t saved_rflags;
 
 	regs = td->td_frame;
 	pcb = td->td_pcb;
 
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	clear_pcb_flags(pcb, PCB_32BIT);
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
 
 	saved_rflags = regs->tf_rflags & PSL_T;
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | saved_rflags;
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 
 	/*
 	 * Reset the hardware debug registers if they were in use.
 	 * They won't have any meaning for the newly exec'd process.
 	 */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		pcb->pcb_dr0 = 0;
 		pcb->pcb_dr1 = 0;
 		pcb->pcb_dr2 = 0;
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
 		if (pcb == curpcb) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 			reset_dbregs();
 		}
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 static char mce0_stack[PAGE_SIZE] __aligned(16);
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
 static char dbg0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
 /*
  * Software prototypes -- in more palatable form.
  *
  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  * slots as corresponding segments for i386 kernel.
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GNULL2_SEL	1 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
 	.ssd_type = SDT_SYSTSS,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* Actually, the TSS is a system descriptor which is double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 LDT Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
 
 void
 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 	IDTVEC(div_pti), IDTVEC(bpt_pti),
 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
 	IDTVEC(xmm_pti),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
 #endif
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
 	IDTVEC(fast_syscall_pti);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	struct {
 		uint16_t limit;
 		uint64_t base;
 	} __packed idtr, gdtr;
 	uint16_t ldt, tr;
 
 	__asm __volatile("sidt %0" : "=m" (idtr));
 	db_printf("idtr\t0x%016lx/%04x\n",
 	    (u_long)idtr.base, (u_int)idtr.limit);
 	__asm __volatile("sgdt %0" : "=m" (gdtr));
 	db_printf("gdtr\t0x%016lx/%04x\n",
 	    (u_long)gdtr.base, (u_int)gdtr.limit);
 	__asm __volatile("sldt %0" : "=r" (ldt));
 	db_printf("ldtr\t0x%04x\n", ldt);
 	__asm __volatile("str %0" : "=r" (tr));
 	db_printf("tr\t0x%04x\n", tr);
 	db_printf("cr0\t0x%016lx\n", rcr0());
 	db_printf("cr2\t0x%016lx\n", rcr2());
 	db_printf("cr3\t0x%016lx\n", rcr3());
 	db_printf("cr4\t0x%016lx\n", rcr4());
 	if (rcr4() & CR4_XSAVE)
 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
 		db_printf("FEATURES_CTL\t%016lx\n",
 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
 }
 
 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
 {
 
 	db_printf("dr0\t0x%016lx\n", rdr0());
 	db_printf("dr1\t0x%016lx\n", rdr1());
 	db_printf("dr2\t0x%016lx\n", rdr2());
 	db_printf("dr3\t0x%016lx\n", rdr3());
 	db_printf("dr6\t0x%016lx\n", rdr6());
 	db_printf("dr7\t0x%016lx\n", rdr7());	
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
 #include <isa/isavar.h>
 #include <isa/isareg.h>
 /*
  * Return a bitmap of the current interrupt requests.  This is 8259-specific
  * and is only suitable for use at probe time.
  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
  * It shouldn't be here.  There should probably be an APIC centric
  * implementation in the apic driver code, if at all.
  */
 intrmask_t
 isa_irq_pending(void)
 {
 	u_char irr1;
 	u_char irr2;
 
 	irr1 = inb(IO_ICU1);
 	irr2 = inb(IO_ICU2);
 	return ((irr2 << 8) | irr1);
 }
 #endif
 
 u_int basemem;
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 *
 	 * NB: physmap_idx points to the next free slot.
 	 */
 	insert_idx = physmap_idx;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 void
 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
                       vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap, *smapend;
 
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (!add_physmap_entry(smap->base, smap->length, physmap,
 		    physmap_idx))
 			break;
 	}
 }
 
 static void
 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
     int *physmap_idx)
 {
 	struct efi_md *map, *p;
 	const char *type;
 	size_t efisz;
 	int ndesc, i;
 
 	static const char *types[] = {
 		"Reserved",
 		"LoaderCode",
 		"LoaderData",
 		"BootServicesCode",
 		"BootServicesData",
 		"RuntimeServicesCode",
 		"RuntimeServicesData",
 		"ConventionalMemory",
 		"UnusableMemory",
 		"ACPIReclaimMemory",
 		"ACPIMemoryNVS",
 		"MemoryMappedIO",
 		"MemoryMappedIOPortSpace",
 		"PalCode",
 		"PersistentMemory"
 	};
 
 	/*
 	 * Memory map data provided by UEFI via the GetMemoryMap
 	 * Boot Services API.
 	 */
 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 
 	if (efihdr->descriptor_size == 0)
 		return;
 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
 
 	if (boothowto & RB_VERBOSE)
 		printf("%23s %12s %12s %8s %4s\n",
 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
 
 	for (i = 0, p = map; i < ndesc; i++,
 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 		if (boothowto & RB_VERBOSE) {
 			if (p->md_type < nitems(types))
 				type = types[p->md_type];
 			else
 				type = "<INVALID>";
 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
 			    p->md_virt, p->md_pages);
 			if (p->md_attr & EFI_MD_ATTR_UC)
 				printf("UC ");
 			if (p->md_attr & EFI_MD_ATTR_WC)
 				printf("WC ");
 			if (p->md_attr & EFI_MD_ATTR_WT)
 				printf("WT ");
 			if (p->md_attr & EFI_MD_ATTR_WB)
 				printf("WB ");
 			if (p->md_attr & EFI_MD_ATTR_UCE)
 				printf("UCE ");
 			if (p->md_attr & EFI_MD_ATTR_WP)
 				printf("WP ");
 			if (p->md_attr & EFI_MD_ATTR_RP)
 				printf("RP ");
 			if (p->md_attr & EFI_MD_ATTR_XP)
 				printf("XP ");
 			if (p->md_attr & EFI_MD_ATTR_NV)
 				printf("NV ");
 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 				printf("MORE_RELIABLE ");
 			if (p->md_attr & EFI_MD_ATTR_RO)
 				printf("RO ");
 			if (p->md_attr & EFI_MD_ATTR_RT)
 				printf("RUNTIME");
 			printf("\n");
 		}
 
 		switch (p->md_type) {
 		case EFI_MD_TYPE_CODE:
 		case EFI_MD_TYPE_DATA:
 		case EFI_MD_TYPE_BS_CODE:
 		case EFI_MD_TYPE_BS_DATA:
 		case EFI_MD_TYPE_FREE:
 			/*
 			 * We're allowed to use any entry with these types.
 			 */
 			break;
 		default:
 			continue;
 		}
 
 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 		    physmap, physmap_idx))
 			break;
 	}
 }
 
 static char bootmethod[16] = "";
 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
     "System firmware boot method");
 
 static void
 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap;
 	struct efi_map_header *efihdr;
 	u_int32_t size;
 
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	smap = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (efihdr == NULL && smap == NULL)
 		panic("No BIOS smap or EFI map info from loader!");
 
 	if (efihdr != NULL) {
 		add_efi_map_entries(efihdr, physmap, physmap_idx);
 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 	} else {
 		size = *((u_int32_t *)smap - 1);
 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 	}
 }
 
 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
 	u_long physmem_start, physmem_tunable, memtest;
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 	int page_counter;
 
 	/*
 	 * Tell the physical memory allocator about pages used to store
 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
 	 */
 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
 
 	bzero(physmap, sizeof(physmap));
 	physmap_idx = 0;
 
 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 	physmap_idx -= 2;
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] <= 0xA0000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0 || basemem > 640) {
 		if (bootverbose)
 			printf(
 		"Memory map doesn't contain a basemem segment, faking it");
 		basemem = 640;
 	}
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * The boot memory test is disabled by default, as it takes a
 	 * significant amount of time on large-memory systems, and is
 	 * unfriendly to virtual machines as it unnecessarily touches all
 	 * pages.
 	 *
 	 * A general name is used as the code may be extended to support
 	 * additional tests beyond the current "page present" test.
 	 */
 	memtest = 0;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * Make hole for "AP -> long mode" bootstrap code.  The
 	 * mp_bootaddress vector is only available when the kernel
 	 * is configured to support APs and APs for the system start
 	 * in real mode mode (e.g. SMP bare metal).
 	 */
 	if (init_ops.mp_bootaddress)
 		init_ops.mp_bootaddress(physmap, &physmap_idx);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 *
 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 	 * By default, mask off the first 16 pages unless we appear to be
 	 * running in a VM.
 	 */
 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 	if (physmap[0] < physmem_start) {
 		if (physmem_start < PAGE_SIZE)
 			physmap[0] = PAGE_SIZE;
 		else if (physmem_start >= physmap[1])
 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 		else
 			physmap[0] = round_page(physmem_start);
 	}
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	page_counter = 0;
 	if (memtest != 0)
 		printf("Testing system memory");
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= (vm_paddr_t)kernphys && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * Print a "." every GB to show we're making
 			 * progress.
 			 */
 			page_counter++;
 			if ((page_counter % PAGES_PER_GB) == 0)
 				printf(".");
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == PHYS_AVAIL_ENTRIES) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	if (memtest != 0)
 		printf("\n");
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 }
 
 static caddr_t
 native_parse_preload_data(u_int64_t modulep)
 {
 	caddr_t kmdp;
 	char *envp;
 #ifdef DDB
 	vm_offset_t ksym_start;
 	vm_offset_t ksym_end;
 #endif
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 	if (envp != NULL)
 		envp += KERNBASE;
 	init_static_kenv(envp, 0);
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 	db_fetch_ksymtab(ksym_start, ksym_end);
 #endif
 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 
 	return (kmdp);
 }
 
 static void
 amd64_kdb_init(void)
 {
 	kdb_init();
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
 /* Set up the fast syscall stuff */
 void
 amd64_conf_fast_syscall(void)
 {
 	uint64_t msr;
 
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 	    (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
 }
 
 void
 amd64_bsp_pcpu_init1(struct pcpu *pc)
 {
 	struct user_segment_descriptor *gdt;
 
 	PCPU_SET(prvspace, pc);
 	gdt = *PCPU_PTR(gdt);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(tssp, PCPU_PTR(common_tss));
 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 }
 
 void
 amd64_bsp_pcpu_init2(uint64_t rsp0)
 {
 
 	PCPU_SET(rsp0, rsp0);
 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 	PCPU_SET(curpcb, thread0.td_pcb);
 }
 
 void
 amd64_bsp_ist_init(struct pcpu *pc)
 {
 	struct nmi_pcpu *np;
 	struct amd64tss *tssp;
 
 	tssp = &pc->pc_common_tss;
 
 	/* doublefault stack space, runs on ist1 */
 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist1 = (long)np;
 
 	/*
 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
 	 * above the start of the ist2 stack.
 	 */
 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist2 = (long)np;
 
 	/*
 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
 	 * above the start of the ist3 stack.
 	 */
 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist3 = (long)np;
 
 	/*
 	 * DB# stack, runs on ist4.
 	 */
 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist4 = (long)np;
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
 	struct xstate_hdr *xhdr;
 	u_int64_t rsp0;
 	char *env;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor r_gdt;
 	size_t kstack0_sz;
 	int late_console;
 
 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
 
 	kmdp = init_ops.parse_preload_data(modulep);
 
 	physfree += ucode_load_bsp(physfree + KERNBASE);
 	physfree = roundup2(physfree, PAGE_SIZE);
 
 	identify_cpu1();
 	identify_hypervisor();
 	identify_cpu_fixup_bsp();
 	identify_cpu2();
 	initializecpucache();
 
 	/*
 	 * Check for pti, pcid, and invpcid before ifuncs are
 	 * resolved, to correctly select the implementation for
 	 * pmap_activate_sw_mode().
 	 */
 	pti = pti_get_default();
 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 		invpcid_works = (cpu_stdext_feature &
 		    CPUID_STDEXT_INVPCID) != 0;
 	} else {
 		pmap_pcid_enabled = 0;
 	}
 
 	link_elf_ireloc(kmdp);
 
 	/*
 	 * This may be done better later if it gets more high level
 	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	thread0.td_kstack = physfree + KERNBASE;
 	thread0.td_kstack_pages = kstack_pages;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
 	physfree += kstack0_sz;
 
 	/*
 	 * Initialize enough of thread0 for delayed invalidation to
 	 * work very early.  Rely on thread0.td_base_pri
 	 * zero-initialization, it is reset to PVM at proc0_init().
 	 */
 	pmap_thread_init_invl_gen(&thread0);
 
 	pc = &temp_bsp_pcpu;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	gdt = &temp_bsp_pcpu.pc_gdt[0];
 
 	/*
 	 * make gdt memory segments
 	 */
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base = (long)gdt;
 	lgdt(&r_gdt);
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	dpcpu_init((void *)(physfree + KERNBASE), 0);
 	physfree += DPCPU_SIZE;
 	amd64_bsp_pcpu_init1(pc);
 	/* Non-late cninit() and printf() can be moved up to here. */
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 		    SEL_KPL, 0);
 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 	    SEL_UPL, 0);
 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 	    SEL_UPL, 0);
 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 	    SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 	    SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 	    SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the clock before the console so that console
 	 * initialization can use DELAY().
 	 */
 	clock_init();
 
 	/*
 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 	 * transition).
 	 * Once bootblocks have updated, we can test directly for
 	 * efi_systbl != NULL here...
 	 */
 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
 	    != NULL)
 		vty_set_preferred(VTY_VT);
 
 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
 
 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
 
 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
 	    &syscall_ret_l1d_flush_mode);
 
 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
 
 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
 
+	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
+	    &x86_rngds_mitg_enable);
+
 	finishidentcpu();	/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 
 	amd64_bsp_ist_init(pc);
 	
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	amd64_conf_fast_syscall();
 
 	/*
 	 * We initialize the PCB pointer early so that exception
 	 * handlers will work.  Also set up td_critnest to short-cut
 	 * the page fault handler.
 	 */
 	cpu_max_ext_state_size = sizeof(struct savefpu);
 	set_top_of_stack_td(&thread0);
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_critnest = 1;
 
 	/*
 	 * The console and kdb should be initialized even earlier than here,
 	 * but some console drivers don't work until after getmemsize().
 	 * Default to late console initialization to support these drivers.
 	 * This loses mainly printf()s in getmemsize() and early debugging.
 	 */
 	late_console = 1;
 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
 	if (!late_console) {
 		cninit();
 		amd64_kdb_init();
 	}
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 #ifdef DEV_PCI
         /* This call might adjust phys_avail[]. */
         pci_early_quirks();
 #endif
 
 	if (late_console)
 		cninit();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?"
 #endif
 
 	if (late_console)
 		amd64_kdb_init();
 
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
 	/*
 	 * Reinitialize thread0's stack base now that the xsave area size is
 	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
 	 * save area size.  Zero out the extended state header in fpu save area.
 	 */
 	set_top_of_stack_td(&thread0);
 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
 		    1);
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	rsp0 = thread0.td_md.md_stack_base;
 	/* Ensure the stack is aligned to 16 bytes */
 	rsp0 &= ~0xFul;
 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
 	amd64_bsp_pcpu_init2(rsp0);
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 
         env = kern_getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	cpu_probe_amdc1e();
 
 	kcsan_cpu_init(0);
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 	thread0.td_critnest = 0;
 
 	TSEXIT();
 
 	/* Location of kernel stack for locore */
 	return (thread0.td_md.md_stack_base);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 static int
 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct bios_smap *smapbase;
 	struct bios_smap_xattr smap;
 	caddr_t kmdp;
 	uint32_t *smapattr;
 	int count, error, i;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		return (0);
 	smapattr = (uint32_t *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 	error = 0;
 	for (i = 0; i < count; i++) {
 		smap.base = smapbase[i].base;
 		smap.length = smapbase[i].length;
 		smap.type = smapbase[i].type;
 		if (smapattr != NULL)
 			smap.xattr = smapattr[i];
 		else
 			smap.xattr = 0;
 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
 	}
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, smap,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     smap_sysctl_handler, "S,bios_smap_xattr",
     "Raw BIOS SMAP data");
 
 static int
 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct efi_map_header *efihdr;
 	caddr_t kmdp;
 	uint32_t efisize;
 
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr == NULL)
 		return (0);
 	efisize = *((uint32_t *)efihdr - 1);
 	return (SYSCTL_OUT(req, efihdr, efisize));
 }
 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     efi_map_sysctl_handler, "S,efi_map_header",
     "Raw EFI Memory Map");
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 		critical_enter();
 	} else
 		td->td_md.md_spinlock_count++;
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0) {
 		critical_exit();
 		intr_restore(flags);
 	}
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = tf->tf_rsp;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 
 	td->td_frame->tf_rip = addr;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
 		td->td_frame->tf_rflags |= PSL_T;
 		td->td_dbgflags |= TDB_STEP;
 	}
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	td->td_frame->tf_rflags &= ~PSL_T;
 	td->td_dbgflags &= ~TDB_STEP;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	if (tp->tf_flags & TF_HASSEGS) {
 		regs->r_ds = tp->tf_ds;
 		regs->r_es = tp->tf_es;
 		regs->r_fs = tp->tf_fs;
 		regs->r_gs = tp->tf_gs;
 	} else {
 		regs->r_ds = 0;
 		regs->r_es = 0;
 		regs->r_fs = 0;
 		regs->r_gs = 0;
 	}
 	regs->r_err = 0;
 	regs->r_trapno = 0;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	if (0) {	/* XXXKIB */
 		tp->tf_ds = regs->r_ds;
 		tp->tf_es = regs->r_es;
 		tp->tf_fs = regs->r_fs;
 		tp->tf_gs = regs->r_gs;
 		tp->tf_flags = TF_HASSEGS;
 	}
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	fpugetregs(td);
 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	critical_enter();
 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 	fpuuserinited(td);
 	critical_exit();
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_flags = tp->tf_flags;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	update_pcb_bases(pcb);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	tp->tf_flags = mcp->mc_flags;
 	if (tp->tf_flags & TF_HASSEGS) {
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_gs = mcp->mc_gs;
 	}
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	if (mcp->mc_flags & _MC_HASBASES) {
 		pcb->pcb_fsbase = mcp->mc_fsbase;
 		pcb->pcb_gsbase = mcp->mc_gsbase;
 	}
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 	 * have too many layers.
 	 */
 	clear_pcb_flags(curthread->td_pcb,
 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		set_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(register_t dr6)
 {
         u_int64_t dr7;
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
 
         bp = dr6 & DBREG_DR6_BMASK;
         if (bp == 0) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 /*
  * The pcb_flags is only modified by current thread, or by other threads
  * when current thread is stopped.  However, current thread may change it
  * from the interrupt context in cpu_switch(), or in the trap handler.
  * When we read-modify-write pcb_flags from C sources, compiler may generate
  * code that is not atomic regarding the interrupt handler.  If a trap or
  * interrupt happens and any flag is modified from the handler, it can be
  * clobbered with the cached value later.  Therefore, we implement setting
  * and clearing flags with single-instruction functions, which do not race
  * with possible modification of the flags from the trap or interrupt context,
  * because traps and interrupts are executed only on instruction boundary.
  */
 void
 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("orl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 
 }
 
 /*
  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
  * pcb if user space modified the bases.  We must save on the context
  * switch or if the return to usermode happens through the doreti.
  *
  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
  * which have a consequence that the base MSRs must be saved each time
  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
  * context switches.
  */
 static void
 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
 {
 	register_t r;
 
 	if (curpcb == pcb &&
 	    (flags & PCB_FULL_IRET) != 0 &&
 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 		r = intr_disable();
 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 			if (rfs() == _ufssel)
 				pcb->pcb_fsbase = rdfsbase();
 			if (rgs() == _ugssel)
 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 		}
 		set_pcb_flags_raw(pcb, flags);
 		intr_restore(r);
 	} else {
 		set_pcb_flags_raw(pcb, flags);
 	}
 }
 
 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
 }
 
 void
 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("andl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
 
 #undef memset
 #undef memmove
 #undef memcpy
 
 void	*memset_std(void *buf, int c, size_t len);
 void	*memset_erms(void *buf, int c, size_t len);
 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 
 #ifdef KCSAN
 /*
  * These fail to build as ifuncs when used with KCSAN.
  */
 void *
 memset(void *buf, int c, size_t len)
 {
 
 	return (memset_std(buf, c, len));
 }
 
 void *
 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
 {
 
 	return (memmove_std(dst, src, len));
 }
 
 void *
 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
 {
 
 	return (memcpy_std(dst, src, len));
 }
 #else
 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memset_erms : memset_std);
 }
 
 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
     size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memmove_erms : memmove_std);
 }
 
 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memcpy_erms : memcpy_std);
 }
 #endif
 
 void	pagezero_std(void *addr);
 void	pagezero_erms(void *addr);
 DEFINE_IFUNC(, void , pagezero, (void *))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    pagezero_erms : pagezero_std);
 }
diff --git a/sys/dev/cpuctl/cpuctl.c b/sys/dev/cpuctl/cpuctl.c
index 878e430b473e..67f710f52ec6 100644
--- a/sys/dev/cpuctl/cpuctl.c
+++ b/sys/dev/cpuctl/cpuctl.c
@@ -1,604 +1,605 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006-2008 Stanislav Sedov <stas@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/ioccom.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/pcpu.h>
 #include <sys/smp.h>
 #include <sys/pmckern.h>
 #include <sys/cpuctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <x86/ucode.h>
 
 static d_open_t cpuctl_open;
 static d_ioctl_t cpuctl_ioctl;
 
 #define	CPUCTL_VERSION 1
 
 #ifdef CPUCTL_DEBUG
 # define	DPRINTF(format,...) printf(format, __VA_ARGS__);
 #else
 # define	DPRINTF(...)
 #endif
 
 #define	UCODE_SIZE_MAX	(4 * 1024 * 1024)
 
 static int cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd,
     struct thread *td);
 static int cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data,
     struct thread *td);
 static int cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data,
     struct thread *td);
 static int cpuctl_do_eval_cpu_features(int cpu, struct thread *td);
 static int cpuctl_do_update(int cpu, cpuctl_update_args_t *data,
     struct thread *td);
 static int update_intel(int cpu, cpuctl_update_args_t *args,
     struct thread *td);
 static int update_amd(int cpu, cpuctl_update_args_t *args, struct thread *td);
 static int update_via(int cpu, cpuctl_update_args_t *args,
     struct thread *td);
 
 static struct cdev **cpuctl_devs;
 static MALLOC_DEFINE(M_CPUCTL, "cpuctl", "CPUCTL buffer");
 
 static struct cdevsw cpuctl_cdevsw = {
         .d_version =    D_VERSION,
         .d_open =       cpuctl_open,
         .d_ioctl =      cpuctl_ioctl,
         .d_name =       "cpuctl",
 };
 
 /*
  * This function checks if specified cpu enabled or not.
  */
 static int
 cpu_enabled(int cpu)
 {
 
 	return (pmc_cpu_is_disabled(cpu) == 0);
 }
 
 /*
  * Check if the current thread is bound to a specific cpu.
  */
 static int
 cpu_sched_is_bound(struct thread *td)
 {
 	int ret;
 
 	thread_lock(td);
 	ret = sched_is_bound(td);
 	thread_unlock(td);
 	return (ret);
 }
 
 /*
  * Switch to target cpu to run.
  */
 static void
 set_cpu(int cpu, struct thread *td)
 {
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid && cpu_enabled(cpu),
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 	thread_lock(td);
 	sched_bind(td, cpu);
 	thread_unlock(td);
 	KASSERT(td->td_oncpu == cpu,
 	    ("[cpuctl,%d]: cannot bind to target cpu %d on cpu %d", __LINE__,
 	    cpu, td->td_oncpu));
 }
 
 static void
 restore_cpu(int oldcpu, int is_bound, struct thread *td)
 {
 
 	KASSERT(oldcpu >= 0 && oldcpu <= mp_maxid && cpu_enabled(oldcpu),
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, oldcpu));
 	thread_lock(td);
 	if (is_bound == 0)
 		sched_unbind(td);
 	else
 		sched_bind(td, oldcpu);
 	thread_unlock(td);
 }
 
 int
 cpuctl_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
     int flags, struct thread *td)
 {
 	int cpu, ret;
 
 	cpu = dev2unit(dev);
 	if (cpu > mp_maxid || !cpu_enabled(cpu)) {
 		DPRINTF("[cpuctl,%d]: bad cpu number %d\n", __LINE__, cpu);
 		return (ENXIO);
 	}
 	/* Require write flag for "write" requests. */
 	if ((cmd == CPUCTL_MSRCBIT || cmd == CPUCTL_MSRSBIT ||
 	    cmd == CPUCTL_UPDATE || cmd == CPUCTL_WRMSR ||
 	    cmd == CPUCTL_EVAL_CPU_FEATURES) &&
 	    (flags & FWRITE) == 0)
 		return (EPERM);
 	switch (cmd) {
 	case CPUCTL_RDMSR:
 		ret = cpuctl_do_msr(cpu, (cpuctl_msr_args_t *)data, cmd, td);
 		break;
 	case CPUCTL_MSRSBIT:
 	case CPUCTL_MSRCBIT:
 	case CPUCTL_WRMSR:
 		ret = priv_check(td, PRIV_CPUCTL_WRMSR);
 		if (ret != 0)
 			goto fail;
 		ret = cpuctl_do_msr(cpu, (cpuctl_msr_args_t *)data, cmd, td);
 		break;
 	case CPUCTL_CPUID:
 		ret = cpuctl_do_cpuid(cpu, (cpuctl_cpuid_args_t *)data, td);
 		break;
 	case CPUCTL_UPDATE:
 		ret = priv_check(td, PRIV_CPUCTL_UPDATE);
 		if (ret != 0)
 			goto fail;
 		ret = cpuctl_do_update(cpu, (cpuctl_update_args_t *)data, td);
 		break;
 	case CPUCTL_CPUID_COUNT:
 		ret = cpuctl_do_cpuid_count(cpu,
 		    (cpuctl_cpuid_count_args_t *)data, td);
 		break;
 	case CPUCTL_EVAL_CPU_FEATURES:
 		ret = cpuctl_do_eval_cpu_features(cpu, td);
 		break;
 	default:
 		ret = EINVAL;
 		break;
 	}
 fail:
 	return (ret);
 }
 
 /*
  * Actually perform cpuid operation.
  */
 static int
 cpuctl_do_cpuid_count(int cpu, cpuctl_cpuid_count_args_t *data,
     struct thread *td)
 {
 	int is_bound = 0;
 	int oldcpu;
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 
 	/* Explicitly clear cpuid data to avoid returning stale info. */
 	bzero(data->data, sizeof(data->data));
 	DPRINTF("[cpuctl,%d]: retrieving cpuid lev %#0x type %#0x for %d cpu\n",
 	    __LINE__, data->level, data->level_type, cpu);
 #ifdef __i386__
 	if (cpu_id == 0)
 		return (ENODEV);
 #endif
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	cpuid_count(data->level, data->level_type, data->data);
 	restore_cpu(oldcpu, is_bound, td);
 	return (0);
 }
 
 static int
 cpuctl_do_cpuid(int cpu, cpuctl_cpuid_args_t *data, struct thread *td)
 {
 	cpuctl_cpuid_count_args_t cdata;
 	int error;
 
 	cdata.level = data->level;
 	/* Override the level type. */
 	cdata.level_type = 0;
 	error = cpuctl_do_cpuid_count(cpu, &cdata, td);
 	bcopy(cdata.data, data->data, sizeof(data->data)); /* Ignore error */
 	return (error);
 }
 
 /*
  * Actually perform MSR operations.
  */
 static int
 cpuctl_do_msr(int cpu, cpuctl_msr_args_t *data, u_long cmd, struct thread *td)
 {
 	uint64_t reg;
 	int is_bound = 0;
 	int oldcpu;
 	int ret;
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 
 	/*
 	 * Explicitly clear cpuid data to avoid returning stale
 	 * info
 	 */
 	DPRINTF("[cpuctl,%d]: operating on MSR %#0x for %d cpu\n", __LINE__,
 	    data->msr, cpu);
 #ifdef __i386__
 	if ((cpu_feature & CPUID_MSR) == 0)
 		return (ENODEV);
 #endif
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	if (cmd == CPUCTL_RDMSR) {
 		data->data = 0;
 		ret = rdmsr_safe(data->msr, &data->data);
 	} else if (cmd == CPUCTL_WRMSR) {
 		ret = wrmsr_safe(data->msr, data->data);
 	} else if (cmd == CPUCTL_MSRSBIT) {
 		critical_enter();
 		ret = rdmsr_safe(data->msr, &reg);
 		if (ret == 0)
 			ret = wrmsr_safe(data->msr, reg | data->data);
 		critical_exit();
 	} else if (cmd == CPUCTL_MSRCBIT) {
 		critical_enter();
 		ret = rdmsr_safe(data->msr, &reg);
 		if (ret == 0)
 			ret = wrmsr_safe(data->msr, reg & ~data->data);
 		critical_exit();
 	} else
 		panic("[cpuctl,%d]: unknown operation requested: %lu",
 		    __LINE__, cmd);
 	restore_cpu(oldcpu, is_bound, td);
 	return (ret);
 }
 
 /*
  * Actually perform microcode update.
  */
 static int
 cpuctl_do_update(int cpu, cpuctl_update_args_t *data, struct thread *td)
 {
 	cpuctl_cpuid_args_t args = {
 		.level = 0,
 	};
 	char vendor[13];
 	int ret;
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 	DPRINTF("[cpuctl,%d]: XXX %d", __LINE__, cpu);
 
 	ret = cpuctl_do_cpuid(cpu, &args, td);
 	if (ret != 0)
 		return (ret);
 	((uint32_t *)vendor)[0] = args.data[1];
 	((uint32_t *)vendor)[1] = args.data[3];
 	((uint32_t *)vendor)[2] = args.data[2];
 	vendor[12] = '\0';
 	if (strncmp(vendor, INTEL_VENDOR_ID, sizeof(INTEL_VENDOR_ID)) == 0)
 		ret = update_intel(cpu, data, td);
 	else if(strncmp(vendor, AMD_VENDOR_ID, sizeof(AMD_VENDOR_ID)) == 0)
 		ret = update_amd(cpu, data, td);
 	else if(strncmp(vendor, CENTAUR_VENDOR_ID, sizeof(CENTAUR_VENDOR_ID))
 	    == 0)
 		ret = update_via(cpu, data, td);
 	else
 		ret = ENXIO;
 	return (ret);
 }
 
 struct ucode_update_data {
 	void *ptr;
 	int cpu;
 	int ret;
 };
 
 static void
 ucode_intel_load_rv(void *arg)
 {
 	struct ucode_update_data *d;
 
 	d = arg;
 	if (PCPU_GET(cpuid) == d->cpu)
 		d->ret = ucode_intel_load(d->ptr, true, NULL, NULL);
 }
 
 static int
 update_intel(int cpu, cpuctl_update_args_t *args, struct thread *td)
 {
 	struct ucode_update_data d;
 	void *ptr;
 	int is_bound, oldcpu, ret;
 
 	if (args->size == 0 || args->data == NULL) {
 		DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__);
 		return (EINVAL);
 	}
 	if (args->size > UCODE_SIZE_MAX) {
 		DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__);
 		return (EINVAL);
 	}
 
 	/*
 	 * 16 byte alignment required.  Rely on the fact that
 	 * malloc(9) always returns the pointer aligned at least on
 	 * the size of the allocation.
 	 */
 	ptr = malloc(args->size + 16, M_CPUCTL, M_WAITOK);
 	if (copyin(args->data, ptr, args->size) != 0) {
 		DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed",
 		    __LINE__, args->data, ptr, args->size);
 		ret = EFAULT;
 		goto out;
 	}
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	d.ptr = ptr;
 	d.cpu = cpu;
 	smp_rendezvous(NULL, ucode_intel_load_rv, NULL, &d);
 	restore_cpu(oldcpu, is_bound, td);
 	ret = d.ret;
 
 	/*
 	 * Replace any existing update.  This ensures that the new update
 	 * will be reloaded automatically during ACPI resume.
 	 */
 	if (ret == 0)
 		ptr = ucode_update(ptr);
 
 out:
 	free(ptr, M_CPUCTL);
 	return (ret);
 }
 
 /*
  * NB: MSR 0xc0010020, MSR_K8_UCODE_UPDATE, is not documented by AMD.
  * Coreboot, illumos and Linux source code was used to understand
  * its workings.
  */
 static void
 amd_ucode_wrmsr(void *ucode_ptr)
 {
 	uint32_t tmp[4];
 
 	wrmsr_safe(MSR_K8_UCODE_UPDATE, (uintptr_t)ucode_ptr);
 	do_cpuid(0, tmp);
 }
 
 static int
 update_amd(int cpu, cpuctl_update_args_t *args, struct thread *td)
 {
 	void *ptr;
 	int ret;
 
 	if (args->size == 0 || args->data == NULL) {
 		DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__);
 		return (EINVAL);
 	}
 	if (args->size > UCODE_SIZE_MAX) {
 		DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__);
 		return (EINVAL);
 	}
 
 	/*
 	 * 16 byte alignment required.  Rely on the fact that
 	 * malloc(9) always returns the pointer aligned at least on
 	 * the size of the allocation.
 	 */
 	ptr = malloc(args->size + 16, M_CPUCTL, M_ZERO | M_WAITOK);
 	if (copyin(args->data, ptr, args->size) != 0) {
 		DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed",
 		    __LINE__, args->data, ptr, args->size);
 		ret = EFAULT;
 		goto fail;
 	}
 	smp_rendezvous(NULL, amd_ucode_wrmsr, NULL, ptr);
 	ret = 0;
 fail:
 	free(ptr, M_CPUCTL);
 	return (ret);
 }
 
 static int
 update_via(int cpu, cpuctl_update_args_t *args, struct thread *td)
 {
 	void *ptr;
 	uint64_t rev0, rev1, res;
 	uint32_t tmp[4];
 	int is_bound;
 	int oldcpu;
 	int ret;
 
 	if (args->size == 0 || args->data == NULL) {
 		DPRINTF("[cpuctl,%d]: zero-sized firmware image", __LINE__);
 		return (EINVAL);
 	}
 	if (args->size > UCODE_SIZE_MAX) {
 		DPRINTF("[cpuctl,%d]: firmware image too large", __LINE__);
 		return (EINVAL);
 	}
 
 	/*
 	 * 4 byte alignment required.
 	 */
 	ptr = malloc(args->size, M_CPUCTL, M_WAITOK);
 	if (copyin(args->data, ptr, args->size) != 0) {
 		DPRINTF("[cpuctl,%d]: copyin %p->%p of %zd bytes failed",
 		    __LINE__, args->data, ptr, args->size);
 		ret = EFAULT;
 		goto fail;
 	}
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	critical_enter();
 	rdmsr_safe(MSR_BIOS_SIGN, &rev0); /* Get current microcode revision. */
 
 	/*
 	 * Perform update.
 	 */
 	wrmsr_safe(MSR_BIOS_UPDT_TRIG, (uintptr_t)(ptr));
 	do_cpuid(1, tmp);
 
 	/*
 	 * Result are in low byte of MSR FCR5:
 	 * 0x00: No update has been attempted since RESET.
 	 * 0x01: The last attempted update was successful.
 	 * 0x02: The last attempted update was unsuccessful due to a bad
 	 *       environment. No update was loaded and any preexisting
 	 *       patches are still active.
 	 * 0x03: The last attempted update was not applicable to this processor.
 	 *       No update was loaded and any preexisting patches are still
 	 *       active.
 	 * 0x04: The last attempted update was not successful due to an invalid
 	 *       update data block. No update was loaded and any preexisting
 	 *       patches are still active
 	 */
 	rdmsr_safe(0x1205, &res);
 	res &= 0xff;
 	critical_exit();
 	rdmsr_safe(MSR_BIOS_SIGN, &rev1); /* Get new microcode revision. */
 	restore_cpu(oldcpu, is_bound, td);
 
 	DPRINTF("[cpu,%d]: rev0=%x rev1=%x res=%x\n", __LINE__,
 	    (unsigned)(rev0 >> 32), (unsigned)(rev1 >> 32), (unsigned)res);
 
 	if (res != 0x01)
 		ret = EINVAL;
 	else
 		ret = 0;
 fail:
 	free(ptr, M_CPUCTL);
 	return (ret);
 }
 
 static int
 cpuctl_do_eval_cpu_features(int cpu, struct thread *td)
 {
 	int is_bound = 0;
 	int oldcpu;
 
 	KASSERT(cpu >= 0 && cpu <= mp_maxid,
 	    ("[cpuctl,%d]: bad cpu number %d", __LINE__, cpu));
 
 #ifdef __i386__
 	if (cpu_id == 0)
 		return (ENODEV);
 #endif
 	oldcpu = td->td_oncpu;
 	is_bound = cpu_sched_is_bound(td);
 	set_cpu(cpu, td);
 	identify_cpu1();
 	identify_cpu2();
 	restore_cpu(oldcpu, is_bound, td);
 	hw_ibrs_recalculate(true);
 	hw_ssb_recalculate(true);
 #ifdef __amd64__
 	amd64_syscall_ret_flush_l1d_recalc();
 	pmap_allow_2m_x_ept_recalculate();
 #endif
 	hw_mds_recalculate();
 	x86_taa_recalculate();
+	x86_rngds_mitg_recalculate(true);
 	printcpuinfo();
 	return (0);
 }
 
 
 int
 cpuctl_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
 {
 	int ret = 0;
 	int cpu;
 
 	cpu = dev2unit(dev);
 	if (cpu > mp_maxid || !cpu_enabled(cpu)) {
 		DPRINTF("[cpuctl,%d]: incorrect cpu number %d\n", __LINE__,
 		    cpu);
 		return (ENXIO);
 	}
 	if (flags & FWRITE)
 		ret = securelevel_gt(td->td_ucred, 0);
 	return (ret);
 }
 
 static int
 cpuctl_modevent(module_t mod __unused, int type, void *data __unused)
 {
 	int cpu;
 
 	switch(type) {
 	case MOD_LOAD:
 		if (bootverbose)
 			printf("cpuctl: access to MSR registers/cpuid info.\n");
 		cpuctl_devs = malloc(sizeof(*cpuctl_devs) * (mp_maxid + 1), M_CPUCTL,
 		    M_WAITOK | M_ZERO);
 		CPU_FOREACH(cpu)
 			if (cpu_enabled(cpu))
 				cpuctl_devs[cpu] = make_dev(&cpuctl_cdevsw, cpu,
 				    UID_ROOT, GID_KMEM, 0640, "cpuctl%d", cpu);
 		break;
 	case MOD_UNLOAD:
 		CPU_FOREACH(cpu) {
 			if (cpuctl_devs[cpu] != NULL)
 				destroy_dev(cpuctl_devs[cpu]);
 		}
 		free(cpuctl_devs, M_CPUCTL);
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		return (EOPNOTSUPP);
         }
 	return (0);
 }
 
 DEV_MODULE(cpuctl, cpuctl_modevent, NULL);
 MODULE_VERSION(cpuctl, CPUCTL_VERSION);
diff --git a/sys/x86/include/x86_var.h b/sys/x86/include/x86_var.h
index b064f2f77c2e..3b908068bff3 100644
--- a/sys/x86/include/x86_var.h
+++ b/sys/x86/include/x86_var.h
@@ -1,160 +1,162 @@
 /*-
  * Copyright (c) 1995 Bruce D. Evans.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _X86_X86_VAR_H_
 #define	_X86_X86_VAR_H_
 
 /*
  * Miscellaneous machine-dependent declarations.
  */
 
 extern	long	Maxmem;
 extern	u_int	basemem;
 extern	int	busdma_swi_pending;
 extern	u_int	cpu_exthigh;
 extern	u_int	cpu_feature;
 extern	u_int	cpu_feature2;
 extern	u_int	amd_feature;
 extern	u_int	amd_feature2;
 extern	u_int	amd_rascap;
 extern	u_int	amd_pminfo;
 extern	u_int	amd_extended_feature_extensions;
 extern	u_int	via_feature_rng;
 extern	u_int	via_feature_xcrypt;
 extern	u_int	cpu_clflush_line_size;
 extern	u_int	cpu_stdext_feature;
 extern	u_int	cpu_stdext_feature2;
 extern	u_int	cpu_stdext_feature3;
 extern	uint64_t cpu_ia32_arch_caps;
 extern	u_int	cpu_fxsr;
 extern	u_int	cpu_high;
 extern	u_int	cpu_id;
 extern	u_int	cpu_max_ext_state_size;
 extern	u_int	cpu_mxcsr_mask;
 extern	u_int	cpu_procinfo;
 extern	u_int	cpu_procinfo2;
 extern	char	cpu_vendor[];
 extern	u_int	cpu_vendor_id;
 extern	u_int	cpu_mon_mwait_flags;
 extern	u_int	cpu_mon_min_size;
 extern	u_int	cpu_mon_max_size;
 extern	u_int	cpu_maxphyaddr;
 extern	u_int	cpu_power_eax;
 extern	u_int	cpu_power_ebx;
 extern	u_int	cpu_power_ecx;
 extern	u_int	cpu_power_edx;
 extern	char	ctx_switch_xsave[];
 extern	u_int	hv_base;
 extern	u_int	hv_high;
 extern	char	hv_vendor[];
 extern	char	kstack[];
 extern	char	sigcode[];
 extern	int	szsigcode;
 extern	int	vm_page_dump_size;
 extern	int	workaround_erratum383;
 extern	int	_udatasel;
 extern	int	_ucodesel;
 extern	int	_ucode32sel;
 extern	int	_ufssel;
 extern	int	_ugssel;
 extern	int	use_xsave;
 extern	uint64_t xsave_mask;
 extern	u_int	max_apic_id;
 extern	int	i386_read_exec;
 extern	int	pti;
 extern	int	hw_ibrs_ibpb_active;
 extern	int	hw_mds_disable;
 extern	int	hw_ssb_active;
 extern	int	x86_taa_enable;
 extern	int	cpu_flush_rsb_ctxsw;
+extern	int	x86_rngds_mitg_enable;
 
 struct	pcb;
 struct	thread;
 struct	reg;
 struct	fpreg;
 struct  dbreg;
 struct	dumperinfo;
 struct	trapframe;
 
 /*
  * The interface type of the interrupt handler entry point cannot be
  * expressed in C.  Use simplest non-variadic function type as an
  * approximation.
  */
 typedef void alias_for_inthand_t(void);
 
 bool	acpi_get_fadt_bootflags(uint16_t *flagsp);
 void	*alloc_fpusave(int flags);
 void	busdma_swi(void);
 vm_paddr_t cpu_getmaxphyaddr(void);
 bool	cpu_mwait_usable(void);
 void	cpu_probe_amdc1e(void);
 void	cpu_setregs(void);
 bool	disable_wp(void);
 void	restore_wp(bool old_wp);
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
 void	finishidentcpu(void);
 void	identify_cpu1(void);
 void	identify_cpu2(void);
 void	identify_cpu_fixup_bsp(void);
 void	identify_hypervisor(void);
 void	initializecpu(void);
 void	initializecpucache(void);
 bool	fix_cpuid(void);
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 int	is_physical_memory(vm_paddr_t addr);
 int	isa_nmi(int cd);
 void	handle_ibrs_entry(void);
 void	handle_ibrs_exit(void);
 void	hw_ibrs_recalculate(bool all_cpus);
 void	hw_mds_recalculate(void);
 void	hw_ssb_recalculate(bool all_cpus);
 void	x86_taa_recalculate(void);
+void	x86_rngds_mitg_recalculate(bool all_cpus);
 void	nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame);
 void	nmi_call_kdb_smp(u_int type, struct trapframe *frame);
 void	nmi_handle_intr(u_int type, struct trapframe *frame);
 void	pagecopy(void *from, void *to);
 void	printcpuinfo(void);
 int	pti_get_default(void);
 int	user_dbreg_trap(register_t dr6);
 int	minidumpsys(struct dumperinfo *);
 struct pcb *get_pcb_td(struct thread *td);
 
 #define	MSR_OP_ANDNOT		0x00000001
 #define	MSR_OP_OR		0x00000002
 #define	MSR_OP_WRITE		0x00000003
 #define	MSR_OP_LOCAL		0x10000000
 #define	MSR_OP_SCHED		0x20000000
 #define	MSR_OP_RENDEZVOUS	0x30000000
 void x86_msr_op(u_int msr, u_int op, uint64_t arg1);
 
 #endif
diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c
index 33dd19d28fbd..74a5261f9112 100644
--- a/sys/x86/x86/cpu_machdep.c
+++ b/sys/x86/x86/cpu_machdep.c
@@ -1,1449 +1,1503 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_platform.h"
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/domainset.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/specialreg.h>
 #include <machine/md_var.h>
 #include <machine/mp_watchdog.h>
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef CPU_ELAN
 #include <machine/elan_mmcr.h>
 #endif
 #include <x86/acpica_machdep.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <isa/isareg.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 #ifdef SMP
 static u_int	cpu_reset_proxyid;
 static volatile u_int	cpu_reset_proxy_active;
 #endif
 
 struct msr_op_arg {
 	u_int msr;
 	int op;
 	uint64_t arg1;
 };
 
 static void
 x86_msr_op_one(void *argp)
 {
 	struct msr_op_arg *a;
 	uint64_t v;
 
 	a = argp;
 	switch (a->op) {
 	case MSR_OP_ANDNOT:
 		v = rdmsr(a->msr);
 		v &= ~a->arg1;
 		wrmsr(a->msr, v);
 		break;
 	case MSR_OP_OR:
 		v = rdmsr(a->msr);
 		v |= a->arg1;
 		wrmsr(a->msr, v);
 		break;
 	case MSR_OP_WRITE:
 		wrmsr(a->msr, a->arg1);
 		break;
 	}
 }
 
 #define	MSR_OP_EXMODE_MASK	0xf0000000
 #define	MSR_OP_OP_MASK		0x000000ff
 
 void
 x86_msr_op(u_int msr, u_int op, uint64_t arg1)
 {
 	struct thread *td;
 	struct msr_op_arg a;
 	u_int exmode;
 	int bound_cpu, i, is_bound;
 
 	a.op = op & MSR_OP_OP_MASK;
 	MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
 	    a.op == MSR_OP_WRITE);
 	exmode = op & MSR_OP_EXMODE_MASK;
 	MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED ||
 	    exmode == MSR_OP_RENDEZVOUS);
 	a.msr = msr;
 	a.arg1 = arg1;
 	switch (exmode) {
 	case MSR_OP_LOCAL:
 		x86_msr_op_one(&a);
 		break;
 	case MSR_OP_SCHED:
 		td = curthread;
 		thread_lock(td);
 		is_bound = sched_is_bound(td);
 		bound_cpu = td->td_oncpu;
 		CPU_FOREACH(i) {
 			sched_bind(td, i);
 			x86_msr_op_one(&a);
 		}
 		if (is_bound)
 			sched_bind(td, bound_cpu);
 		else
 			sched_unbind(td);
 		thread_unlock(td);
 		break;
 	case MSR_OP_RENDEZVOUS:
 		smp_rendezvous(NULL, x86_msr_op_one, NULL, &a);
 		break;
 	}
 }
 
 /*
  * Automatically initialized per CPU errata in cpu_idle_tun below.
  */
 bool mwait_cpustop_broken = false;
 SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN,
     &mwait_cpustop_broken, 0,
     "Can not reliably wake MONITOR/MWAIT cpus without interrupts");
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 void
 acpi_cpu_c1(void)
 {
 
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Use mwait to pause execution while waiting for an interrupt or
  * another thread to signal that there is more work.
  *
  * NOTE: Interrupts will cause a wakeup; however, this function does
  * not enable interrupt handling. The caller is responsible to enable
  * interrupts.
  */
 void
 acpi_cpu_idle_mwait(uint32_t mwait_hint)
 {
 	int *state;
 	uint64_t v;
 
 	/*
 	 * A comment in Linux patch claims that 'CPUs run faster with
 	 * speculation protection disabled. All CPU threads in a core
 	 * must disable speculation protection for it to be
 	 * disabled. Disable it while we are idle so the other
 	 * hyperthread can run fast.'
 	 *
 	 * XXXKIB.  Software coordination mode should be supported,
 	 * but all Intel CPUs provide hardware coordination.
 	 */
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
 	    ("cpu_mwait_cx: wrong monitorbuf state"));
 	atomic_store_int(state, STATE_MWAIT);
 	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
 		v = rdmsr(MSR_IA32_SPEC_CTRL);
 		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
 		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
 	} else {
 		v = 0;
 	}
 	cpu_monitor(state, 0, 0);
 	if (atomic_load_int(state) == STATE_MWAIT)
 		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
 
 	/*
 	 * SSB cannot be disabled while we sleep, or rather, if it was
 	 * disabled, the sysctl thread will bind to our cpu to tweak
 	 * MSR.
 	 */
 	if (v != 0)
 		wrmsr(MSR_IA32_SPEC_CTRL, v);
 
 	/*
 	 * We should exit on any event that interrupts mwait, because
 	 * that event might be a wanted interrupt.
 	 */
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	uint64_t acnt, mcnt, perf;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 #ifdef __i386__
 	if ((cpu_feature & CPUID_TSC) == 0)
 		return (EOPNOTSUPP);
 #endif
 
 	/*
 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 	 * DELAY(9) based logic fails.
 	 */
 	if (tsc_is_invariant && !tsc_perf_stat)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	if (tsc_is_invariant) {
 		wrmsr(MSR_MPERF, 0);
 		wrmsr(MSR_APERF, 0);
 		tsc1 = rdtsc();
 		DELAY(1000);
 		mcnt = rdmsr(MSR_MPERF);
 		acnt = rdmsr(MSR_APERF);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		perf = 1000 * acnt / mcnt;
 		*rate = (tsc2 - tsc1) * perf;
 	} else {
 		tsc1 = rdtsc();
 		DELAY(1000);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		*rate = (tsc2 - tsc1) * 1000;
 	}
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		halt();
 }
 
 static void
 cpu_reset_real(void)
 {
 	struct region_descriptor null_idt;
 	int b;
 
 	disable_intr();
 #ifdef CPU_ELAN
 	if (elan_mmcr != NULL)
 		elan_mmcr->RESCFG = 1;
 #endif
 #ifdef __i386__
 	if (cpu == CPU_GEODE1100) {
 		/* Attempt Geode's own reset */
 		outl(0xcf8, 0x80009044ul);
 		outl(0xcfc, 0xf);
 	}
 #endif
 #if !defined(BROKEN_KEYBOARD_RESET)
 	/*
 	 * Attempt to do a CPU reset via the keyboard controller,
 	 * do not turn off GateA20, as any machine that fails
 	 * to do the reset here would then end up in no man's land.
 	 */
 	outb(IO_KBD + 4, 0xFE);
 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
 #endif
 
 	/*
 	 * Attempt to force a reset via the Reset Control register at
 	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
 	 * transitions from 0 to 1.  Bit 1 selects the type of reset
 	 * to attempt: 0 selects a "soft" reset, and 1 selects a
 	 * "hard" reset.  We try a "hard" reset.  The first write sets
 	 * bit 1 to select a "hard" reset and clears bit 2.  The
 	 * second write forces a 0 -> 1 transition in bit 2 to trigger
 	 * a reset.
 	 */
 	outb(0xcf9, 0x2);
 	outb(0xcf9, 0x6);
 	DELAY(500000);  /* wait 0.5 sec to see if that did it */
 
 	/*
 	 * Attempt to force a reset via the Fast A20 and Init register
 	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
 	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
 	 * preserve bit 1 while setting bit 0.  We also must clear bit
 	 * 0 before setting it if it isn't already clear.
 	 */
 	b = inb(0x92);
 	if (b != 0xff) {
 		if ((b & 0x1) != 0)
 			outb(0x92, b & 0xfe);
 		outb(0x92, b | 0x1);
 		DELAY(500000);  /* wait 0.5 sec to see if that did it */
 	}
 
 	printf("No known reset method worked, attempting CPU shutdown\n");
 	DELAY(1000000); /* wait 1 sec for printf to complete */
 
 	/* Wipe the IDT. */
 	null_idt.rd_limit = 0;
 	null_idt.rd_base = 0;
 	lidt(&null_idt);
 
 	/* "good night, sweet prince .... <THUNK!>" */
 	breakpoint();
 
 	/* NOTREACHED */
 	while(1);
 }
 
 #ifdef SMP
 static void
 cpu_reset_proxy(void)
 {
 
 	cpu_reset_proxy_active = 1;
 	while (cpu_reset_proxy_active == 1)
 		ia32_pause(); /* Wait for other cpu to see that we've started */
 
 	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
 	DELAY(1000000);
 	cpu_reset_real();
 }
 #endif
 
 void
 cpu_reset(void)
 {
 #ifdef SMP
 	struct monitorbuf *mb;
 	cpuset_t map;
 	u_int cnt;
 
 	if (smp_started) {
 		map = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &map);
 		CPU_ANDNOT(&map, &stopped_cpus);
 		if (!CPU_EMPTY(&map)) {
 			printf("cpu_reset: Stopping other CPUs\n");
 			stop_cpus(map);
 		}
 
 		if (PCPU_GET(cpuid) != 0) {
 			cpu_reset_proxyid = PCPU_GET(cpuid);
 			cpustop_restartfunc = cpu_reset_proxy;
 			cpu_reset_proxy_active = 0;
 			printf("cpu_reset: Restarting BSP\n");
 
 			/* Restart CPU #0. */
 			CPU_SETOF(0, &started_cpus);
 			mb = &pcpu_find(0)->pc_monitorbuf;
 			atomic_store_int(&mb->stop_state,
 			    MONITOR_STOPSTATE_RUNNING);
 
 			cnt = 0;
 			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
 				ia32_pause();
 				cnt++;	/* Wait for BSP to announce restart */
 			}
 			if (cpu_reset_proxy_active == 0) {
 				printf("cpu_reset: Failed to restart BSP\n");
 			} else {
 				cpu_reset_proxy_active = 2;
 				while (1)
 					ia32_pause();
 				/* NOTREACHED */
 			}
 		}
 
 		DELAY(1000000);
 	}
 #endif
 	cpu_reset_real();
 	/* NOTREACHED */
 }
 
 bool
 cpu_mwait_usable(void)
 {
 
 	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
 }
 
 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 static void
 cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_SLEEPING);
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
 		cpu_idle_hook(sbt);
 	else
 		acpi_cpu_c1();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_SLEEPING);
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
 	 * an interrupt fires during that critical section we may have
 	 * a pending preemption.  If the CPU halts, then that thread
 	 * may not execute until a later interrupt awakens the CPU.
 	 * To handle this race, check for a runnable thread after
 	 * disabling interrupts and immediately return if one is
 	 * found.  Also, we must absolutely guarentee that hlt is
 	 * the next instruction after sti.  This ensures that any
 	 * interrupt that fires after the call to disable_intr() will
 	 * immediately awaken the CPU from hlt.  Finally, please note
 	 * that on x86 this works fine because of interrupts enabled only
 	 * after the instruction following sti takes place, while IF is set
 	 * to 1 immediately, allowing hlt instruction to acknowledge the
 	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		acpi_cpu_c1();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_MWAIT);
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable()) {
 		atomic_store_int(state, STATE_RUNNING);
 		enable_intr();
 		return;
 	}
 
 	cpu_monitor(state, 0, 0);
 	if (atomic_load_int(state) == STATE_MWAIT)
 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 	else
 		enable_intr();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_RUNNING);
 
 	/*
 	 * The sched_runnable() call is racy but as long as there is
 	 * a loop missing it one time will have just a little impact if any 
 	 * (and it is much better than missing the check at all).
 	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 /*
  * C1E renders the local APIC timer dead, so we disable it by
  * reading the Interrupt Pending Message register and clearing
  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  * 
  * Reference:
  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  *   #32559 revision 3.00+
  */
 #define	MSR_AMDK8_IPM		0xc0010055
 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 
 void
 cpu_probe_amdc1e(void)
 {
 
 	/*
 	 * Detect the presence of C1E capability mostly on latest
 	 * dual-cores (or future) k8 family.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
 		cpu_ident_amdc1e = 1;
 	}
 }
 
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 
 void
 cpu_idle(int busy)
 {
 	uint64_t msr;
 	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #ifdef MP_WATCHDOG
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		sbt = cpu_idleclock();
 	}
 
 	/* Apply AMD APIC timer C1E workaround. */
 	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 	}
 
 	/* Call main idle method. */
 	cpu_idle_fn(sbt);
 
 	/* Switch timers back into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 out:
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 static int cpu_idle_apl31_workaround;
 SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
     &cpu_idle_apl31_workaround, 0,
     "Apollo Lake APL31 MWAIT bug workaround");
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct monitorbuf *mb;
 	int *state;
 
 	mb = &pcpu_find(cpu)->pc_monitorbuf;
 	state = &mb->idle_state;
 	switch (atomic_load_int(state)) {
 	case STATE_SLEEPING:
 		return (0);
 	case STATE_MWAIT:
 		atomic_store_int(state, STATE_RUNNING);
 		return (cpu_idle_apl31_workaround ? 0 : 1);
 	case STATE_RUNNING:
 		return (1);
 	default:
 		panic("bad monitor state");
 		return (1);
 	}
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 static struct {
 	void	*id_fn;
 	char	*id_name;
 	int	id_cpuid2_flag;
 } idle_tbl[] = {
 	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
 	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
 	    .id_cpuid2_flag = CPUID2_MON },
 	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
 	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     0, 0, idle_sysctl_available, "A",
     "list of available idle functions");
 
 static bool
 cpu_idle_selector(const char *new_idle_name)
 {
 	int i;
 
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, new_idle_name))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		if (bootverbose)
 			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
 		return (true);
 	}
 	return (false);
 }
 
 static int
 cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16], *p;
 	int error, i;
 
 	p = "unknown";
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	return (cpu_idle_selector(buf) ? 0 : EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     0, 0, cpu_idle_sysctl, "A",
     "currently selected idle function");
 
 static void
 cpu_idle_tun(void *unused __unused)
 {
 	char tunvar[16];
 
 	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
 		cpu_idle_selector(tunvar);
 	else if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
 		/* Ryzen erratas 1057, 1109. */
 		cpu_idle_selector("hlt");
 		idle_mwait = 0;
 		mwait_cpustop_broken = true;
 	}
 
 	if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
 		/*
 		 * Apollo Lake errata APL31 (public errata APL30).
 		 * Stores to the armed address range may not trigger
 		 * MWAIT to resume execution.  OS needs to use
 		 * interrupts to wake processors from MWAIT-induced
 		 * sleep states.
 		 */
 		cpu_idle_apl31_workaround = 1;
 		mwait_cpustop_broken = true;
 	}
 	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
 }
 SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
 
 static int panic_on_nmi = 0xff;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
     &panic_on_nmi, 0,
     "Panic on NMI: 1 = H/W failure; 2 = unknown; 0xff = all");
 int nmi_is_broadcast = 1;
 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
     &nmi_is_broadcast, 0,
     "Chipset NMI is broadcast");
 
 void
 nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
 {
 	bool claimed = false;
 
 #ifdef DEV_ISA
 	/* machine/parity/power fail/"kitchen sink" faults */
 	if (isa_nmi(frame->tf_err)) {
 		claimed = true;
 		if ((panic_on_nmi & 1) != 0)
 			panic("NMI indicates hardware failure");
 	}
 #endif /* DEV_ISA */
 
 	/*
 	 * NMIs can be useful for debugging.  They can be hooked up to a
 	 * pushbutton, usually on an ISA, PCI, or PCIe card.  They can also be
 	 * generated by an IPMI BMC, either manually or in response to a
 	 * watchdog timeout.  For example, see the "power diag" command in
 	 * ports/sysutils/ipmitool.  They can also be generated by a
 	 * hypervisor; see "bhyvectl --inject-nmi".
 	 */
 
 #ifdef KDB
 	if (!claimed && (panic_on_nmi & 2) != 0) {
 		if (debugger_on_panic) {
 			printf("NMI/cpu%d ... going to debugger\n", cpu);
 			claimed = kdb_trap(type, 0, frame);
 		}
 	}
 #endif /* KDB */
 
 	if (!claimed && panic_on_nmi != 0)
 		panic("NMI");
 }
 
 void
 nmi_handle_intr(u_int type, struct trapframe *frame)
 {
 
 #ifdef SMP
 	if (nmi_is_broadcast) {
 		nmi_call_kdb_smp(type, frame);
 		return;
 	}
 #endif
 	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
 }
 
 static int hw_ibrs_active;
 int hw_ibrs_ibpb_active;
 int hw_ibrs_disable = 1;
 
 SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
     "Indirect Branch Restricted Speculation active");
 
 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ibrs,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Indirect Branch Restricted Speculation active");
 
 SYSCTL_INT(_machdep_mitigations_ibrs, OID_AUTO, active, CTLFLAG_RD,
     &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active");
 
 void
 hw_ibrs_recalculate(bool for_all_cpus)
 {
 	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
 		x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ?
 		    MSR_OP_RENDEZVOUS : MSR_OP_LOCAL) |
 		    (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR),
 		    IA32_SPEC_CTRL_IBRS);
 		hw_ibrs_active = hw_ibrs_disable == 0;
 		hw_ibrs_ibpb_active = 0;
 	} else {
 		hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 &
 		    CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable;
 	}
 }
 
 static int
 hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = hw_ibrs_disable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	hw_ibrs_disable = val != 0;
 	hw_ibrs_recalculate(true);
 	return (0);
 }
 SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
     CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
     "Disable Indirect Branch Restricted Speculation");
 
 SYSCTL_PROC(_machdep_mitigations_ibrs, OID_AUTO, disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     hw_ibrs_disable_handler, "I",
     "Disable Indirect Branch Restricted Speculation");
 
 int hw_ssb_active;
 int hw_ssb_disable;
 
 SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
     &hw_ssb_active, 0,
     "Speculative Store Bypass Disable active");
 
 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ssb,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Speculative Store Bypass Disable active");
 
 SYSCTL_INT(_machdep_mitigations_ssb, OID_AUTO, active, CTLFLAG_RD,
     &hw_ssb_active, 0, "Speculative Store Bypass Disable active");
 
 static void
 hw_ssb_set(bool enable, bool for_all_cpus)
 {
 
 	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
 		hw_ssb_active = 0;
 		return;
 	}
 	hw_ssb_active = enable;
 	x86_msr_op(MSR_IA32_SPEC_CTRL,
 	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
 	    (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD);
 }
 
 void
 hw_ssb_recalculate(bool all_cpus)
 {
 
 	switch (hw_ssb_disable) {
 	default:
 		hw_ssb_disable = 0;
 		/* FALLTHROUGH */
 	case 0: /* off */
 		hw_ssb_set(false, all_cpus);
 		break;
 	case 1: /* on */
 		hw_ssb_set(true, all_cpus);
 		break;
 	case 2: /* auto */
 		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
 		    false : true, all_cpus);
 		break;
 	}
 }
 
 static int
 hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = hw_ssb_disable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	hw_ssb_disable = val;
 	hw_ssb_recalculate(true);
 	return (0);
 }
 SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     hw_ssb_disable_handler, "I",
     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
 
 SYSCTL_PROC(_machdep_mitigations_ssb, OID_AUTO, disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     hw_ssb_disable_handler, "I",
     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
 
 int hw_mds_disable;
 
 /*
  * Handler for Microarchitectural Data Sampling issues.  Really not a
  * pointer to C function: on amd64 the code must not change any CPU
  * architectural state except possibly %rflags. Also, it is always
  * called with interrupts disabled.
  */
 void mds_handler_void(void);
 void mds_handler_verw(void);
 void mds_handler_ivb(void);
 void mds_handler_bdw(void);
 void mds_handler_skl_sse(void);
 void mds_handler_skl_avx(void);
 void mds_handler_skl_avx512(void);
 void mds_handler_silvermont(void);
 void (*mds_handler)(void) = mds_handler_void;
 
 static int
 sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	const char *state;
 
 	if (mds_handler == mds_handler_void)
 		state = "inactive";
 	else if (mds_handler == mds_handler_verw)
 		state = "VERW";
 	else if (mds_handler == mds_handler_ivb)
 		state = "software IvyBridge";
 	else if (mds_handler == mds_handler_bdw)
 		state = "software Broadwell";
 	else if (mds_handler == mds_handler_skl_sse)
 		state = "software Skylake SSE";
 	else if (mds_handler == mds_handler_skl_avx)
 		state = "software Skylake AVX";
 	else if (mds_handler == mds_handler_skl_avx512)
 		state = "software Skylake AVX512";
 	else if (mds_handler == mds_handler_silvermont)
 		state = "software Silvermont";
 	else
 		state = "unknown";
 	return (SYSCTL_OUT(req, state, strlen(state)));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_hw_mds_disable_state_handler, "A",
     "Microarchitectural Data Sampling Mitigation state");
 
 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, mds,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Microarchitectural Data Sampling Mitigation state");
 
 SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, state,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_hw_mds_disable_state_handler, "A",
     "Microarchitectural Data Sampling Mitigation state");
 
 _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
 
 void
 hw_mds_recalculate(void)
 {
 	struct pcpu *pc;
 	vm_offset_t b64;
 	u_long xcr0;
 	int i;
 
 	/*
 	 * Allow user to force VERW variant even if MD_CLEAR is not
 	 * reported.  For instance, hypervisor might unknowingly
 	 * filter the cap out.
 	 * For the similar reasons, and for testing, allow to enable
 	 * mitigation even when MDS_NO cap is set.
 	 */
 	if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
 	    ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 &&
 	    hw_mds_disable == 3)) {
 		mds_handler = mds_handler_void;
 	} else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
 	    hw_mds_disable == 3) || hw_mds_disable == 1) {
 		mds_handler = mds_handler_verw;
 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
 	    CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
 	    CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
 	    CPUID_TO_MODEL(cpu_id) == 0x3a) &&
 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 		/*
 		 * Nehalem, SandyBridge, IvyBridge
 		 */
 		CPU_FOREACH(i) {
 			pc = pcpu_find(i);
 			if (pc->pc_mds_buf == NULL) {
 				pc->pc_mds_buf = malloc_domainset(672, M_TEMP,
 				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
 				bzero(pc->pc_mds_buf, 16);
 			}
 		}
 		mds_handler = mds_handler_ivb;
 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
 	    CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 		/*
 		 * Haswell, Broadwell
 		 */
 		CPU_FOREACH(i) {
 			pc = pcpu_find(i);
 			if (pc->pc_mds_buf == NULL) {
 				pc->pc_mds_buf = malloc_domainset(1536, M_TEMP,
 				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
 				bzero(pc->pc_mds_buf, 16);
 			}
 		}
 		mds_handler = mds_handler_bdw;
 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
 	    CPUID_STEPPING) <= 5) ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
 	    (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
 	    CPUID_STEPPING) <= 0xb) ||
 	    (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
 	    CPUID_STEPPING) <= 0xc)) &&
 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 		/*
 		 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
 		 * CascadeLake
 		 */
 		CPU_FOREACH(i) {
 			pc = pcpu_find(i);
 			if (pc->pc_mds_buf == NULL) {
 				pc->pc_mds_buf = malloc_domainset(6 * 1024,
 				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
 				    M_WAITOK);
 				b64 = (vm_offset_t)malloc_domainset(64 + 63,
 				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
 				    M_WAITOK);
 				pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
 				bzero(pc->pc_mds_buf64, 64);
 			}
 		}
 		xcr0 = rxcr(0);
 		if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
 		    (cpu_stdext_feature & CPUID_STDEXT_AVX512DQ) != 0)
 			mds_handler = mds_handler_skl_avx512;
 		else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
 		    (cpu_feature2 & CPUID2_AVX) != 0)
 			mds_handler = mds_handler_skl_avx;
 		else
 			mds_handler = mds_handler_skl_sse;
 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
 	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
 	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
 	    CPUID_TO_MODEL(cpu_id) == 0x65 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x75 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x26 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x7a))) {
 		/* Silvermont, Airmont */
 		CPU_FOREACH(i) {
 			pc = pcpu_find(i);
 			if (pc->pc_mds_buf == NULL)
 				pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
 		}
 		mds_handler = mds_handler_silvermont;
 	} else {
 		hw_mds_disable = 0;
 		mds_handler = mds_handler_void;
 	}
 }
 
 static void
 hw_mds_recalculate_boot(void *arg __unused)
 {
 
 	hw_mds_recalculate();
 }
 SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
 
 static int
 sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = hw_mds_disable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val < 0 || val > 3)
 		return (EINVAL);
 	hw_mds_disable = val;
 	hw_mds_recalculate();
 	return (0);
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_mds_disable_handler, "I",
     "Microarchitectural Data Sampling Mitigation "
     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
 
 SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_mds_disable_handler, "I",
     "Microarchitectural Data Sampling Mitigation "
     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
 
 /*
  * Intel Transactional Memory Asynchronous Abort Mitigation
  * CVE-2019-11135
  */
 int x86_taa_enable;
 int x86_taa_state;
 enum {
 	TAA_NONE	= 0,	/* No mitigation enabled */
 	TAA_TSX_DISABLE	= 1,	/* Disable TSX via MSR */
 	TAA_VERW	= 2,	/* Use VERW mitigation */
 	TAA_AUTO	= 3,	/* Automatically select the mitigation */
 
 	/* The states below are not selectable by the operator */
 
 	TAA_TAA_UC	= 4,	/* Mitigation present in microcode */
 	TAA_NOT_PRESENT	= 5	/* TSX is not present */
 };
 
 static void
 taa_set(bool enable, bool all)
 {
 
 	x86_msr_op(MSR_IA32_TSX_CTRL,
 	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
 	    (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
 	    IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR);
 }
 
 void
 x86_taa_recalculate(void)
 {
 	static int taa_saved_mds_disable = 0;
 	int taa_need = 0, taa_state = 0;
 	int mds_disable = 0, need_mds_recalc = 0;
 
 	/* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
 	if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
 	    (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
 		/* TSX is not present */
 		x86_taa_state = TAA_NOT_PRESENT;
 		return;
 	}
 
 	/* Check to see what mitigation options the CPU gives us */
 	if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
 		/* CPU is not suseptible to TAA */
 		taa_need = TAA_TAA_UC;
 	} else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
 		/*
 		 * CPU can turn off TSX.  This is the next best option
 		 * if TAA_NO hardware mitigation isn't present
 		 */
 		taa_need = TAA_TSX_DISABLE;
 	} else {
 		/* No TSX/TAA specific remedies are available. */
 		if (x86_taa_enable == TAA_TSX_DISABLE) {
 			if (bootverbose)
 				printf("TSX control not available\n");
 			return;
 		} else
 			taa_need = TAA_VERW;
 	}
 
 	/* Can we automatically take action, or are we being forced? */
 	if (x86_taa_enable == TAA_AUTO)
 		taa_state = taa_need;
 	else
 		taa_state = x86_taa_enable;
 
 	/* No state change, nothing to do */
 	if (taa_state == x86_taa_state) {
 		if (bootverbose)
 			printf("No TSX change made\n");
 		return;
 	}
 
 	/* Does the MSR need to be turned on or off? */
 	if (taa_state == TAA_TSX_DISABLE)
 		taa_set(true, true);
 	else if (x86_taa_state == TAA_TSX_DISABLE)
 		taa_set(false, true);
 
 	/* Does MDS need to be set to turn on VERW? */
 	if (taa_state == TAA_VERW) {
 		taa_saved_mds_disable = hw_mds_disable;
 		mds_disable = hw_mds_disable = 1;
 		need_mds_recalc = 1;
 	} else if (x86_taa_state == TAA_VERW) {
 		mds_disable = hw_mds_disable = taa_saved_mds_disable;
 		need_mds_recalc = 1;
 	}
 	if (need_mds_recalc) {
 		hw_mds_recalculate();
 		if (mds_disable != hw_mds_disable) {
 			if (bootverbose)
 				printf("Cannot change MDS state for TAA\n");
 			/* Don't update our state */
 			return;
 		}
 	}
 
 	x86_taa_state = taa_state;
 	return;
 }
 
 static void
 taa_recalculate_boot(void * arg __unused)
 {
 
 	x86_taa_recalculate();
 }
 SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
 
 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TSX Asynchronous Abort Mitigation");
 
 static int
 sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = x86_taa_enable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val < TAA_NONE || val > TAA_AUTO)
 		return (EINVAL);
 	x86_taa_enable = val;
 	x86_taa_recalculate();
 	return (0);
 }
 
 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_taa_handler, "I",
     "TAA Mitigation enablement control "
     "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO");
 
 static int
 sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	const char *state;
 
 	switch (x86_taa_state) {
 	case TAA_NONE:
 		state = "inactive";
 		break;
 	case TAA_TSX_DISABLE:
 		state = "TSX disabled";
 		break;
 	case TAA_VERW:
 		state = "VERW";
 		break;
 	case TAA_TAA_UC:
 		state = "Mitigated in microcode";
 		break;
 	case TAA_NOT_PRESENT:
 		state = "TSX not present";
 		break;
 	default:
 		state = "unknown";
 	}
 
 	return (SYSCTL_OUT(req, state, strlen(state)));
 }
 
 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_taa_state_handler, "A",
     "TAA Mitigation state");
 
 int __read_frequently cpu_flush_rsb_ctxsw;
 SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw,
     CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0,
     "Flush Return Stack Buffer on context switch");
 
+SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds,
+    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+    "MCU Optimization, disable RDSEED mitigation");
+
+int x86_rngds_mitg_enable = 1;
+void
+x86_rngds_mitg_recalculate(bool all_cpus)
+{
+	if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0)
+		return;
+	x86_msr_op(MSR_IA32_MCU_OPT_CTRL,
+	    (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
+	    (all_cpus ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
+	    IA32_RNGDS_MITG_DIS);
+}
+
+static int
+sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS)
+{
+	int error, val;
+
+	val = x86_rngds_mitg_enable;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	x86_rngds_mitg_enable = val;
+	x86_rngds_mitg_recalculate(true);
+	return (0);
+}
+SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT |
+    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_rngds_mitg_enable_handler, "I",
+    "MCU Optimization, disabling RDSEED mitigation control "
+    "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled");
+
+static int
+sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS)
+{
+	const char *state;
+
+	if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) {
+		state = "Not applicable";
+	} else if (x86_rngds_mitg_enable == 0) {
+		state = "RDSEED not serialized";
+	} else {
+		state = "Mitigated";
+	}
+	return (SYSCTL_OUT(req, state, strlen(state)));
+}
+SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
+    sysctl_rngds_state_handler, "A",
+    "MCU Optimization state");
+
 /*
  * Enable and restore kernel text write permissions.
  * Callers must ensure that disable_wp()/restore_wp() are executed
  * without rescheduling on the same core.
  */
 bool
 disable_wp(void)
 {
 	u_int cr0;
 
 	cr0 = rcr0();
 	if ((cr0 & CR0_WP) == 0)
 		return (false);
 	load_cr0(cr0 & ~CR0_WP);
 	return (true);
 }
 
 void
 restore_wp(bool old_wp)
 {
 
 	if (old_wp)
 		load_cr0(rcr0() | CR0_WP);
 }
 
 bool
 acpi_get_fadt_bootflags(uint16_t *flagsp)
 {
 #ifdef DEV_ACPI
 	ACPI_TABLE_FADT *fadt;
 	vm_paddr_t physaddr;
 
 	physaddr = acpi_find_table(ACPI_SIG_FADT);
 	if (physaddr == 0)
 		return (false);
 	fadt = acpi_map_table(physaddr, ACPI_SIG_FADT);
 	if (fadt == NULL)
 		return (false);
 	*flagsp = fadt->BootFlags;
 	acpi_unmap_table(fadt);
 	return (true);
 #else
 	return (false);
 #endif
 }