Index: head/share/man/man7/security.7
===================================================================
--- head/share/man/man7/security.7	(revision 361301)
+++ head/share/man/man7/security.7	(revision 361302)
@@ -1,1095 +1,1102 @@
 .\" Copyright (C) 1998 Matthew Dillon. All rights reserved.
 .\" Copyright (c) 2019 The FreeBSD Foundation, Inc.
 .\"
 .\" Parts of this documentation were written by
 .\" Konstantin Belousov <kib@FreeBSD.org> under sponsorship
 .\" from the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 4, 2020
+.Dd May 16, 2020
 .Dt SECURITY 7
 .Os
 .Sh NAME
 .Nm security
 .Nd introduction to security under FreeBSD
 .Sh DESCRIPTION
 Security is a function that begins and ends with the system administrator.
 While all
 .Bx
 multi-user systems have some inherent security, the job of building and
 maintaining additional security mechanisms to keep users
 .Dq honest
 is probably
 one of the single largest undertakings of the sysadmin.
 Machines are
 only as secure as you make them, and security concerns are ever competing
 with the human necessity for convenience.
 .Ux
 systems,
 in general, are capable of running a huge number of simultaneous processes
 and many of these processes operate as servers \(em meaning that external
 entities can connect and talk to them.
 As yesterday's mini-computers and mainframes
 become today's desktops, and as computers become networked and internetworked,
 security becomes an ever bigger issue.
 .Pp
 Security is best implemented through a layered onion approach.
 In a nutshell,
 what you want to do is to create as many layers of security as are convenient
 and then carefully monitor the system for intrusions.
 .Pp
 System security also pertains to dealing with various forms of attacks,
 including attacks that attempt to crash or otherwise make a system unusable
 but do not attempt to break root.
 Security concerns can be split up into
 several categories:
 .Bl -enum -offset indent
 .It
 Denial of Service attacks (DoS)
 .It
 User account compromises
 .It
 Root compromise through accessible servers
 .It
 Root compromise via user accounts
 .It
 Backdoor creation
 .El
 .Pp
 A denial of service attack is an action that deprives the machine of needed
 resources.
 Typically, DoS attacks are brute-force mechanisms that attempt
 to crash or otherwise make a machine unusable by overwhelming its servers or
 network stack.
 Some DoS attacks try to take advantages of bugs in the
 networking stack to crash a machine with a single packet.
 The latter can
 only be fixed by applying a bug fix to the kernel.
 Attacks on servers can
 often be fixed by properly specifying options to limit the load the servers
 incur on the system under adverse conditions.
 Brute-force network attacks are harder to deal with.
 A spoofed-packet attack, for example, is
 nearly impossible to stop short of cutting your system off from the Internet.
 It may not be able to take your machine down, but it can fill up your Internet
 pipe.
 .Pp
 A user account compromise is even more common than a DoS attack.
 Many
 sysadmins still run standard
 .Xr telnetd 8
 and
 .Xr ftpd 8
 servers on their machines.
 These servers, by default, do not operate over encrypted
 connections.
 The result is that if you have any moderate-sized user base,
 one or more of your users logging into your system from a remote location
 (which is the most common and convenient way to log in to a system)
 will have his or her password sniffed.
 The attentive system administrator will analyze
 his remote access logs looking for suspicious source addresses
 even for successful logins.
 .Pp
 One must always assume that once an attacker has access to a user account,
 the attacker can break root.
 However, the reality is that in a well secured
 and maintained system, access to a user account does not necessarily give the
 attacker access to root.
 The distinction is important because without access
 to root the attacker cannot generally hide his tracks and may, at best, be
 able to do nothing more than mess with the user's files or crash the machine.
 User account compromises are very common because users tend not to take the
 precautions that sysadmins take.
 .Pp
 System administrators must keep in mind that there are potentially many ways
 to break root on a machine.
 The attacker may know the root password,
 the attacker
 may find a bug in a root-run server and be able to break root over a network
 connection to that server, or the attacker may know of a bug in an SUID-root
 program that allows the attacker to break root once he has broken into a
 user's account.
 If an attacker has found a way to break root on a machine,
 the attacker may not have a need to install a backdoor.
 Many of the root holes found and closed to date involve a considerable amount
 of work by the attacker to clean up after himself, so most attackers do install
 backdoors.
 This gives you a convenient way to detect the attacker.
 Making
 it impossible for an attacker to install a backdoor may actually be detrimental
 to your security because it will not close off the hole the attacker used to
 break in originally.
 .Pp
 Security remedies should always be implemented with a multi-layered
 .Dq onion peel
 approach and can be categorized as follows:
 .Bl -enum -offset indent
 .It
 Securing root and staff accounts
 .It
 Securing root \(em root-run servers and SUID/SGID binaries
 .It
 Securing user accounts
 .It
 Securing the password file
 .It
 Securing the kernel core, raw devices, and file systems
 .It
 Quick detection of inappropriate changes made to the system
 .It
 Paranoia
 .El
 .Sh SECURING THE ROOT ACCOUNT AND SECURING STAFF ACCOUNTS
 Do not bother securing staff accounts if you have not secured the root
 account.
 Most systems have a password assigned to the root account.
 The
 first thing you do is assume that the password is
 .Em always
 compromised.
 This does not mean that you should remove the password.
 The
 password is almost always necessary for console access to the machine.
 What it does mean is that you should not make it possible to use the password
 outside of the console or possibly even with a
 .Xr su 1
 utility.
 For example, make sure that your PTYs are specified as being
 .Dq Li insecure
 in the
 .Pa /etc/ttys
 file
 so that direct root logins via
 .Xr telnet 1
 are disallowed.
 If using
 other login services such as
 .Xr sshd 8 ,
 make sure that direct root logins are
 disabled there as well.
 Consider every access method \(em services such as
 .Xr ftp 1
 often fall through the cracks.
 Direct root logins should only be allowed
 via the system console.
 .Pp
 Of course, as a sysadmin you have to be able to get to root, so we open up
 a few holes.
 But we make sure these holes require additional password
 verification to operate.
 One way to make root accessible is to add appropriate
 staff accounts to the
 .Dq Li wheel
 group (in
 .Pa /etc/group ) .
 The staff members placed in the
 .Li wheel
 group are allowed to
 .Xr su 1
 to root.
 You should never give staff
 members native
 .Li wheel
 access by putting them in the
 .Li wheel
 group in their password entry.
 Staff accounts should be placed in a
 .Dq Li staff
 group, and then added to the
 .Li wheel
 group via the
 .Pa /etc/group
 file.
 Only those staff members who actually need to have root access
 should be placed in the
 .Li wheel
 group.
 It is also possible, when using an
 authentication method such as Kerberos, to use Kerberos's
 .Pa .k5login
 file in the root account to allow a
 .Xr ksu 1
 to root without having to place anyone at all in the
 .Li wheel
 group.
 This
 may be the better solution since the
 .Li wheel
 mechanism still allows an
 intruder to break root if the intruder has gotten hold of your password
 file and can break into a staff account.
 While having the
 .Li wheel
 mechanism
 is better than having nothing at all, it is not necessarily the safest
 option.
 .Pp
 An indirect way to secure the root account is to secure your staff accounts
 by using an alternative login access method and *'ing out the crypted password
 for the staff accounts.
 This way an intruder may be able to steal the password
 file but will not be able to break into any staff accounts or root, even if
 root has a crypted password associated with it (assuming, of course, that
 you have limited root access to the console).
 Staff members
 get into their staff accounts through a secure login mechanism such as
 .Xr kerberos 8
 or
 .Xr ssh 1
 using a private/public
 key pair.
 When you use something like Kerberos you generally must secure
 the machines which run the Kerberos servers and your desktop workstation.
 When you use a public/private key pair with SSH, you must generally secure
 the machine you are logging in
 .Em from
 (typically your workstation),
 but you can
 also add an additional layer of protection to the key pair by password
 protecting the keypair when you create it with
 .Xr ssh-keygen 1 .
 Being able
 to star-out the passwords for staff accounts also guarantees that staff
 members can only log in through secure access methods that you have set up.
 You can
 thus force all staff members to use secure, encrypted connections for
 all their sessions which closes an important hole used by many intruders: that
 of sniffing the network from an unrelated, less secure machine.
 .Pp
 The more indirect security mechanisms also assume that you are logging in
 from a more restrictive server to a less restrictive server.
 For example,
 if your main box is running all sorts of servers, your workstation should not
 be running any.
 In order for your workstation to be reasonably secure
 you should run as few servers as possible, up to and including no servers
 at all, and you should run a password-protected screen blanker.
 Of course, given physical access to
 a workstation, an attacker can break any sort of security you put on it.
 This is definitely a problem that you should consider but you should also
 consider the fact that the vast majority of break-ins occur remotely, over
 a network, from people who do not have physical access to your workstation or
 servers.
 .Pp
 Using something like Kerberos also gives you the ability to disable or
 change the password for a staff account in one place and have it immediately
 affect all the machines the staff member may have an account on.
 If a staff
 member's account gets compromised, the ability to instantly change his
 password on all machines should not be underrated.
 With discrete passwords, changing a password on N machines can be a mess.
 You can also impose
 re-passwording restrictions with Kerberos: not only can a Kerberos ticket
 be made to timeout after a while, but the Kerberos system can require that
 the user choose a new password after a certain period of time
 (say, once a month).
 .Sh SECURING ROOT \(em ROOT-RUN SERVERS AND SUID/SGID BINARIES
 The prudent sysadmin only runs the servers he needs to, no more, no less.
 Be aware that third party servers are often the most bug-prone.
 For example,
 running an old version of
 .Xr imapd 8
 or
 .Xr popper 8 Pq Pa ports/mail/popper
 is like giving a universal root
 ticket out to the entire world.
 Never run a server that you have not checked
 out carefully.
 Many servers do not need to be run as root.
 For example,
 the
 .Xr talkd 8 ,
 .Xr comsat 8 ,
 and
 .Xr fingerd 8
 daemons can be run in special user
 .Dq sandboxes .
 A sandbox is not perfect unless you go to a large amount of trouble, but the
 onion approach to security still stands: if someone is able to break in
 through a server running in a sandbox, they still have to break out of the
 sandbox.
 The more layers the attacker must break through, the lower the
 likelihood of his success.
 Root holes have historically been found in
 virtually every server ever run as root, including basic system servers.
 If you are running a machine through which people only log in via
 .Xr sshd 8
 and never log in via
 .Xr telnetd 8
 then turn off those services!
 .Pp
 .Fx
 now defaults to running
 .Xr talkd 8 ,
 .Xr comsat 8 ,
 and
 .Xr fingerd 8
 in a sandbox.
 Depending on whether you
 are installing a new system or upgrading an existing system, the special
 user accounts used by these sandboxes may not be installed.
 The prudent
 sysadmin would research and implement sandboxes for servers whenever possible.
 .Pp
 There are a number of other servers that typically do not run in sandboxes:
 .Xr sendmail 8 ,
 .Xr popper 8 ,
 .Xr imapd 8 ,
 .Xr ftpd 8 ,
 and others.
 There are alternatives to
 some of these, but installing them may require more work than you are willing
 to put
 (the convenience factor strikes again).
 You may have to run these
 servers as root and rely on other mechanisms to detect break-ins that might
 occur through them.
 .Pp
 The other big potential root hole in a system are the SUID-root and SGID
 binaries installed on the system.
 Most of these binaries, such as
 .Xr su 1 ,
 reside in
 .Pa /bin , /sbin , /usr/bin ,
 or
 .Pa /usr/sbin .
 While nothing is 100% safe,
 the system-default SUID and SGID binaries can be considered reasonably safe.
 Still, root holes are occasionally found in these binaries.
 A root hole
 was found in Xlib in 1998 that made
 .Xr xterm 1 Pq Pa ports/x11/xterm
 (which is typically SUID)
 vulnerable.
 It is better to be safe than sorry and the prudent sysadmin will restrict SUID
 binaries that only staff should run to a special group that only staff can
 access, and get rid of
 .Pq Dq Li "chmod 000"
 any SUID binaries that nobody uses.
 A server with no display generally does not need an
 .Xr xterm 1
 binary.
 SGID binaries can be almost as dangerous.
 If an intruder can break an SGID-kmem binary the
 intruder might be able to read
 .Pa /dev/kmem
 and thus read the crypted password
 file, potentially compromising any passworded account.
 Alternatively an
 intruder who breaks group
 .Dq Li kmem
 can monitor keystrokes sent through PTYs,
 including PTYs used by users who log in through secure methods.
 An intruder
 that breaks the
 .Dq Li tty
 group can write to almost any user's TTY.
 If a user
 is running a terminal
 program or emulator with a keyboard-simulation feature, the intruder can
 potentially
 generate a data stream that causes the user's terminal to echo a command, which
 is then run as that user.
 .Sh SECURING USER ACCOUNTS
 User accounts are usually the most difficult to secure.
 While you can impose
 draconian access restrictions on your staff and *-out their passwords, you
 may not be able to do so with any general user accounts you might have.
 If
 you do have sufficient control then you may win out and be able to secure the
 user accounts properly.
 If not, you simply have to be more vigilant in your
 monitoring of those accounts.
 Use of SSH and Kerberos for user accounts is
 more problematic due to the extra administration and technical support
 required, but still a very good solution compared to a crypted password
 file.
 .Sh SECURING THE PASSWORD FILE
 The only sure fire way is to *-out as many passwords as you can and
 use SSH or Kerberos for access to those accounts.
 Even though the
 crypted password file
 .Pq Pa /etc/spwd.db
 can only be read by root, it may
 be possible for an intruder to obtain read access to that file even if the
 attacker cannot obtain root-write access.
 .Pp
 Your security scripts should always check for and report changes to
 the password file
 (see
 .Sx CHECKING FILE INTEGRITY
 below).
 .Sh SECURING THE KERNEL CORE, RAW DEVICES, AND FILE SYSTEMS
 If an attacker breaks root he can do just about anything, but there
 are certain conveniences.
 For example, most modern kernels have a packet sniffing device driver built in.
 Under
 .Fx
 it is called
 the
 .Xr bpf 4
 device.
 An intruder will commonly attempt to run a packet sniffer
 on a compromised machine.
 You do not need to give the intruder the
 capability and most systems should not have the
 .Xr bpf 4
 device compiled in.
 .Pp
 But even if you turn off the
 .Xr bpf 4
 device, you still have
 .Pa /dev/mem
 and
 .Pa /dev/kmem
 to worry about.
 For that matter,
 the intruder can still write to raw disk devices.
 Also, there is another kernel feature called the module loader,
 .Xr kldload 8 .
 An enterprising intruder can use a KLD module to install
 his own
 .Xr bpf 4
 device or other sniffing device on a running kernel.
 To avoid these problems you have to run
 the kernel at a higher security level, at least level 1.
 The security level can be set with a
 .Xr sysctl 8
 on the
 .Va kern.securelevel
 variable.
 Once you have
 set the security level to 1, write access to raw devices will be denied and
 special
 .Xr chflags 1
 flags, such as
 .Cm schg ,
 will be enforced.
 You must also ensure
 that the
 .Cm schg
 flag is set on critical startup binaries, directories, and
 script files \(em everything that gets run
 up to the point where the security level is set.
 This might be overdoing it, and upgrading the system is much more
 difficult when you operate at a higher security level.
 You may compromise and
 run the system at a higher security level but not set the
 .Cm schg
 flag for every
 system file and directory under the sun.
 Another possibility is to simply
 mount
 .Pa /
 and
 .Pa /usr
 read-only.
 It should be noted that being too draconian in
 what you attempt to protect may prevent the all-important detection of an
 intrusion.
 .Pp
 The kernel runs with five different security levels.
 Any super-user process can raise the level, but no process
 can lower it.
 The security levels are:
 .Bl -tag -width flag
 .It Ic -1
 Permanently insecure mode \- always run the system in insecure mode.
 This is the default initial value.
 .It Ic 0
 Insecure mode \- immutable and append-only flags may be turned off.
 All devices may be read or written subject to their permissions.
 .It Ic 1
 Secure mode \- the system immutable and system append-only flags may not
 be turned off;
 disks for mounted file systems,
 .Pa /dev/mem
 and
 .Pa /dev/kmem
 may not be opened for writing;
 .Pa /dev/io
 (if your platform has it) may not be opened at all;
 kernel modules (see
 .Xr kld 4 )
 may not be loaded or unloaded.
 The kernel debugger may not be entered using the
 .Va debug.kdb.enter
 sysctl.
 A panic or trap cannot be forced using the
 .Va debug.kdb.panic
 and other sysctl's.
 .It Ic 2
 Highly secure mode \- same as secure mode, plus disks may not be
 opened for writing (except by
 .Xr mount 2 )
 whether mounted or not.
 This level precludes tampering with file systems by unmounting them,
 but also inhibits running
 .Xr newfs 8
 while the system is multi-user.
 .Pp
 In addition, kernel time changes are restricted to less than or equal to one
 second.
 Attempts to change the time by more than this will log the message
 .Dq Time adjustment clamped to +1 second .
 .It Ic 3
 Network secure mode \- same as highly secure mode, plus
 IP packet filter rules (see
 .Xr ipfw 8 ,
 .Xr ipfirewall 4
 and
 .Xr pfctl 8 )
 cannot be changed and
 .Xr dummynet 4
 or
 .Xr pf 4
 configuration cannot be adjusted.
 .El
 .Pp
 The security level can be configured with variables documented in
 .Xr rc.conf 5 .
 .Sh CHECKING FILE INTEGRITY: BINARIES, CONFIG FILES, ETC
 When it comes right down to it, you can only protect your core system
 configuration and control files so much before the convenience factor
 rears its ugly head.
 For example, using
 .Xr chflags 1
 to set the
 .Cm schg
 bit on most of the files in
 .Pa /
 and
 .Pa /usr
 is probably counterproductive because
 while it may protect the files, it also closes a detection window.
 The
 last layer of your security onion is perhaps the most important \(em detection.
 The rest of your security is pretty much useless (or, worse, presents you with
 a false sense of safety) if you cannot detect potential incursions.
 Half
 the job of the onion is to slow down the attacker rather than stop him
 in order to give the detection layer a chance to catch him in
 the act.
 .Pp
 The best way to detect an incursion is to look for modified, missing, or
 unexpected files.
 The best
 way to look for modified files is from another (often centralized)
 limited-access system.
 Writing your security scripts on the extra-secure limited-access system
 makes them mostly invisible to potential attackers, and this is important.
 In order to take maximum advantage you generally have to give the
 limited-access box significant access to the other machines in the business,
 usually either by doing a read-only NFS export of the other machines to the
 limited-access box, or by setting up SSH keypairs to allow the limit-access
 box to SSH to the other machines.
 Except for its network traffic, NFS is
 the least visible method \(em allowing you to monitor the file systems on each
 client box virtually undetected.
 If your
 limited-access server is connected to the client boxes through a switch,
 the NFS method is often the better choice.
 If your limited-access server
 is connected to the client boxes through a hub or through several layers
 of routing, the NFS method may be too insecure (network-wise) and using SSH
 may be the better choice even with the audit-trail tracks that SSH lays.
 .Pp
 Once you give a limit-access box at least read access to the client systems
 it is supposed to monitor, you must write scripts to do the actual
 monitoring.
 Given an NFS mount, you can write scripts out of simple system
 utilities such as
 .Xr find 1
 and
 .Xr md5 1 .
 It is best to physically
 .Xr md5 1
 the client-box files boxes at least once a
 day, and to test control files such as those found in
 .Pa /etc
 and
 .Pa /usr/local/etc
 even more often.
 When mismatches are found relative to the base MD5
 information the limited-access machine knows is valid, it should scream at
 a sysadmin to go check it out.
 A good security script will also check for
 inappropriate SUID binaries and for new or deleted files on system partitions
 such as
 .Pa /
 and
 .Pa /usr .
 .Pp
 When using SSH rather than NFS, writing the security script is much more
 difficult.
 You essentially have to
 .Xr scp 1
 the scripts to the client box in order to run them, making them visible, and
 for safety you also need to
 .Xr scp 1
 the binaries (such as
 .Xr find 1 )
 that those scripts use.
 The
 .Xr sshd 8
 daemon on the client box may already be compromised.
 All in all,
 using SSH may be necessary when running over unsecure links, but it is also a
 lot harder to deal with.
 .Pp
 A good security script will also check for changes to user and staff members
 access configuration files:
 .Pa .rhosts , .shosts , .ssh/authorized_keys
 and so forth, files that might fall outside the purview of the MD5 check.
 .Pp
 If you have a huge amount of user disk space it may take too long to run
 through every file on those partitions.
 In this case, setting mount
 flags to disallow SUID binaries on those partitions is a good
 idea.
 The
 .Cm nosuid
 option
 (see
 .Xr mount 8 )
 is what you want to look into.
 I would scan them anyway at least once a
 week, since the object of this layer is to detect a break-in whether or
 not the break-in is effective.
 .Pp
 Process accounting
 (see
 .Xr accton 8 )
 is a relatively low-overhead feature of
 the operating system which I recommend using as a post-break-in evaluation
 mechanism.
 It is especially useful in tracking down how an intruder has
 actually broken into a system, assuming the file is still intact after
 the break-in occurs.
 .Pp
 Finally, security scripts should process the log files and the logs themselves
 should be generated in as secure a manner as possible \(em remote syslog can be
 very useful.
 An intruder tries to cover his tracks, and log files are critical
 to the sysadmin trying to track down the time and method of the initial
 break-in.
 One way to keep a permanent record of the log files is to run
 the system console to a serial port and collect the information on a
 continuing basis through a secure machine monitoring the consoles.
 .Sh PARANOIA
 A little paranoia never hurts.
 As a rule, a sysadmin can add any number
 of security features as long as they do not affect convenience, and
 can add security features that do affect convenience with some added
 thought.
 Even more importantly, a security administrator should mix it up
 a bit \(em if you use recommendations such as those given by this manual
 page verbatim, you give away your methodologies to the prospective
 attacker who also has access to this manual page.
 .Sh SPECIAL SECTION ON DoS ATTACKS
 This section covers Denial of Service attacks.
 A DoS attack is typically a packet attack.
 While there is not much you can do about modern spoofed
 packet attacks that saturate your network, you can generally limit the damage
 by ensuring that the attacks cannot take down your servers.
 .Bl -enum -offset indent
 .It
 Limiting server forks
 .It
 Limiting springboard attacks (ICMP response attacks, ping broadcast, etc.)
 .It
 Kernel Route Cache
 .El
 .Pp
 A common DoS attack is against a forking server that attempts to cause the
 server to eat processes, file descriptors, and memory until the machine
 dies.
 The
 .Xr inetd 8
 server
 has several options to limit this sort of attack.
 It should be noted that while it is possible to prevent a machine from going
 down it is not generally possible to prevent a service from being disrupted
 by the attack.
 Read the
 .Xr inetd 8
 manual page carefully and pay specific attention
 to the
 .Fl c , C ,
 and
 .Fl R
 options.
 Note that spoofed-IP attacks will circumvent
 the
 .Fl C
 option to
 .Xr inetd 8 ,
 so typically a combination of options must be used.
 Some standalone servers have self-fork-limitation parameters.
 .Pp
 The
 .Xr sendmail 8
 daemon has its
 .Fl OMaxDaemonChildren
 option which tends to work much
 better than trying to use
 .Xr sendmail 8 Ns 's
 load limiting options due to the
 load lag.
 You should specify a
 .Va MaxDaemonChildren
 parameter when you start
 .Xr sendmail 8
 high enough to handle your expected load but not so high that the
 computer cannot handle that number of
 .Nm sendmail Ns 's
 without falling on its face.
 It is also prudent to run
 .Xr sendmail 8
 in
 .Dq queued
 mode
 .Pq Fl ODeliveryMode=queued
 and to run the daemon
 .Pq Dq Nm sendmail Fl bd
 separate from the queue-runs
 .Pq Dq Nm sendmail Fl q15m .
 If you still want real-time delivery you can run the queue
 at a much lower interval, such as
 .Fl q1m ,
 but be sure to specify a reasonable
 .Va MaxDaemonChildren
 option for that
 .Xr sendmail 8
 to prevent cascade failures.
 .Pp
 The
 .Xr syslogd 8
 daemon can be attacked directly and it is strongly recommended that you use
 the
 .Fl s
 option whenever possible, and the
 .Fl a
 option otherwise.
 .Pp
 You should also be fairly careful
 with connect-back services such as tcpwrapper's reverse-identd, which can
 be attacked directly.
 You generally do not want to use the reverse-ident
 feature of tcpwrappers for this reason.
 .Pp
 It is a very good idea to protect internal services from external access
 by firewalling them off at your border routers.
 The idea here is to prevent
 saturation attacks from outside your LAN, not so much to protect internal
 services from network-based root compromise.
 Always configure an exclusive
 firewall, i.e.,
 .So
 firewall everything
 .Em except
 ports A, B, C, D, and M-Z
 .Sc .
 This
 way you can firewall off all of your low ports except for certain specific
 services such as
 .Xr talkd 8 ,
 .Xr sendmail 8 ,
 and other internet-accessible services.
 If you try to configure the firewall the other
 way \(em as an inclusive or permissive firewall, there is a good chance that you
 will forget to
 .Dq close
 a couple of services or that you will add a new internal
 service and forget to update the firewall.
 You can still open up the
 high-numbered port range on the firewall to allow permissive-like operation
 without compromising your low ports.
 Also take note that
 .Fx
 allows you to
 control the range of port numbers used for dynamic binding via the various
 .Va net.inet.ip.portrange
 sysctl's
 .Pq Dq Li "sysctl net.inet.ip.portrange" ,
 which can also
 ease the complexity of your firewall's configuration.
 I usually use a normal
 first/last range of 4000 to 5000, and a hiport range of 49152 to 65535, then
 block everything under 4000 off in my firewall
 (except for certain specific
 internet-accessible ports, of course).
 .Pp
 Another common DoS attack is called a springboard attack \(em to attack a server
 in a manner that causes the server to generate responses which then overload
 the server, the local network, or some other machine.
 The most common attack
 of this nature is the ICMP PING BROADCAST attack.
 The attacker spoofs ping
 packets sent to your LAN's broadcast address with the source IP address set
 to the actual machine they wish to attack.
 If your border routers are not
 configured to stomp on ping's to broadcast addresses, your LAN winds up
 generating sufficient responses to the spoofed source address to saturate the
 victim, especially when the attacker uses the same trick on several dozen
 broadcast addresses over several dozen different networks at once.
 Broadcast attacks of over a hundred and twenty megabits have been measured.
 A second common springboard attack is against the ICMP error reporting system.
 By
 constructing packets that generate ICMP error responses, an attacker can
 saturate a server's incoming network and cause the server to saturate its
 outgoing network with ICMP responses.
 This type of attack can also crash the
 server by running it out of
 .Vt mbuf Ns 's ,
 especially if the server cannot drain the
 ICMP responses it generates fast enough.
 The
 .Fx
 kernel has a new kernel
 compile option called
 .Dv ICMP_BANDLIM
 which limits the effectiveness of these
 sorts of attacks.
 The last major class of springboard attacks is related to
 certain internal
 .Xr inetd 8
 services such as the UDP echo service.
 An attacker
 simply spoofs a UDP packet with the source address being server A's echo port,
 and the destination address being server B's echo port, where server A and B
 are both on your LAN.
 The two servers then bounce this one packet back and
 forth between each other.
 The attacker can overload both servers and their
 LANs simply by injecting a few packets in this manner.
 Similar problems
 exist with the internal chargen port.
 A competent sysadmin will turn off all
 of these
 .Xr inetd 8 Ns -internal
 test services.
 .Sh ACCESS ISSUES WITH KERBEROS AND SSH
 There are a few issues with both Kerberos and SSH that need to be addressed
 if you intend to use them.
 Kerberos5 is an excellent authentication
 protocol but the kerberized
 .Xr telnet 1
 suck rocks.
 There are bugs that make them unsuitable for dealing with binary streams.
 Also, by default
 Kerberos does not encrypt a session unless you use the
 .Fl x
 option.
 SSH encrypts everything by default.
 .Pp
 SSH works quite well in every respect except when it is set up to
 forward encryption keys.
 What this means is that if you have a secure workstation holding
 keys that give you access to the rest of the system, and you
 .Xr ssh 1
 to an
 unsecure machine, your keys become exposed.
 The actual keys themselves are
 not exposed, but
 .Xr ssh 1
 installs a forwarding port for the duration of your
 login and if an attacker has broken root on the unsecure machine he can utilize
 that port to use your keys to gain access to any other machine that your
 keys unlock.
 .Pp
 We recommend that you use SSH in combination with Kerberos whenever possible
 for staff logins.
 SSH can be compiled with Kerberos support.
 This reduces
 your reliance on potentially exposable SSH keys while at the same time
 protecting passwords via Kerberos.
 SSH keys
 should only be used for automated tasks from secure machines (something
 that Kerberos is unsuited to).
 We also recommend that you either turn off
 key-forwarding in the SSH configuration, or that you make use of the
 .Va from Ns = Ns Ar IP/DOMAIN
 option that SSH allows in its
 .Pa authorized_keys
 file to make the key only usable to entities logging in from specific
 machines.
 .Sh KNOBS AND TWEAKS
 .Fx
 provides several knobs and tweak handles that make some introspection
 information access more restricted.
 Some people consider this as improving system security, so the knobs are
 briefly listed there, together with controls which enable some mitigations
 of the hardware state leaks.
 .Pp
 Hardware mitigation sysctl knobs described below have been moved under
 .Pa machdep.mitigations ,
 with backwards-compatibility shims to accept the existing names.
 A future change will rationalize the sense of the individual sysctls
 (so that enabled / true always indicates that the mitigation is active).
 For that reason the previous names remain the canonical way to set the
 mitigations, and are documented here.
 Backwards compatibility shims for the interim sysctls under
 .Pa machdep.mitigations
 will not be added.
 .Bl -tag -width security.bsd.unprivileged_proc_debug
 .It Dv security.bsd.see_other_uids
 Controls visibility of processes owned by different uid.
 The knob directly affects the
 .Dv kern.proc
 sysctls filtering of data, which results in restricted output from
 utilities like
 .Xr ps 1 .
 .It Dv security.bsd.see_other_gids
 Same, for processes owned by different gid.
 .It Dv security.bsd.see_jail_proc
 Same, for processes belonging to a jail.
 .It Dv security.bsd.conservative_signals
 When enabled, unprivileged users are only allowed to send job control
 and usual termination signals like
 .Dv SIGKILL ,
 .Dv SIGINT ,
 and
 .Dv SIGTERM ,
 to the processes executing programs with changed uids.
 .It Dv security.bsd.unprivileged_proc_debug
 Controls availability of the process debugging facilities to non-root users.
 See also
 .Xr proccontrol 1
 mode
 .Dv trace .
 .It Dv vm.pmap.pti
 Tunable, amd64-only.
 Enables mode of operation of virtual memory system where usermode page
 tables are sanitized to prevent so-called Meltdown information leak on
 some Intel CPUs.
 By default, the system detects whether the CPU needs the workaround,
 and enables it automatically.
 See also
 .Xr proccontrol 1
 mode
 .Dv kpti .
+.It Dv machdep.mitigations.flush_rsb_ctxsw
+amd64.
+Controls Return Stack Buffer flush on context switch, to prevent
+cross-process ret2spec attacks.
+Only needed, and only enabled by default, if the machine
+supports SMEP, otherwise IBRS would do necessary flushing on kernel
+entry anyway.
 .It Dv hw.mds_disable
 amd64 and i386.
 Controls Microarchitectural Data Sampling hardware information leak
 mitigation.
 .It Dv hw.spec_store_bypass_disable
 amd64 and i386.
 Controls Speculative Store Bypass hardware information leak mitigation.
 .It Dv hw.ibrs_disable
 amd64 and i386.
 Controls Indirect Branch Restricted Speculation hardware information leak
 mitigation.
 .It Dv machdep.syscall_ret_l1d_flush
 amd64.
 Controls force-flush of L1D cache on return from syscalls which report
 errors other than
 .Ev EEXIST ,
 .Ev EAGAIN ,
 .Ev EXDEV ,
 .Ev ENOENT ,
 .Ev ENOTCONN ,
 and
 .Ev EINPROGRESS .
 This is mostly a paranoid setting added to prevent hypothetical exploitation
 of unknown gadgets for unknown hardware issues.
 The error codes exclusion list is composed of the most common errors which
 typically occurs on normal system operation.
 .It Dv machdep.nmi_flush_l1d_sw
 amd64.
 Controls force-flush of L1D cache on NMI;
 this provides software assist for bhyve mitigation of L1 terminal fault
 hardware information leak.
 .It Dv hw.vmm.vmx.l1d_flush
 amd64.
 Controls the mitigation of L1 Terminal Fault in bhyve hypervisor.
 .It Dv vm.pmap.allow_2m_x_ept
 amd64.
 Allows the use of superpages for executable mappings under the EPT
 page table format used by hypervisors on Intel CPUs to map the guest
 physical address space to machine physical memory.
 May be disabled to work around a CPU Erratum called
 Machine Check Error Avoidance on Page Size Change.
 .It Dv kern.elf32.aslr.enable
 Controls system-global Address Space Layout Randomization (ASLR) for
 normal non-PIE (Position Independent Executable) 32bit binaries.
 See also
 .Xr proccontrol 1
 mode
 .Dv aslr ,
 also affected by the per-image control note flag.
 .It Dv kern.elf32.aslr.pie_enable
 Controls system-global Address Space Layout Randomization for
 position-independent (PIE) 32bit binaries.
 .It Dv kern.elf32.aslr.honor_sbrk
 Makes ASLR less aggressive and more compatible with old binaries
 relying on the sbrk area.
 .It Dv kern.elf32.aslr.aslr_stack_gap
 If ASLR is enabled for a binary, a non-zero value creates a randomized
 stack gap between strings and the end of the aux vector.
 The value is the maximum percentage of main stack to waste on the gap.
 Cannot be greater than 50, i.e., at most half of the stack.
 .It Dv kern.elf64.aslr.enable
 64bit binaries ASLR control.
 .It Dv kern.elf64.aslr.pie_enable
 64bit PIE binaries ASLR control.
 .It Dv kern.elf64.aslr.honor_sbrk
 64bit binaries ASLR sbrk compatibility control.
 .It Dv kern.elf32.aslr.aslr_stack_gap
 Controls stack gap for 64bit binaries.
 .It Dv kern.elf32.nxstack
 Enables non-executable stack for 32bit processes.
 Enabled by default if supported by hardware and corresponding binary.
 .It Dv kern.elf64.nxstack
 Enables non-executable stack for 64bit processes.
 .El
 .Sh SEE ALSO
 .Xr chflags 1 ,
 .Xr find 1 ,
 .Xr md5 1 ,
 .Xr netstat 1 ,
 .Xr openssl 1 ,
 .Xr proccontrol 1 ,
 .Xr ps 1 ,
 .Xr ssh 1 ,
 .Xr xdm 1 Pq Pa ports/x11/xorg-clients ,
 .Xr group 5 ,
 .Xr ttys 5 ,
 .Xr accton 8 ,
 .Xr init 8 ,
 .Xr sshd 8 ,
 .Xr sysctl 8 ,
 .Xr syslogd 8 ,
 .Xr vipw 8
 .Sh HISTORY
 The
 .Nm
 manual page was originally written by
 .An Matthew Dillon
 and first appeared
 in
 .Fx 3.1 ,
 December 1998.
Index: head/sys/amd64/amd64/cpu_switch.S
===================================================================
--- head/sys/amd64/amd64/cpu_switch.S	(revision 361301)
+++ head/sys/amd64/amd64/cpu_switch.S	(revision 361302)
@@ -1,498 +1,500 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 
 #include "assym.inc"
 #include "opt_sched.h"
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 	.text
 
 /*
  * cpu_throw()
  *
  * This is the second half of cpu_switch(). It is used when the current
  * thread is either a dummy or slated to die, and we no longer care
  * about its state.  This is only a slight optimization and is probably
  * not worth it anymore.  Note that we need to clear the pm_active bits so
  * we do need the old proc if it still exists.
  * %rdi = oldtd
  * %rsi = newtd
  */
 ENTRY(cpu_throw)
 	movq	%rsi,%r12
 	movq	%rsi,%rdi
 	call	pmap_activate_sw
 	jmp	sw1
 END(cpu_throw)
 
 /*
  * cpu_switch(old, new, mtx)
  *
  * Save the current thread state, then select the next thread to run
  * and load its state.
  * %rdi = oldtd
  * %rsi = newtd
  * %rdx = mtx
  */
 ENTRY(cpu_switch)
 	/* Switch to new thread.  First, save context. */
 	leaq	TD_MD_PCB(%rdi),%r8
 
 	movq	(%rsp),%rax			/* Hardware registers */
 	movq	%r15,PCB_R15(%r8)
 	movq	%r14,PCB_R14(%r8)
 	movq	%r13,PCB_R13(%r8)
 	movq	%r12,PCB_R12(%r8)
 	movq	%rbp,PCB_RBP(%r8)
 	movq	%rsp,PCB_RSP(%r8)
 	movq	%rbx,PCB_RBX(%r8)
 	movq	%rax,PCB_RIP(%r8)
 
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
 	jnz	2f
 	orl	$PCB_FULL_IRET,PCB_FLAGS(%r8)
 	testl	$TDP_KTHREAD,TD_PFLAGS(%rdi)
 	jnz	2f
 	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
 	jz	2f
 	movl	%fs,%eax
 	cmpl	$KUF32SEL,%eax
 	jne	1f
 	rdfsbase %rax
 	movq	%rax,PCB_FSBASE(%r8)
 1:	movl	%gs,%eax
 	cmpl	$KUG32SEL,%eax
 	jne	2f
 	movq	%rdx,%r12
 	movl	$MSR_KGSBASE,%ecx		/* Read user gs base */
 	rdmsr
 	shlq	$32,%rdx
 	orq	%rdx,%rax
 	movq	%rax,PCB_GSBASE(%r8)
 	movq	%r12,%rdx
 
 2:
 	testl	$PCB_DBREGS,PCB_FLAGS(%r8)
 	jnz	store_dr			/* static predict not taken */
 done_store_dr:
 
 	/* have we used fp, and need a save? */
 	cmpq	%rdi,PCPU(FPCURTHREAD)
 	jne	2f
 	movq	PCB_SAVEFPU(%r8),%r8
 	clts
 	cmpl	$0,use_xsave(%rip)
 	jne	1f
 	fxsave	(%r8)
 	jmp	2f
 1:	movq	%rdx,%rcx
 	movl	xsave_mask,%eax
 	movl	xsave_mask+4,%edx
 	.globl	ctx_switch_xsave
 ctx_switch_xsave:
 	/* This is patched to xsaveopt if supported, see fpuinit_bsp1() */
 	xsave	(%r8)
 	movq	%rcx,%rdx
 2:
 	/* Save is done.  Now fire up new thread. Leave old vmspace. */
 	movq	%rsi,%r12
 	movq	%rdi,%r13
 	movq	%rdx,%r15
 	movq	%rsi,%rdi
 	callq	pmap_activate_sw
 	movq	%r15,TD_LOCK(%r13)		/* Release the old thread */
 sw1:
 	leaq	TD_MD_PCB(%r12),%r8
 #if defined(SCHED_ULE) && defined(SMP)
 	movq	$blocked_lock, %rdx
 	movq	TD_LOCK(%r12),%rcx
 	cmpq	%rcx, %rdx
 	je	sw1wait
 sw1cont:
 #endif
 	/*
 	 * At this point, we've switched address spaces and are ready
 	 * to load up the rest of the next context.
 	 */
 
 	/* Skip loading LDT and user fsbase/gsbase for kthreads */
 	testl	$TDP_KTHREAD,TD_PFLAGS(%r12)
 	jnz	do_kthread
 
 	/*
 	 * Load ldt register
 	 */
 	movq	TD_PROC(%r12),%rcx
 	cmpq	$0, P_MD+MD_LDT(%rcx)
 	jne	do_ldt
 	xorl	%eax,%eax
 ld_ldt:	lldt	%ax
 
 	/* Restore fs base in GDT */
 	movl	PCB_FSBASE(%r8),%eax
 	movq	PCPU(FS32P),%rdx
 	movw	%ax,2(%rdx)
 	shrl	$16,%eax
 	movb	%al,4(%rdx)
 	shrl	$8,%eax
 	movb	%al,7(%rdx)
 
 	/* Restore gs base in GDT */
 	movl	PCB_GSBASE(%r8),%eax
 	movq	PCPU(GS32P),%rdx
 	movw	%ax,2(%rdx)
 	shrl	$16,%eax
 	movb	%al,4(%rdx)
 	shrl	$8,%eax
 	movb	%al,7(%rdx)
 
 do_kthread:
 	/* Do we need to reload tss ? */
 	movq	PCPU(TSSP),%rax
 	movq	PCB_TSSP(%r8),%rdx
 	movq	PCPU(PRVSPACE),%r13
 	addq	$PC_COMMONTSS,%r13
 	testq	%rdx,%rdx
 	cmovzq	%r13,%rdx
 	cmpq	%rax,%rdx
 	jne	do_tss
 done_tss:
 	movq	TD_MD_STACK_BASE(%r12),%r9
 	movq	%r9,PCPU(RSP0)
 	movq	%r8,PCPU(CURPCB)
 	movq	PCPU(PTI_RSP0),%rax
 	cmpq	$~0,PCPU(UCR3)
 	cmove	%r9,%rax
 	movq	%rax,TSS_RSP0(%rdx)
 	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
 
 	/* Test if debug registers should be restored. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%r8)
 	jnz	load_dr				/* static predict not taken */
 done_load_dr:
 
 	/* Restore context. */
 	movq	PCB_R15(%r8),%r15
 	movq	PCB_R14(%r8),%r14
 	movq	PCB_R13(%r8),%r13
 	movq	PCB_R12(%r8),%r12
 	movq	PCB_RBP(%r8),%rbp
 	movq	PCB_RSP(%r8),%rsp
 	movq	PCB_RBX(%r8),%rbx
 	movq	PCB_RIP(%r8),%rax
 	movq	%rax,(%rsp)
 	movq	PCPU(CURTHREAD),%rdi
 	call	fpu_activate_sw
+	cmpb	$0,cpu_flush_rsb_ctxsw(%rip)
+	jne	rsb_flush
 	ret
 
 	/*
 	 * We order these strangely for several reasons.
 	 * 1: I wanted to use static branch prediction hints
 	 * 2: Most athlon64/opteron cpus don't have them.  They define
 	 *    a forward branch as 'predict not taken'.  Intel cores have
 	 *    the 'rep' prefix to invert this.
 	 * So, to make it work on both forms of cpu we do the detour.
 	 * We use jumps rather than call in order to avoid the stack.
 	 */
 
 store_dr:
 	movq	%dr7,%rax			/* yes, do the save */
 	movq	%dr0,%r15
 	movq	%dr1,%r14
 	movq	%dr2,%r13
 	movq	%dr3,%r12
 	movq	%dr6,%r11
 	movq	%r15,PCB_DR0(%r8)
 	movq	%r14,PCB_DR1(%r8)
 	movq	%r13,PCB_DR2(%r8)
 	movq	%r12,PCB_DR3(%r8)
 	movq	%r11,PCB_DR6(%r8)
 	movq	%rax,PCB_DR7(%r8)
 	andq	$0x0000fc00, %rax		/* disable all watchpoints */
 	movq	%rax,%dr7
 	jmp	done_store_dr
 
 load_dr:
 	movq	%dr7,%rax
 	movq	PCB_DR0(%r8),%r15
 	movq	PCB_DR1(%r8),%r14
 	movq	PCB_DR2(%r8),%r13
 	movq	PCB_DR3(%r8),%r12
 	movq	PCB_DR6(%r8),%r11
 	movq	PCB_DR7(%r8),%rcx
 	movq	%r15,%dr0
 	movq	%r14,%dr1
 	/* Preserve reserved bits in %dr7 */
 	andq	$0x0000fc00,%rax
 	andq	$~0x0000fc00,%rcx
 	movq	%r13,%dr2
 	movq	%r12,%dr3
 	orq	%rcx,%rax
 	movq	%r11,%dr6
 	movq	%rax,%dr7
 	jmp	done_load_dr
 
 do_tss:	movq	%rdx,PCPU(TSSP)
 	movq	%rdx,%rcx
 	movq	PCPU(TSS),%rax
 	movw	%cx,2(%rax)
 	shrq	$16,%rcx
 	movb	%cl,4(%rax)
 	shrq	$8,%rcx
 	movb	%cl,7(%rax)
 	shrq	$8,%rcx
 	movl	%ecx,8(%rax)
 	movb	$0x89,5(%rax)	/* unset busy */
 	movl	$TSSSEL,%eax
 	ltr	%ax
 	jmp	done_tss
 
 do_ldt:	movq	PCPU(LDT),%rax
 	movq	P_MD+MD_LDT_SD(%rcx),%rdx
 	movq	%rdx,(%rax)
 	movq	P_MD+MD_LDT_SD+8(%rcx),%rdx
 	movq	%rdx,8(%rax)
 	movl	$LDTSEL,%eax
 	jmp	ld_ldt
 END(cpu_switch)
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* Save caller's return address. */
 	movq	(%rsp),%rax
 	movq	%rax,PCB_RIP(%rdi)
 
 	movq	%rbx,PCB_RBX(%rdi)
 	movq	%rsp,PCB_RSP(%rdi)
 	movq	%rbp,PCB_RBP(%rdi)
 	movq	%r12,PCB_R12(%rdi)
 	movq	%r13,PCB_R13(%rdi)
 	movq	%r14,PCB_R14(%rdi)
 	movq	%r15,PCB_R15(%rdi)
 
 	movq	%cr0,%rax
 	movq	%rax,PCB_CR0(%rdi)
 	movq	%cr2,%rax
 	movq	%rax,PCB_CR2(%rdi)
 	movq	%cr3,%rax
 	movq	%rax,PCB_CR3(%rdi)
 	movq	%cr4,%rax
 	movq	%rax,PCB_CR4(%rdi)
 
 	movq	%dr0,%rax
 	movq	%rax,PCB_DR0(%rdi)
 	movq	%dr1,%rax
 	movq	%rax,PCB_DR1(%rdi)
 	movq	%dr2,%rax
 	movq	%rax,PCB_DR2(%rdi)
 	movq	%dr3,%rax
 	movq	%rax,PCB_DR3(%rdi)
 	movq	%dr6,%rax
 	movq	%rax,PCB_DR6(%rdi)
 	movq	%dr7,%rax
 	movq	%rax,PCB_DR7(%rdi)
 
 	movl	$MSR_FSBASE,%ecx
 	rdmsr
 	movl	%eax,PCB_FSBASE(%rdi)
 	movl	%edx,PCB_FSBASE+4(%rdi)
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
 	movl	%eax,PCB_GSBASE(%rdi)
 	movl	%edx,PCB_GSBASE+4(%rdi)
 	movl	$MSR_KGSBASE,%ecx
 	rdmsr
 	movl	%eax,PCB_KGSBASE(%rdi)
 	movl	%edx,PCB_KGSBASE+4(%rdi)
 	movl	$MSR_EFER,%ecx
 	rdmsr
 	movl	%eax,PCB_EFER(%rdi)
 	movl	%edx,PCB_EFER+4(%rdi)
 	movl	$MSR_STAR,%ecx
 	rdmsr
 	movl	%eax,PCB_STAR(%rdi)
 	movl	%edx,PCB_STAR+4(%rdi)
 	movl	$MSR_LSTAR,%ecx
 	rdmsr
 	movl	%eax,PCB_LSTAR(%rdi)
 	movl	%edx,PCB_LSTAR+4(%rdi)
 	movl	$MSR_CSTAR,%ecx
 	rdmsr
 	movl	%eax,PCB_CSTAR(%rdi)
 	movl	%edx,PCB_CSTAR+4(%rdi)
 	movl	$MSR_SF_MASK,%ecx
 	rdmsr
 	movl	%eax,PCB_SFMASK(%rdi)
 	movl	%edx,PCB_SFMASK+4(%rdi)
 
 	sgdt	PCB_GDT(%rdi)
 	sidt	PCB_IDT(%rdi)
 	sldt	PCB_LDT(%rdi)
 	str	PCB_TR(%rdi)
 
 	movl	$1,%eax
 	ret
 END(savectx)
 
 /*
  * resumectx(pcb)
  * Resuming processor state from pcb.
  */     
 ENTRY(resumectx)
 	/* Switch to KPML4phys. */
 	movq	KPML4phys,%rax
 	movq	%rax,%cr3
 
 	/* Force kernel segment registers. */
 	movl	$KDSEL,%eax
 	movw	%ax,%ds
 	movw	%ax,%es
 	movw	%ax,%ss
 	movl	$KUF32SEL,%eax
 	movw	%ax,%fs
 	movl	$KUG32SEL,%eax
 	movw	%ax,%gs
 
 	movl	$MSR_FSBASE,%ecx
 	movl	PCB_FSBASE(%rdi),%eax
 	movl	4 + PCB_FSBASE(%rdi),%edx
 	wrmsr
 	movl	$MSR_GSBASE,%ecx
 	movl	PCB_GSBASE(%rdi),%eax
 	movl	4 + PCB_GSBASE(%rdi),%edx
 	wrmsr
 	movl	$MSR_KGSBASE,%ecx
 	movl	PCB_KGSBASE(%rdi),%eax
 	movl	4 + PCB_KGSBASE(%rdi),%edx
 	wrmsr
 
 	/* Restore EFER one more time. */
 	movl	$MSR_EFER,%ecx
 	movl	PCB_EFER(%rdi),%eax
 	wrmsr
 
 	/* Restore fast syscall stuff. */
 	movl	$MSR_STAR,%ecx
 	movl	PCB_STAR(%rdi),%eax
 	movl	4 + PCB_STAR(%rdi),%edx
 	wrmsr
 	movl	$MSR_LSTAR,%ecx
 	movl	PCB_LSTAR(%rdi),%eax
 	movl	4 + PCB_LSTAR(%rdi),%edx
 	wrmsr
 	movl	$MSR_CSTAR,%ecx
 	movl	PCB_CSTAR(%rdi),%eax
 	movl	4 + PCB_CSTAR(%rdi),%edx
 	wrmsr
 	movl	$MSR_SF_MASK,%ecx
 	movl	PCB_SFMASK(%rdi),%eax
 	wrmsr
 
 	/* Restore CR0, CR2, CR4 and CR3. */
 	movq	PCB_CR0(%rdi),%rax
 	movq	%rax,%cr0
 	movq	PCB_CR2(%rdi),%rax
 	movq	%rax,%cr2
 	movq	PCB_CR4(%rdi),%rax
 	movq	%rax,%cr4
 	movq	PCB_CR3(%rdi),%rax
 	movq	%rax,%cr3
 
 	/* Restore descriptor tables. */
 	lidt	PCB_IDT(%rdi)
 	lldt	PCB_LDT(%rdi)
 
 #define	SDT_SYSTSS	9
 #define	SDT_SYSBSY	11
 
 	/* Clear "task busy" bit and reload TR. */
 	movq	PCPU(TSS),%rax
 	andb	$(~SDT_SYSBSY | SDT_SYSTSS),5(%rax)
 	movw	PCB_TR(%rdi),%ax
 	ltr	%ax
 
 #undef	SDT_SYSTSS
 #undef	SDT_SYSBSY
 
 	/* Restore debug registers. */
 	movq	PCB_DR0(%rdi),%rax
 	movq	%rax,%dr0
 	movq	PCB_DR1(%rdi),%rax
 	movq	%rax,%dr1
 	movq	PCB_DR2(%rdi),%rax
 	movq	%rax,%dr2
 	movq	PCB_DR3(%rdi),%rax
 	movq	%rax,%dr3
 	movq	PCB_DR6(%rdi),%rax
 	movq	%rax,%dr6
 	movq	PCB_DR7(%rdi),%rax
 	movq	%rax,%dr7
 
 	/* Restore other callee saved registers. */
 	movq	PCB_R15(%rdi),%r15
 	movq	PCB_R14(%rdi),%r14
 	movq	PCB_R13(%rdi),%r13
 	movq	PCB_R12(%rdi),%r12
 	movq	PCB_RBP(%rdi),%rbp
 	movq	PCB_RSP(%rdi),%rsp
 	movq	PCB_RBX(%rdi),%rbx
 
 	/* Restore return address. */
 	movq	PCB_RIP(%rdi),%rax
 	movq	%rax,(%rsp)
 
 	xorl	%eax,%eax
 	ret
 END(resumectx)
 
 /* Wait for the new thread to become unblocked */
 #if defined(SCHED_ULE) && defined(SMP)
 sw1wait:
 1:
 	pause
 	movq	TD_LOCK(%r12),%rcx
 	cmpq	%rcx, %rdx
 	je	1b
 	jmp	sw1cont
 #endif
Index: head/sys/amd64/amd64/initcpu.c
===================================================================
--- head/sys/amd64/amd64/initcpu.c	(revision 361301)
+++ head/sys/amd64/amd64/initcpu.c	(revision 361302)
@@ -1,307 +1,319 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) KATO Takenori, 1997, 1998.
  * 
  * All rights reserved.  Unpublished rights reserved under the copyright
  * laws of Japan.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/pcpu.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 static int	hw_instruction_sse;
 SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
     &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU");
 static int	lower_sharedpage_init;
 int		hw_lower_amd64_sharedpage;
 SYSCTL_INT(_hw, OID_AUTO, lower_amd64_sharedpage, CTLFLAG_RDTUN,
     &hw_lower_amd64_sharedpage, 0,
    "Lower sharedpage to work around Ryzen issue with executing code near the top of user memory");
 /*
  * -1: automatic (default)
  *  0: keep enable CLFLUSH
  *  1: force disable CLFLUSH
  */
 static int	hw_clflush_disable = -1;
 
 static void
 init_amd(void)
 {
 	uint64_t msr;
 
 	/*
 	 * Work around Erratum 721 for Family 10h and 12h processors.
 	 * These processors may incorrectly update the stack pointer
 	 * after a long series of push and/or near-call instructions,
 	 * or a long series of pop and/or near-return instructions.
 	 *
 	 * http://support.amd.com/us/Processor_TechDocs/41322_10h_Rev_Gd.pdf
 	 * http://support.amd.com/us/Processor_TechDocs/44739_12h_Rev_Gd.pdf
 	 *
 	 * Hypervisors do not provide access to the errata MSR,
 	 * causing #GP exception on attempt to apply the errata.  The
 	 * MSR write shall be done on host and persist globally
 	 * anyway, so do not try to do it when under virtualization.
 	 */
 	switch (CPUID_TO_FAMILY(cpu_id)) {
 	case 0x10:
 	case 0x12:
 		if ((cpu_feature2 & CPUID2_HV) == 0)
 			wrmsr(0xc0011029, rdmsr(0xc0011029) | 1);
 		break;
 	}
 
 	/*
 	 * BIOS may fail to set InitApicIdCpuIdLo to 1 as it should per BKDG.
 	 * So, do it here or otherwise some tools could be confused by
 	 * Initial Local APIC ID reported with CPUID Function 1 in EBX.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(MSR_NB_CFG1);
 			msr |= (uint64_t)1 << 54;
 			wrmsr(MSR_NB_CFG1, msr);
 		}
 	}
 
 	/*
 	 * BIOS may configure Family 10h processors to convert WC+ cache type
 	 * to CD.  That can hurt performance of guest VMs using nested paging.
 	 * The relevant MSR bit is not documented in the BKDG,
 	 * the fix is borrowed from Linux.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x10) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(0xc001102a);
 			msr &= ~((uint64_t)1 << 24);
 			wrmsr(0xc001102a, msr);
 		}
 	}
 
 	/*
 	 * Work around Erratum 793: Specific Combination of Writes to Write
 	 * Combined Memory Types and Locked Instructions May Cause Core Hang.
 	 * See Revision Guide for AMD Family 16h Models 00h-0Fh Processors,
 	 * revision 3.04 or later, publication 51810.
 	 */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x16 && CPUID_TO_MODEL(cpu_id) <= 0xf) {
 		if ((cpu_feature2 & CPUID2_HV) == 0) {
 			msr = rdmsr(MSR_LS_CFG);
 			msr |= (uint64_t)1 << 15;
 			wrmsr(MSR_LS_CFG, msr);
 		}
 	}
 
 	/* Ryzen erratas. */
 	if (CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1 &&
 	    (cpu_feature2 & CPUID2_HV) == 0) {
 		/* 1021 */
 		msr = rdmsr(0xc0011029);
 		msr |= 0x2000;
 		wrmsr(0xc0011029, msr);
 
 		/* 1033 */
 		msr = rdmsr(MSR_LS_CFG);
 		msr |= 0x10;
 		wrmsr(MSR_LS_CFG, msr);
 
 		/* 1049 */
 		msr = rdmsr(0xc0011028);
 		msr |= 0x10;
 		wrmsr(0xc0011028, msr);
 
 		/* 1095 */
 		msr = rdmsr(MSR_LS_CFG);
 		msr |= 0x200000000000000;
 		wrmsr(MSR_LS_CFG, msr);
 	}
 
 	/*
 	 * Work around a problem on Ryzen that is triggered by executing
 	 * code near the top of user memory, in our case the signal
 	 * trampoline code in the shared page on amd64.
 	 *
 	 * This function is executed once for the BSP before tunables take
 	 * effect so the value determined here can be overridden by the
 	 * tunable.  This function is then executed again for each AP and
 	 * also on resume.  Set a flag the first time so that value set by
 	 * the tunable is not overwritten.
 	 *
 	 * The stepping and/or microcode versions should be checked after
 	 * this issue is fixed by AMD so that we don't use this mode if not
 	 * needed.
 	 */
 	if (lower_sharedpage_init == 0) {
 		lower_sharedpage_init = 1;
 		if (CPUID_TO_FAMILY(cpu_id) == 0x17 ||
 		    CPUID_TO_FAMILY(cpu_id) == 0x18) {
 			hw_lower_amd64_sharedpage = 1;
 		}
 	}
 }
 
 /*
  * Initialize special VIA features
  */
 static void
 init_via(void)
 {
 	u_int regs[4], val;
 
 	/*
 	 * Check extended CPUID for PadLock features.
 	 *
 	 * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf
 	 */
 	do_cpuid(0xc0000000, regs);
 	if (regs[0] >= 0xc0000001) {
 		do_cpuid(0xc0000001, regs);
 		val = regs[3];
 	} else
 		return;
 
 	/* Enable RNG if present. */
 	if ((val & VIA_CPUID_HAS_RNG) != 0) {
 		via_feature_rng = VIA_HAS_RNG;
 		wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG);
 	}
 
 	/* Enable PadLock if present. */
 	if ((val & VIA_CPUID_HAS_ACE) != 0)
 		via_feature_xcrypt |= VIA_HAS_AES;
 	if ((val & VIA_CPUID_HAS_ACE2) != 0)
 		via_feature_xcrypt |= VIA_HAS_AESCTR;
 	if ((val & VIA_CPUID_HAS_PHE) != 0)
 		via_feature_xcrypt |= VIA_HAS_SHA;
 	if ((val & VIA_CPUID_HAS_PMM) != 0)
 		via_feature_xcrypt |= VIA_HAS_MM;
 	if (via_feature_xcrypt != 0)
 		wrmsr(0x1107, rdmsr(0x1107) | (1 << 28));
 }
 
 /*
  * Initialize CPU control registers
  */
 void
 initializecpu(void)
 {
 	uint64_t msr;
 	uint32_t cr4;
 
 	cr4 = rcr4();
 	if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) {
 		cr4 |= CR4_FXSR | CR4_XMM;
 		cpu_fxsr = hw_instruction_sse = 1;
 	}
 	if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE)
 		cr4 |= CR4_FSGSBASE;
 
 	if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU)
 		cr4 |= CR4_PKE;
 
 	/*
+	 * If SMEP is present, we only need to flush RSB (by default)
+	 * on context switches, to prevent cross-process ret2spec
+	 * attacks.  Do it automatically if ibrs_disable is set, to
+	 * complete the mitigation.
+	 *
 	 * Postpone enabling the SMEP on the boot CPU until the page
 	 * tables are switched from the boot loader identity mapping
 	 * to the kernel tables.  The boot loader enables the U bit in
 	 * its tables.
 	 */
-	if (!IS_BSP()) {
+	if (IS_BSP()) {
+		if (cpu_stdext_feature & CPUID_STDEXT_SMEP &&
+		    !TUNABLE_INT_FETCH(
+		    "machdep.mitigations.cpu_flush_rsb_ctxsw",
+		    &cpu_flush_rsb_ctxsw) &&
+		    hw_ibrs_disable)
+			cpu_flush_rsb_ctxsw = 1;
+	} else {
 		if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 			cr4 |= CR4_SMEP;
 		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
 			cr4 |= CR4_SMAP;
 	}
 	load_cr4(cr4);
 	if (IS_BSP() && (amd_feature & AMDID_NX) != 0) {
 		msr = rdmsr(MSR_EFER) | EFER_NXE;
 		wrmsr(MSR_EFER, msr);
 		pg_nx = PG_NX;
 	}
 	hw_ibrs_recalculate(false);
 	hw_ssb_recalculate(false);
 	amd64_syscall_ret_flush_l1d_recalc();
 	switch (cpu_vendor_id) {
 	case CPU_VENDOR_AMD:
 	case CPU_VENDOR_HYGON:
 		init_amd();
 		break;
 	case CPU_VENDOR_CENTAUR:
 		init_via();
 		break;
 	}
 
 	if ((amd_feature & AMDID_RDTSCP) != 0 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0)
 		wrmsr(MSR_TSC_AUX, PCPU_GET(cpuid));
 }
 
 void
 initializecpucache(void)
 {
 
 	/*
 	 * CPUID with %eax = 1, %ebx returns
 	 * Bits 15-8: CLFLUSH line size
 	 * 	(Value * 8 = cache line size in bytes)
 	 */
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8;
 	/*
 	 * XXXKIB: (temporary) hack to work around traps generated
 	 * when CLFLUSHing APIC register window under virtualization
 	 * environments.  These environments tend to disable the
 	 * CPUID_SS feature even though the native CPU supports it.
 	 */
 	TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable);
 	if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) {
 		cpu_feature &= ~CPUID_CLFSH;
 		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
 	}
 
 	/*
 	 * The kernel's use of CLFLUSH{,OPT} can be disabled manually
 	 * by setting the hw.clflush_disable tunable.
 	 */
 	if (hw_clflush_disable == 1) {
 		cpu_feature &= ~CPUID_CLFSH;
 		cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT;
 	}
 }
Index: head/sys/amd64/amd64/support.S
===================================================================
--- head/sys/amd64/amd64/support.S	(revision 361301)
+++ head/sys/amd64/amd64/support.S	(revision 361302)
@@ -1,1956 +1,1959 @@
 /*-
  * Copyright (c) 2018-2019 The FreeBSD Foundation
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_ddb.h"
 
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <machine/pmap.h>
 
 #include "assym.inc"
 
 	.text
 
 /* Address: %rdi */
 ENTRY(pagezero_std)
 	PUSH_FRAME_POINTER
 	movl	$PAGE_SIZE/8,%ecx
 	xorl	%eax,%eax
 	rep
 	stosq
 	POP_FRAME_POINTER
 	ret
 END(pagezero_std)
 
 ENTRY(pagezero_erms)
 	PUSH_FRAME_POINTER
 	movl	$PAGE_SIZE,%ecx
 	xorl	%eax,%eax
 	rep
 	stosb
 	POP_FRAME_POINTER
 	ret
 END(pagezero_erms)
 
 /*
  * pagecopy(%rdi=from, %rsi=to)
  */
 ENTRY(pagecopy)
 	PUSH_FRAME_POINTER
 	movl	$PAGE_SIZE/8,%ecx
 	movq	%rdi,%r9
 	movq	%rsi,%rdi
 	movq	%r9,%rsi
 	rep
 	movsq
 	POP_FRAME_POINTER
 	ret
 END(pagecopy)
 
 /* Address: %rdi */
 ENTRY(sse2_pagezero)
 	PUSH_FRAME_POINTER
 	movq	$-PAGE_SIZE,%rdx
 	subq	%rdx,%rdi
 	xorl	%eax,%eax
 	jmp	1f
 	/*
 	 * The loop takes 29 bytes.  Ensure that it doesn't cross a 32-byte
 	 * cache line.
 	 */
 	.p2align 5,0x90
 1:
 	movnti	%rax,(%rdi,%rdx)
 	movnti	%rax,8(%rdi,%rdx)
 	movnti	%rax,16(%rdi,%rdx)
 	movnti	%rax,24(%rdi,%rdx)
 	addq	$32,%rdx
 	jne	1b
 	sfence
 	POP_FRAME_POINTER
 	ret
 END(sse2_pagezero)
 
 /*
  * memcmpy(b1, b2, len)
  *	   rdi,rsi,rdx
  */
 ENTRY(memcmp)
 	PUSH_FRAME_POINTER
 
 	xorl	%eax,%eax
 10:
 	cmpq	$16,%rdx
 	ja	101632f
 
 100816:
 	cmpb	$8,%dl
 	jl	100408f
 	movq	(%rdi),%r8
 	movq	(%rsi),%r9
 	cmpq	%r8,%r9
 	jne	80f
 	movq	-8(%rdi,%rdx),%r8
 	movq	-8(%rsi,%rdx),%r9
 	cmpq	%r8,%r9
 	jne	10081608f
 	POP_FRAME_POINTER
 	ret
 100408:
 	cmpb	$4,%dl
 	jl	100204f
 	movl	(%rdi),%r8d
 	movl	(%rsi),%r9d
 	cmpl	%r8d,%r9d
 	jne	80f
 	movl	-4(%rdi,%rdx),%r8d
 	movl	-4(%rsi,%rdx),%r9d
 	cmpl	%r8d,%r9d
 	jne	10040804f
 	POP_FRAME_POINTER
 	ret
 100204:
 	cmpb	$2,%dl
 	jl	100001f
 	movzwl	(%rdi),%r8d
 	movzwl	(%rsi),%r9d
 	cmpl	%r8d,%r9d
 	jne	1f
 	movzwl	-2(%rdi,%rdx),%r8d
 	movzwl	-2(%rsi,%rdx),%r9d
 	cmpl	%r8d,%r9d
 	jne	1f
 	POP_FRAME_POINTER
 	ret
 100001:
 	cmpb	$1,%dl
 	jl	100000f
 	movzbl	(%rdi),%eax
 	movzbl	(%rsi),%r8d
 	subl	%r8d,%eax
 100000:
 	POP_FRAME_POINTER
 	ret
 ALIGN_TEXT
 101632:
 	cmpq	$32,%rdx
 	ja	103200f
 	movq	(%rdi),%r8
 	movq	(%rsi),%r9
 	cmpq	%r8,%r9
 	jne	80f
 	movq	8(%rdi),%r8
 	movq	8(%rsi),%r9
 	cmpq	%r8,%r9
 	jne	10163208f
 	movq	-16(%rdi,%rdx),%r8
 	movq	-16(%rsi,%rdx),%r9
 	cmpq	%r8,%r9
 	jne	10163216f
 	movq	-8(%rdi,%rdx),%r8
 	movq	-8(%rsi,%rdx),%r9
 	cmpq	%r8,%r9
 	jne	10163224f
 	POP_FRAME_POINTER
 	ret
 ALIGN_TEXT
 103200:
 	movq	(%rdi),%r8
 	movq	8(%rdi),%r9
 	subq	(%rsi),%r8
 	subq	8(%rsi),%r9
 	orq	%r8,%r9
 	jnz	10320000f
 
 	movq    16(%rdi),%r8
 	movq    24(%rdi),%r9
 	subq    16(%rsi),%r8
 	subq    24(%rsi),%r9
 	orq	%r8,%r9
 	jnz     10320016f
 
 	leaq	32(%rdi),%rdi
 	leaq	32(%rsi),%rsi
 	subq	$32,%rdx
 	cmpq	$32,%rdx
 	jae	103200b
 	cmpb	$0,%dl
 	jne	10b
 	POP_FRAME_POINTER
 	ret
 
 /*
  * Mismatch was found.
  *
  * Before we compute it we narrow down the range (16 -> 8 -> 4 bytes).
  */
 ALIGN_TEXT
 10320016:
 	leaq	16(%rdi),%rdi
 	leaq	16(%rsi),%rsi
 10320000:
 	movq	(%rdi),%r8
 	movq	(%rsi),%r9
 	cmpq	%r8,%r9
 	jne	80f
 	leaq	8(%rdi),%rdi
 	leaq	8(%rsi),%rsi
 	jmp	80f
 ALIGN_TEXT
 10081608:
 10163224:
 	leaq	-8(%rdi,%rdx),%rdi
 	leaq	-8(%rsi,%rdx),%rsi
 	jmp	80f
 ALIGN_TEXT
 10163216:
 	leaq	-16(%rdi,%rdx),%rdi
 	leaq	-16(%rsi,%rdx),%rsi
 	jmp	80f
 ALIGN_TEXT
 10163208:
 	leaq	8(%rdi),%rdi
 	leaq	8(%rsi),%rsi
 	jmp	80f
 ALIGN_TEXT
 10040804:
 	leaq	-4(%rdi,%rdx),%rdi
 	leaq	-4(%rsi,%rdx),%rsi
 	jmp	1f
 
 ALIGN_TEXT
 80:
 	movl	(%rdi),%r8d
 	movl	(%rsi),%r9d
 	cmpl	%r8d,%r9d
 	jne	1f
 	leaq	4(%rdi),%rdi
 	leaq	4(%rsi),%rsi
 
 /*
  * We have up to 4 bytes to inspect.
  */
 1:
 	movzbl	(%rdi),%eax
 	movzbl	(%rsi),%r8d
 	cmpb	%r8b,%al
 	jne	2f
 
 	movzbl	1(%rdi),%eax
 	movzbl	1(%rsi),%r8d
 	cmpb	%r8b,%al
 	jne	2f
 
 	movzbl	2(%rdi),%eax
 	movzbl	2(%rsi),%r8d
 	cmpb	%r8b,%al
 	jne	2f
 
 	movzbl	3(%rdi),%eax
 	movzbl	3(%rsi),%r8d
 2:
 	subl	%r8d,%eax
 	POP_FRAME_POINTER
 	ret
 END(memcmp)
 
 /*
  * memmove(dst, src, cnt)
  *         rdi, rsi, rdx
  */
 
 /*
  * Register state at entry is supposed to be as follows:
  * rdi - destination
  * rsi - source
  * rdx - count
  *
  * The macro possibly clobbers the above and: rcx, r8, r9, r10
  * It does not clobber rax nor r11.
  */
 .macro MEMMOVE erms overlap begin end
 	\begin
 
 	/*
 	 * For sizes 0..32 all data is read before it is written, so there
 	 * is no correctness issue with direction of copying.
 	 */
 	cmpq	$32,%rcx
 	jbe	101632f
 
 .if \overlap == 1
 	movq	%rdi,%r8
 	subq	%rsi,%r8
 	cmpq	%rcx,%r8	/* overlapping && src < dst? */
 	jb	2f
 .endif
 
 	cmpq	$256,%rcx
 	ja	1256f
 
 103200:
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	movq	8(%rsi),%rdx
 	movq	%rdx,8(%rdi)
 	movq	16(%rsi),%rdx
 	movq	%rdx,16(%rdi)
 	movq	24(%rsi),%rdx
 	movq	%rdx,24(%rdi)
 	leaq	32(%rsi),%rsi
 	leaq	32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
 	jae	103200b
 	cmpb	$0,%cl
 	jne	101632f
 	\end
 	ret
 	ALIGN_TEXT
 101632:
 	cmpb	$16,%cl
 	jl	100816f
 	movq	(%rsi),%rdx
 	movq	8(%rsi),%r8
 	movq	-16(%rsi,%rcx),%r9
 	movq	-8(%rsi,%rcx),%r10
 	movq	%rdx,(%rdi)
 	movq	%r8,8(%rdi)
 	movq	%r9,-16(%rdi,%rcx)
 	movq	%r10,-8(%rdi,%rcx)
 	\end
 	ret
 	ALIGN_TEXT
 100816:
 	cmpb	$8,%cl
 	jl	100408f
 	movq	(%rsi),%rdx
 	movq	-8(%rsi,%rcx),%r8
 	movq	%rdx,(%rdi)
 	movq	%r8,-8(%rdi,%rcx,)
 	\end
 	ret
 	ALIGN_TEXT
 100408:
 	cmpb	$4,%cl
 	jl	100204f
 	movl	(%rsi),%edx
 	movl	-4(%rsi,%rcx),%r8d
 	movl	%edx,(%rdi)
 	movl	%r8d,-4(%rdi,%rcx)
 	\end
 	ret
 	ALIGN_TEXT
 100204:
 	cmpb	$2,%cl
 	jl	100001f
 	movzwl	(%rsi),%edx
 	movzwl	-2(%rsi,%rcx),%r8d
 	movw	%dx,(%rdi)
 	movw	%r8w,-2(%rdi,%rcx)
 	\end
 	ret
 	ALIGN_TEXT
 100001:
 	cmpb	$1,%cl
 	jl	100000f
 	movb	(%rsi),%dl
 	movb	%dl,(%rdi)
 100000:
 	\end
 	ret
 
 	ALIGN_TEXT
 1256:
 	testb	$15,%dil
 	jnz	100f
 .if \erms == 1
 	rep
 	movsb
 .else
 	shrq	$3,%rcx                         /* copy by 64-bit words */
 	rep
 	movsq
 	movq	%rdx,%rcx
 	andl	$7,%ecx                         /* any bytes left? */
 	jne	100408b
 .endif
 	\end
 	ret
 100:
 	movq	(%rsi),%r8
 	movq	8(%rsi),%r9
 	movq	%rdi,%r10
 	movq	%rdi,%rcx
 	andq	$15,%rcx
 	leaq	-16(%rdx,%rcx),%rdx
 	neg	%rcx
 	leaq	16(%rdi,%rcx),%rdi
 	leaq	16(%rsi,%rcx),%rsi
 	movq	%rdx,%rcx
 .if \erms == 1
 	rep
 	movsb
 	movq	%r8,(%r10)
 	movq	%r9,8(%r10)
 .else
 	shrq	$3,%rcx                         /* copy by 64-bit words */
 	rep
 	movsq
 	movq	%r8,(%r10)
 	movq	%r9,8(%r10)
 	movq	%rdx,%rcx
 	andl	$7,%ecx                         /* any bytes left? */
 	jne	100408b
 .endif
 	\end
 	ret
 
 .if \overlap == 1
 	/*
 	 * Copy backwards.
 	 */
         ALIGN_TEXT
 2:
 	cmpq	$256,%rcx
 	ja	2256f
 
 	leaq	-8(%rdi,%rcx),%rdi
 	leaq	-8(%rsi,%rcx),%rsi
 
 	cmpq	$32,%rcx
 	jb	2016f
 
 2032:
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	movq	-8(%rsi),%rdx
 	movq	%rdx,-8(%rdi)
 	movq	-16(%rsi),%rdx
 	movq	%rdx,-16(%rdi)
 	movq	-24(%rsi),%rdx
 	movq	%rdx,-24(%rdi)
 	leaq	-32(%rsi),%rsi
 	leaq	-32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
 	jae	2032b
 	cmpb	$0,%cl
 	jne	2016f
 	\end
 	ret
 	ALIGN_TEXT
 2016:
 	cmpb	$16,%cl
 	jl	2008f
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	movq	-8(%rsi),%rdx
 	movq	%rdx,-8(%rdi)
 	subb	$16,%cl
 	jz	2000f
 	leaq	-16(%rsi),%rsi
 	leaq	-16(%rdi),%rdi
 2008:
 	cmpb	$8,%cl
 	jl	2004f
 	movq	(%rsi),%rdx
 	movq	%rdx,(%rdi)
 	subb	$8,%cl
 	jz	2000f
 	leaq	-8(%rsi),%rsi
 	leaq	-8(%rdi),%rdi
 2004:
 	cmpb	$4,%cl
 	jl	2002f
 	movl	4(%rsi),%edx
 	movl	%edx,4(%rdi)
 	subb	$4,%cl
 	jz	2000f
 	leaq	-4(%rsi),%rsi
 	leaq	-4(%rdi),%rdi
 2002:
 	cmpb	$2,%cl
 	jl	2001f
 	movw	6(%rsi),%dx
 	movw	%dx,6(%rdi)
 	subb	$2,%cl
 	jz	2000f
 	leaq	-2(%rsi),%rsi
 	leaq	-2(%rdi),%rdi
 2001:
 	cmpb	$1,%cl
 	jl	2000f
 	movb	7(%rsi),%dl
 	movb	%dl,7(%rdi)
 2000:
 	\end
 	ret
 	ALIGN_TEXT
 2256:
 	std
 .if \erms == 1
 	leaq	-1(%rdi,%rcx),%rdi
 	leaq	-1(%rsi,%rcx),%rsi
 	rep
 	movsb
 	cld
 .else
 	leaq	-8(%rdi,%rcx),%rdi
 	leaq	-8(%rsi,%rcx),%rsi
 	shrq	$3,%rcx
 	rep
 	movsq
 	cld
 	movq	%rdx,%rcx
 	andb	$7,%cl
 	jne	2004b
 .endif
 	\end
 	ret
 .endif
 .endm
 
 .macro MEMMOVE_BEGIN
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rdx,%rcx
 .endm
 
 .macro MEMMOVE_END
 	POP_FRAME_POINTER
 .endm
 
 ENTRY(memmove_std)
 	MEMMOVE erms=0 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memmove_std)
 
 ENTRY(memmove_erms)
 	MEMMOVE erms=1 overlap=1 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memmove_erms)
 
 /*
  * memcpy(dst, src, len)
  *        rdi, rsi, rdx
  *
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy_std)
 	MEMMOVE erms=0 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memcpy_std)
 
 ENTRY(memcpy_erms)
 	MEMMOVE erms=1 overlap=0 begin=MEMMOVE_BEGIN end=MEMMOVE_END
 END(memcpy_erms)
 
 /*
  * memset(dst, c,   len)
  *        rdi, rsi, rdx
  */
 .macro MEMSET erms
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rdx,%rcx
 	movzbq	%sil,%r8
 	movabs	$0x0101010101010101,%r10
 	imulq	%r8,%r10
 
 	cmpq	$32,%rcx
 	jbe	101632f
 
 	cmpq	$256,%rcx
 	ja	1256f
 
 103200:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
 	movq	%r10,16(%rdi)
 	movq	%r10,24(%rdi)
 	leaq	32(%rdi),%rdi
 	subq	$32,%rcx
 	cmpq	$32,%rcx
 	ja	103200b
 	cmpb	$16,%cl
 	ja	201632f
 	movq	%r10,-16(%rdi,%rcx)
 	movq	%r10,-8(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 101632:
 	cmpb	$16,%cl
 	jl	100816f
 201632:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
 	movq	%r10,-16(%rdi,%rcx)
 	movq	%r10,-8(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 100816:
 	cmpb	$8,%cl
 	jl	100408f
 	movq	%r10,(%rdi)
 	movq	%r10,-8(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 100408:
 	cmpb	$4,%cl
 	jl	100204f
 	movl	%r10d,(%rdi)
 	movl	%r10d,-4(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 100204:
 	cmpb	$2,%cl
 	jl	100001f
 	movw	%r10w,(%rdi)
 	movw	%r10w,-2(%rdi,%rcx)
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 100001:
 	cmpb	$0,%cl
 	je	100000f
 	movb	%r10b,(%rdi)
 100000:
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 1256:
 	movq	%rdi,%r9
 	movq	%r10,%rax
 	testl	$15,%edi
 	jnz	3f
 1:
 .if \erms == 1
 	rep
 	stosb
 	movq	%r9,%rax
 .else
 	movq	%rcx,%rdx
 	shrq	$3,%rcx
 	rep
 	stosq
 	movq	%r9,%rax
 	andl	$7,%edx
 	jnz	2f
 	POP_FRAME_POINTER
 	ret
 2:
 	movq	%r10,-8(%rdi,%rdx)
 .endif
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 3:
 	movq	%r10,(%rdi)
 	movq	%r10,8(%rdi)
 	movq	%rdi,%r8
 	andq	$15,%r8
 	leaq	-16(%rcx,%r8),%rcx
 	neg	%r8
 	leaq	16(%rdi,%r8),%rdi
 	jmp	1b
 .endm
 
 ENTRY(memset_std)
 	MEMSET erms=0
 END(memset_std)
 
 ENTRY(memset_erms)
 	MEMSET erms=1
 END(memset_erms)
 
 /* fillw(pat, base, cnt) */
 /*       %rdi,%rsi, %rdx */
 ENTRY(fillw)
 	PUSH_FRAME_POINTER
 	movq	%rdi,%rax
 	movq	%rsi,%rdi
 	movq	%rdx,%rcx
 	rep
 	stosw
 	POP_FRAME_POINTER
 	ret
 END(fillw)
 
 /*****************************************************************************/
 /* copyout and fubyte family                                                 */
 /*****************************************************************************/
 /*
  * Access user memory from inside the kernel. These routines should be
  * the only places that do this.
  *
  * These routines set curpcb->pcb_onfault for the time they execute. When a
  * protection violation occurs inside the functions, the trap handler
  * returns to *curpcb->pcb_onfault instead of the function.
  */
 
 .macro SMAP_DISABLE smap
 .if	\smap
 	stac
 .endif
 .endm
 
 
 .macro SMAP_ENABLE smap
 .if	\smap
 	clac
 .endif
 .endm
 
 .macro COPYINOUT_BEGIN
 .endm
 
 .macro COPYINOUT_END
 	movq	%rax,PCB_ONFAULT(%r11)
 	POP_FRAME_POINTER
 .endm
 
 .macro COPYINOUT_SMAP_END
 	SMAP_ENABLE smap=1
 	COPYINOUT_END
 .endm
 
 /*
  * copyout(from_kernel, to_user, len)
  *         %rdi,        %rsi,    %rdx
  */
 .macro	COPYOUT smap erms
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r11
 	movq	$copy_fault,PCB_ONFAULT(%r11)
 
 	/*
 	 * Check explicitly for non-user addresses.
 	 * First, prevent address wrapping.
 	 */
 	movq	%rsi,%rax
 	addq	%rdx,%rax
 	jc	copy_fault
 /*
  * XXX STOP USING VM_MAXUSER_ADDRESS.
  * It is an end address, not a max, so every time it is used correctly it
  * looks like there is an off by one error, and of course it caused an off
  * by one error in several places.
  */
 	movq	$VM_MAXUSER_ADDRESS,%rcx
 	cmpq	%rcx,%rax
 	ja	copy_fault
 
 	/*
 	 * Set return value to zero. Remaining failure mode goes through
 	 * copy_fault.
 	 */
 	xorl	%eax,%eax
 
 	/*
 	 * Set up arguments for MEMMOVE.
 	 */
 	movq	%rdi,%r8
 	movq	%rsi,%rdi
 	movq	%r8,%rsi
 	movq	%rdx,%rcx
 
 
 	SMAP_DISABLE \smap
 .if	\smap == 1
 	MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_SMAP_END
 .else
 	MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_END
 .endif
 	/* NOTREACHED */
 .endm
 
 ENTRY(copyout_nosmap_std)
 	COPYOUT smap=0 erms=0
 END(copyout_nosmap_std)
 
 ENTRY(copyout_smap_std)
 	COPYOUT smap=1 erms=0
 END(copyout_smap_std)
 
 ENTRY(copyout_nosmap_erms)
 	COPYOUT smap=0 erms=1
 END(copyout_nosmap_erms)
 
 ENTRY(copyout_smap_erms)
 	COPYOUT smap=1 erms=1
 END(copyout_smap_erms)
 
 /*
  * copyin(from_user, to_kernel, len)
  *        %rdi,      %rsi,      %rdx
  */
 .macro	COPYIN smap erms
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r11
 	movq	$copy_fault,PCB_ONFAULT(%r11)
 
 	/*
 	 * make sure address is valid
 	 */
 	movq	%rdi,%rax
 	addq	%rdx,%rax
 	jc	copy_fault
 	movq	$VM_MAXUSER_ADDRESS,%rcx
 	cmpq	%rcx,%rax
 	ja	copy_fault
 
 	xorl	%eax,%eax
 
 	movq	%rdi,%r8
 	movq	%rsi,%rdi
 	movq	%r8,%rsi
 	movq	%rdx,%rcx
 
 	SMAP_DISABLE \smap
 .if	\smap == 1
 	MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_SMAP_END
 .else
 	MEMMOVE erms=\erms overlap=0 begin=COPYINOUT_BEGIN end=COPYINOUT_END
 .endif
 	/* NOTREACHED */
 .endm
 
 ENTRY(copyin_nosmap_std)
 	COPYIN smap=0 erms=0
 END(copyin_nosmap_std)
 
 ENTRY(copyin_smap_std)
 	COPYIN smap=1 erms=0
 END(copyin_smap_std)
 
 ENTRY(copyin_nosmap_erms)
 	COPYIN smap=0 erms=1
 END(copyin_nosmap_erms)
 
 ENTRY(copyin_smap_erms)
 	COPYIN smap=1 erms=1
 END(copyin_smap_erms)
 
 	ALIGN_TEXT
 	/* Trap entry clears PSL.AC */
 copy_fault:
 	movq	$0,PCB_ONFAULT(%r11)
 	movl	$EFAULT,%eax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * casueword32.  Compare and set user integer.  Returns -1 on fault,
  *        0 if access was successful.  Old value is written to *oldp.
  *        dst = %rdi, old = %esi, oldp = %rdx, new = %ecx
  */
 ENTRY(casueword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movl	%esi,%eax			/* old */
 #ifdef SMP
 	lock
 #endif
 	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
 	setne	%cl
 
 	/*
 	 * The old value is in %eax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.  Save %eax into %esi to prepare the return
 	 * value.
 	 */
 	movl	%eax,%esi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 
 	/*
 	 * Access the oldp after the pcb_onfault is cleared, to correctly
 	 * catch corrupted pointer.
 	 */
 	movl	%esi,(%rdx)			/* oldp = %rdx */
 	POP_FRAME_POINTER
 	movzbl	%cl, %eax
 	ret
 END(casueword32_nosmap)
 
 ENTRY(casueword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movl	%esi,%eax			/* old */
 	stac
 #ifdef SMP
 	lock
 #endif
 	cmpxchgl %ecx,(%rdi)			/* new = %ecx */
 	clac
 	setne	%cl
 
 	/*
 	 * The old value is in %eax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.  Save %eax into %esi to prepare the return
 	 * value.
 	 */
 	movl	%eax,%esi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 
 	/*
 	 * Access the oldp after the pcb_onfault is cleared, to correctly
 	 * catch corrupted pointer.
 	 */
 	movl	%esi,(%rdx)			/* oldp = %rdx */
 	POP_FRAME_POINTER
 	movzbl	%cl, %eax
 	ret
 END(casueword32_smap)
 
 /*
  * casueword.  Compare and set user long.  Returns -1 on fault,
  *        0 if access was successful.  Old value is written to *oldp.
  *        dst = %rdi, old = %rsi, oldp = %rdx, new = %rcx
  */
 ENTRY(casueword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movq	%rsi,%rax			/* old */
 #ifdef SMP
 	lock
 #endif
 	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
 	setne	%cl
 
 	/*
 	 * The old value is in %rax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.
 	 */
 	movq	%rax,%rsi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 	movq	%rsi,(%rdx)
 	POP_FRAME_POINTER
 	movzbl	%cl, %eax
 	ret
 END(casueword_nosmap)
 
 ENTRY(casueword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$fusufault,PCB_ONFAULT(%r8)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	movq	%rsi,%rax			/* old */
 	stac
 #ifdef SMP
 	lock
 #endif
 	cmpxchgq %rcx,(%rdi)			/* new = %rcx */
 	clac
 	setne	%cl
 
 	/*
 	 * The old value is in %rax.  If the store succeeded it will be the
 	 * value we expected (old) from before the store, otherwise it will
 	 * be the current value.
 	 */
 	movq	%rax,%rsi
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%r8)
 	movq	%rsi,(%rdx)
 	POP_FRAME_POINTER
 	movzbl	%cl, %eax
 	ret
 END(casueword_smap)
 
 /*
  * Fetch (load) a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit
  * byte from user memory.
  * addr = %rdi, valp = %rsi
  */
 
 ENTRY(fueword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	movq	(%rdi),%r11
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movq	%r11,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword_nosmap)
 
 ENTRY(fueword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	stac
 	movq	(%rdi),%r11
 	clac
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movq	%r11,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword_smap)
 
 ENTRY(fueword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	movl	(%rdi),%r11d
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movl	%r11d,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword32_nosmap)
 
 ENTRY(fueword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address is valid */
 	ja	fusufault
 
 	xorl	%eax,%eax
 	stac
 	movl	(%rdi),%r11d
 	clac
 	movq	%rax,PCB_ONFAULT(%rcx)
 	movl	%r11d,(%rsi)
 	POP_FRAME_POINTER
 	ret
 END(fueword32_smap)
 
 ENTRY(fuword16_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	movzwl	(%rdi),%eax
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fuword16_nosmap)
 
 ENTRY(fuword16_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	stac
 	movzwl	(%rdi),%eax
 	clac
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fuword16_smap)
 
 ENTRY(fubyte_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	movzbl	(%rdi),%eax
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fubyte_nosmap)
 
 ENTRY(fubyte_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi
 	ja	fusufault
 
 	stac
 	movzbl	(%rdi),%eax
 	clac
 	movq	$0,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(fubyte_smap)
 
 /*
  * Store a 64-bit word, a 32-bit word, a 16-bit word, or an 8-bit byte to
  * user memory.
  * addr = %rdi, value = %rsi
  */
 ENTRY(suword_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movq	%rsi,(%rdi)
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword_nosmap)
 
 ENTRY(suword_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-8,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movq	%rsi,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword_smap)
 
 ENTRY(suword32_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,(%rdi)
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword32_nosmap)
 
 ENTRY(suword32_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-4,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movl	%esi,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword32_smap)
 
 ENTRY(suword16_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movw	%si,(%rdi)
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword16_nosmap)
 
 ENTRY(suword16_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-2,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	stac
 	movw	%si,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(suword16_smap)
 
 ENTRY(subyte_nosmap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,%eax
 	movb	%al,(%rdi)
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(subyte_nosmap)
 
 ENTRY(subyte_smap)
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%rcx
 	movq	$fusufault,PCB_ONFAULT(%rcx)
 
 	movq	$VM_MAXUSER_ADDRESS-1,%rax
 	cmpq	%rax,%rdi			/* verify address validity */
 	ja	fusufault
 
 	movl	%esi,%eax
 	stac
 	movb	%al,(%rdi)
 	clac
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	POP_FRAME_POINTER
 	ret
 END(subyte_smap)
 
 	ALIGN_TEXT
 	/* Fault entry clears PSL.AC */
 fusufault:
 	movq	PCPU(CURPCB),%rcx
 	xorl	%eax,%eax
 	movq	%rax,PCB_ONFAULT(%rcx)
 	decq	%rax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * copyinstr(from, to, maxlen, int *lencopied)
  *           %rdi, %rsi, %rdx, %rcx
  *
  *	copy a string from 'from' to 'to', stop when a 0 character is reached.
  *	return ENAMETOOLONG if string is longer than maxlen, and
  *	EFAULT on protection violations. If lencopied is non-zero,
  *	return the actual length in *lencopied.
  */
 .macro COPYINSTR smap
 	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 	movq	PCPU(CURPCB),%r9
 	movq	$cpystrflt,PCB_ONFAULT(%r9)
 
 	movq	$VM_MAXUSER_ADDRESS,%rax
 
 	/* make sure 'from' is within bounds */
 	subq	%rdi,%rax
 	jbe	cpystrflt
 
 	SMAP_DISABLE \smap
 
 	/* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */
 	cmpq	%rdx,%rax
 	jb	8f
 1:
 	incq	%rdx
 2:
 	decq	%rdx
 .if \smap == 0
 	jz	copyinstr_toolong
 .else
 	jz	copyinstr_toolong_smap
 .endif
 
 	movb	(%rdi),%al
 	movb	%al,(%rsi)
 	incq	%rsi
 	incq	%rdi
 	testb	%al,%al
 	jnz	2b
 
 	SMAP_ENABLE \smap
 
 	/* Success -- 0 byte reached */
 	decq	%rdx
 	xorl	%eax,%eax
 
 	/* set *lencopied and return %eax */
 	movq	%rax,PCB_ONFAULT(%r9)
 
 	testq	%rcx,%rcx
 	jz	3f
 	subq	%rdx,%r8
 	movq	%r8,(%rcx)
 3:
 	POP_FRAME_POINTER
 	ret
 	ALIGN_TEXT
 8:
 	movq	%rax,%rdx
 	movq	%rax,%r8
 	jmp 1b
 
 .endm
 
 ENTRY(copyinstr_nosmap)
 	COPYINSTR smap=0
 END(copyinstr_nosmap)
 
 ENTRY(copyinstr_smap)
 	COPYINSTR smap=1
 END(copyinstr_smap)
 
 cpystrflt:
 	/* Fault entry clears PSL.AC */
 	movl	$EFAULT,%eax
 cpystrflt_x:
 	/* set *lencopied and return %eax */
 	movq	$0,PCB_ONFAULT(%r9)
 
 	testq	%rcx,%rcx
 	jz	1f
 	subq	%rdx,%r8
 	movq	%r8,(%rcx)
 1:
 	POP_FRAME_POINTER
 	ret
 
 copyinstr_toolong_smap:
 	clac
 copyinstr_toolong:
 	/* rdx is zero - return ENAMETOOLONG or EFAULT */
 	movq	$VM_MAXUSER_ADDRESS,%rax
 	cmpq	%rax,%rdi
 	jae	cpystrflt
 	movl	$ENAMETOOLONG,%eax
 	jmp	cpystrflt_x
 
 /*
  * copystr(from, to, maxlen, int *lencopied)
  *         %rdi, %rsi, %rdx, %rcx
  */
 ENTRY(copystr)
 	PUSH_FRAME_POINTER
 	movq	%rdx,%r8			/* %r8 = maxlen */
 
 	incq    %rdx
 1:
 	decq	%rdx
 	jz	4f
 	movb	(%rdi),%al
 	movb	%al,(%rsi)
 	incq	%rsi
 	incq	%rdi
 	testb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decq	%rdx
 	xorl	%eax,%eax
 2:
 	testq	%rcx,%rcx
 	jz      3f
 	/* set *lencopied and return %rax */
 	subq	%rdx,%r8
 	movq	%r8,(%rcx)
 3:
 	POP_FRAME_POINTER
 	ret
 4:
 	/* rdx is zero -- return ENAMETOOLONG */
 	movl    $ENAMETOOLONG,%eax
 	jmp	2b
 END(copystr)
 
 /*
  * Handling of special amd64 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	lgdt	(%rdi)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	movl	$KDSEL,%eax
 	movl	%eax,%ds
 	movl	%eax,%es
 	movl	%eax,%fs	/* Beware, use wrmsr to set 64 bit base */
 	movl	%eax,%gs
 	movl	%eax,%ss
 
 	/* reload code selector by turning return into intersegmental return */
 	popq	%rax
 	pushq	$KCSEL
 	pushq	%rax
 	MEXITCOUNT
 	lretq
 END(lgdt)
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movq	%rbx,0(%rdi)			/* save rbx */
 	movq	%rsp,8(%rdi)			/* save rsp */
 	movq	%rbp,16(%rdi)			/* save rbp */
 	movq	%r12,24(%rdi)			/* save r12 */
 	movq	%r13,32(%rdi)			/* save r13 */
 	movq	%r14,40(%rdi)			/* save r14 */
 	movq	%r15,48(%rdi)			/* save r15 */
 	movq	0(%rsp),%rdx			/* get rta */
 	movq	%rdx,56(%rdi)			/* save rip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 END(setjmp)
 
 ENTRY(longjmp)
 	movq	0(%rdi),%rbx			/* restore rbx */
 	movq	8(%rdi),%rsp			/* restore rsp */
 	movq	16(%rdi),%rbp			/* restore rbp */
 	movq	24(%rdi),%r12			/* restore r12 */
 	movq	32(%rdi),%r13			/* restore r13 */
 	movq	40(%rdi),%r14			/* restore r14 */
 	movq	48(%rdi),%r15			/* restore r15 */
 	movq	56(%rdi),%rdx			/* get rta */
 	movq	%rdx,0(%rsp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 END(longjmp)
 
 /*
  * Support for reading MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(rdmsr_safe)
 /* int rdmsr_safe(u_int msr, uint64_t *data) */
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
 	rdmsr			/* Read MSR pointed by %ecx. Returns
 				   hi byte in edx, lo in %eax */
 	salq	$32,%rdx	/* sign-shift %rdx left */
 	movl	%eax,%eax	/* zero-extend %eax -> %rax */
 	orq	%rdx,%rax
 	movq	%rax,(%rsi)
 	xorq	%rax,%rax
 	movq	%rax,PCB_ONFAULT(%r8)
 	POP_FRAME_POINTER
 	ret
 
 /*
  * Support for writing MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(wrmsr_safe)
 /* int wrmsr_safe(u_int msr, uint64_t data) */
 	PUSH_FRAME_POINTER
 	movq	PCPU(CURPCB),%r8
 	movq	$msr_onfault,PCB_ONFAULT(%r8)
 	movl	%edi,%ecx
 	movl	%esi,%eax
 	sarq	$32,%rsi
 	movl	%esi,%edx
 	wrmsr			/* Write MSR pointed by %ecx. Accepts
 				   hi byte in edx, lo in %eax. */
 	xorq	%rax,%rax
 	movq	%rax,PCB_ONFAULT(%r8)
 	POP_FRAME_POINTER
 	ret
 
 /*
  * MSR operations fault handler
  */
 	ALIGN_TEXT
 msr_onfault:
 	movq	$0,PCB_ONFAULT(%r8)
 	movl	$EFAULT,%eax
 	POP_FRAME_POINTER
 	ret
 
 /*
  * void pmap_pti_pcid_invalidate(uint64_t ucr3, uint64_t kcr3);
  * Invalidates address space addressed by ucr3, then returns to kcr3.
  * Done in assembler to ensure no other memory accesses happen while
  * on ucr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invalidate)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 /*
  * void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
  * Invalidates virtual address va in address space ucr3, then returns to kcr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invlpg)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 	invlpg	(%rdx)
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 /*
  * void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
  *     vm_offset_t eva);
  * Invalidates virtual addresses between sva and eva in address space ucr3,
  * then returns to kcr3.
  */
 	ALIGN_TEXT
 ENTRY(pmap_pti_pcid_invlrng)
 	pushfq
 	cli
 	movq	%rdi,%cr3	/* to user page table */
 1:	invlpg	(%rdx)
 	addq	$PAGE_SIZE,%rdx
 	cmpq	%rdx,%rcx
 	ja	1b
 	movq	%rsi,%cr3	/* back to kernel */
 	popfq
 	retq
 
 	.altmacro
-	.macro	ibrs_seq_label l
-handle_ibrs_\l:
+	.macro	rsb_seq_label l
+rsb_seq_\l:
 	.endm
-	.macro	ibrs_call_label l
-	call	handle_ibrs_\l
+	.macro	rsb_call_label l
+	call	rsb_seq_\l
 	.endm
-	.macro	ibrs_seq count
+	.macro	rsb_seq count
 	ll=1
 	.rept	\count
-	ibrs_call_label	%(ll)
+	rsb_call_label	%(ll)
 	nop
-	ibrs_seq_label %(ll)
+	rsb_seq_label %(ll)
 	addq	$8,%rsp
 	ll=ll+1
 	.endr
 	.endm
 
+ENTRY(rsb_flush)
+	rsb_seq	32
+	ret
+
 /* all callers already saved %rax, %rdx, and %rcx */
 ENTRY(handle_ibrs_entry)
 	cmpb	$0,hw_ibrs_ibpb_active(%rip)
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx
 	wrmsr
 	movb	$1,PCPU(IBPB_SET)
 	testl	$CPUID_STDEXT_SMEP,cpu_stdext_feature(%rip)
-	jne	1f
-	ibrs_seq 32
+	je	rsb_flush
 1:	ret
 END(handle_ibrs_entry)
 
 ENTRY(handle_ibrs_exit)
 	cmpb	$0,PCPU(IBPB_SET)
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	andl	$~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	andl	$~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
 	wrmsr
 	movb	$0,PCPU(IBPB_SET)
 1:	ret
 END(handle_ibrs_exit)
 
 /* registers-neutral version, but needs stack */
 ENTRY(handle_ibrs_exit_rs)
 	cmpb	$0,PCPU(IBPB_SET)
 	je	1f
 	pushq	%rax
 	pushq	%rdx
 	pushq	%rcx
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	andl	$~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	andl	$~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
 	wrmsr
 	popq	%rcx
 	popq	%rdx
 	popq	%rax
 	movb	$0,PCPU(IBPB_SET)
 1:	ret
 END(handle_ibrs_exit_rs)
 
 	.noaltmacro
 
 /*
  * Flush L1D cache.  Load enough of the data from the kernel text
  * to flush existing L1D content.
  *
  * N.B. The function does not follow ABI calling conventions, it corrupts %rbx.
  * The vmm.ko caller expects that only %rax, %rdx, %rbx, %rcx, %r9, and %rflags
  * registers are clobbered.  The NMI handler caller only needs %r13 preserved.
  */
 ENTRY(flush_l1d_sw)
 #define	L1D_FLUSH_SIZE	(64 * 1024)
 	movq	$KERNBASE, %r9
 	movq	$-L1D_FLUSH_SIZE, %rcx
 	/*
 	 * pass 1: Preload TLB.
 	 * Kernel text is mapped using superpages.  TLB preload is
 	 * done for the benefit of older CPUs which split 2M page
 	 * into 4k TLB entries.
 	 */
 1:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
 	addq	$PAGE_SIZE, %rcx
 	jne	1b
 	xorl	%eax, %eax
 	cpuid
 	movq	$-L1D_FLUSH_SIZE, %rcx
 	/* pass 2: Read each cache line. */
 2:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
 	addq	$64, %rcx
 	jne	2b
 	lfence
 	ret
 #undef	L1D_FLUSH_SIZE
 END(flush_l1d_sw)
 
 ENTRY(flush_l1d_sw_abi)
 	pushq	%rbx
 	call	flush_l1d_sw
 	popq	%rbx
 	ret
 END(flush_l1d_sw_abi)
 
 ENTRY(mds_handler_void)
 	retq
 END(mds_handler_void)
 
 ENTRY(mds_handler_verw)
 	subq	$8, %rsp
 	movw	%ds, (%rsp)
 	verw	(%rsp)
 	addq	$8, %rsp
 	retq
 END(mds_handler_verw)
 
 ENTRY(mds_handler_ivb)
 	pushq	%rax
 	pushq	%rdx
 	pushq	%rcx
 
 	movq	%cr0, %rax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movq	PCPU(MDS_BUF), %rdx
 	movdqa	%xmm0, PCPU(MDS_TMP)
 	pxor	%xmm0, %xmm0
 
 	lfence
 	orpd	(%rdx), %xmm0
 	orpd	(%rdx), %xmm0
 	mfence
 	movl	$40, %ecx
 	addq	$16, %rdx
 2:	movntdq	%xmm0, (%rdx)
 	addq	$16, %rdx
 	decl	%ecx
 	jnz	2b
 	mfence
 
 	movdqa	PCPU(MDS_TMP),%xmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movq	%rax, %cr0
 3:	popq	%rcx
 	popq	%rdx
 	popq	%rax
 	retq
 END(mds_handler_ivb)
 
 ENTRY(mds_handler_bdw)
 	pushq	%rax
 	pushq	%rbx
 	pushq	%rcx
 	pushq	%rdi
 	pushq	%rsi
 
 	movq	%cr0, %rax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movq	PCPU(MDS_BUF), %rbx
 	movdqa	%xmm0, PCPU(MDS_TMP)
 	pxor	%xmm0, %xmm0
 
 	movq	%rbx, %rdi
 	movq	%rbx, %rsi
 	movl	$40, %ecx
 2:	movntdq	%xmm0, (%rbx)
 	addq	$16, %rbx
 	decl	%ecx
 	jnz	2b
 	mfence
 	movl	$1536, %ecx
 	rep; movsb
 	lfence
 
 	movdqa	PCPU(MDS_TMP),%xmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movq	%rax, %cr0
 3:	popq	%rsi
 	popq	%rdi
 	popq	%rcx
 	popq	%rbx
 	popq	%rax
 	retq
 END(mds_handler_bdw)
 
 ENTRY(mds_handler_skl_sse)
 	pushq	%rax
 	pushq	%rdx
 	pushq	%rcx
 	pushq	%rdi
 
 	movq	%cr0, %rax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movq	PCPU(MDS_BUF), %rdi
 	movq	PCPU(MDS_BUF64), %rdx
 	movdqa	%xmm0, PCPU(MDS_TMP)
 	pxor	%xmm0, %xmm0
 
 	lfence
 	orpd	(%rdx), %xmm0
 	orpd	(%rdx), %xmm0
 	xorl	%eax, %eax
 2:	clflushopt	5376(%rdi, %rax, 8)
 	addl	$8, %eax
 	cmpl	$8 * 12, %eax
 	jb	2b
 	sfence
 	movl	$6144, %ecx
 	xorl	%eax, %eax
 	rep; stosb
 	mfence
 
 	movdqa	PCPU(MDS_TMP), %xmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movq	%rax, %cr0
 3:	popq	%rdi
 	popq	%rcx
 	popq	%rdx
 	popq	%rax
 	retq
 END(mds_handler_skl_sse)
 
 ENTRY(mds_handler_skl_avx)
 	pushq	%rax
 	pushq	%rdx
 	pushq	%rcx
 	pushq	%rdi
 
 	movq	%cr0, %rax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movq	PCPU(MDS_BUF), %rdi
 	movq	PCPU(MDS_BUF64), %rdx
 	vmovdqa	%ymm0, PCPU(MDS_TMP)
 	vpxor	%ymm0, %ymm0, %ymm0
 
 	lfence
 	vorpd	(%rdx), %ymm0, %ymm0
 	vorpd	(%rdx), %ymm0, %ymm0
 	xorl	%eax, %eax
 2:	clflushopt	5376(%rdi, %rax, 8)
 	addl	$8, %eax
 	cmpl	$8 * 12, %eax
 	jb	2b
 	sfence
 	movl	$6144, %ecx
 	xorl	%eax, %eax
 	rep; stosb
 	mfence
 
 	vmovdqa	PCPU(MDS_TMP), %ymm0
 	testb	$CR0_TS, %al
 	je	3f
 	movq	%rax, %cr0
 3:	popq	%rdi
 	popq	%rcx
 	popq	%rdx
 	popq	%rax
 	retq
 END(mds_handler_skl_avx)
 
 ENTRY(mds_handler_skl_avx512)
 	pushq	%rax
 	pushq	%rdx
 	pushq	%rcx
 	pushq	%rdi
 
 	movq	%cr0, %rax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movq	PCPU(MDS_BUF), %rdi
 	movq	PCPU(MDS_BUF64), %rdx
 	vmovdqa64	%zmm0, PCPU(MDS_TMP)
 	vpxord	%zmm0, %zmm0, %zmm0
 
 	lfence
 	vorpd	(%rdx), %zmm0, %zmm0
 	vorpd	(%rdx), %zmm0, %zmm0
 	xorl	%eax, %eax
 2:	clflushopt	5376(%rdi, %rax, 8)
 	addl	$8, %eax
 	cmpl	$8 * 12, %eax
 	jb	2b
 	sfence
 	movl	$6144, %ecx
 	xorl	%eax, %eax
 	rep; stosb
 	mfence
 
 	vmovdqa64	PCPU(MDS_TMP), %zmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movq	%rax, %cr0
 3:	popq	%rdi
 	popq	%rcx
 	popq	%rdx
 	popq	%rax
 	retq
 END(mds_handler_skl_avx512)
 
 ENTRY(mds_handler_silvermont)
 	pushq	%rax
 	pushq	%rdx
 	pushq	%rcx
 
 	movq	%cr0, %rax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movq	PCPU(MDS_BUF), %rdx
 	movdqa	%xmm0, PCPU(MDS_TMP)
 	pxor	%xmm0, %xmm0
 
 	movl	$16, %ecx
 2:	movntdq	%xmm0, (%rdx)
 	addq	$16, %rdx
 	decl	%ecx
 	jnz	2b
 	mfence
 
 	movdqa	PCPU(MDS_TMP),%xmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movq	%rax, %cr0
 3:	popq	%rcx
 	popq	%rdx
 	popq	%rax
 	retq
 END(mds_handler_silvermont)
Index: head/sys/i386/i386/support.s
===================================================================
--- head/sys/i386/i386/support.s	(revision 361301)
+++ head/sys/i386/i386/support.s	(revision 361302)
@@ -1,658 +1,679 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <machine/asmacros.h>
 #include <machine/cputypes.h>
 #include <machine/pmap.h>
 #include <machine/specialreg.h>
 
 #include "assym.inc"
 
 #define IDXSHIFT	10
 
 	.text
 
 /*
  * bcopy family
  * void bzero(void *buf, u_int len)
  */
 ENTRY(bzero)
 	pushl	%edi
 	movl	8(%esp),%edi
 	movl	12(%esp),%ecx
 	xorl	%eax,%eax
 	shrl	$2,%ecx
 	rep
 	stosl
 	movl	12(%esp),%ecx
 	andl	$3,%ecx
 	rep
 	stosb
 	popl	%edi
 	ret
 END(bzero)
 
 ENTRY(sse2_pagezero)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	%ecx,%eax
 	addl	$4096,%eax
 	xor	%ebx,%ebx
 	jmp	1f
 	/*
 	 * The loop takes 14 bytes.  Ensure that it doesn't cross a 16-byte
 	 * cache line.
 	 */
 	.p2align 4,0x90
 1:
 	movnti	%ebx,(%ecx)
 	movnti	%ebx,4(%ecx)
 	addl	$8,%ecx
 	cmpl	%ecx,%eax
 	jne	1b
 	sfence
 	popl	%ebx
 	ret
 END(sse2_pagezero)
 
 ENTRY(i686_pagezero)
 	pushl	%edi
 	pushl	%ebx
 
 	movl	12(%esp),%edi
 	movl	$1024,%ecx
 
 	ALIGN_TEXT
 1:
 	xorl	%eax,%eax
 	repe
 	scasl
 	jnz	2f
 
 	popl	%ebx
 	popl	%edi
 	ret
 
 	ALIGN_TEXT
 
 2:
 	incl	%ecx
 	subl	$4,%edi
 
 	movl	%ecx,%edx
 	cmpl	$16,%ecx
 
 	jge	3f
 
 	movl	%edi,%ebx
 	andl	$0x3f,%ebx
 	shrl	%ebx
 	shrl	%ebx
 	movl	$16,%ecx
 	subl	%ebx,%ecx
 
 3:
 	subl	%ecx,%edx
 	rep
 	stosl
 
 	movl	%edx,%ecx
 	testl	%edx,%edx
 	jnz	1b
 
 	popl	%ebx
 	popl	%edi
 	ret
 END(i686_pagezero)
 
 /* fillw(pat, base, cnt) */
 ENTRY(fillw)
 	pushl	%edi
 	movl	8(%esp),%eax
 	movl	12(%esp),%edi
 	movl	16(%esp),%ecx
 	rep
 	stosw
 	popl	%edi
 	ret
 END(fillw)
 
 /*
  * memmove(dst, src, cnt) (return dst)
  * bcopy(src, dst, cnt)
  *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
  */
 ENTRY(bcopy)
 	movl	4(%esp),%eax
 	movl	8(%esp),%edx
 	movl	%eax,8(%esp)
 	movl	%edx,4(%esp)
 	MEXITCOUNT
 	jmp	memmove
 END(bcopy)
 
 ENTRY(memmove)
 	pushl	%ebp
 	movl	%esp,%ebp
 	pushl	%esi
 	pushl	%edi
 	movl	8(%ebp),%edi
 	movl	12(%ebp),%esi
 1:
 	movl	16(%ebp),%ecx
 
 	movl	%edi,%eax
 	subl	%esi,%eax
 	cmpl	%ecx,%eax			/* overlapping && src < dst? */
 	jb	1f
 
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	rep
 	movsl
 	movl	16(%ebp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%edi
 	popl	%esi
 	movl	8(%ebp),%eax			/* return dst for memmove */
 	popl	%ebp
 	ret
 
 	ALIGN_TEXT
 1:
 	addl	%ecx,%edi			/* copy backwards */
 	addl	%ecx,%esi
 	decl	%edi
 	decl	%esi
 	andl	$3,%ecx				/* any fractional bytes? */
 	std
 	rep
 	movsb
 	movl	16(%ebp),%ecx			/* copy remainder by 32-bit words */
 	shrl	$2,%ecx
 	subl	$3,%esi
 	subl	$3,%edi
 	rep
 	movsl
 	popl	%edi
 	popl	%esi
 	cld
 	movl	8(%ebp),%eax			/* return dst for memmove */
 	popl	%ebp
 	ret
 END(memmove)
 
 /*
  * Note: memcpy does not support overlapping copies
  */
 ENTRY(memcpy)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%ecx
 	movl	%edi,%eax
 	shrl	$2,%ecx				/* copy by 32-bit words */
 	rep
 	movsl
 	movl	20(%esp),%ecx
 	andl	$3,%ecx				/* any bytes left? */
 	rep
 	movsb
 	popl	%esi
 	popl	%edi
 	ret
 END(memcpy)
 
 /*
  * copystr(from, to, maxlen, int *lencopied) - MP SAFE
  */
 ENTRY(copystr)
 	pushl	%esi
 	pushl	%edi
 
 	movl	12(%esp),%esi			/* %esi = from */
 	movl	16(%esp),%edi			/* %edi = to */
 	movl	20(%esp),%edx			/* %edx = maxlen */
 	incl	%edx
 1:
 	decl	%edx
 	jz	4f
 	lodsb
 	stosb
 	orb	%al,%al
 	jnz	1b
 
 	/* Success -- 0 byte reached */
 	decl	%edx
 	xorl	%eax,%eax
 	jmp	6f
 4:
 	/* edx is zero -- return ENAMETOOLONG */
 	movl	$ENAMETOOLONG,%eax
 
 6:
 	/* set *lencopied and return %eax */
 	movl	20(%esp),%ecx
 	subl	%edx,%ecx
 	movl	24(%esp),%edx
 	testl	%edx,%edx
 	jz	7f
 	movl	%ecx,(%edx)
 7:
 	popl	%edi
 	popl	%esi
 	ret
 END(copystr)
 
 ENTRY(bcmp)
 	pushl	%edi
 	pushl	%esi
 	movl	12(%esp),%edi
 	movl	16(%esp),%esi
 	movl	20(%esp),%edx
 
 	movl	%edx,%ecx
 	shrl	$2,%ecx
 	repe
 	cmpsl
 	jne	1f
 
 	movl	%edx,%ecx
 	andl	$3,%ecx
 	repe
 	cmpsb
 1:
 	setne	%al
 	movsbl	%al,%eax
 	popl	%esi
 	popl	%edi
 	ret
 END(bcmp)
 
 /*
  * Handling of special 386 registers and descriptor tables etc
  */
 /* void lgdt(struct region_descriptor *rdp); */
 ENTRY(lgdt)
 	/* reload the descriptor table */
 	movl	4(%esp),%eax
 	lgdt	(%eax)
 
 	/* flush the prefetch q */
 	jmp	1f
 	nop
 1:
 	/* reload "stale" selectors */
 	movl	$KDSEL,%eax
 	movl	%eax,%ds
 	movl	%eax,%es
 	movl	%eax,%gs
 	movl	%eax,%ss
 	movl	$KPSEL,%eax
 	movl	%eax,%fs
 
 	/* reload code selector by turning return into intersegmental return */
 	movl	(%esp),%eax
 	pushl	%eax
 	movl	$KCSEL,4(%esp)
 	MEXITCOUNT
 	lret
 END(lgdt)
 
 /* ssdtosd(*ssdp,*sdp) */
 ENTRY(ssdtosd)
 	pushl	%ebx
 	movl	8(%esp),%ecx
 	movl	8(%ecx),%ebx
 	shll	$16,%ebx
 	movl	(%ecx),%edx
 	roll	$16,%edx
 	movb	%dh,%bl
 	movb	%dl,%bh
 	rorl	$8,%ebx
 	movl	4(%ecx),%eax
 	movw	%ax,%dx
 	andl	$0xf0000,%eax
 	orl	%eax,%ebx
 	movl	12(%esp),%ecx
 	movl	%edx,(%ecx)
 	movl	%ebx,4(%ecx)
 	popl	%ebx
 	ret
 END(ssdtosd)
 
 /* void reset_dbregs() */
 ENTRY(reset_dbregs)
 	movl	$0,%eax
 	movl	%eax,%dr7	/* disable all breakpoints first */
 	movl	%eax,%dr0
 	movl	%eax,%dr1
 	movl	%eax,%dr2
 	movl	%eax,%dr3
 	movl	%eax,%dr6
 	ret
 END(reset_dbregs)
 
 /*****************************************************************************/
 /* setjump, longjump                                                         */
 /*****************************************************************************/
 
 ENTRY(setjmp)
 	movl	4(%esp),%eax
 	movl	%ebx,(%eax)			/* save ebx */
 	movl	%esp,4(%eax)			/* save esp */
 	movl	%ebp,8(%eax)			/* save ebp */
 	movl	%esi,12(%eax)			/* save esi */
 	movl	%edi,16(%eax)			/* save edi */
 	movl	(%esp),%edx			/* get rta */
 	movl	%edx,20(%eax)			/* save eip */
 	xorl	%eax,%eax			/* return(0); */
 	ret
 END(setjmp)
 
 ENTRY(longjmp)
 	movl	4(%esp),%eax
 	movl	(%eax),%ebx			/* restore ebx */
 	movl	4(%eax),%esp			/* restore esp */
 	movl	8(%eax),%ebp			/* restore ebp */
 	movl	12(%eax),%esi			/* restore esi */
 	movl	16(%eax),%edi			/* restore edi */
 	movl	20(%eax),%edx			/* get rta */
 	movl	%edx,(%esp)			/* put in return frame */
 	xorl	%eax,%eax			/* return(1); */
 	incl	%eax
 	ret
 END(longjmp)
 
 /*
  * Support for reading MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(rdmsr_safe)
 /* int rdmsr_safe(u_int msr, uint64_t *data) */
 	movl	PCPU(CURPCB),%ecx
 	movl	$msr_onfault,PCB_ONFAULT(%ecx)
 
 	movl	4(%esp),%ecx
 	rdmsr
 	movl	8(%esp),%ecx
 	movl	%eax,(%ecx)
 	movl	%edx,4(%ecx)
 	xorl	%eax,%eax
 
 	movl	PCPU(CURPCB),%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 
 	ret
 
 /*
  * Support for writing MSRs in the safe manner.  (Instead of panic on #gp,
  * return an error.)
  */
 ENTRY(wrmsr_safe)
 /* int wrmsr_safe(u_int msr, uint64_t data) */
 	movl	PCPU(CURPCB),%ecx
 	movl	$msr_onfault,PCB_ONFAULT(%ecx)
 
 	movl	4(%esp),%ecx
 	movl	8(%esp),%eax
 	movl	12(%esp),%edx
 	wrmsr
 	xorl	%eax,%eax
 
 	movl	PCPU(CURPCB),%ecx
 	movl	%eax,PCB_ONFAULT(%ecx)
 
 	ret
 
 /*
  * MSR operations fault handler
  */
 	ALIGN_TEXT
 msr_onfault:
 	movl	PCPU(CURPCB),%ecx
 	movl	$0,PCB_ONFAULT(%ecx)
 	movl	$EFAULT,%eax
 	ret
 
+	.altmacro
+	.macro	rsb_seq_label l
+rsb_seq_\l:
+	.endm
+	.macro	rsb_call_label l
+	call	rsb_seq_\l
+	.endm
+	.macro	rsb_seq count
+	ll=1
+	.rept	\count
+	rsb_call_label	%(ll)
+	nop
+	rsb_seq_label %(ll)
+	addl	$4,%esp
+	ll=ll+1
+	.endr
+	.endm
+
+ENTRY(rsb_flush)
+	rsb_seq	32
+	ret
+
 ENTRY(handle_ibrs_entry)
 	cmpb	$0,hw_ibrs_ibpb_active
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	orl	$(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32,%edx
 	wrmsr
 	movb	$1,PCPU(IBPB_SET)
 	/*
-	 * i386 does not implement SMEP, but the 4/4 split makes this not
-	 * that important.
+	 * i386 does not implement SMEP.
 	 */
-1:	ret
+1:	jmp	rsb_flush
 END(handle_ibrs_entry)
 
 ENTRY(handle_ibrs_exit)
 	cmpb	$0,PCPU(IBPB_SET)
 	je	1f
 	movl	$MSR_IA32_SPEC_CTRL,%ecx
 	rdmsr
 	andl	$~(IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP),%eax
 	andl	$~((IA32_SPEC_CTRL_IBRS|IA32_SPEC_CTRL_STIBP)>>32),%edx
 	wrmsr
 	movb	$0,PCPU(IBPB_SET)
 1:	ret
 END(handle_ibrs_exit)
 
 ENTRY(mds_handler_void)
 	ret
 END(mds_handler_void)
 
 ENTRY(mds_handler_verw)
 	subl	$4, %esp
 	movw	%ds, (%esp)
 	verw	(%esp)
 	addl	$4, %esp
 	ret
 END(mds_handler_verw)
 
 ENTRY(mds_handler_ivb)
 	movl	%cr0, %eax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movl	PCPU(MDS_BUF), %edx
 	movdqa	%xmm0, PCPU(MDS_TMP)
 	pxor	%xmm0, %xmm0
 
 	lfence
 	orpd	(%edx), %xmm0
 	orpd	(%edx), %xmm0
 	mfence
 	movl	$40, %ecx
 	addl	$16, %edx
 2:	movntdq	%xmm0, (%edx)
 	addl	$16, %edx
 	decl	%ecx
 	jnz	2b
 	mfence
 
 	movdqa	PCPU(MDS_TMP),%xmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movl	%eax, %cr0
 3:	ret
 END(mds_handler_ivb)
 
 ENTRY(mds_handler_bdw)
 	movl	%cr0, %eax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movl	PCPU(MDS_BUF), %ebx
 	movdqa	%xmm0, PCPU(MDS_TMP)
 	pxor	%xmm0, %xmm0
 
 	movl	%ebx, %edi
 	movl	%ebx, %esi
 	movl	$40, %ecx
 2:	movntdq	%xmm0, (%ebx)
 	addl	$16, %ebx
 	decl	%ecx
 	jnz	2b
 	mfence
 	movl	$1536, %ecx
 	rep; movsb
 	lfence
 
 	movdqa	PCPU(MDS_TMP),%xmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movl	%eax, %cr0
 3:	ret
 END(mds_handler_bdw)
 
 ENTRY(mds_handler_skl_sse)
 	movl	%cr0, %eax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movl	PCPU(MDS_BUF), %edi
 	movl	PCPU(MDS_BUF64), %edx
 	movdqa	%xmm0, PCPU(MDS_TMP)
 	pxor	%xmm0, %xmm0
 
 	lfence
 	orpd	(%edx), %xmm0
 	orpd	(%edx), %xmm0
 	xorl	%eax, %eax
 2:	clflushopt	5376(%edi, %eax, 8)
 	addl	$8, %eax
 	cmpl	$8 * 12, %eax
 	jb	2b
 	sfence
 	movl	$6144, %ecx
 	xorl	%eax, %eax
 	rep; stosb
 	mfence
 
 	movdqa	PCPU(MDS_TMP), %xmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movl	%eax, %cr0
 3:	ret
 END(mds_handler_skl_sse)
 
 ENTRY(mds_handler_skl_avx)
 	movl	%cr0, %eax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movl	PCPU(MDS_BUF), %edi
 	movl	PCPU(MDS_BUF64), %edx
 	vmovdqa	%ymm0, PCPU(MDS_TMP)
 	vpxor	%ymm0, %ymm0, %ymm0
 
 	lfence
 	vorpd	(%edx), %ymm0, %ymm0
 	vorpd	(%edx), %ymm0, %ymm0
 	xorl	%eax, %eax
 2:	clflushopt	5376(%edi, %eax, 8)
 	addl	$8, %eax
 	cmpl	$8 * 12, %eax
 	jb	2b
 	sfence
 	movl	$6144, %ecx
 	xorl	%eax, %eax
 	rep; stosb
 	mfence
 
 	vmovdqa	PCPU(MDS_TMP), %ymm0
 	testb	$CR0_TS, %al
 	je	3f
 	movl	%eax, %cr0
 3:	ret
 END(mds_handler_skl_avx)
 
 ENTRY(mds_handler_skl_avx512)
 	movl	%cr0, %eax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movl	PCPU(MDS_BUF), %edi
 	movl	PCPU(MDS_BUF64), %edx
 	vmovdqa64	%zmm0, PCPU(MDS_TMP)
 	vpxord	%zmm0, %zmm0, %zmm0
 
 	lfence
 	vorpd	(%edx), %zmm0, %zmm0
 	vorpd	(%edx), %zmm0, %zmm0
 	xorl	%eax, %eax
 2:	clflushopt	5376(%edi, %eax, 8)
 	addl	$8, %eax
 	cmpl	$8 * 12, %eax
 	jb	2b
 	sfence
 	movl	$6144, %ecx
 	xorl	%eax, %eax
 	rep; stosb
 	mfence
 
 	vmovdqa64	PCPU(MDS_TMP), %zmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movl	%eax, %cr0
 3:	ret
 END(mds_handler_skl_avx512)
 
 ENTRY(mds_handler_silvermont)
 	movl	%cr0, %eax
 	testb	$CR0_TS, %al
 	je	1f
 	clts
 1:	movl	PCPU(MDS_BUF), %edx
 	movdqa	%xmm0, PCPU(MDS_TMP)
 	pxor	%xmm0, %xmm0
 
 	movl	$16, %ecx
 2:	movntdq	%xmm0, (%edx)
 	addl	$16, %edx
 	decl	%ecx
 	jnz	2b
 	mfence
 
 	movdqa	PCPU(MDS_TMP),%xmm0
 	testb	$CR0_TS, %al
 	je	3f
 	movl	%eax, %cr0
 3:	ret
 END(mds_handler_silvermont)
Index: head/sys/x86/include/x86_var.h
===================================================================
--- head/sys/x86/include/x86_var.h	(revision 361301)
+++ head/sys/x86/include/x86_var.h	(revision 361302)
@@ -1,159 +1,160 @@
 /*-
  * Copyright (c) 1995 Bruce D. Evans.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _X86_X86_VAR_H_
 #define	_X86_X86_VAR_H_
 
 /*
  * Miscellaneous machine-dependent declarations.
  */
 
 extern	long	Maxmem;
 extern	u_int	basemem;
 extern	int	busdma_swi_pending;
 extern	u_int	cpu_exthigh;
 extern	u_int	cpu_feature;
 extern	u_int	cpu_feature2;
 extern	u_int	amd_feature;
 extern	u_int	amd_feature2;
 extern	u_int	amd_rascap;
 extern	u_int	amd_pminfo;
 extern	u_int	amd_extended_feature_extensions;
 extern	u_int	via_feature_rng;
 extern	u_int	via_feature_xcrypt;
 extern	u_int	cpu_clflush_line_size;
 extern	u_int	cpu_stdext_feature;
 extern	u_int	cpu_stdext_feature2;
 extern	u_int	cpu_stdext_feature3;
 extern	uint64_t cpu_ia32_arch_caps;
 extern	u_int	cpu_fxsr;
 extern	u_int	cpu_high;
 extern	u_int	cpu_id;
 extern	u_int	cpu_max_ext_state_size;
 extern	u_int	cpu_mxcsr_mask;
 extern	u_int	cpu_procinfo;
 extern	u_int	cpu_procinfo2;
 extern	char	cpu_vendor[];
 extern	u_int	cpu_vendor_id;
 extern	u_int	cpu_mon_mwait_flags;
 extern	u_int	cpu_mon_min_size;
 extern	u_int	cpu_mon_max_size;
 extern	u_int	cpu_maxphyaddr;
 extern	u_int	cpu_power_eax;
 extern	u_int	cpu_power_ebx;
 extern	u_int	cpu_power_ecx;
 extern	u_int	cpu_power_edx;
 extern	char	ctx_switch_xsave[];
 extern	u_int	hv_base;
 extern	u_int	hv_high;
 extern	char	hv_vendor[];
 extern	char	kstack[];
 extern	char	sigcode[];
 extern	int	szsigcode;
 extern	int	vm_page_dump_size;
 extern	int	workaround_erratum383;
 extern	int	_udatasel;
 extern	int	_ucodesel;
 extern	int	_ucode32sel;
 extern	int	_ufssel;
 extern	int	_ugssel;
 extern	int	use_xsave;
 extern	uint64_t xsave_mask;
 extern	u_int	max_apic_id;
 extern	int	i386_read_exec;
 extern	int	pti;
 extern	int	hw_ibrs_ibpb_active;
 extern	int	hw_mds_disable;
 extern	int	hw_ssb_active;
 extern	int	x86_taa_enable;
+extern	int	cpu_flush_rsb_ctxsw;
 
 struct	pcb;
 struct	thread;
 struct	reg;
 struct	fpreg;
 struct  dbreg;
 struct	dumperinfo;
 struct	trapframe;
 
 /*
  * The interface type of the interrupt handler entry point cannot be
  * expressed in C.  Use simplest non-variadic function type as an
  * approximation.
  */
 typedef void alias_for_inthand_t(void);
 
 bool	acpi_get_fadt_bootflags(uint16_t *flagsp);
 void	*alloc_fpusave(int flags);
 void	busdma_swi(void);
 vm_paddr_t cpu_getmaxphyaddr(void);
 bool	cpu_mwait_usable(void);
 void	cpu_probe_amdc1e(void);
 void	cpu_setregs(void);
 bool	disable_wp(void);
 void	restore_wp(bool old_wp);
 void	dump_add_page(vm_paddr_t);
 void	dump_drop_page(vm_paddr_t);
 void	finishidentcpu(void);
 void	identify_cpu1(void);
 void	identify_cpu2(void);
 void	identify_cpu_fixup_bsp(void);
 void	identify_hypervisor(void);
 void	initializecpu(void);
 void	initializecpucache(void);
 bool	fix_cpuid(void);
 void	fillw(int /*u_short*/ pat, void *base, size_t cnt);
 int	is_physical_memory(vm_paddr_t addr);
 int	isa_nmi(int cd);
 void	handle_ibrs_entry(void);
 void	handle_ibrs_exit(void);
 void	hw_ibrs_recalculate(bool all_cpus);
 void	hw_mds_recalculate(void);
 void	hw_ssb_recalculate(bool all_cpus);
 void	x86_taa_recalculate(void);
 void	nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame);
 void	nmi_call_kdb_smp(u_int type, struct trapframe *frame);
 void	nmi_handle_intr(u_int type, struct trapframe *frame);
 void	pagecopy(void *from, void *to);
 void	printcpuinfo(void);
 int	pti_get_default(void);
 int	user_dbreg_trap(register_t dr6);
 int	minidumpsys(struct dumperinfo *);
 struct pcb *get_pcb_td(struct thread *td);
 
 #define	MSR_OP_ANDNOT		0x00000001
 #define	MSR_OP_OR		0x00000002
 #define	MSR_OP_WRITE		0x00000003
 #define	MSR_OP_LOCAL		0x10000000
 #define	MSR_OP_SCHED		0x20000000
 #define	MSR_OP_RENDEZVOUS	0x30000000
 void x86_msr_op(u_int msr, u_int op, uint64_t arg1);
 
 #endif
Index: head/sys/x86/x86/cpu_machdep.c
===================================================================
--- head/sys/x86/x86/cpu_machdep.c	(revision 361301)
+++ head/sys/x86/x86/cpu_machdep.c	(revision 361302)
@@ -1,1444 +1,1449 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_platform.h"
 #ifdef __i386__
 #include "opt_apic.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/domainset.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/specialreg.h>
 #include <machine/md_var.h>
 #include <machine/mp_watchdog.h>
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef CPU_ELAN
 #include <machine/elan_mmcr.h>
 #endif
 #include <x86/acpica_machdep.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <isa/isareg.h>
 
 #include <contrib/dev/acpica/include/acpi.h>
 
 #define	STATE_RUNNING	0x0
 #define	STATE_MWAIT	0x1
 #define	STATE_SLEEPING	0x2
 
 #ifdef SMP
 static u_int	cpu_reset_proxyid;
 static volatile u_int	cpu_reset_proxy_active;
 #endif
 
 struct msr_op_arg {
 	u_int msr;
 	int op;
 	uint64_t arg1;
 };
 
 static void
 x86_msr_op_one(void *argp)
 {
 	struct msr_op_arg *a;
 	uint64_t v;
 
 	a = argp;
 	switch (a->op) {
 	case MSR_OP_ANDNOT:
 		v = rdmsr(a->msr);
 		v &= ~a->arg1;
 		wrmsr(a->msr, v);
 		break;
 	case MSR_OP_OR:
 		v = rdmsr(a->msr);
 		v |= a->arg1;
 		wrmsr(a->msr, v);
 		break;
 	case MSR_OP_WRITE:
 		wrmsr(a->msr, a->arg1);
 		break;
 	}
 }
 
 #define	MSR_OP_EXMODE_MASK	0xf0000000
 #define	MSR_OP_OP_MASK		0x000000ff
 
 void
 x86_msr_op(u_int msr, u_int op, uint64_t arg1)
 {
 	struct thread *td;
 	struct msr_op_arg a;
 	u_int exmode;
 	int bound_cpu, i, is_bound;
 
 	a.op = op & MSR_OP_OP_MASK;
 	MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
 	    a.op == MSR_OP_WRITE);
 	exmode = op & MSR_OP_EXMODE_MASK;
 	MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED ||
 	    exmode == MSR_OP_RENDEZVOUS);
 	a.msr = msr;
 	a.arg1 = arg1;
 	switch (exmode) {
 	case MSR_OP_LOCAL:
 		x86_msr_op_one(&a);
 		break;
 	case MSR_OP_SCHED:
 		td = curthread;
 		thread_lock(td);
 		is_bound = sched_is_bound(td);
 		bound_cpu = td->td_oncpu;
 		CPU_FOREACH(i) {
 			sched_bind(td, i);
 			x86_msr_op_one(&a);
 		}
 		if (is_bound)
 			sched_bind(td, bound_cpu);
 		else
 			sched_unbind(td);
 		thread_unlock(td);
 		break;
 	case MSR_OP_RENDEZVOUS:
 		smp_rendezvous(NULL, x86_msr_op_one, NULL, &a);
 		break;
 	}
 }
 
 /*
  * Automatically initialized per CPU errata in cpu_idle_tun below.
  */
 bool mwait_cpustop_broken = false;
 SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN,
     &mwait_cpustop_broken, 0,
     "Can not reliably wake MONITOR/MWAIT cpus without interrupts");
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Flush the D-cache for non-DMA I/O so that the I-cache can
  * be made coherent later.
  */
 void
 cpu_flush_dcache(void *ptr, size_t len)
 {
 	/* Not applicable */
 }
 
 void
 acpi_cpu_c1(void)
 {
 
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Use mwait to pause execution while waiting for an interrupt or
  * another thread to signal that there is more work.
  *
  * NOTE: Interrupts will cause a wakeup; however, this function does
  * not enable interrupt handling. The caller is responsible to enable
  * interrupts.
  */
 void
 acpi_cpu_idle_mwait(uint32_t mwait_hint)
 {
 	int *state;
 	uint64_t v;
 
 	/*
 	 * A comment in Linux patch claims that 'CPUs run faster with
 	 * speculation protection disabled. All CPU threads in a core
 	 * must disable speculation protection for it to be
 	 * disabled. Disable it while we are idle so the other
 	 * hyperthread can run fast.'
 	 *
 	 * XXXKIB.  Software coordination mode should be supported,
 	 * but all Intel CPUs provide hardware coordination.
 	 */
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
 	    ("cpu_mwait_cx: wrong monitorbuf state"));
 	atomic_store_int(state, STATE_MWAIT);
 	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
 		v = rdmsr(MSR_IA32_SPEC_CTRL);
 		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
 		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
 	} else {
 		v = 0;
 	}
 	cpu_monitor(state, 0, 0);
 	if (atomic_load_int(state) == STATE_MWAIT)
 		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
 
 	/*
 	 * SSB cannot be disabled while we sleep, or rather, if it was
 	 * disabled, the sysctl thread will bind to our cpu to tweak
 	 * MSR.
 	 */
 	if (v != 0)
 		wrmsr(MSR_IA32_SPEC_CTRL, v);
 
 	/*
 	 * We should exit on any event that interrupts mwait, because
 	 * that event might be a wanted interrupt.
 	 */
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	uint64_t tsc1, tsc2;
 	uint64_t acnt, mcnt, perf;
 	register_t reg;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 #ifdef __i386__
 	if ((cpu_feature & CPUID_TSC) == 0)
 		return (EOPNOTSUPP);
 #endif
 
 	/*
 	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
 	 * DELAY(9) based logic fails.
 	 */
 	if (tsc_is_invariant && !tsc_perf_stat)
 		return (EOPNOTSUPP);
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		/* Schedule ourselves on the indicated cpu. */
 		thread_lock(curthread);
 		sched_bind(curthread, cpu_id);
 		thread_unlock(curthread);
 	}
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	if (tsc_is_invariant) {
 		wrmsr(MSR_MPERF, 0);
 		wrmsr(MSR_APERF, 0);
 		tsc1 = rdtsc();
 		DELAY(1000);
 		mcnt = rdmsr(MSR_MPERF);
 		acnt = rdmsr(MSR_APERF);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		perf = 1000 * acnt / mcnt;
 		*rate = (tsc2 - tsc1) * perf;
 	} else {
 		tsc1 = rdtsc();
 		DELAY(1000);
 		tsc2 = rdtsc();
 		intr_restore(reg);
 		*rate = (tsc2 - tsc1) * 1000;
 	}
 
 #ifdef SMP
 	if (smp_cpus > 1) {
 		thread_lock(curthread);
 		sched_unbind(curthread);
 		thread_unlock(curthread);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		halt();
 }
 
 static void
 cpu_reset_real(void)
 {
 	struct region_descriptor null_idt;
 	int b;
 
 	disable_intr();
 #ifdef CPU_ELAN
 	if (elan_mmcr != NULL)
 		elan_mmcr->RESCFG = 1;
 #endif
 #ifdef __i386__
 	if (cpu == CPU_GEODE1100) {
 		/* Attempt Geode's own reset */
 		outl(0xcf8, 0x80009044ul);
 		outl(0xcfc, 0xf);
 	}
 #endif
 #if !defined(BROKEN_KEYBOARD_RESET)
 	/*
 	 * Attempt to do a CPU reset via the keyboard controller,
 	 * do not turn off GateA20, as any machine that fails
 	 * to do the reset here would then end up in no man's land.
 	 */
 	outb(IO_KBD + 4, 0xFE);
 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
 #endif
 
 	/*
 	 * Attempt to force a reset via the Reset Control register at
 	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
 	 * transitions from 0 to 1.  Bit 1 selects the type of reset
 	 * to attempt: 0 selects a "soft" reset, and 1 selects a
 	 * "hard" reset.  We try a "hard" reset.  The first write sets
 	 * bit 1 to select a "hard" reset and clears bit 2.  The
 	 * second write forces a 0 -> 1 transition in bit 2 to trigger
 	 * a reset.
 	 */
 	outb(0xcf9, 0x2);
 	outb(0xcf9, 0x6);
 	DELAY(500000);  /* wait 0.5 sec to see if that did it */
 
 	/*
 	 * Attempt to force a reset via the Fast A20 and Init register
 	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
 	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
 	 * preserve bit 1 while setting bit 0.  We also must clear bit
 	 * 0 before setting it if it isn't already clear.
 	 */
 	b = inb(0x92);
 	if (b != 0xff) {
 		if ((b & 0x1) != 0)
 			outb(0x92, b & 0xfe);
 		outb(0x92, b | 0x1);
 		DELAY(500000);  /* wait 0.5 sec to see if that did it */
 	}
 
 	printf("No known reset method worked, attempting CPU shutdown\n");
 	DELAY(1000000); /* wait 1 sec for printf to complete */
 
 	/* Wipe the IDT. */
 	null_idt.rd_limit = 0;
 	null_idt.rd_base = 0;
 	lidt(&null_idt);
 
 	/* "good night, sweet prince .... <THUNK!>" */
 	breakpoint();
 
 	/* NOTREACHED */
 	while(1);
 }
 
 #ifdef SMP
 static void
 cpu_reset_proxy(void)
 {
 
 	cpu_reset_proxy_active = 1;
 	while (cpu_reset_proxy_active == 1)
 		ia32_pause(); /* Wait for other cpu to see that we've started */
 
 	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
 	DELAY(1000000);
 	cpu_reset_real();
 }
 #endif
 
 void
 cpu_reset(void)
 {
 #ifdef SMP
 	struct monitorbuf *mb;
 	cpuset_t map;
 	u_int cnt;
 
 	if (smp_started) {
 		map = all_cpus;
 		CPU_CLR(PCPU_GET(cpuid), &map);
 		CPU_ANDNOT(&map, &stopped_cpus);
 		if (!CPU_EMPTY(&map)) {
 			printf("cpu_reset: Stopping other CPUs\n");
 			stop_cpus(map);
 		}
 
 		if (PCPU_GET(cpuid) != 0) {
 			cpu_reset_proxyid = PCPU_GET(cpuid);
 			cpustop_restartfunc = cpu_reset_proxy;
 			cpu_reset_proxy_active = 0;
 			printf("cpu_reset: Restarting BSP\n");
 
 			/* Restart CPU #0. */
 			CPU_SETOF(0, &started_cpus);
 			mb = &pcpu_find(0)->pc_monitorbuf;
 			atomic_store_int(&mb->stop_state,
 			    MONITOR_STOPSTATE_RUNNING);
 
 			cnt = 0;
 			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
 				ia32_pause();
 				cnt++;	/* Wait for BSP to announce restart */
 			}
 			if (cpu_reset_proxy_active == 0) {
 				printf("cpu_reset: Failed to restart BSP\n");
 			} else {
 				cpu_reset_proxy_active = 2;
 				while (1)
 					ia32_pause();
 				/* NOTREACHED */
 			}
 		}
 
 		DELAY(1000000);
 	}
 #endif
 	cpu_reset_real();
 	/* NOTREACHED */
 }
 
 bool
 cpu_mwait_usable(void)
 {
 
 	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
 	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
 }
 
 void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
 static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
 static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
 SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
     0, "Use MONITOR/MWAIT for short idle");
 
 static void
 cpu_idle_acpi(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_SLEEPING);
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else if (cpu_idle_hook)
 		cpu_idle_hook(sbt);
 	else
 		acpi_cpu_c1();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_hlt(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_SLEEPING);
 
 	/*
 	 * Since we may be in a critical section from cpu_idle(), if
 	 * an interrupt fires during that critical section we may have
 	 * a pending preemption.  If the CPU halts, then that thread
 	 * may not execute until a later interrupt awakens the CPU.
 	 * To handle this race, check for a runnable thread after
 	 * disabling interrupts and immediately return if one is
 	 * found.  Also, we must absolutely guarentee that hlt is
 	 * the next instruction after sti.  This ensures that any
 	 * interrupt that fires after the call to disable_intr() will
 	 * immediately awaken the CPU from hlt.  Finally, please note
 	 * that on x86 this works fine because of interrupts enabled only
 	 * after the instruction following sti takes place, while IF is set
 	 * to 1 immediately, allowing hlt instruction to acknowledge the
 	 * interrupt.
 	 */
 	disable_intr();
 	if (sched_runnable())
 		enable_intr();
 	else
 		acpi_cpu_c1();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_mwait(sbintime_t sbt)
 {
 	int *state;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_MWAIT);
 
 	/* See comments in cpu_idle_hlt(). */
 	disable_intr();
 	if (sched_runnable()) {
 		atomic_store_int(state, STATE_RUNNING);
 		enable_intr();
 		return;
 	}
 
 	cpu_monitor(state, 0, 0);
 	if (atomic_load_int(state) == STATE_MWAIT)
 		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
 	else
 		enable_intr();
 	atomic_store_int(state, STATE_RUNNING);
 }
 
 static void
 cpu_idle_spin(sbintime_t sbt)
 {
 	int *state;
 	int i;
 
 	state = &PCPU_PTR(monitorbuf)->idle_state;
 	atomic_store_int(state, STATE_RUNNING);
 
 	/*
 	 * The sched_runnable() call is racy but as long as there is
 	 * a loop missing it one time will have just a little impact if any 
 	 * (and it is much better than missing the check at all).
 	 */
 	for (i = 0; i < 1000; i++) {
 		if (sched_runnable())
 			return;
 		cpu_spinwait();
 	}
 }
 
 /*
  * C1E renders the local APIC timer dead, so we disable it by
  * reading the Interrupt Pending Message register and clearing
  * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
  * 
  * Reference:
  *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
  *   #32559 revision 3.00+
  */
 #define	MSR_AMDK8_IPM		0xc0010055
 #define	AMDK8_SMIONCMPHALT	(1ULL << 27)
 #define	AMDK8_C1EONCMPHALT	(1ULL << 28)
 #define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
 
 void
 cpu_probe_amdc1e(void)
 {
 
 	/*
 	 * Detect the presence of C1E capability mostly on latest
 	 * dual-cores (or future) k8 family.
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    (cpu_id & 0x00000f00) == 0x00000f00 &&
 	    (cpu_id & 0x0fff0000) >=  0x00040000) {
 		cpu_ident_amdc1e = 1;
 	}
 }
 
 void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
 
 void
 cpu_idle(int busy)
 {
 	uint64_t msr;
 	sbintime_t sbt = -1;
 
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
 	    busy, curcpu);
 #ifdef MP_WATCHDOG
 	ap_watchdog(PCPU_GET(cpuid));
 #endif
 
 	/* If we are busy - try to use fast methods. */
 	if (busy) {
 		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
 			cpu_idle_mwait(busy);
 			goto out;
 		}
 	}
 
 	/* If we have time - switch timers into idle mode. */
 	if (!busy) {
 		critical_enter();
 		sbt = cpu_idleclock();
 	}
 
 	/* Apply AMD APIC timer C1E workaround. */
 	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
 		msr = rdmsr(MSR_AMDK8_IPM);
 		if (msr & AMDK8_CMPHALT)
 			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
 	}
 
 	/* Call main idle method. */
 	cpu_idle_fn(sbt);
 
 	/* Switch timers back into active mode. */
 	if (!busy) {
 		cpu_activeclock();
 		critical_exit();
 	}
 out:
 	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
 	    busy, curcpu);
 }
 
 static int cpu_idle_apl31_workaround;
 SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
     &cpu_idle_apl31_workaround, 0,
     "Apollo Lake APL31 MWAIT bug workaround");
 
 int
 cpu_idle_wakeup(int cpu)
 {
 	struct monitorbuf *mb;
 	int *state;
 
 	mb = &pcpu_find(cpu)->pc_monitorbuf;
 	state = &mb->idle_state;
 	switch (atomic_load_int(state)) {
 	case STATE_SLEEPING:
 		return (0);
 	case STATE_MWAIT:
 		atomic_store_int(state, STATE_RUNNING);
 		return (cpu_idle_apl31_workaround ? 0 : 1);
 	case STATE_RUNNING:
 		return (1);
 	default:
 		panic("bad monitor state");
 		return (1);
 	}
 }
 
 /*
  * Ordered by speed/power consumption.
  */
 static struct {
 	void	*id_fn;
 	char	*id_name;
 	int	id_cpuid2_flag;
 } idle_tbl[] = {
 	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
 	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
 	    .id_cpuid2_flag = CPUID2_MON },
 	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
 	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
 };
 
 static int
 idle_sysctl_available(SYSCTL_HANDLER_ARGS)
 {
 	char *avail, *p;
 	int error;
 	int i;
 
 	avail = malloc(256, M_TEMP, M_WAITOK);
 	p = avail;
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		p += sprintf(p, "%s%s", p != avail ? ", " : "",
 		    idle_tbl[i].id_name);
 	}
 	error = sysctl_handle_string(oidp, avail, 0, req);
 	free(avail, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle_available,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     0, 0, idle_sysctl_available, "A",
     "list of available idle functions");
 
 static bool
 cpu_idle_selector(const char *new_idle_name)
 {
 	int i;
 
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_cpuid2_flag != 0 &&
 		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
 		    cpu_idle_hook == NULL)
 			continue;
 		if (strcmp(idle_tbl[i].id_name, new_idle_name))
 			continue;
 		cpu_idle_fn = idle_tbl[i].id_fn;
 		if (bootverbose)
 			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
 		return (true);
 	}
 	return (false);
 }
 
 static int
 cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16], *p;
 	int error, i;
 
 	p = "unknown";
 	for (i = 0; i < nitems(idle_tbl); i++) {
 		if (idle_tbl[i].id_fn == cpu_idle_fn) {
 			p = idle_tbl[i].id_name;
 			break;
 		}
 	}
 	strncpy(buf, p, sizeof(buf));
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	return (cpu_idle_selector(buf) ? 0 : EINVAL);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, idle,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     0, 0, cpu_idle_sysctl, "A",
     "currently selected idle function");
 
 static void
 cpu_idle_tun(void *unused __unused)
 {
 	char tunvar[16];
 
 	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
 		cpu_idle_selector(tunvar);
 	else if (cpu_vendor_id == CPU_VENDOR_AMD &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
 		/* Ryzen erratas 1057, 1109. */
 		cpu_idle_selector("hlt");
 		idle_mwait = 0;
 		mwait_cpustop_broken = true;
 	}
 
 	if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
 		/*
 		 * Apollo Lake errata APL31 (public errata APL30).
 		 * Stores to the armed address range may not trigger
 		 * MWAIT to resume execution.  OS needs to use
 		 * interrupts to wake processors from MWAIT-induced
 		 * sleep states.
 		 */
 		cpu_idle_apl31_workaround = 1;
 		mwait_cpustop_broken = true;
 	}
 	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
 }
 SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
 
 static int panic_on_nmi = 0xff;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
     &panic_on_nmi, 0,
     "Panic on NMI: 1 = H/W failure; 2 = unknown; 0xff = all");
 int nmi_is_broadcast = 1;
 SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
     &nmi_is_broadcast, 0,
     "Chipset NMI is broadcast");
 
 void
 nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
 {
 	bool claimed = false;
 
 #ifdef DEV_ISA
 	/* machine/parity/power fail/"kitchen sink" faults */
 	if (isa_nmi(frame->tf_err)) {
 		claimed = true;
 		if ((panic_on_nmi & 1) != 0)
 			panic("NMI indicates hardware failure");
 	}
 #endif /* DEV_ISA */
 
 	/*
 	 * NMIs can be useful for debugging.  They can be hooked up to a
 	 * pushbutton, usually on an ISA, PCI, or PCIe card.  They can also be
 	 * generated by an IPMI BMC, either manually or in response to a
 	 * watchdog timeout.  For example, see the "power diag" command in
 	 * ports/sysutils/ipmitool.  They can also be generated by a
 	 * hypervisor; see "bhyvectl --inject-nmi".
 	 */
 
 #ifdef KDB
 	if (!claimed && (panic_on_nmi & 2) != 0) {
 		if (debugger_on_panic) {
 			printf("NMI/cpu%d ... going to debugger\n", cpu);
 			claimed = kdb_trap(type, 0, frame);
 		}
 	}
 #endif /* KDB */
 
 	if (!claimed && panic_on_nmi != 0)
 		panic("NMI");
 }
 
 void
 nmi_handle_intr(u_int type, struct trapframe *frame)
 {
 
 #ifdef SMP
 	if (nmi_is_broadcast) {
 		nmi_call_kdb_smp(type, frame);
 		return;
 	}
 #endif
 	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
 }
 
 static int hw_ibrs_active;
 int hw_ibrs_ibpb_active;
 int hw_ibrs_disable = 1;
 
 SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
     "Indirect Branch Restricted Speculation active");
 
 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ibrs,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Indirect Branch Restricted Speculation active");
 
 SYSCTL_INT(_machdep_mitigations_ibrs, OID_AUTO, active, CTLFLAG_RD,
     &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active");
 
 void
 hw_ibrs_recalculate(bool for_all_cpus)
 {
 	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
 		x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ?
 		    MSR_OP_RENDEZVOUS : MSR_OP_LOCAL) |
 		    (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR),
 		    IA32_SPEC_CTRL_IBRS);
 		hw_ibrs_active = hw_ibrs_disable == 0;
 		hw_ibrs_ibpb_active = 0;
 	} else {
 		hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 &
 		    CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable;
 	}
 }
 
 static int
 hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = hw_ibrs_disable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	hw_ibrs_disable = val != 0;
 	hw_ibrs_recalculate(true);
 	return (0);
 }
 SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
     CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
     "Disable Indirect Branch Restricted Speculation");
 
 SYSCTL_PROC(_machdep_mitigations_ibrs, OID_AUTO, disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     hw_ibrs_disable_handler, "I",
     "Disable Indirect Branch Restricted Speculation");
 
 int hw_ssb_active;
 int hw_ssb_disable;
 
 SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
     &hw_ssb_active, 0,
     "Speculative Store Bypass Disable active");
 
 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ssb,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Speculative Store Bypass Disable active");
 
 SYSCTL_INT(_machdep_mitigations_ssb, OID_AUTO, active, CTLFLAG_RD,
     &hw_ssb_active, 0, "Speculative Store Bypass Disable active");
 
 static void
 hw_ssb_set(bool enable, bool for_all_cpus)
 {
 
 	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
 		hw_ssb_active = 0;
 		return;
 	}
 	hw_ssb_active = enable;
 	x86_msr_op(MSR_IA32_SPEC_CTRL,
 	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
 	    (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD);
 }
 
 void
 hw_ssb_recalculate(bool all_cpus)
 {
 
 	switch (hw_ssb_disable) {
 	default:
 		hw_ssb_disable = 0;
 		/* FALLTHROUGH */
 	case 0: /* off */
 		hw_ssb_set(false, all_cpus);
 		break;
 	case 1: /* on */
 		hw_ssb_set(true, all_cpus);
 		break;
 	case 2: /* auto */
 		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
 		    false : true, all_cpus);
 		break;
 	}
 }
 
 static int
 hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = hw_ssb_disable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	hw_ssb_disable = val;
 	hw_ssb_recalculate(true);
 	return (0);
 }
 SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     hw_ssb_disable_handler, "I",
     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
 
 SYSCTL_PROC(_machdep_mitigations_ssb, OID_AUTO, disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     hw_ssb_disable_handler, "I",
     "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
 
 int hw_mds_disable;
 
 /*
  * Handler for Microarchitectural Data Sampling issues.  Really not a
  * pointer to C function: on amd64 the code must not change any CPU
  * architectural state except possibly %rflags. Also, it is always
  * called with interrupts disabled.
  */
 void mds_handler_void(void);
 void mds_handler_verw(void);
 void mds_handler_ivb(void);
 void mds_handler_bdw(void);
 void mds_handler_skl_sse(void);
 void mds_handler_skl_avx(void);
 void mds_handler_skl_avx512(void);
 void mds_handler_silvermont(void);
 void (*mds_handler)(void) = mds_handler_void;
 
 static int
 sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	const char *state;
 
 	if (mds_handler == mds_handler_void)
 		state = "inactive";
 	else if (mds_handler == mds_handler_verw)
 		state = "VERW";
 	else if (mds_handler == mds_handler_ivb)
 		state = "software IvyBridge";
 	else if (mds_handler == mds_handler_bdw)
 		state = "software Broadwell";
 	else if (mds_handler == mds_handler_skl_sse)
 		state = "software Skylake SSE";
 	else if (mds_handler == mds_handler_skl_avx)
 		state = "software Skylake AVX";
 	else if (mds_handler == mds_handler_skl_avx512)
 		state = "software Skylake AVX512";
 	else if (mds_handler == mds_handler_silvermont)
 		state = "software Silvermont";
 	else
 		state = "unknown";
 	return (SYSCTL_OUT(req, state, strlen(state)));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_hw_mds_disable_state_handler, "A",
     "Microarchitectural Data Sampling Mitigation state");
 
 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, mds,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Microarchitectural Data Sampling Mitigation state");
 
 SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, state,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_hw_mds_disable_state_handler, "A",
     "Microarchitectural Data Sampling Mitigation state");
 
 _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
 
 void
 hw_mds_recalculate(void)
 {
 	struct pcpu *pc;
 	vm_offset_t b64;
 	u_long xcr0;
 	int i;
 
 	/*
 	 * Allow user to force VERW variant even if MD_CLEAR is not
 	 * reported.  For instance, hypervisor might unknowingly
 	 * filter the cap out.
 	 * For the similar reasons, and for testing, allow to enable
 	 * mitigation even when MDS_NO cap is set.
 	 */
 	if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
 	    ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 &&
 	    hw_mds_disable == 3)) {
 		mds_handler = mds_handler_void;
 	} else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
 	    hw_mds_disable == 3) || hw_mds_disable == 1) {
 		mds_handler = mds_handler_verw;
 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
 	    CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
 	    CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
 	    CPUID_TO_MODEL(cpu_id) == 0x3a) &&
 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 		/*
 		 * Nehalem, SandyBridge, IvyBridge
 		 */
 		CPU_FOREACH(i) {
 			pc = pcpu_find(i);
 			if (pc->pc_mds_buf == NULL) {
 				pc->pc_mds_buf = malloc_domainset(672, M_TEMP,
 				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
 				bzero(pc->pc_mds_buf, 16);
 			}
 		}
 		mds_handler = mds_handler_ivb;
 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
 	    CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 		/*
 		 * Haswell, Broadwell
 		 */
 		CPU_FOREACH(i) {
 			pc = pcpu_find(i);
 			if (pc->pc_mds_buf == NULL) {
 				pc->pc_mds_buf = malloc_domainset(1536, M_TEMP,
 				    DOMAINSET_PREF(pc->pc_domain), M_WAITOK);
 				bzero(pc->pc_mds_buf, 16);
 			}
 		}
 		mds_handler = mds_handler_bdw;
 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
 	    CPUID_STEPPING) <= 5) ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
 	    (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
 	    CPUID_STEPPING) <= 0xb) ||
 	    (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
 	    CPUID_STEPPING) <= 0xc)) &&
 	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
 		/*
 		 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
 		 * CascadeLake
 		 */
 		CPU_FOREACH(i) {
 			pc = pcpu_find(i);
 			if (pc->pc_mds_buf == NULL) {
 				pc->pc_mds_buf = malloc_domainset(6 * 1024,
 				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
 				    M_WAITOK);
 				b64 = (vm_offset_t)malloc_domainset(64 + 63,
 				    M_TEMP, DOMAINSET_PREF(pc->pc_domain),
 				    M_WAITOK);
 				pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
 				bzero(pc->pc_mds_buf64, 64);
 			}
 		}
 		xcr0 = rxcr(0);
 		if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
 		    (cpu_stdext_feature & CPUID_STDEXT_AVX512DQ) != 0)
 			mds_handler = mds_handler_skl_avx512;
 		else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
 		    (cpu_feature2 & CPUID2_AVX) != 0)
 			mds_handler = mds_handler_skl_avx;
 		else
 			mds_handler = mds_handler_skl_sse;
 	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 	    ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
 	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
 	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
 	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
 	    CPUID_TO_MODEL(cpu_id) == 0x65 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x75 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
 	    CPUID_TO_MODEL(cpu_id) == 0x26 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
 	    CPUID_TO_MODEL(cpu_id) == 0x7a))) {
 		/* Silvermont, Airmont */
 		CPU_FOREACH(i) {
 			pc = pcpu_find(i);
 			if (pc->pc_mds_buf == NULL)
 				pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
 		}
 		mds_handler = mds_handler_silvermont;
 	} else {
 		hw_mds_disable = 0;
 		mds_handler = mds_handler_void;
 	}
 }
 
 static void
 hw_mds_recalculate_boot(void *arg __unused)
 {
 
 	hw_mds_recalculate();
 }
 SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
 
 static int
 sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = hw_mds_disable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val < 0 || val > 3)
 		return (EINVAL);
 	hw_mds_disable = val;
 	hw_mds_recalculate();
 	return (0);
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_mds_disable_handler, "I",
     "Microarchitectural Data Sampling Mitigation "
     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
 
 SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, disable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_mds_disable_handler, "I",
     "Microarchitectural Data Sampling Mitigation "
     "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
 
 /*
  * Intel Transactional Memory Asynchronous Abort Mitigation
  * CVE-2019-11135
  */
 int x86_taa_enable;
 int x86_taa_state;
 enum {
 	TAA_NONE	= 0,	/* No mitigation enabled */
 	TAA_TSX_DISABLE	= 1,	/* Disable TSX via MSR */
 	TAA_VERW	= 2,	/* Use VERW mitigation */
 	TAA_AUTO	= 3,	/* Automatically select the mitigation */
 
 	/* The states below are not selectable by the operator */
 
 	TAA_TAA_UC	= 4,	/* Mitigation present in microcode */
 	TAA_NOT_PRESENT	= 5	/* TSX is not present */
 };
 
 static void
 taa_set(bool enable, bool all)
 {
 
 	x86_msr_op(MSR_IA32_TSX_CTRL,
 	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
 	    (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
 	    IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR);
 }
 
 void
 x86_taa_recalculate(void)
 {
 	static int taa_saved_mds_disable = 0;
 	int taa_need = 0, taa_state = 0;
 	int mds_disable = 0, need_mds_recalc = 0;
 
 	/* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
 	if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
 	    (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
 		/* TSX is not present */
 		x86_taa_state = TAA_NOT_PRESENT;
 		return;
 	}
 
 	/* Check to see what mitigation options the CPU gives us */
 	if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
 		/* CPU is not suseptible to TAA */
 		taa_need = TAA_TAA_UC;
 	} else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
 		/*
 		 * CPU can turn off TSX.  This is the next best option
 		 * if TAA_NO hardware mitigation isn't present
 		 */
 		taa_need = TAA_TSX_DISABLE;
 	} else {
 		/* No TSX/TAA specific remedies are available. */
 		if (x86_taa_enable == TAA_TSX_DISABLE) {
 			if (bootverbose)
 				printf("TSX control not available\n");
 			return;
 		} else
 			taa_need = TAA_VERW;
 	}
 
 	/* Can we automatically take action, or are we being forced? */
 	if (x86_taa_enable == TAA_AUTO)
 		taa_state = taa_need;
 	else
 		taa_state = x86_taa_enable;
 
 	/* No state change, nothing to do */
 	if (taa_state == x86_taa_state) {
 		if (bootverbose)
 			printf("No TSX change made\n");
 		return;
 	}
 
 	/* Does the MSR need to be turned on or off? */
 	if (taa_state == TAA_TSX_DISABLE)
 		taa_set(true, true);
 	else if (x86_taa_state == TAA_TSX_DISABLE)
 		taa_set(false, true);
 
 	/* Does MDS need to be set to turn on VERW? */
 	if (taa_state == TAA_VERW) {
 		taa_saved_mds_disable = hw_mds_disable;
 		mds_disable = hw_mds_disable = 1;
 		need_mds_recalc = 1;
 	} else if (x86_taa_state == TAA_VERW) {
 		mds_disable = hw_mds_disable = taa_saved_mds_disable;
 		need_mds_recalc = 1;
 	}
 	if (need_mds_recalc) {
 		hw_mds_recalculate();
 		if (mds_disable != hw_mds_disable) {
 			if (bootverbose)
 				printf("Cannot change MDS state for TAA\n");
 			/* Don't update our state */
 			return;
 		}
 	}
 
 	x86_taa_state = taa_state;
 	return;
 }
 
 static void
 taa_recalculate_boot(void * arg __unused)
 {
 
 	x86_taa_recalculate();
 }
 SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
 
 SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TSX Asynchronous Abort Mitigation");
 
 static int
 sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = x86_taa_enable;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val < TAA_NONE || val > TAA_AUTO)
 		return (EINVAL);
 	x86_taa_enable = val;
 	x86_taa_recalculate();
 	return (0);
 }
 
 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
     CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_taa_handler, "I",
     "TAA Mitigation enablement control "
     "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO");
 
 static int
 sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
 {
 	const char *state;
 
 	switch (x86_taa_state) {
 	case TAA_NONE:
 		state = "inactive";
 		break;
 	case TAA_TSX_DISABLE:
 		state = "TSX disabled";
 		break;
 	case TAA_VERW:
 		state = "VERW";
 		break;
 	case TAA_TAA_UC:
 		state = "Mitigated in microcode";
 		break;
 	case TAA_NOT_PRESENT:
 		state = "TSX not present";
 		break;
 	default:
 		state = "unknown";
 	}
 
 	return (SYSCTL_OUT(req, state, strlen(state)));
 }
 
 SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_taa_state_handler, "A",
     "TAA Mitigation state");
 
+int __read_frequently cpu_flush_rsb_ctxsw;
+SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw,
+    CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0,
+    "Flush Return Stack Buffer on context switch");
+
 /*
  * Enable and restore kernel text write permissions.
  * Callers must ensure that disable_wp()/restore_wp() are executed
  * without rescheduling on the same core.
  */
 bool
 disable_wp(void)
 {
 	u_int cr0;
 
 	cr0 = rcr0();
 	if ((cr0 & CR0_WP) == 0)
 		return (false);
 	load_cr0(cr0 & ~CR0_WP);
 	return (true);
 }
 
 void
 restore_wp(bool old_wp)
 {
 
 	if (old_wp)
 		load_cr0(rcr0() | CR0_WP);
 }
 
 bool
 acpi_get_fadt_bootflags(uint16_t *flagsp)
 {
 #ifdef DEV_ACPI
 	ACPI_TABLE_FADT *fadt;
 	vm_paddr_t physaddr;
 
 	physaddr = acpi_find_table(ACPI_SIG_FADT);
 	if (physaddr == 0)
 		return (false);
 	fadt = acpi_map_table(physaddr, ACPI_SIG_FADT);
 	if (fadt == NULL)
 		return (false);
 	*flagsp = fadt->BootFlags;
 	acpi_unmap_table(fadt);
 	return (true);
 #else
 	return (false);
 #endif
 }