diff --git a/sys/amd64/amd64/exec_machdep.c b/sys/amd64/amd64/exec_machdep.c
index 1297117638d6..48bda05f9685 100644
--- a/sys/amd64/amd64/exec_machdep.c
+++ b/sys/amd64/amd64/exec_machdep.c
@@ -1,983 +1,983 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/reg.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <machine/vmparam.h>
 #include <machine/frame.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/trap.h>
 
 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpusave, size_t xfpusave_len);
 static int set_fpcontext(struct thread *td, mcontext_t *mcp,
     char *xfpustate, size_t xfpustate_len);
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored at top to call routine,
  * followed by call to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the frame pointer, it
  * returns to the user specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct pcb *pcb;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	pcb = td->td_pcb;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
-		xfpusave = __builtin_alloca(xfpusave_len);
+		xfpusave = (char *)td->td_md.md_fpu_scratch;
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	update_pcb_bases(pcb);
 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - 128;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_ss = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 sys_sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	ucontext_t uc;
 	struct pcb *pcb;
 	struct proc *p;
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	pcb = td->td_pcb;
 	p = td->td_proc;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0) {
 		uprintf("pid %d (%s): sigreturn copyin failed\n",
 		    p->p_pid, td->td_name);
 		return (error);
 	}
 	ucp = &uc;
 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
 		    td->td_name, ucp->uc_mcontext.mc_flags);
 		return (EINVAL);
 	}
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
 		    td->td_name, rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
 		    td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    p->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
 		xfpustate = __builtin_alloca(xfpustate_len);
 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    p->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    p->p_pid, td->td_name, ret);
 		return (ret);
 	}
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
 
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Reset the hardware debug registers if they were in use.
  * They won't have any meaning for the newly exec'd process.
  */
 void
 x86_clear_dbregs(struct pcb *pcb)
 {
 	if ((pcb->pcb_flags & PCB_DBREGS) == 0)
 		return;
 
 	pcb->pcb_dr0 = 0;
 	pcb->pcb_dr1 = 0;
 	pcb->pcb_dr2 = 0;
 	pcb->pcb_dr3 = 0;
 	pcb->pcb_dr6 = 0;
 	pcb->pcb_dr7 = 0;
 
 	if (pcb == curpcb) {
 		/*
 		 * Clear the debug registers on the running CPU,
 		 * otherwise they will end up affecting the next
 		 * process we switch to.
 		 */
 		reset_dbregs();
 	}
 	clear_pcb_flags(pcb, PCB_DBREGS);
 }
 
 /*
  * Reset registers to default values on exec.
  */
 void
 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
 {
 	struct trapframe *regs;
 	struct pcb *pcb;
 	register_t saved_rflags;
 
 	regs = td->td_frame;
 	pcb = td->td_pcb;
 
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 
 	update_pcb_bases(pcb);
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	clear_pcb_flags(pcb, PCB_32BIT);
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
 
 	saved_rflags = regs->tf_rflags & PSL_T;
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | saved_rflags;
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 
 	x86_clear_dbregs(pcb);
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	return (fill_frame_regs(tp, regs));
 }
 
 int
 fill_frame_regs(struct trapframe *tp, struct reg *regs)
 {
 
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	if (tp->tf_flags & TF_HASSEGS) {
 		regs->r_ds = tp->tf_ds;
 		regs->r_es = tp->tf_es;
 		regs->r_fs = tp->tf_fs;
 		regs->r_gs = tp->tf_gs;
 	} else {
 		regs->r_ds = 0;
 		regs->r_es = 0;
 		regs->r_fs = 0;
 		regs->r_gs = 0;
 	}
 	regs->r_err = 0;
 	regs->r_trapno = 0;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	if (0) {	/* XXXKIB */
 		tp->tf_ds = regs->r_ds;
 		tp->tf_es = regs->r_es;
 		tp->tf_fs = regs->r_fs;
 		tp->tf_gs = regs->r_gs;
 		tp->tf_flags = TF_HASSEGS;
 	}
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
 	    P_SHOULDSTOP(td->td_proc),
 	    ("not suspended thread %p", td));
 	fpugetregs(td);
 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	critical_enter();
 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
 	fpuuserinited(td);
 	critical_exit();
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_flags = tp->tf_flags;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp, NULL, 0);
 	update_pcb_bases(pcb);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp) ||
 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
-		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
+		xfpustate = (char *)td->td_md.md_fpu_scratch;
 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	tp->tf_flags = mcp->mc_flags;
 	if (tp->tf_flags & TF_HASSEGS) {
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_gs = mcp->mc_gs;
 	}
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 	if (mcp->mc_flags & _MC_HASBASES) {
 		pcb->pcb_fsbase = mcp->mc_fsbase;
 		pcb->pcb_gsbase = mcp->mc_gsbase;
 	}
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
     size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
     size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 
 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
 	critical_enter();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
 	 * have too many layers.
 	 */
 	clear_pcb_flags(curthread->td_pcb,
 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	critical_exit();
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		set_pcb_flags(pcb, PCB_DBREGS);
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(register_t dr6)
 {
         u_int64_t dr7;
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
 
         bp = dr6 & DBREG_DR6_BMASK;
         if (bp == 0) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return (0);
         }
 
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
 		return (0);
         }
 
         nbp = 0;
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return (nbp);
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return (0);
 }
diff --git a/sys/amd64/amd64/fpu.c b/sys/amd64/amd64/fpu.c
index d7936b3b1922..24986958d4ca 100644
--- a/sys/amd64/amd64/fpu.c
+++ b/sys/amd64/amd64/fpu.c
@@ -1,1293 +1,1295 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990 William Jolitz.
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)npx.c	7.2 (Berkeley) 5/12/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/domainset.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <sys/signalvar.h>
 #include <vm/uma.h>
 
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/psl.h>
 #include <machine/resource.h>
 #include <machine/specialreg.h>
 #include <machine/segments.h>
 #include <machine/ucontext.h>
 #include <x86/ifunc.h>
 
 /*
  * Floating point support.
  */
 
 #if defined(__GNUCLIKE_ASM) && !defined(lint)
 
 #define	fldcw(cw)		__asm __volatile("fldcw %0" : : "m" (cw))
 #define	fnclex()		__asm __volatile("fnclex")
 #define	fninit()		__asm __volatile("fninit")
 #define	fnstcw(addr)		__asm __volatile("fnstcw %0" : "=m" (*(addr)))
 #define	fnstsw(addr)		__asm __volatile("fnstsw %0" : "=am" (*(addr)))
 #define	fxrstor(addr)		__asm __volatile("fxrstor %0" : : "m" (*(addr)))
 #define	fxsave(addr)		__asm __volatile("fxsave %0" : "=m" (*(addr)))
 #define	ldmxcsr(csr)		__asm __volatile("ldmxcsr %0" : : "m" (csr))
 #define	stmxcsr(addr)		__asm __volatile("stmxcsr %0" : "=m" (*(addr)))
 
 static __inline void
 xrstor32(char *addr, uint64_t mask)
 {
 	uint32_t low, hi;
 
 	low = mask;
 	hi = mask >> 32;
 	__asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi));
 }
 
 static __inline void
 xrstor64(char *addr, uint64_t mask)
 {
 	uint32_t low, hi;
 
 	low = mask;
 	hi = mask >> 32;
 	__asm __volatile("xrstor64 %0" : : "m" (*addr), "a" (low), "d" (hi));
 }
 
 static __inline void
 xsave32(char *addr, uint64_t mask)
 {
 	uint32_t low, hi;
 
 	low = mask;
 	hi = mask >> 32;
 	__asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) :
 	    "memory");
 }
 
 static __inline void
 xsave64(char *addr, uint64_t mask)
 {
 	uint32_t low, hi;
 
 	low = mask;
 	hi = mask >> 32;
 	__asm __volatile("xsave64 %0" : "=m" (*addr) : "a" (low), "d" (hi) :
 	    "memory");
 }
 
 static __inline void
 xsaveopt32(char *addr, uint64_t mask)
 {
 	uint32_t low, hi;
 
 	low = mask;
 	hi = mask >> 32;
 	__asm __volatile("xsaveopt %0" : "=m" (*addr) : "a" (low), "d" (hi) :
 	    "memory");
 }
 
 static __inline void
 xsaveopt64(char *addr, uint64_t mask)
 {
 	uint32_t low, hi;
 
 	low = mask;
 	hi = mask >> 32;
 	__asm __volatile("xsaveopt64 %0" : "=m" (*addr) : "a" (low), "d" (hi) :
 	    "memory");
 }
 
 #else	/* !(__GNUCLIKE_ASM && !lint) */
 
 void	fldcw(u_short cw);
 void	fnclex(void);
 void	fninit(void);
 void	fnstcw(caddr_t addr);
 void	fnstsw(caddr_t addr);
 void	fxsave(caddr_t addr);
 void	fxrstor(caddr_t addr);
 void	ldmxcsr(u_int csr);
 void	stmxcsr(u_int *csr);
 void	xrstor32(char *addr, uint64_t mask);
 void	xrstor64(char *addr, uint64_t mask);
 void	xsave32(char *addr, uint64_t mask);
 void	xsave64(char *addr, uint64_t mask);
 void	xsaveopt32(char *addr, uint64_t mask);
 void	xsaveopt64(char *addr, uint64_t mask);
 
 #endif	/* __GNUCLIKE_ASM && !lint */
 
 #define	start_emulating()	load_cr0(rcr0() | CR0_TS)
 #define	stop_emulating()	clts()
 
 CTASSERT(sizeof(struct savefpu) == 512);
 CTASSERT(sizeof(struct xstate_hdr) == 64);
 CTASSERT(sizeof(struct savefpu_ymm) == 832);
 
 /*
  * This requirement is to make it easier for asm code to calculate
  * offset of the fpu save area from the pcb address. FPU save area
  * must be 64-byte aligned.
  */
 CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0);
 
 /*
  * Ensure the copy of XCR0 saved in a core is contained in the padding
  * area.
  */
 CTASSERT(X86_XSTATE_XCR0_OFFSET >= offsetof(struct savefpu, sv_pad) &&
     X86_XSTATE_XCR0_OFFSET + sizeof(uint64_t) <= sizeof(struct savefpu));
 
 static	void	fpu_clean_state(void);
 
 SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD,
     SYSCTL_NULL_INT_PTR, 1, "Floating point instructions executed in hardware");
 
 int use_xsave;			/* non-static for cpu_switch.S */
 uint64_t xsave_mask;		/* the same */
 static	uma_zone_t fpu_save_area_zone;
 static	struct savefpu *fpu_initialstate;
 
 static struct xsave_area_elm_descr {
 	u_int	offset;
 	u_int	size;
 } *xsave_area_desc;
 
 static void
 fpusave_xsaveopt64(void *addr)
 {
 	xsaveopt64((char *)addr, xsave_mask);
 }
 
 static void
 fpusave_xsaveopt3264(void *addr)
 {
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		xsaveopt32((char *)addr, xsave_mask);
 	else
 		xsaveopt64((char *)addr, xsave_mask);
 }
 
 static void
 fpusave_xsave64(void *addr)
 {
 	xsave64((char *)addr, xsave_mask);
 }
 
 static void
 fpusave_xsave3264(void *addr)
 {
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		xsave32((char *)addr, xsave_mask);
 	else
 		xsave64((char *)addr, xsave_mask);
 }
 
 static void
 fpurestore_xrstor64(void *addr)
 {
 	xrstor64((char *)addr, xsave_mask);
 }
 
 static void
 fpurestore_xrstor3264(void *addr)
 {
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		xrstor32((char *)addr, xsave_mask);
 	else
 		xrstor64((char *)addr, xsave_mask);
 }
 
 static void
 fpusave_fxsave(void *addr)
 {
 
 	fxsave((char *)addr);
 }
 
 static void
 fpurestore_fxrstor(void *addr)
 {
 
 	fxrstor((char *)addr);
 }
 
 static void
 init_xsave(void)
 {
 
 	if (use_xsave)
 		return;
 	if ((cpu_feature2 & CPUID2_XSAVE) == 0)
 		return;
 	use_xsave = 1;
 	TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
 }
 
 DEFINE_IFUNC(, void, fpusave, (void *))
 {
 
 	init_xsave();
 	if (!use_xsave)
 		return (fpusave_fxsave);
 	if ((cpu_stdext_feature & CPUID_EXTSTATE_XSAVEOPT) != 0) {
 		return ((cpu_stdext_feature & CPUID_STDEXT_NFPUSG) != 0 ?
 		    fpusave_xsaveopt64 : fpusave_xsaveopt3264);
 	}
 	return ((cpu_stdext_feature & CPUID_STDEXT_NFPUSG) != 0 ?
 	    fpusave_xsave64 : fpusave_xsave3264);
 }
 
 DEFINE_IFUNC(, void, fpurestore, (void *))
 {
 
 	init_xsave();
 	if (!use_xsave)
 		return (fpurestore_fxrstor);
 	return ((cpu_stdext_feature & CPUID_STDEXT_NFPUSG) != 0 ?
 	    fpurestore_xrstor64 : fpurestore_xrstor3264);
 }
 
 void
 fpususpend(void *addr)
 {
 	u_long cr0;
 
 	cr0 = rcr0();
 	stop_emulating();
 	fpusave(addr);
 	load_cr0(cr0);
 }
 
 void
 fpuresume(void *addr)
 {
 	u_long cr0;
 
 	cr0 = rcr0();
 	stop_emulating();
 	fninit();
 	if (use_xsave)
 		load_xcr(XCR0, xsave_mask);
 	fpurestore(addr);
 	load_cr0(cr0);
 }
 
 /*
  * Enable XSAVE if supported and allowed by user.
  * Calculate the xsave_mask.
  */
 static void
 fpuinit_bsp1(void)
 {
 	u_int cp[4];
 	uint64_t xsave_mask_user;
 	bool old_wp;
 
 	if (!use_xsave)
 		return;
 	cpuid_count(0xd, 0x0, cp);
 	xsave_mask = XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
 	if ((cp[0] & xsave_mask) != xsave_mask)
 		panic("CPU0 does not support X87 or SSE: %x", cp[0]);
 	xsave_mask = ((uint64_t)cp[3] << 32) | cp[0];
 	xsave_mask_user = xsave_mask;
 	TUNABLE_ULONG_FETCH("hw.xsave_mask", &xsave_mask_user);
 	xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
 	xsave_mask &= xsave_mask_user;
 	if ((xsave_mask & XFEATURE_AVX512) != XFEATURE_AVX512)
 		xsave_mask &= ~XFEATURE_AVX512;
 	if ((xsave_mask & XFEATURE_MPX) != XFEATURE_MPX)
 		xsave_mask &= ~XFEATURE_MPX;
 
 	cpuid_count(0xd, 0x1, cp);
 	if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) {
 		/*
 		 * Patch the XSAVE instruction in the cpu_switch code
 		 * to XSAVEOPT.  We assume that XSAVE encoding used
 		 * REX byte, and set the bit 4 of the r/m byte.
 		 *
 		 * It seems that some BIOSes give control to the OS
 		 * with CR0.WP already set, making the kernel text
 		 * read-only before cpu_startup().
 		 */
 		old_wp = disable_wp();
 		ctx_switch_xsave32[3] |= 0x10;
 		ctx_switch_xsave[3] |= 0x10;
 		restore_wp(old_wp);
 	}
 }
 
 /*
  * Calculate the fpu save area size.
  */
 static void
 fpuinit_bsp2(void)
 {
 	u_int cp[4];
 
 	if (use_xsave) {
 		cpuid_count(0xd, 0x0, cp);
 		cpu_max_ext_state_size = cp[1];
 
 		/*
 		 * Reload the cpu_feature2, since we enabled OSXSAVE.
 		 */
 		do_cpuid(1, cp);
 		cpu_feature2 = cp[2];
 	} else
 		cpu_max_ext_state_size = sizeof(struct savefpu);
 }
 
 /*
  * Initialize the floating point unit.
  */
 void
 fpuinit(void)
 {
 	register_t saveintr;
 	u_int mxcsr;
 	u_short control;
 
 	if (IS_BSP())
 		fpuinit_bsp1();
 
 	if (use_xsave) {
 		load_cr4(rcr4() | CR4_XSAVE);
 		load_xcr(XCR0, xsave_mask);
 	}
 
 	/*
 	 * XCR0 shall be set up before CPU can report the save area size.
 	 */
 	if (IS_BSP())
 		fpuinit_bsp2();
 
 	/*
 	 * It is too early for critical_enter() to work on AP.
 	 */
 	saveintr = intr_disable();
 	stop_emulating();
 	fninit();
 	control = __INITIAL_FPUCW__;
 	fldcw(control);
 	mxcsr = __INITIAL_MXCSR__;
 	ldmxcsr(mxcsr);
 	start_emulating();
 	intr_restore(saveintr);
 }
 
 /*
  * On the boot CPU we generate a clean state that is used to
  * initialize the floating point unit when it is first used by a
  * process.
  */
 static void
 fpuinitstate(void *arg __unused)
 {
 	uint64_t *xstate_bv;
 	register_t saveintr;
 	int cp[4], i, max_ext_n;
 
 	/* Do potentially blocking operations before disabling interrupts. */
 	fpu_save_area_zone = uma_zcreate("FPU_save_area",
 	    cpu_max_ext_state_size, NULL, NULL, NULL, NULL,
 	    XSAVE_AREA_ALIGN - 1, 0);
 	fpu_initialstate = uma_zalloc(fpu_save_area_zone, M_WAITOK | M_ZERO);
 	if (use_xsave) {
 		max_ext_n = flsl(xsave_mask);
 		xsave_area_desc = malloc(max_ext_n * sizeof(struct
 		    xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO);
 	}
 
+	cpu_thread_alloc(&thread0);
+
 	saveintr = intr_disable();
 	stop_emulating();
 
 	fpusave_fxsave(fpu_initialstate);
 	if (fpu_initialstate->sv_env.en_mxcsr_mask)
 		cpu_mxcsr_mask = fpu_initialstate->sv_env.en_mxcsr_mask;
 	else
 		cpu_mxcsr_mask = 0xFFBF;
 
 	/*
 	 * The fninit instruction does not modify XMM registers or x87
 	 * registers (MM/ST).  The fpusave call dumped the garbage
 	 * contained in the registers after reset to the initial state
 	 * saved.  Clear XMM and x87 registers file image to make the
 	 * startup program state and signal handler XMM/x87 register
 	 * content predictable.
 	 */
 	bzero(fpu_initialstate->sv_fp, sizeof(fpu_initialstate->sv_fp));
 	bzero(fpu_initialstate->sv_xmm, sizeof(fpu_initialstate->sv_xmm));
 
 	/*
 	 * Create a table describing the layout of the CPU Extended
 	 * Save Area.
 	 */
 	if (use_xsave) {
 		xstate_bv = (uint64_t *)((char *)(fpu_initialstate + 1) +
 		    offsetof(struct xstate_hdr, xstate_bv));
 		*xstate_bv = XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE;
 
 		/* x87 state */
 		xsave_area_desc[0].offset = 0;
 		xsave_area_desc[0].size = 160;
 		/* XMM */
 		xsave_area_desc[1].offset = 160;
 		xsave_area_desc[1].size = 288 - 160;
 
 		for (i = 2; i < max_ext_n; i++) {
 			cpuid_count(0xd, i, cp);
 			xsave_area_desc[i].offset = cp[1];
 			xsave_area_desc[i].size = cp[0];
 		}
 	}
 
 	start_emulating();
 	intr_restore(saveintr);
 }
 /* EFIRT needs this to be initialized before we can enter our EFI environment */
 SYSINIT(fpuinitstate, SI_SUB_CPU, SI_ORDER_ANY, fpuinitstate, NULL);
 
 /*
  * Free coprocessor (if we have it).
  */
 void
 fpuexit(struct thread *td)
 {
 
 	critical_enter();
 	if (curthread == PCPU_GET(fpcurthread)) {
 		stop_emulating();
 		fpusave(curpcb->pcb_save);
 		start_emulating();
 		PCPU_SET(fpcurthread, NULL);
 	}
 	critical_exit();
 }
 
 int
 fpuformat(void)
 {
 
 	return (_MC_FPFMT_XMM);
 }
 
 /* 
  * The following mechanism is used to ensure that the FPE_... value
  * that is passed as a trapcode to the signal handler of the user
  * process does not have more than one bit set.
  * 
  * Multiple bits may be set if the user process modifies the control
  * word while a status word bit is already set.  While this is a sign
  * of bad coding, we have no choise than to narrow them down to one
  * bit, since we must not send a trapcode that is not exactly one of
  * the FPE_ macros.
  *
  * The mechanism has a static table with 127 entries.  Each combination
  * of the 7 FPU status word exception bits directly translates to a
  * position in this table, where a single FPE_... value is stored.
  * This FPE_... value stored there is considered the "most important"
  * of the exception bits and will be sent as the signal code.  The
  * precedence of the bits is based upon Intel Document "Numerical
  * Applications", Chapter "Special Computational Situations".
  *
  * The macro to choose one of these values does these steps: 1) Throw
  * away status word bits that cannot be masked.  2) Throw away the bits
  * currently masked in the control word, assuming the user isn't
  * interested in them anymore.  3) Reinsert status word bit 7 (stack
  * fault) if it is set, which cannot be masked but must be presered.
  * 4) Use the remaining bits to point into the trapcode table.
  *
  * The 6 maskable bits in order of their preference, as stated in the
  * above referenced Intel manual:
  * 1  Invalid operation (FP_X_INV)
  * 1a   Stack underflow
  * 1b   Stack overflow
  * 1c   Operand of unsupported format
  * 1d   SNaN operand.
  * 2  QNaN operand (not an exception, irrelavant here)
  * 3  Any other invalid-operation not mentioned above or zero divide
  *      (FP_X_INV, FP_X_DZ)
  * 4  Denormal operand (FP_X_DNML)
  * 5  Numeric over/underflow (FP_X_OFL, FP_X_UFL)
  * 6  Inexact result (FP_X_IMP) 
  */
 static char fpetable[128] = {
 	0,
 	FPE_FLTINV,	/*  1 - INV */
 	FPE_FLTUND,	/*  2 - DNML */
 	FPE_FLTINV,	/*  3 - INV | DNML */
 	FPE_FLTDIV,	/*  4 - DZ */
 	FPE_FLTINV,	/*  5 - INV | DZ */
 	FPE_FLTDIV,	/*  6 - DNML | DZ */
 	FPE_FLTINV,	/*  7 - INV | DNML | DZ */
 	FPE_FLTOVF,	/*  8 - OFL */
 	FPE_FLTINV,	/*  9 - INV | OFL */
 	FPE_FLTUND,	/*  A - DNML | OFL */
 	FPE_FLTINV,	/*  B - INV | DNML | OFL */
 	FPE_FLTDIV,	/*  C - DZ | OFL */
 	FPE_FLTINV,	/*  D - INV | DZ | OFL */
 	FPE_FLTDIV,	/*  E - DNML | DZ | OFL */
 	FPE_FLTINV,	/*  F - INV | DNML | DZ | OFL */
 	FPE_FLTUND,	/* 10 - UFL */
 	FPE_FLTINV,	/* 11 - INV | UFL */
 	FPE_FLTUND,	/* 12 - DNML | UFL */
 	FPE_FLTINV,	/* 13 - INV | DNML | UFL */
 	FPE_FLTDIV,	/* 14 - DZ | UFL */
 	FPE_FLTINV,	/* 15 - INV | DZ | UFL */
 	FPE_FLTDIV,	/* 16 - DNML | DZ | UFL */
 	FPE_FLTINV,	/* 17 - INV | DNML | DZ | UFL */
 	FPE_FLTOVF,	/* 18 - OFL | UFL */
 	FPE_FLTINV,	/* 19 - INV | OFL | UFL */
 	FPE_FLTUND,	/* 1A - DNML | OFL | UFL */
 	FPE_FLTINV,	/* 1B - INV | DNML | OFL | UFL */
 	FPE_FLTDIV,	/* 1C - DZ | OFL | UFL */
 	FPE_FLTINV,	/* 1D - INV | DZ | OFL | UFL */
 	FPE_FLTDIV,	/* 1E - DNML | DZ | OFL | UFL */
 	FPE_FLTINV,	/* 1F - INV | DNML | DZ | OFL | UFL */
 	FPE_FLTRES,	/* 20 - IMP */
 	FPE_FLTINV,	/* 21 - INV | IMP */
 	FPE_FLTUND,	/* 22 - DNML | IMP */
 	FPE_FLTINV,	/* 23 - INV | DNML | IMP */
 	FPE_FLTDIV,	/* 24 - DZ | IMP */
 	FPE_FLTINV,	/* 25 - INV | DZ | IMP */
 	FPE_FLTDIV,	/* 26 - DNML | DZ | IMP */
 	FPE_FLTINV,	/* 27 - INV | DNML | DZ | IMP */
 	FPE_FLTOVF,	/* 28 - OFL | IMP */
 	FPE_FLTINV,	/* 29 - INV | OFL | IMP */
 	FPE_FLTUND,	/* 2A - DNML | OFL | IMP */
 	FPE_FLTINV,	/* 2B - INV | DNML | OFL | IMP */
 	FPE_FLTDIV,	/* 2C - DZ | OFL | IMP */
 	FPE_FLTINV,	/* 2D - INV | DZ | OFL | IMP */
 	FPE_FLTDIV,	/* 2E - DNML | DZ | OFL | IMP */
 	FPE_FLTINV,	/* 2F - INV | DNML | DZ | OFL | IMP */
 	FPE_FLTUND,	/* 30 - UFL | IMP */
 	FPE_FLTINV,	/* 31 - INV | UFL | IMP */
 	FPE_FLTUND,	/* 32 - DNML | UFL | IMP */
 	FPE_FLTINV,	/* 33 - INV | DNML | UFL | IMP */
 	FPE_FLTDIV,	/* 34 - DZ | UFL | IMP */
 	FPE_FLTINV,	/* 35 - INV | DZ | UFL | IMP */
 	FPE_FLTDIV,	/* 36 - DNML | DZ | UFL | IMP */
 	FPE_FLTINV,	/* 37 - INV | DNML | DZ | UFL | IMP */
 	FPE_FLTOVF,	/* 38 - OFL | UFL | IMP */
 	FPE_FLTINV,	/* 39 - INV | OFL | UFL | IMP */
 	FPE_FLTUND,	/* 3A - DNML | OFL | UFL | IMP */
 	FPE_FLTINV,	/* 3B - INV | DNML | OFL | UFL | IMP */
 	FPE_FLTDIV,	/* 3C - DZ | OFL | UFL | IMP */
 	FPE_FLTINV,	/* 3D - INV | DZ | OFL | UFL | IMP */
 	FPE_FLTDIV,	/* 3E - DNML | DZ | OFL | UFL | IMP */
 	FPE_FLTINV,	/* 3F - INV | DNML | DZ | OFL | UFL | IMP */
 	FPE_FLTSUB,	/* 40 - STK */
 	FPE_FLTSUB,	/* 41 - INV | STK */
 	FPE_FLTUND,	/* 42 - DNML | STK */
 	FPE_FLTSUB,	/* 43 - INV | DNML | STK */
 	FPE_FLTDIV,	/* 44 - DZ | STK */
 	FPE_FLTSUB,	/* 45 - INV | DZ | STK */
 	FPE_FLTDIV,	/* 46 - DNML | DZ | STK */
 	FPE_FLTSUB,	/* 47 - INV | DNML | DZ | STK */
 	FPE_FLTOVF,	/* 48 - OFL | STK */
 	FPE_FLTSUB,	/* 49 - INV | OFL | STK */
 	FPE_FLTUND,	/* 4A - DNML | OFL | STK */
 	FPE_FLTSUB,	/* 4B - INV | DNML | OFL | STK */
 	FPE_FLTDIV,	/* 4C - DZ | OFL | STK */
 	FPE_FLTSUB,	/* 4D - INV | DZ | OFL | STK */
 	FPE_FLTDIV,	/* 4E - DNML | DZ | OFL | STK */
 	FPE_FLTSUB,	/* 4F - INV | DNML | DZ | OFL | STK */
 	FPE_FLTUND,	/* 50 - UFL | STK */
 	FPE_FLTSUB,	/* 51 - INV | UFL | STK */
 	FPE_FLTUND,	/* 52 - DNML | UFL | STK */
 	FPE_FLTSUB,	/* 53 - INV | DNML | UFL | STK */
 	FPE_FLTDIV,	/* 54 - DZ | UFL | STK */
 	FPE_FLTSUB,	/* 55 - INV | DZ | UFL | STK */
 	FPE_FLTDIV,	/* 56 - DNML | DZ | UFL | STK */
 	FPE_FLTSUB,	/* 57 - INV | DNML | DZ | UFL | STK */
 	FPE_FLTOVF,	/* 58 - OFL | UFL | STK */
 	FPE_FLTSUB,	/* 59 - INV | OFL | UFL | STK */
 	FPE_FLTUND,	/* 5A - DNML | OFL | UFL | STK */
 	FPE_FLTSUB,	/* 5B - INV | DNML | OFL | UFL | STK */
 	FPE_FLTDIV,	/* 5C - DZ | OFL | UFL | STK */
 	FPE_FLTSUB,	/* 5D - INV | DZ | OFL | UFL | STK */
 	FPE_FLTDIV,	/* 5E - DNML | DZ | OFL | UFL | STK */
 	FPE_FLTSUB,	/* 5F - INV | DNML | DZ | OFL | UFL | STK */
 	FPE_FLTRES,	/* 60 - IMP | STK */
 	FPE_FLTSUB,	/* 61 - INV | IMP | STK */
 	FPE_FLTUND,	/* 62 - DNML | IMP | STK */
 	FPE_FLTSUB,	/* 63 - INV | DNML | IMP | STK */
 	FPE_FLTDIV,	/* 64 - DZ | IMP | STK */
 	FPE_FLTSUB,	/* 65 - INV | DZ | IMP | STK */
 	FPE_FLTDIV,	/* 66 - DNML | DZ | IMP | STK */
 	FPE_FLTSUB,	/* 67 - INV | DNML | DZ | IMP | STK */
 	FPE_FLTOVF,	/* 68 - OFL | IMP | STK */
 	FPE_FLTSUB,	/* 69 - INV | OFL | IMP | STK */
 	FPE_FLTUND,	/* 6A - DNML | OFL | IMP | STK */
 	FPE_FLTSUB,	/* 6B - INV | DNML | OFL | IMP | STK */
 	FPE_FLTDIV,	/* 6C - DZ | OFL | IMP | STK */
 	FPE_FLTSUB,	/* 6D - INV | DZ | OFL | IMP | STK */
 	FPE_FLTDIV,	/* 6E - DNML | DZ | OFL | IMP | STK */
 	FPE_FLTSUB,	/* 6F - INV | DNML | DZ | OFL | IMP | STK */
 	FPE_FLTUND,	/* 70 - UFL | IMP | STK */
 	FPE_FLTSUB,	/* 71 - INV | UFL | IMP | STK */
 	FPE_FLTUND,	/* 72 - DNML | UFL | IMP | STK */
 	FPE_FLTSUB,	/* 73 - INV | DNML | UFL | IMP | STK */
 	FPE_FLTDIV,	/* 74 - DZ | UFL | IMP | STK */
 	FPE_FLTSUB,	/* 75 - INV | DZ | UFL | IMP | STK */
 	FPE_FLTDIV,	/* 76 - DNML | DZ | UFL | IMP | STK */
 	FPE_FLTSUB,	/* 77 - INV | DNML | DZ | UFL | IMP | STK */
 	FPE_FLTOVF,	/* 78 - OFL | UFL | IMP | STK */
 	FPE_FLTSUB,	/* 79 - INV | OFL | UFL | IMP | STK */
 	FPE_FLTUND,	/* 7A - DNML | OFL | UFL | IMP | STK */
 	FPE_FLTSUB,	/* 7B - INV | DNML | OFL | UFL | IMP | STK */
 	FPE_FLTDIV,	/* 7C - DZ | OFL | UFL | IMP | STK */
 	FPE_FLTSUB,	/* 7D - INV | DZ | OFL | UFL | IMP | STK */
 	FPE_FLTDIV,	/* 7E - DNML | DZ | OFL | UFL | IMP | STK */
 	FPE_FLTSUB,	/* 7F - INV | DNML | DZ | OFL | UFL | IMP | STK */
 };
 
 /*
  * Read the FP status and control words, then generate si_code value
  * for SIGFPE.  The error code chosen will be one of the
  * FPE_... macros.  It will be sent as the second argument to old
  * BSD-style signal handlers and as "siginfo_t->si_code" (second
  * argument) to SA_SIGINFO signal handlers.
  *
  * Some time ago, we cleared the x87 exceptions with FNCLEX there.
  * Clearing exceptions was necessary mainly to avoid IRQ13 bugs.  The
  * usermode code which understands the FPU hardware enough to enable
  * the exceptions, can also handle clearing the exception state in the
  * handler.  The only consequence of not clearing the exception is the
  * rethrow of the SIGFPE on return from the signal handler and
  * reexecution of the corresponding instruction.
  *
  * For XMM traps, the exceptions were never cleared.
  */
 int
 fputrap_x87(void)
 {
 	struct savefpu *pcb_save;
 	u_short control, status;
 
 	critical_enter();
 
 	/*
 	 * Interrupt handling (for another interrupt) may have pushed the
 	 * state to memory.  Fetch the relevant parts of the state from
 	 * wherever they are.
 	 */
 	if (PCPU_GET(fpcurthread) != curthread) {
 		pcb_save = curpcb->pcb_save;
 		control = pcb_save->sv_env.en_cw;
 		status = pcb_save->sv_env.en_sw;
 	} else {
 		fnstcw(&control);
 		fnstsw(&status);
 	}
 
 	critical_exit();
 	return (fpetable[status & ((~control & 0x3f) | 0x40)]);
 }
 
 int
 fputrap_sse(void)
 {
 	u_int mxcsr;
 
 	critical_enter();
 	if (PCPU_GET(fpcurthread) != curthread)
 		mxcsr = curpcb->pcb_save->sv_env.en_mxcsr;
 	else
 		stmxcsr(&mxcsr);
 	critical_exit();
 	return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]);
 }
 
 static void
 restore_fpu_curthread(struct thread *td)
 {
 	struct pcb *pcb;
 
 	/*
 	 * Record new context early in case frstor causes a trap.
 	 */
 	PCPU_SET(fpcurthread, td);
 
 	stop_emulating();
 	fpu_clean_state();
 	pcb = td->td_pcb;
 
 	if ((pcb->pcb_flags & PCB_FPUINITDONE) == 0) {
 		/*
 		 * This is the first time this thread has used the FPU or
 		 * the PCB doesn't contain a clean FPU state.  Explicitly
 		 * load an initial state.
 		 *
 		 * We prefer to restore the state from the actual save
 		 * area in PCB instead of directly loading from
 		 * fpu_initialstate, to ignite the XSAVEOPT
 		 * tracking engine.
 		 */
 		bcopy(fpu_initialstate, pcb->pcb_save,
 		    cpu_max_ext_state_size);
 		fpurestore(pcb->pcb_save);
 		if (pcb->pcb_initial_fpucw != __INITIAL_FPUCW__)
 			fldcw(pcb->pcb_initial_fpucw);
 		if (PCB_USER_FPU(pcb))
 			set_pcb_flags(pcb, PCB_FPUINITDONE |
 			    PCB_USERFPUINITDONE);
 		else
 			set_pcb_flags(pcb, PCB_FPUINITDONE);
 	} else
 		fpurestore(pcb->pcb_save);
 }
 
 /*
  * Device Not Available (DNA, #NM) exception handler.
  *
  * It would be better to switch FP context here (if curthread !=
  * fpcurthread) and not necessarily for every context switch, but it
  * is too hard to access foreign pcb's.
  */
 void
 fpudna(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	/*
 	 * This handler is entered with interrupts enabled, so context
 	 * switches may occur before critical_enter() is executed.  If
 	 * a context switch occurs, then when we regain control, our
 	 * state will have been completely restored.  The CPU may
 	 * change underneath us, but the only part of our context that
 	 * lives in the CPU is CR0.TS and that will be "restored" by
 	 * setting it on the new CPU.
 	 */
 	critical_enter();
 
 	KASSERT((curpcb->pcb_flags & PCB_FPUNOSAVE) == 0,
 	    ("fpudna while in fpu_kern_enter(FPU_KERN_NOCTX)"));
 	if (__predict_false(PCPU_GET(fpcurthread) == td)) {
 		/*
 		 * Some virtual machines seems to set %cr0.TS at
 		 * arbitrary moments.  Silently clear the TS bit
 		 * regardless of the eager/lazy FPU context switch
 		 * mode.
 		 */
 		stop_emulating();
 	} else {
 		if (__predict_false(PCPU_GET(fpcurthread) != NULL)) {
 			panic(
 		    "fpudna: fpcurthread = %p (%d), curthread = %p (%d)\n",
 			    PCPU_GET(fpcurthread),
 			    PCPU_GET(fpcurthread)->td_tid, td, td->td_tid);
 		}
 		restore_fpu_curthread(td);
 	}
 	critical_exit();
 }
 
 void fpu_activate_sw(struct thread *td); /* Called from the context switch */
 void
 fpu_activate_sw(struct thread *td)
 {
 
 	if ((td->td_pflags & TDP_KTHREAD) != 0 || !PCB_USER_FPU(td->td_pcb)) {
 		PCPU_SET(fpcurthread, NULL);
 		start_emulating();
 	} else if (PCPU_GET(fpcurthread) != td) {
 		restore_fpu_curthread(td);
 	}
 }
 
 void
 fpudrop(void)
 {
 	struct thread *td;
 
 	td = PCPU_GET(fpcurthread);
 	KASSERT(td == curthread, ("fpudrop: fpcurthread != curthread"));
 	CRITICAL_ASSERT(td);
 	PCPU_SET(fpcurthread, NULL);
 	clear_pcb_flags(td->td_pcb, PCB_FPUINITDONE);
 	start_emulating();
 }
 
 /*
  * Get the user state of the FPU into pcb->pcb_user_save without
  * dropping ownership (if possible).  It returns the FPU ownership
  * status.
  */
 int
 fpugetregs(struct thread *td)
 {
 	struct pcb *pcb;
 	uint64_t *xstate_bv, bit;
 	char *sa;
 	int max_ext_n, i, owned;
 
 	pcb = td->td_pcb;
 	critical_enter();
 	if ((pcb->pcb_flags & PCB_USERFPUINITDONE) == 0) {
 		bcopy(fpu_initialstate, get_pcb_user_save_pcb(pcb),
 		    cpu_max_ext_state_size);
 		get_pcb_user_save_pcb(pcb)->sv_env.en_cw =
 		    pcb->pcb_initial_fpucw;
 		fpuuserinited(td);
 		critical_exit();
 		return (_MC_FPOWNED_PCB);
 	}
 	if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) {
 		fpusave(get_pcb_user_save_pcb(pcb));
 		owned = _MC_FPOWNED_FPU;
 	} else {
 		owned = _MC_FPOWNED_PCB;
 	}
 	if (use_xsave) {
 		/*
 		 * Handle partially saved state.
 		 */
 		sa = (char *)get_pcb_user_save_pcb(pcb);
 		xstate_bv = (uint64_t *)(sa + sizeof(struct savefpu) +
 		    offsetof(struct xstate_hdr, xstate_bv));
 		max_ext_n = flsl(xsave_mask);
 		for (i = 0; i < max_ext_n; i++) {
 			bit = 1ULL << i;
 			if ((xsave_mask & bit) == 0 || (*xstate_bv & bit) != 0)
 				continue;
 			bcopy((char *)fpu_initialstate +
 			    xsave_area_desc[i].offset,
 			    sa + xsave_area_desc[i].offset,
 			    xsave_area_desc[i].size);
 			*xstate_bv |= bit;
 		}
 	}
 	critical_exit();
 	return (owned);
 }
 
 void
 fpuuserinited(struct thread *td)
 {
 	struct pcb *pcb;
 
 	CRITICAL_ASSERT(td);
 	pcb = td->td_pcb;
 	if (PCB_USER_FPU(pcb))
 		set_pcb_flags(pcb,
 		    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
 	else
 		set_pcb_flags(pcb, PCB_FPUINITDONE);
 }
 
 int
 fpusetxstate(struct thread *td, char *xfpustate, size_t xfpustate_size)
 {
 	struct xstate_hdr *hdr, *ehdr;
 	size_t len, max_len;
 	uint64_t bv;
 
 	/* XXXKIB should we clear all extended state in xstate_bv instead ? */
 	if (xfpustate == NULL)
 		return (0);
 	if (!use_xsave)
 		return (EOPNOTSUPP);
 
 	len = xfpustate_size;
 	if (len < sizeof(struct xstate_hdr))
 		return (EINVAL);
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	if (len > max_len)
 		return (EINVAL);
 
 	ehdr = (struct xstate_hdr *)xfpustate;
 	bv = ehdr->xstate_bv;
 
 	/*
 	 * Avoid #gp.
 	 */
 	if (bv & ~xsave_mask)
 		return (EINVAL);
 
 	hdr = (struct xstate_hdr *)(get_pcb_user_save_td(td) + 1);
 
 	hdr->xstate_bv = bv;
 	bcopy(xfpustate + sizeof(struct xstate_hdr),
 	    (char *)(hdr + 1), len - sizeof(struct xstate_hdr));
 
 	return (0);
 }
 
 /*
  * Set the state of the FPU.
  */
 int
 fpusetregs(struct thread *td, struct savefpu *addr, char *xfpustate,
     size_t xfpustate_size)
 {
 	struct pcb *pcb;
 	int error;
 
 	addr->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 	pcb = td->td_pcb;
 	error = 0;
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) {
 		error = fpusetxstate(td, xfpustate, xfpustate_size);
 		if (error == 0) {
 			bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr));
 			fpurestore(get_pcb_user_save_td(td));
 			set_pcb_flags(pcb, PCB_FPUINITDONE |
 			    PCB_USERFPUINITDONE);
 		}
 	} else {
 		error = fpusetxstate(td, xfpustate, xfpustate_size);
 		if (error == 0) {
 			bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr));
 			fpuuserinited(td);
 		}
 	}
 	critical_exit();
 	return (error);
 }
 
 /*
  * On AuthenticAMD processors, the fxrstor instruction does not restore
  * the x87's stored last instruction pointer, last data pointer, and last
  * opcode values, except in the rare case in which the exception summary
  * (ES) bit in the x87 status word is set to 1.
  *
  * In order to avoid leaking this information across processes, we clean
  * these values by performing a dummy load before executing fxrstor().
  */
 static void
 fpu_clean_state(void)
 {
 	static float dummy_variable = 0.0;
 	u_short status;
 
 	/*
 	 * Clear the ES bit in the x87 status word if it is currently
 	 * set, in order to avoid causing a fault in the upcoming load.
 	 */
 	fnstsw(&status);
 	if (status & 0x80)
 		fnclex();
 
 	/*
 	 * Load the dummy variable into the x87 stack.  This mangles
 	 * the x87 stack, but we don't care since we're about to call
 	 * fxrstor() anyway.
 	 */
 	__asm __volatile("ffree %%st(7); flds %0" : : "m" (dummy_variable));
 }
 
 /*
  * This really sucks.  We want the acpi version only, but it requires
  * the isa_if.h file in order to get the definitions.
  */
 #include "opt_isa.h"
 #ifdef DEV_ISA
 #include <isa/isavar.h>
 /*
  * This sucks up the legacy ISA support assignments from PNPBIOS/ACPI.
  */
 static struct isa_pnp_id fpupnp_ids[] = {
 	{ 0x040cd041, "Legacy ISA coprocessor support" }, /* PNP0C04 */
 	{ 0 }
 };
 
 static int
 fpupnp_probe(device_t dev)
 {
 	int result;
 
 	result = ISA_PNP_PROBE(device_get_parent(dev), dev, fpupnp_ids);
 	if (result <= 0)
 		device_quiet(dev);
 	return (result);
 }
 
 static int
 fpupnp_attach(device_t dev)
 {
 
 	return (0);
 }
 
 static device_method_t fpupnp_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		fpupnp_probe),
 	DEVMETHOD(device_attach,	fpupnp_attach),
 	DEVMETHOD(device_detach,	bus_generic_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
 	DEVMETHOD(device_resume,	bus_generic_resume),
 	{ 0, 0 }
 };
 
 static driver_t fpupnp_driver = {
 	"fpupnp",
 	fpupnp_methods,
 	1,			/* no softc */
 };
 
 static devclass_t fpupnp_devclass;
 
 DRIVER_MODULE(fpupnp, acpi, fpupnp_driver, fpupnp_devclass, 0, 0);
 ISA_PNP_INFO(fpupnp_ids);
 #endif	/* DEV_ISA */
 
 static MALLOC_DEFINE(M_FPUKERN_CTX, "fpukern_ctx",
     "Kernel contexts for FPU state");
 
 #define	FPU_KERN_CTX_FPUINITDONE 0x01
 #define	FPU_KERN_CTX_DUMMY	 0x02	/* avoided save for the kern thread */
 #define	FPU_KERN_CTX_INUSE	 0x04
 
 struct fpu_kern_ctx {
 	struct savefpu *prev;
 	uint32_t flags;
 	char hwstate1[];
 };
 
 static inline size_t __pure2
 fpu_kern_alloc_sz(u_int max_est)
 {
 	return (sizeof(struct fpu_kern_ctx) + XSAVE_AREA_ALIGN + max_est);
 }
 
 static inline int __pure2
 fpu_kern_malloc_flags(u_int fpflags)
 {
 	return (((fpflags & FPU_KERN_NOWAIT) ? M_NOWAIT : M_WAITOK) | M_ZERO);
 }
 
 struct fpu_kern_ctx *
 fpu_kern_alloc_ctx_domain(int domain, u_int flags)
 {
 	return (malloc_domainset(fpu_kern_alloc_sz(cpu_max_ext_state_size),
 	    M_FPUKERN_CTX, DOMAINSET_PREF(domain),
 	    fpu_kern_malloc_flags(flags)));
 }
 
 struct fpu_kern_ctx *
 fpu_kern_alloc_ctx(u_int flags)
 {
 	return (malloc(fpu_kern_alloc_sz(cpu_max_ext_state_size),
 	    M_FPUKERN_CTX, fpu_kern_malloc_flags(flags)));
 }
 
 void
 fpu_kern_free_ctx(struct fpu_kern_ctx *ctx)
 {
 
 	KASSERT((ctx->flags & FPU_KERN_CTX_INUSE) == 0, ("free'ing inuse ctx"));
 	/* XXXKIB clear the memory ? */
 	free(ctx, M_FPUKERN_CTX);
 }
 
 static struct savefpu *
 fpu_kern_ctx_savefpu(struct fpu_kern_ctx *ctx)
 {
 	vm_offset_t p;
 
 	p = (vm_offset_t)&ctx->hwstate1;
 	p = roundup2(p, XSAVE_AREA_ALIGN);
 	return ((struct savefpu *)p);
 }
 
 void
 fpu_kern_enter(struct thread *td, struct fpu_kern_ctx *ctx, u_int flags)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	KASSERT((flags & FPU_KERN_NOCTX) != 0 || ctx != NULL,
 	    ("ctx is required when !FPU_KERN_NOCTX"));
 	KASSERT(ctx == NULL || (ctx->flags & FPU_KERN_CTX_INUSE) == 0,
 	    ("using inuse ctx"));
 	KASSERT((pcb->pcb_flags & PCB_FPUNOSAVE) == 0,
 	    ("recursive fpu_kern_enter while in PCB_FPUNOSAVE state"));
 
 	if ((flags & FPU_KERN_NOCTX) != 0) {
 		critical_enter();
 		stop_emulating();
 		if (curthread == PCPU_GET(fpcurthread)) {
 			fpusave(curpcb->pcb_save);
 			PCPU_SET(fpcurthread, NULL);
 		} else {
 			KASSERT(PCPU_GET(fpcurthread) == NULL,
 			    ("invalid fpcurthread"));
 		}
 
 		/*
 		 * This breaks XSAVEOPT tracker, but
 		 * PCB_FPUNOSAVE state is supposed to never need to
 		 * save FPU context at all.
 		 */
 		fpurestore(fpu_initialstate);
 		set_pcb_flags(pcb, PCB_KERNFPU | PCB_FPUNOSAVE |
 		    PCB_FPUINITDONE);
 		return;
 	}
 	if ((flags & FPU_KERN_KTHR) != 0 && is_fpu_kern_thread(0)) {
 		ctx->flags = FPU_KERN_CTX_DUMMY | FPU_KERN_CTX_INUSE;
 		return;
 	}
 	critical_enter();
 	KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save ==
 	    get_pcb_user_save_pcb(pcb), ("mangled pcb_save"));
 	ctx->flags = FPU_KERN_CTX_INUSE;
 	if ((pcb->pcb_flags & PCB_FPUINITDONE) != 0)
 		ctx->flags |= FPU_KERN_CTX_FPUINITDONE;
 	fpuexit(td);
 	ctx->prev = pcb->pcb_save;
 	pcb->pcb_save = fpu_kern_ctx_savefpu(ctx);
 	set_pcb_flags(pcb, PCB_KERNFPU);
 	clear_pcb_flags(pcb, PCB_FPUINITDONE);
 	critical_exit();
 }
 
 int
 fpu_kern_leave(struct thread *td, struct fpu_kern_ctx *ctx)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	if ((pcb->pcb_flags & PCB_FPUNOSAVE) != 0) {
 		KASSERT(ctx == NULL, ("non-null ctx after FPU_KERN_NOCTX"));
 		KASSERT(PCPU_GET(fpcurthread) == NULL,
 		    ("non-NULL fpcurthread for PCB_FPUNOSAVE"));
 		CRITICAL_ASSERT(td);
 
 		clear_pcb_flags(pcb,  PCB_FPUNOSAVE | PCB_FPUINITDONE);
 		start_emulating();
 	} else {
 		KASSERT((ctx->flags & FPU_KERN_CTX_INUSE) != 0,
 		    ("leaving not inuse ctx"));
 		ctx->flags &= ~FPU_KERN_CTX_INUSE;
 
 		if (is_fpu_kern_thread(0) &&
 		    (ctx->flags & FPU_KERN_CTX_DUMMY) != 0)
 			return (0);
 		KASSERT((ctx->flags & FPU_KERN_CTX_DUMMY) == 0,
 		    ("dummy ctx"));
 		critical_enter();
 		if (curthread == PCPU_GET(fpcurthread))
 			fpudrop();
 		pcb->pcb_save = ctx->prev;
 	}
 
 	if (pcb->pcb_save == get_pcb_user_save_pcb(pcb)) {
 		if ((pcb->pcb_flags & PCB_USERFPUINITDONE) != 0) {
 			set_pcb_flags(pcb, PCB_FPUINITDONE);
 			if ((pcb->pcb_flags & PCB_KERNFPU_THR) == 0)
 				clear_pcb_flags(pcb, PCB_KERNFPU);
 		} else if ((pcb->pcb_flags & PCB_KERNFPU_THR) == 0)
 			clear_pcb_flags(pcb, PCB_FPUINITDONE | PCB_KERNFPU);
 	} else {
 		if ((ctx->flags & FPU_KERN_CTX_FPUINITDONE) != 0)
 			set_pcb_flags(pcb, PCB_FPUINITDONE);
 		else
 			clear_pcb_flags(pcb, PCB_FPUINITDONE);
 		KASSERT(!PCB_USER_FPU(pcb), ("unpaired fpu_kern_leave"));
 	}
 	critical_exit();
 	return (0);
 }
 
 int
 fpu_kern_thread(u_int flags)
 {
 
 	KASSERT((curthread->td_pflags & TDP_KTHREAD) != 0,
 	    ("Only kthread may use fpu_kern_thread"));
 	KASSERT(curpcb->pcb_save == get_pcb_user_save_pcb(curpcb),
 	    ("mangled pcb_save"));
 	KASSERT(PCB_USER_FPU(curpcb), ("recursive call"));
 
 	set_pcb_flags(curpcb, PCB_KERNFPU | PCB_KERNFPU_THR);
 	return (0);
 }
 
 int
 is_fpu_kern_thread(u_int flags)
 {
 
 	if ((curthread->td_pflags & TDP_KTHREAD) == 0)
 		return (0);
 	return ((curpcb->pcb_flags & PCB_KERNFPU_THR) != 0);
 }
 
 /*
  * FPU save area alloc/free/init utility routines
  */
 struct savefpu *
 fpu_save_area_alloc(void)
 {
 
 	return (uma_zalloc(fpu_save_area_zone, M_WAITOK));
 }
 
 void
 fpu_save_area_free(struct savefpu *fsa)
 {
 
 	uma_zfree(fpu_save_area_zone, fsa);
 }
 
 void
 fpu_save_area_reset(struct savefpu *fsa)
 {
 
 	bcopy(fpu_initialstate, fsa, cpu_max_ext_state_size);
 }
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index d4e2356a9ae1..5c9b64526609 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1,1908 +1,1894 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atpic.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_mp_watchdog.h"
 #include "opt_pci.h"
 #include "opt_platform.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/asan.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/csan.h>
 #include <sys/efi.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msan.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/reg.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_dumpset.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/mca.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mp_watchdog.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/trap.h>
 #include <machine/tss.h>
 #include <x86/ucode.h>
 #include <x86/ifunc.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef FDT
 #include <x86/fdt.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <x86/isa/icu.h>
 #else
 #include <x86/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 #include <x86/init.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 /*
  * The PTI trampoline stack needs enough space for a hardware trapframe and a
  * couple of scratch registers, as well as the trapframe left behind after an
  * iret fault.
  */
 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
     offsetof(struct pti_frame, pti_rip));
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 
 static void cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 /* Preload data parse function */
 static caddr_t native_parse_preload_data(u_int64_t);
 
 /* Native function to fetch and parse the e820 map */
 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
 
 /* Default init_ops implementation. */
 struct init_ops init_ops = {
 	.parse_preload_data =	native_parse_preload_data,
 	.early_clock_source_init =	i8254_init,
 	.early_delay =			i8254_delay,
 	.parse_memmap =			native_parse_memmap,
 };
 
 /*
  * Physical address of the EFI System Table. Stashed from the metadata hints
  * passed into the kernel and used by the EFI code to call runtime services.
  */
 vm_paddr_t efi_systbl_phys;
 
 /* Intel ICH registers */
 #define ICH_PMBASE	0x400
 #define ICH_SMI_EN	ICH_PMBASE + 0x30
 
 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_idt;
 
 struct pcpu *__pcpu;
 struct pcpu temp_bsp_pcpu;
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 struct mtx dt_lock;	/* lock for GDT and LDT */
 
 void (*vmm_resume_p)(void);
 
 bool efi_boot;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	uintmax_t memsize;
 	char *sysenv;
 
 	/*
 	 * On MacBooks, we need to disallow the legacy USB circuit to
 	 * generate an SMI# because this can cause several problems,
 	 * namely: incorrect CPU frequency detection and failure to
 	 * start the APs.
 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
 	 * Enable register) of the Intel ICH LPC Interface Bridge. 
 	 */
 	sysenv = kern_getenv("smbios.system.product");
 	if (sysenv != NULL) {
 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
 			if (bootverbose)
 				printf("Disabling LEGACY_USB_EN bit on "
 				    "Intel ICH.\n");
 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
 		}
 		freeenv(sysenv);
 	}
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 
 	/*
 	 * Display physical memory if SMBIOS reports reasonable amount.
 	 */
 	memsize = 0;
 	sysenv = kern_getenv("smbios.memory.enabled");
 	if (sysenv != NULL) {
 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
 		freeenv(sysenv);
 	}
 	if (memsize < ptoa((uintmax_t)vm_free_count()))
 		memsize = ptoa((uintmax_t)Maxmem);
 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
 	realmem = atop(memsize);
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)vm_free_count()),
 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
 #ifdef DEV_PCI
 	if (bootverbose && intel_graphics_stolen_base != 0)
 		printf("intel stolen mem: base %#jx size %ju MB\n",
 		    (uintmax_t)intel_graphics_stolen_base,
 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
 #endif
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 static void
 late_ifunc_resolve(void *dummy __unused)
 {
 	link_elf_late_ireloc();
 }
 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
 
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
 /*
  * Software prototypes -- in more palatable form.
  *
  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
  * slots as corresponding segments for i386 kernel.
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GNULL2_SEL	1 Null Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMRWA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 1,
 	.ssd_gran = 1		},
 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0xfffff,
 	.ssd_type = SDT_MEMERA,
 	.ssd_dpl = SEL_UPL,
 	.ssd_p = 1,
 	.ssd_long = 1,
 	.ssd_def32 = 0,
 	.ssd_gran = 1		},
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
 	.ssd_type = SDT_SYSTSS,
 	.ssd_dpl = SEL_KPL,
 	.ssd_p = 1,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* Actually, the TSS is a system descriptor which is double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	11 LDT Descriptor */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
 {	.ssd_base = 0x0,
 	.ssd_limit = 0x0,
 	.ssd_type = 0,
 	.ssd_dpl = 0,
 	.ssd_p = 0,
 	.ssd_long = 0,
 	.ssd_def32 = 0,
 	.ssd_gran = 0		},
 };
 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
 
 void
 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 	IDTVEC(div_pti), IDTVEC(bpt_pti),
 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
 	IDTVEC(xmm_pti),
 #ifdef KDTRACE_HOOKS
 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
 #endif
 #ifdef XENHVM
 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
 #endif
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
 	IDTVEC(fast_syscall_pti);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	struct {
 		uint16_t limit;
 		uint64_t base;
 	} __packed idtr, gdtr;
 	uint16_t ldt, tr;
 
 	__asm __volatile("sidt %0" : "=m" (idtr));
 	db_printf("idtr\t0x%016lx/%04x\n",
 	    (u_long)idtr.base, (u_int)idtr.limit);
 	__asm __volatile("sgdt %0" : "=m" (gdtr));
 	db_printf("gdtr\t0x%016lx/%04x\n",
 	    (u_long)gdtr.base, (u_int)gdtr.limit);
 	__asm __volatile("sldt %0" : "=r" (ldt));
 	db_printf("ldtr\t0x%04x\n", ldt);
 	__asm __volatile("str %0" : "=r" (tr));
 	db_printf("tr\t0x%04x\n", tr);
 	db_printf("cr0\t0x%016lx\n", rcr0());
 	db_printf("cr2\t0x%016lx\n", rcr2());
 	db_printf("cr3\t0x%016lx\n", rcr3());
 	db_printf("cr4\t0x%016lx\n", rcr4());
 	if (rcr4() & CR4_XSAVE)
 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
 		db_printf("FEATURES_CTL\t%016lx\n",
 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
 }
 
 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
 {
 
 	db_printf("dr0\t0x%016lx\n", rdr0());
 	db_printf("dr1\t0x%016lx\n", rdr1());
 	db_printf("dr2\t0x%016lx\n", rdr2());
 	db_printf("dr3\t0x%016lx\n", rdr3());
 	db_printf("dr6\t0x%016lx\n", rdr6());
 	db_printf("dr7\t0x%016lx\n", rdr7());	
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 u_int basemem;
 
 static int
 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
     int *physmap_idxp)
 {
 	int i, insert_idx, physmap_idx;
 
 	physmap_idx = *physmap_idxp;
 
 	if (length == 0)
 		return (1);
 
 	/*
 	 * Find insertion point while checking for overlap.  Start off by
 	 * assuming the new entry will be added to the end.
 	 *
 	 * NB: physmap_idx points to the next free slot.
 	 */
 	insert_idx = physmap_idx;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (base < physmap[i + 1]) {
 			if (base + length <= physmap[i]) {
 				insert_idx = i;
 				break;
 			}
 			if (boothowto & RB_VERBOSE)
 				printf(
 		    "Overlapping memory regions, ignoring second region\n");
 			return (1);
 		}
 	}
 
 	/* See if we can prepend to the next entry. */
 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
 		physmap[insert_idx] = base;
 		return (1);
 	}
 
 	/* See if we can append to the previous entry. */
 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
 		physmap[insert_idx - 1] += length;
 		return (1);
 	}
 
 	physmap_idx += 2;
 	*physmap_idxp = physmap_idx;
 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
 		printf(
 		"Too many segments in the physical address map, giving up\n");
 		return (0);
 	}
 
 	/*
 	 * Move the last 'N' entries down to make room for the new
 	 * entry if needed.
 	 */
 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
 		physmap[i] = physmap[i - 2];
 		physmap[i + 1] = physmap[i - 1];
 	}
 
 	/* Insert the new entry. */
 	physmap[insert_idx] = base;
 	physmap[insert_idx + 1] = base + length;
 	return (1);
 }
 
 void
 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
                       vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap, *smapend;
 
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (!add_physmap_entry(smap->base, smap->length, physmap,
 		    physmap_idx))
 			break;
 	}
 }
 
 static void
 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
     int *physmap_idx)
 {
 	struct efi_md *map, *p;
 	const char *type;
 	size_t efisz;
 	int ndesc, i;
 
 	static const char *types[] = {
 		"Reserved",
 		"LoaderCode",
 		"LoaderData",
 		"BootServicesCode",
 		"BootServicesData",
 		"RuntimeServicesCode",
 		"RuntimeServicesData",
 		"ConventionalMemory",
 		"UnusableMemory",
 		"ACPIReclaimMemory",
 		"ACPIMemoryNVS",
 		"MemoryMappedIO",
 		"MemoryMappedIOPortSpace",
 		"PalCode",
 		"PersistentMemory"
 	};
 
 	/*
 	 * Memory map data provided by UEFI via the GetMemoryMap
 	 * Boot Services API.
 	 */
 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
 
 	if (efihdr->descriptor_size == 0)
 		return;
 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
 
 	if (boothowto & RB_VERBOSE)
 		printf("%23s %12s %12s %8s %4s\n",
 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
 
 	for (i = 0, p = map; i < ndesc; i++,
 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
 		if (boothowto & RB_VERBOSE) {
 			if (p->md_type < nitems(types))
 				type = types[p->md_type];
 			else
 				type = "<INVALID>";
 			printf("%23s %012lx %012lx %08lx ", type, p->md_phys,
 			    p->md_virt, p->md_pages);
 			if (p->md_attr & EFI_MD_ATTR_UC)
 				printf("UC ");
 			if (p->md_attr & EFI_MD_ATTR_WC)
 				printf("WC ");
 			if (p->md_attr & EFI_MD_ATTR_WT)
 				printf("WT ");
 			if (p->md_attr & EFI_MD_ATTR_WB)
 				printf("WB ");
 			if (p->md_attr & EFI_MD_ATTR_UCE)
 				printf("UCE ");
 			if (p->md_attr & EFI_MD_ATTR_WP)
 				printf("WP ");
 			if (p->md_attr & EFI_MD_ATTR_RP)
 				printf("RP ");
 			if (p->md_attr & EFI_MD_ATTR_XP)
 				printf("XP ");
 			if (p->md_attr & EFI_MD_ATTR_NV)
 				printf("NV ");
 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
 				printf("MORE_RELIABLE ");
 			if (p->md_attr & EFI_MD_ATTR_RO)
 				printf("RO ");
 			if (p->md_attr & EFI_MD_ATTR_RT)
 				printf("RUNTIME");
 			printf("\n");
 		}
 
 		switch (p->md_type) {
 		case EFI_MD_TYPE_CODE:
 		case EFI_MD_TYPE_DATA:
 		case EFI_MD_TYPE_BS_CODE:
 		case EFI_MD_TYPE_BS_DATA:
 		case EFI_MD_TYPE_FREE:
 			/*
 			 * We're allowed to use any entry with these types.
 			 */
 			break;
 		default:
 			continue;
 		}
 
 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
 		    physmap, physmap_idx))
 			break;
 	}
 }
 
 static void
 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
 {
 	struct bios_smap *smap;
 	struct efi_map_header *efihdr;
 	u_int32_t size;
 
 	/*
 	 * Memory map from INT 15:E820.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	smap = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (efihdr == NULL && smap == NULL)
 		panic("No BIOS smap or EFI map info from loader!");
 
 	if (efihdr != NULL) {
 		add_efi_map_entries(efihdr, physmap, physmap_idx);
 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
 	} else {
 		size = *((u_int32_t *)smap - 1);
 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
 	}
 }
 
 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
 	u_long physmem_start, physmem_tunable, memtest;
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 	int page_counter;
 
 	/*
 	 * Tell the physical memory allocator about pages used to store
 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
 	 */
 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
 
 	bzero(physmap, sizeof(physmap));
 	physmap_idx = 0;
 
 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
 	physmap_idx -= 2;
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] <= 0xA0000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0 || basemem > 640) {
 		if (bootverbose)
 			printf(
 		"Memory map doesn't contain a basemem segment, faking it");
 		basemem = 640;
 	}
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * The boot memory test is disabled by default, as it takes a
 	 * significant amount of time on large-memory systems, and is
 	 * unfriendly to virtual machines as it unnecessarily touches all
 	 * pages.
 	 *
 	 * A general name is used as the code may be extended to support
 	 * additional tests beyond the current "page present" test.
 	 */
 	memtest = 0;
 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 *
 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
 	 * By default, mask off the first 16 pages unless we appear to be
 	 * running in a VM.
 	 */
 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
 	if (physmap[0] < physmem_start) {
 		if (physmem_start < PAGE_SIZE)
 			physmap[0] = PAGE_SIZE;
 		else if (physmem_start >= physmap[1])
 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
 		else
 			physmap[0] = round_page(physmem_start);
 	}
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	page_counter = 0;
 	if (memtest != 0)
 		printf("Testing system memory");
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= (vm_paddr_t)kernphys && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 			if (memtest == 0)
 				goto skip_memtest;
 
 			/*
 			 * Print a "." every GB to show we're making
 			 * progress.
 			 */
 			page_counter++;
 			if ((page_counter % PAGES_PER_GB) == 0)
 				printf(".");
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 skip_memtest:
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == PHYS_AVAIL_ENTRIES) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 	if (memtest != 0)
 		printf("\n");
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(msgbufsize);
 
 	/* Map the message buffer. */
 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
 }
 
 static caddr_t
 native_parse_preload_data(u_int64_t modulep)
 {
 	caddr_t kmdp;
 	char *envp;
 #ifdef DDB
 	vm_offset_t ksym_start;
 	vm_offset_t ksym_end;
 #endif
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 	if (envp != NULL)
 		envp += KERNBASE;
 	init_static_kenv(envp, 0);
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
 #endif
 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
 
 	return (kmdp);
 }
 
 static void
 amd64_kdb_init(void)
 {
 	kdb_init();
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
 #endif
 }
 
 /* Set up the fast syscall stuff */
 void
 amd64_conf_fast_syscall(void)
 {
 	uint64_t msr;
 
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
 	    (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
 }
 
 void
 amd64_bsp_pcpu_init1(struct pcpu *pc)
 {
 	struct user_segment_descriptor *gdt;
 
 	PCPU_SET(prvspace, pc);
 	gdt = *PCPU_PTR(gdt);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(tssp, PCPU_PTR(common_tss));
 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
 	PCPU_SET(smp_tlb_gen, 1);
 }
 
 void
 amd64_bsp_pcpu_init2(uint64_t rsp0)
 {
 
 	PCPU_SET(rsp0, rsp0);
 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
 	PCPU_SET(curpcb, thread0.td_pcb);
 }
 
 void
 amd64_bsp_ist_init(struct pcpu *pc)
 {
 	struct nmi_pcpu *np;
 	struct amd64tss *tssp;
 
 	tssp = &pc->pc_common_tss;
 
 	/* doublefault stack space, runs on ist1 */
 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist1 = (long)np;
 
 	/*
 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
 	 * above the start of the ist2 stack.
 	 */
 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist2 = (long)np;
 
 	/*
 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
 	 * above the start of the ist3 stack.
 	 */
 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist3 = (long)np;
 
 	/*
 	 * DB# stack, runs on ist4.
 	 */
 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
 	np->np_pcpu = (register_t)pc;
 	tssp->tss_ist4 = (long)np;
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
-	struct xstate_hdr *xhdr;
 	uint64_t cr3, rsp0;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	char *env;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor r_gdt;
 	size_t kstack0_sz;
 	int late_console;
 
 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
 
 	/*
 	 * Calculate kernphys by inspecting page table created by loader.
 	 * The assumptions:
 	 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
 	 *   aligned at 2M, below 4G (the latter is important for AP startup)
 	 * - there is a 2M hole at KERNBASE
 	 * - kernel is mapped with 2M superpages
 	 * - all participating memory, i.e. kernel, modules, metadata,
 	 *   page table is accessible by pre-created 1:1 mapping
 	 *   (right now loader creates 1:1 mapping for lower 4G, and all
 	 *   memory is from there)
 	 * - there is a usable memory block right after the end of the
 	 *   mapped kernel and all modules/metadata, pointed to by
 	 *   physfree, for early allocations
 	 */
 	cr3 = rcr3();
 	pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
 	    (vm_offset_t)hammer_time);
 	pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
 	    (vm_offset_t)hammer_time);
 	pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
 	    (vm_offset_t)hammer_time);
 	kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
 	    (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
 
 	/* Fix-up for 2M hole */
 	physfree += kernphys;
 	kernphys += NBPDR;
 
 	kmdp = init_ops.parse_preload_data(modulep);
 
 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
 	    MODINFOMD_EFI_MAP) != NULL;
 
 	if (!efi_boot) {
 		/* Tell the bios to warmboot next time */
 		atomic_store_short((u_short *)0x472, 0x1234);
 	}
 
 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
 	physfree = roundup2(physfree, PAGE_SIZE);
 
 	identify_cpu1();
 	identify_hypervisor();
 	identify_cpu_fixup_bsp();
 	identify_cpu2();
 	initializecpucache();
 
 	/*
 	 * Check for pti, pcid, and invpcid before ifuncs are
 	 * resolved, to correctly select the implementation for
 	 * pmap_activate_sw_mode().
 	 */
 	pti = pti_get_default();
 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 		invpcid_works = (cpu_stdext_feature &
 		    CPUID_STDEXT_INVPCID) != 0;
 	} else {
 		pmap_pcid_enabled = 0;
 	}
 
 	link_elf_ireloc(kmdp);
 
 	/*
 	 * This may be done better later if it gets more high level
 	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup0(&proc0, &thread0);
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	thread0.td_kstack = physfree - kernphys + KERNSTART;
 	thread0.td_kstack_pages = kstack_pages;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
 	physfree += kstack0_sz;
 
 	/*
 	 * Initialize enough of thread0 for delayed invalidation to
 	 * work very early.  Rely on thread0.td_base_pri
 	 * zero-initialization, it is reset to PVM at proc0_init().
 	 */
 	pmap_thread_init_invl_gen(&thread0);
 
 	pc = &temp_bsp_pcpu;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	gdt = &temp_bsp_pcpu.pc_gdt[0];
 
 	/*
 	 * make gdt memory segments
 	 */
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base = (long)gdt;
 	lgdt(&r_gdt);
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
 	physfree += DPCPU_SIZE;
 	amd64_bsp_pcpu_init1(pc);
 	/* Non-late cninit() and printf() can be moved up to here. */
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
 		    SEL_KPL, 0);
 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
 	    SEL_UPL, 0);
 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
 	    SEL_UPL, 0);
 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
 	    SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
 	    SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 	    SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 	    SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the clock before the console so that console
 	 * initialization can use DELAY().
 	 */
 	clock_init();
 
 	/*
 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
 	 * transition).
 	 * Once bootblocks have updated, we can test directly for
 	 * efi_systbl != NULL here...
 	 */
 	if (efi_boot)
 		vty_set_preferred(VTY_VT);
 
 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
 
 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
 
 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
 	    &syscall_ret_l1d_flush_mode);
 
 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
 
 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
 
 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
 	    &x86_rngds_mitg_enable);
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 
 	amd64_bsp_ist_init(pc);
 
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	amd64_conf_fast_syscall();
 
 	/*
 	 * We initialize the PCB pointer early so that exception
 	 * handlers will work.  Also set up td_critnest to short-cut
 	 * the page fault handler.
 	 */
 	cpu_max_ext_state_size = sizeof(struct savefpu);
 	set_top_of_stack_td(&thread0);
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_critnest = 1;
 
 	/*
 	 * The console and kdb should be initialized even earlier than here,
 	 * but some console drivers don't work until after getmemsize().
 	 * Default to late console initialization to support these drivers.
 	 * This loses mainly printf()s in getmemsize() and early debugging.
 	 */
 	late_console = 1;
 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
 	if (!late_console) {
 		cninit();
 		amd64_kdb_init();
 	}
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 #ifdef DEV_PCI
         /* This call might adjust phys_avail[]. */
         pci_early_quirks();
 #endif
 
 	if (late_console)
 		cninit();
 
 	/*
 	 * Dump the boot metadata. We have to wait for cninit() since console
 	 * output is required. If it's grossly incorrect the kernel will never
 	 * make it this far.
 	 */
 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
 		preload_dump();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?"
 #endif
 
 	if (late_console)
 		amd64_kdb_init();
 
 	msgbufinit(msgbufp, msgbufsize);
 	fpuinit();
 
-	/*
-	 * Reinitialize thread0's stack base now that the xsave area size is
-	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
-	 * save area size.  Zero out the extended state header in fpu save area.
-	 */
-	set_top_of_stack_td(&thread0);
-	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
-	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
-	if (use_xsave) {
-		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
-		    1);
-		xhdr->xstate_bv = xsave_mask;
-	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	rsp0 = thread0.td_md.md_stack_base;
 	/* Ensure the stack is aligned to 16 bytes */
 	rsp0 &= ~0xFul;
 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
 	amd64_bsp_pcpu_init2(rsp0);
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
 
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_ufssel);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 
         env = kern_getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	kcsan_cpu_init(0);
 
 #ifdef FDT
 	x86_init_fdt();
 #endif
 	thread0.td_critnest = 0;
 
 	kasan_init();
 	kmsan_init();
 
 	TSEXIT();
 
 	/* Location of kernel stack for locore */
 	return (thread0.td_md.md_stack_base);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 static int
 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct bios_smap *smapbase;
 	struct bios_smap_xattr smap;
 	caddr_t kmdp;
 	uint32_t *smapattr;
 	int count, error, i;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		return (0);
 	smapattr = (uint32_t *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
 	error = 0;
 	for (i = 0; i < count; i++) {
 		smap.base = smapbase[i].base;
 		smap.length = smapbase[i].length;
 		smap.type = smapbase[i].type;
 		if (smapattr != NULL)
 			smap.xattr = smapattr[i];
 		else
 			smap.xattr = 0;
 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
 	}
 	return (error);
 }
 SYSCTL_PROC(_machdep, OID_AUTO, smap,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     smap_sysctl_handler, "S,bios_smap_xattr",
     "Raw BIOS SMAP data");
 
 static int
 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
 {
 	struct efi_map_header *efihdr;
 	caddr_t kmdp;
 	uint32_t efisize;
 
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
 	if (efihdr == NULL)
 		return (0);
 	efisize = *((uint32_t *)efihdr - 1);
 	return (SYSCTL_OUT(req, efihdr, efisize));
 }
 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     efi_map_sysctl_handler, "S,efi_map_header",
     "Raw EFI Memory Map");
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		flags = intr_disable();
 		td->td_md.md_spinlock_count = 1;
 		td->td_md.md_saved_flags = flags;
 		critical_enter();
 	} else
 		td->td_md.md_spinlock_count++;
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 	register_t flags;
 
 	td = curthread;
 	flags = td->td_md.md_saved_flags;
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0) {
 		critical_exit();
 		intr_restore(flags);
 	}
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = tf->tf_rsp;
 }
 
 /*
  * The pcb_flags is only modified by current thread, or by other threads
  * when current thread is stopped.  However, current thread may change it
  * from the interrupt context in cpu_switch(), or in the trap handler.
  * When we read-modify-write pcb_flags from C sources, compiler may generate
  * code that is not atomic regarding the interrupt handler.  If a trap or
  * interrupt happens and any flag is modified from the handler, it can be
  * clobbered with the cached value later.  Therefore, we implement setting
  * and clearing flags with single-instruction functions, which do not race
  * with possible modification of the flags from the trap or interrupt context,
  * because traps and interrupts are executed only on instruction boundary.
  */
 void
 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("orl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 
 }
 
 /*
  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
  * pcb if user space modified the bases.  We must save on the context
  * switch or if the return to usermode happens through the doreti.
  *
  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
  * which have a consequence that the base MSRs must be saved each time
  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
  * context switches.
  */
 static void
 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
 {
 	register_t r;
 
 	if (curpcb == pcb &&
 	    (flags & PCB_FULL_IRET) != 0 &&
 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 		r = intr_disable();
 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
 			if (rfs() == _ufssel)
 				pcb->pcb_fsbase = rdfsbase();
 			if (rgs() == _ugssel)
 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
 		}
 		set_pcb_flags_raw(pcb, flags);
 		intr_restore(r);
 	} else {
 		set_pcb_flags_raw(pcb, flags);
 	}
 }
 
 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
 }
 
 void
 clear_pcb_flags(struct pcb *pcb, const u_int flags)
 {
 
 	__asm __volatile("andl %1,%0"
 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
 	    : "cc", "memory");
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only available as
  * inline functions, thus cannot be called from the debugger.
  */
 
 /* silence compiler warnings */
 u_char inb_(u_short);
 void outb_(u_short, u_char);
 
 u_char
 inb_(u_short port)
 {
 	return inb(port);
 }
 
 void
 outb_(u_short port, u_char data)
 {
 	outb(port, data);
 }
 
 #endif /* KDB */
 
 #undef memset
 #undef memmove
 #undef memcpy
 
 void	*memset_std(void *buf, int c, size_t len);
 void	*memset_erms(void *buf, int c, size_t len);
 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
 	    size_t len);
 
 #ifdef KCSAN
 /*
  * These fail to build as ifuncs when used with KCSAN.
  */
 void *
 memset(void *buf, int c, size_t len)
 {
 
 	return (memset_std(buf, c, len));
 }
 
 void *
 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
 {
 
 	return (memmove_std(dst, src, len));
 }
 
 void *
 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
 {
 
 	return (memcpy_std(dst, src, len));
 }
 #else
 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memset_erms : memset_std);
 }
 
 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
     size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memmove_erms : memmove_std);
 }
 
 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    memcpy_erms : memcpy_std);
 }
 #endif
 
 void	pagezero_std(void *addr);
 void	pagezero_erms(void *addr);
 DEFINE_IFUNC(, void , pagezero, (void *))
 {
 
 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
 	    pagezero_erms : pagezero_std);
 }
diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index 4567e6e0eb5d..e42d16d61b3a 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -1,738 +1,742 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_isa.h"
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/wait.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/tss.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 
 _Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
     "OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf.");
 
 void
 set_top_of_stack_td(struct thread *td)
 {
 	td->td_md.md_stack_base = td->td_kstack +
-	    td->td_kstack_pages * PAGE_SIZE -
-	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
+	    td->td_kstack_pages * PAGE_SIZE;
 }
 
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
-	vm_offset_t p;
-
-	p = td->td_md.md_stack_base;
-	KASSERT((p % XSAVE_AREA_ALIGN) == 0,
-	    ("Unaligned pcb_user_save area ptr %#lx td %p", p, td));
-	return ((struct savefpu *)p);
+	KASSERT(((vm_offset_t)td->td_md.md_usr_fpu_save %
+	    XSAVE_AREA_ALIGN) == 0,
+	    ("Unaligned pcb_user_save area ptr %p td %p",
+	    td->td_md.md_usr_fpu_save, td));
+	return (td->td_md.md_usr_fpu_save);
 }
 
 struct pcb *
 get_pcb_td(struct thread *td)
 {
 
 	return (&td->td_md.md_pcb);
 }
 
 struct savefpu *
 get_pcb_user_save_pcb(struct pcb *pcb)
 {
 	struct thread *td;
 
 	td = __containerof(pcb, struct thread, td_md.md_pcb);
 	return (get_pcb_user_save_td(td));
 }
 
 void *
 alloc_fpusave(int flags)
 {
 	void *res;
 	struct savefpu_ymm *sf;
 
 	res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
 	if (use_xsave) {
 		sf = (struct savefpu_ymm *)res;
 		bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd));
 		sf->sv_xstate.sx_hd.xstate_bv = xsave_mask;
 	}
 	return (res);
 }
 
 /*
  * Common code shared between cpu_fork() and cpu_copy_thread() for
  * initializing a thread.
  */
 static void
 copy_thread(struct thread *td1, struct thread *td2)
 {
 	struct pcb *pcb2;
 
 	pcb2 = td2->td_pcb;
 
 	/* Ensure that td1's pcb is up to date for user threads. */
 	if ((td2->td_pflags & TDP_KTHREAD) == 0) {
 		MPASS(td1 == curthread);
 		fpuexit(td1);
 		update_pcb_bases(td1->td_pcb);
 	}
 
 	/* Copy td1's pcb */
 	bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
 
 	/* Properly initialize pcb_save */
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
 
 	/* Kernel threads start with clean FPU and segment bases. */
 	if ((td2->td_pflags & TDP_KTHREAD) != 0) {
 		pcb2->pcb_fsbase = 0;
 		pcb2->pcb_gsbase = 0;
 		clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE |
 		    PCB_KERNFPU | PCB_KERNFPU_THR);
 	} else {
 		MPASS((pcb2->pcb_flags & (PCB_KERNFPU | PCB_KERNFPU_THR)) == 0);
 		bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
 		    cpu_max_ext_state_size);
 	}
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pcb2->pcb_r12 = (register_t)fork_return;	/* fork_trampoline argument */
 	pcb2->pcb_rbp = 0;
 	pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *);
 	pcb2->pcb_rbx = (register_t)td2;		/* fork_trampoline argument */
 	pcb2->pcb_rip = (register_t)fork_trampoline;
 	/*-
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_flags:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_[fg]sbase:	cloned above
 	 */
 
 	pcb2->pcb_tssp = NULL;
 
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 	pmap_thread_init_invl_gen(td2);
 }
 
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
  */
 void
 cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
 {
 	struct proc *p1;
 	struct pcb *pcb2;
 	struct mdproc *mdp1, *mdp2;
 	struct proc_ldt *pldt;
 
 	p1 = td1->td_proc;
 	if ((flags & RFPROC) == 0) {
 		if ((flags & RFMEM) == 0) {
 			/* unshare user LDT */
 			mdp1 = &p1->p_md;
 			mtx_lock(&dt_lock);
 			if ((pldt = mdp1->md_ldt) != NULL &&
 			    pldt->ldt_refcnt > 1 &&
 			    user_ldt_alloc(p1, 1) == NULL)
 				panic("could not copy LDT");
 			mtx_unlock(&dt_lock);
 		}
 		return;
 	}
 
 	/* Point the stack and pcb to the actual location */
 	set_top_of_stack_td(td2);
 	td2->td_pcb = pcb2 = get_pcb_td(td2);
 
 	copy_thread(td1, td2);
 
 	/* Reset debug registers in the new process */
 	x86_clear_dbregs(pcb2);
 
 	/* Point mdproc and then copy over p1's contents */
 	mdp2 = &p2->p_md;
 	bcopy(&p1->p_md, mdp2, sizeof(*mdp2));
 
 	/*
 	 * Copy the trap frame for the return to user mode as if from a
 	 * syscall.  This copies most of the user mode register values.
 	 */
 	td2->td_frame = (struct trapframe *)td2->td_md.md_stack_base - 1;
 	bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 	/* Set child return values. */
 	p2->p_sysent->sv_set_fork_retval(td2);
 
 	/*
 	 * If the parent process has the trap bit set (i.e. a debugger
 	 * had single stepped the process to the system call), we need
 	 * to clear the trap flag from the new frame.
 	 */
 	td2->td_frame->tf_rflags &= ~PSL_T;
 
 	/* As on i386, do not copy io permission bitmap. */
 	pcb2->pcb_tssp = NULL;
 
 	/* New segment registers. */
 	set_pcb_flags_raw(pcb2, PCB_FULL_IRET);
 
 	/* Copy the LDT, if necessary. */
 	mdp1 = &td1->td_proc->p_md;
 	mdp2 = &p2->p_md;
 	if (mdp1->md_ldt == NULL) {
 		mdp2->md_ldt = NULL;
 		return;
 	}
 	mtx_lock(&dt_lock);
 	if (mdp1->md_ldt != NULL) {
 		if (flags & RFMEM) {
 			mdp1->md_ldt->ldt_refcnt++;
 			mdp2->md_ldt = mdp1->md_ldt;
 			bcopy(&mdp1->md_ldt_sd, &mdp2->md_ldt_sd, sizeof(struct
 			    system_segment_descriptor));
 		} else {
 			mdp2->md_ldt = NULL;
 			mdp2->md_ldt = user_ldt_alloc(p2, 0);
 			if (mdp2->md_ldt == NULL)
 				panic("could not copy LDT");
 			amd64_set_ldt_data(td2, 0, max_ldt_segment,
 			    (struct user_segment_descriptor *)
 			    mdp1->md_ldt->ldt_base);
 		}
 	} else
 		mdp2->md_ldt = NULL;
 	mtx_unlock(&dt_lock);
 
 	/*
 	 * Now, cpu_switch() can schedule the new process.
 	 * pcb_rsp is loaded pointing to the cpu_switch() stack frame
 	 * containing the return address when exiting cpu_switch.
 	 * This will normally be to fork_trampoline(), which will have
 	 * %rbx loaded with the new proc's pointer.  fork_trampoline()
 	 * will set up a stack to call fork_return(p, frame); to complete
 	 * the return to user-mode.
 	 */
 }
 
 void
 x86_set_fork_retval(struct thread *td)
 {
 	struct trapframe *frame = td->td_frame;
 
 	frame->tf_rax = 0;		/* Child returns zero */
 	frame->tf_rflags &= ~PSL_C;	/* success */
 	frame->tf_rdx = 1;		/* System V emulation */
 }
 
 /*
  * Intercept the return address from a freshly forked process that has NOT
  * been scheduled yet.
  *
  * This is needed to make kernel threads stay in kernel mode.
  */
 void
 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg)
 {
 	/*
 	 * Note that the trap frame follows the args, so the function
 	 * is really called like this:  func(arg, frame);
 	 */
 	td->td_pcb->pcb_r12 = (long) func;	/* function */
 	td->td_pcb->pcb_rbx = (long) arg;	/* first arg */
 }
 
 void
 cpu_exit(struct thread *td)
 {
 
 	/*
 	 * If this process has a custom LDT, release it.
 	 */
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 }
 
 void
 cpu_thread_exit(struct thread *td)
 {
 	struct pcb *pcb;
 
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread))
 		fpudrop();
 	critical_exit();
 
 	pcb = td->td_pcb;
 
 	/* Disable any hardware breakpoints. */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		reset_dbregs();
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 }
 
 void
 cpu_thread_clean(struct thread *td)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	/*
 	 * Clean TSS/iomap
 	 */
 	if (pcb->pcb_tssp != NULL) {
 		pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp,
 		    (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1));
 		kmem_free((vm_offset_t)pcb->pcb_tssp, ctob(IOPAGES + 1));
 		pcb->pcb_tssp = NULL;
 	}
 }
 
 void
 cpu_thread_swapin(struct thread *td)
 {
 }
 
 void
 cpu_thread_swapout(struct thread *td)
 {
 }
 
 void
 cpu_thread_alloc(struct thread *td)
 {
 	struct pcb *pcb;
 	struct xstate_hdr *xhdr;
 
 	set_top_of_stack_td(td);
 	td->td_pcb = pcb = get_pcb_td(td);
 	td->td_frame = (struct trapframe *)td->td_md.md_stack_base - 1;
+	td->td_md.md_usr_fpu_save = fpu_save_area_alloc();
+	td->td_md.md_fpu_scratch = fpu_save_area_alloc();
 	pcb->pcb_save = get_pcb_user_save_pcb(pcb);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
 		bzero(xhdr, sizeof(*xhdr));
 		xhdr->xstate_bv = xsave_mask;
 	}
 }
 
 void
 cpu_thread_free(struct thread *td)
 {
-
 	cpu_thread_clean(td);
+
+	fpu_save_area_free(td->td_md.md_usr_fpu_save);
+	td->td_md.md_usr_fpu_save = NULL;
+	fpu_save_area_free(td->td_md.md_fpu_scratch);
+	td->td_md.md_fpu_scratch = NULL;
 }
 
 bool
 cpu_exec_vmspace_reuse(struct proc *p, vm_map_t map)
 {
 
 	return (((curproc->p_md.md_flags & P_MD_KPTI) != 0) ==
 	    (vm_map_pmap(map)->pm_ucr3 != PMAP_NO_CR3));
 }
 
 static void
 cpu_procctl_kpti_ctl(struct proc *p, int val)
 {
 
 	if (pti && val == PROC_KPTI_CTL_ENABLE_ON_EXEC)
 		p->p_md.md_flags |= P_MD_KPTI;
 	if (val == PROC_KPTI_CTL_DISABLE_ON_EXEC)
 		p->p_md.md_flags &= ~P_MD_KPTI;
 }
 
 static void
 cpu_procctl_kpti_status(struct proc *p, int *val)
 {
 	*val = (p->p_md.md_flags & P_MD_KPTI) != 0 ?
 	    PROC_KPTI_CTL_ENABLE_ON_EXEC:
 	    PROC_KPTI_CTL_DISABLE_ON_EXEC;
 	if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3)
 		*val |= PROC_KPTI_STATUS_ACTIVE;
 }
 
 static int
 cpu_procctl_la_ctl(struct proc *p, int val)
 {
 	int error;
 
 	error = 0;
 	switch (val) {
 	case PROC_LA_CTL_LA48_ON_EXEC:
 		p->p_md.md_flags |= P_MD_LA48;
 		p->p_md.md_flags &= ~P_MD_LA57;
 		break;
 	case PROC_LA_CTL_LA57_ON_EXEC:
 		if (la57) {
 			p->p_md.md_flags &= ~P_MD_LA48;
 			p->p_md.md_flags |= P_MD_LA57;
 		} else {
 			error = ENOTSUP;
 		}
 		break;
 	case PROC_LA_CTL_DEFAULT_ON_EXEC:
 		p->p_md.md_flags &= ~(P_MD_LA48 | P_MD_LA57);
 		break;
 	}
 	return (error);
 }
 
 static void
 cpu_procctl_la_status(struct proc *p, int *val)
 {
 	int res;
 
 	if ((p->p_md.md_flags & P_MD_LA48) != 0)
 		res = PROC_LA_CTL_LA48_ON_EXEC;
 	else if ((p->p_md.md_flags & P_MD_LA57) != 0)
 		res = PROC_LA_CTL_LA57_ON_EXEC;
 	else
 		res = PROC_LA_CTL_DEFAULT_ON_EXEC;
 	if (p->p_sysent->sv_maxuser == VM_MAXUSER_ADDRESS_LA48)
 		res |= PROC_LA_STATUS_LA48;
 	else
 		res |= PROC_LA_STATUS_LA57;
 	*val = res;
 }
 
 int
 cpu_procctl(struct thread *td, int idtype, id_t id, int com, void *data)
 {
 	struct proc *p;
 	int error, val;
 
 	switch (com) {
 	case PROC_KPTI_CTL:
 	case PROC_KPTI_STATUS:
 	case PROC_LA_CTL:
 	case PROC_LA_STATUS:
 		if (idtype != P_PID) {
 			error = EINVAL;
 			break;
 		}
 		if (com == PROC_KPTI_CTL) {
 			/* sad but true and not a joke */
 			error = priv_check(td, PRIV_IO);
 			if (error != 0)
 				break;
 		}
 		if (com == PROC_KPTI_CTL || com == PROC_LA_CTL) {
 			error = copyin(data, &val, sizeof(val));
 			if (error != 0)
 				break;
 		}
 		if (com == PROC_KPTI_CTL &&
 		    val != PROC_KPTI_CTL_ENABLE_ON_EXEC &&
 		    val != PROC_KPTI_CTL_DISABLE_ON_EXEC) {
 			error = EINVAL;
 			break;
 		}
 		if (com == PROC_LA_CTL &&
 		    val != PROC_LA_CTL_LA48_ON_EXEC &&
 		    val != PROC_LA_CTL_LA57_ON_EXEC &&
 		    val != PROC_LA_CTL_DEFAULT_ON_EXEC) {
 			error = EINVAL;
 			break;
 		}
 		error = pget(id, PGET_CANSEE | PGET_NOTWEXIT | PGET_NOTID, &p);
 		if (error != 0)
 			break;
 		switch (com) {
 		case PROC_KPTI_CTL:
 			cpu_procctl_kpti_ctl(p, val);
 			break;
 		case PROC_KPTI_STATUS:
 			cpu_procctl_kpti_status(p, &val);
 			break;
 		case PROC_LA_CTL:
 			error = cpu_procctl_la_ctl(p, val);
 			break;
 		case PROC_LA_STATUS:
 			cpu_procctl_la_status(p, &val);
 			break;
 		}
 		PROC_UNLOCK(p);
 		if (com == PROC_KPTI_STATUS || com == PROC_LA_STATUS)
 			error = copyout(&val, data, sizeof(val));
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 	struct trapframe *frame;
 
 	frame = td->td_frame;
 	if (__predict_true(error == 0)) {
 		frame->tf_rax = td->td_retval[0];
 		frame->tf_rdx = td->td_retval[1];
 		frame->tf_rflags &= ~PSL_C;
 		return;
 	}
 
 	switch (error) {
 	case ERESTART:
 		/*
 		 * Reconstruct pc, we know that 'syscall' is 2 bytes,
 		 * lcall $X,y is 7 bytes, int 0x80 is 2 bytes.
 		 * We saved this in tf_err.
 		 * %r10 (which was holding the value of %rcx) is restored
 		 * for the next iteration.
 		 * %r10 restore is only required for freebsd/amd64 processes,
 		 * but shall be innocent for any ia32 ABI.
 		 *
 		 * Require full context restore to get the arguments
 		 * in the registers reloaded at return to usermode.
 		 */
 		frame->tf_rip -= frame->tf_err;
 		frame->tf_r10 = frame->tf_rcx;
 		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 		frame->tf_rax = error;
 		frame->tf_rflags |= PSL_C;
 		break;
 	}
 }
 
 /*
  * Initialize machine state, mostly pcb and trap frame for a new
  * thread, about to return to userspace.  Put enough state in the new
  * thread's PCB to get it to go back to the fork_return(), which
  * finalizes the thread state and handles peculiarities of the first
  * return to userspace for the new thread.
  */
 void
 cpu_copy_thread(struct thread *td, struct thread *td0)
 {
 	copy_thread(td0, td);
 
 	/*
 	 * Copy user general-purpose registers.
 	 *
 	 * Some of these registers are rewritten by cpu_set_upcall()
 	 * and linux_set_upcall().
 	 */
 	bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe));
 
 	/* If the current thread has the trap bit set (i.e. a debugger had
 	 * single stepped the process to the system call), we need to clear
 	 * the trap flag from the new frame. Otherwise, the new thread will
 	 * receive a (likely unexpected) SIGTRAP when it executes the first
 	 * instruction after returning to userland.
 	 */
 	td->td_frame->tf_rflags &= ~PSL_T;
 
 	set_pcb_flags_raw(td->td_pcb, PCB_FULL_IRET);
 }
 
 /*
  * Set that machine state for performing an upcall that starts
  * the entry function with the given argument.
  */
 void
 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg,
     stack_t *stack)
 {
 
 	/* 
 	 * Do any extra cleaning that needs to be done.
 	 * The thread may have optional components
 	 * that are not present in a fresh thread.
 	 * This may be a recycled thread so make it look
 	 * as though it's newly allocated.
 	 */
 	cpu_thread_clean(td);
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		/*
 		 * Set the trap frame to point at the beginning of the entry
 		 * function.
 		 */
 		td->td_frame->tf_rbp = 0;
 		td->td_frame->tf_rsp =
 		   (((uintptr_t)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4;
 		td->td_frame->tf_rip = (uintptr_t)entry;
 
 		/* Return address sentinel value to stop stack unwinding. */
 		suword32((void *)td->td_frame->tf_rsp, 0);
 
 		/* Pass the argument to the entry point. */
 		suword32((void *)(td->td_frame->tf_rsp + sizeof(int32_t)),
 		    (uint32_t)(uintptr_t)arg);
 
 		return;
 	}
 #endif
 
 	/*
 	 * Set the trap frame to point at the beginning of the uts
 	 * function.
 	 */
 	td->td_frame->tf_rbp = 0;
 	td->td_frame->tf_rsp =
 	    ((register_t)stack->ss_sp + stack->ss_size) & ~0x0f;
 	td->td_frame->tf_rsp -= 8;
 	td->td_frame->tf_rip = (register_t)entry;
 	td->td_frame->tf_ds = _udatasel;
 	td->td_frame->tf_es = _udatasel;
 	td->td_frame->tf_fs = _ufssel;
 	td->td_frame->tf_gs = _ugssel;
 	td->td_frame->tf_flags = TF_HASSEGS;
 
 	/* Return address sentinel value to stop stack unwinding. */
 	suword((void *)td->td_frame->tf_rsp, 0);
 
 	/* Pass the argument to the entry point. */
 	td->td_frame->tf_rdi = (register_t)arg;
 }
 
 int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 	struct pcb *pcb;
 
 	if ((u_int64_t)tls_base >= VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 
 	pcb = td->td_pcb;
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		pcb->pcb_gsbase = (register_t)tls_base;
 		return (0);
 	}
 #endif
 	pcb->pcb_fsbase = (register_t)tls_base;
 	return (0);
 }
 
 /*
  * Software interrupt handler for queued VM system processing.
  */   
 void  
 swi_vm(void *dummy) 
 {     
 	if (busdma_swi_pending != 0)
 		busdma_swi();
 }
 
 /*
  * Tell whether this address is in some physical memory region.
  * Currently used by the kernel coredump code in order to avoid
  * dumping the ``ISA memory hole'' which could cause indefinite hangs,
  * or other unpredictable behaviour.
  */
 
 int
 is_physical_memory(vm_paddr_t addr)
 {
 
 #ifdef DEV_ISA
 	/* The ISA ``memory hole''. */
 	if (addr >= 0xa0000 && addr < 0x100000)
 		return 0;
 #endif
 
 	/*
 	 * stuff other tests for known memory-mapped devices (PCI?)
 	 * here
 	 */
 
 	return 1;
 }
diff --git a/sys/amd64/ia32/ia32_signal.c b/sys/amd64/ia32/ia32_signal.c
index 49b5797d68fd..9b67c7001a87 100644
--- a/sys/amd64/ia32/ia32_signal.c
+++ b/sys/amd64/ia32/ia32_signal.c
@@ -1,974 +1,974 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2003 Peter Wemm
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/ia32/ia32_signal.h>
 #include <machine/psl.h>
 #include <machine/segments.h>
 #include <machine/specialreg.h>
 #include <machine/frame.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/cpufunc.h>
 #include <machine/trap.h>
 
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_ia32_sendsig(sig_t, ksiginfo_t *, sigset_t *);
 #endif
 
 static void
 ia32_get_fpcontext(struct thread *td, struct ia32_mcontext *mcp,
     char *xfpusave, size_t xfpusave_len)
 {
 	size_t max_len, len;
 
 	/*
 	 * XXX Format of 64bit and 32bit FXSAVE areas differs. FXSAVE
 	 * in 32bit mode saves %cs and %ds, while on 64bit it saves
 	 * 64bit instruction and data pointers. Ignore the difference
 	 * for now, it should be irrelevant for most applications.
 	 */
 	mcp->mc_ownedfp = fpugetregs(td);
 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
 	    sizeof(mcp->mc_fpstate));
 	mcp->mc_fpformat = fpuformat();
 	if (!use_xsave || xfpusave_len == 0)
 		return;
 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
 	len = xfpusave_len;
 	if (len > max_len) {
 		len = max_len;
 		bzero(xfpusave + max_len, len - max_len);
 	}
 	mcp->mc_flags |= _MC_IA32_HASFPXSTATE;
 	mcp->mc_xfpustate_len = len;
 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
 }
 
 static int
 ia32_set_fpcontext(struct thread *td, struct ia32_mcontext *mcp,
     char *xfpustate, size_t xfpustate_len)
 {
 	int error;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 		error = 0;
 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
 		    xfpustate, xfpustate_len);
 	} else
 		return (EINVAL);
 	return (error);
 }
 
 /*
  * Get machine context.
  */
 static int
 ia32_get_mcontext(struct thread *td, struct ia32_mcontext *mcp, int flags)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	pcb = td->td_pcb;
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	/* Entry into kernel always sets TF_HASSEGS */
 	mcp->mc_gs = tp->tf_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_rdi;
 	mcp->mc_esi = tp->tf_rsi;
 	mcp->mc_ebp = tp->tf_rbp;
 	mcp->mc_isp = tp->tf_rsp;
 	mcp->mc_eflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_rax;
 		mcp->mc_edx = tp->tf_rdx;
 	}
 	mcp->mc_ebx = tp->tf_rbx;
 	mcp->mc_ecx = tp->tf_rcx;
 	mcp->mc_eip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	mcp->mc_flags = tp->tf_flags;
 	ia32_get_fpcontext(td, mcp, NULL, 0);
 	mcp->mc_fsbase = pcb->pcb_fsbase;
 	mcp->mc_gsbase = pcb->pcb_gsbase;
 	mcp->mc_xfpustate = 0;
 	mcp->mc_xfpustate_len = 0;
 	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 static int
 ia32_set_mcontext(struct thread *td, struct ia32_mcontext *mcp)
 {
 	struct trapframe *tp;
 	char *xfpustate;
 	long rflags;
 	int ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	rflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	if (mcp->mc_flags & _MC_IA32_HASFPXSTATE) {
 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu))
 			return (EINVAL);
-		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
+		xfpustate = (char *)td->td_md.md_fpu_scratch;
 		ret = copyin(PTRIN(mcp->mc_xfpustate), xfpustate,
 		    mcp->mc_xfpustate_len);
 		if (ret != 0)
 			return (ret);
 	} else
 		xfpustate = NULL;
 	ret = ia32_set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
 	if (ret != 0)
 		return (ret);
 	tp->tf_gs = mcp->mc_gs;
 	tp->tf_fs = mcp->mc_fs;
 	tp->tf_es = mcp->mc_es;
 	tp->tf_ds = mcp->mc_ds;
 	tp->tf_flags = TF_HASSEGS;
 	tp->tf_rdi = mcp->mc_edi;
 	tp->tf_rsi = mcp->mc_esi;
 	tp->tf_rbp = mcp->mc_ebp;
 	tp->tf_rbx = mcp->mc_ebx;
 	tp->tf_rdx = mcp->mc_edx;
 	tp->tf_rcx = mcp->mc_ecx;
 	tp->tf_rax = mcp->mc_eax;
 	/* trapno, err */
 	tp->tf_rip = mcp->mc_eip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_esp;
 	tp->tf_ss = mcp->mc_ss;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (0);
 }
 
 /*
  * The first two fields of a ucontext_t are the signal mask and
  * the machine context.  The next field is uc_link; we want to
  * avoid destroying the link when copying out contexts.
  */
 #define	UC_COPY_SIZE	offsetof(struct ia32_ucontext, uc_link)
 
 int
 freebsd32_getcontext(struct thread *td, struct freebsd32_getcontext_args *uap)
 {
 	struct ia32_ucontext uc;
 	int ret;
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		bzero(&uc, sizeof(uc));
 		ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
 	}
 	return (ret);
 }
 
 int
 freebsd32_setcontext(struct thread *td, struct freebsd32_setcontext_args *uap)
 {
 	struct ia32_ucontext uc;
 	int ret;
 
 	if (uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = ia32_set_mcontext(td, &uc.uc_mcontext);
 			if (ret == 0) {
 				kern_sigprocmask(td, SIG_SETMASK,
 				    &uc.uc_sigmask, NULL, 0);
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 int
 freebsd32_swapcontext(struct thread *td, struct freebsd32_swapcontext_args *uap)
 {
 	struct ia32_ucontext uc;
 	int ret;
 
 	if (uap->oucp == NULL || uap->ucp == NULL)
 		ret = EINVAL;
 	else {
 		bzero(&uc, sizeof(uc));
 		ia32_get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
 		PROC_LOCK(td->td_proc);
 		uc.uc_sigmask = td->td_sigmask;
 		PROC_UNLOCK(td->td_proc);
 		ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
 		if (ret == 0) {
 			ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
 			if (ret == 0) {
 				ret = ia32_set_mcontext(td, &uc.uc_mcontext);
 				if (ret == 0) {
 					kern_sigprocmask(td, SIG_SETMASK,
 					    &uc.uc_sigmask, NULL, 0);
 				}
 			}
 		}
 	}
 	return (ret == 0 ? EJUSTRETURN : ret);
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 
 #ifdef COMPAT_43
 static void
 ia32_osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct ia32_sigframe3 sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct ia32_sigframe3 *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(sf));
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	} else
 		fp = (struct ia32_sigframe3 *)regs->tf_rsp - 1;
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ah = (uintptr_t)catcher;
 		sf.sf_addr = 0;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ah = (uintptr_t)catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_rax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_rbx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_rcx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_rdx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_rsi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_rdi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = regs->tf_gs;
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_rsp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_esp = regs->tf_rsp;
 	sf.sf_siginfo.si_sc.sc_ebp = regs->tf_rbp;
 	sf.sf_siginfo.si_sc.sc_eip = regs->tf_rip;
 	sf.sf_siginfo.si_sc.sc_eflags = regs->tf_rflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (uintptr_t)fp;
 	regs->tf_rip = p->p_sysent->sv_psstrings - sz_ia32_osigcode;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct ia32_sigframe4 sf, *sfp;
 	struct siginfo32 siginfo;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int oonstack;
 	int sig;
 
 	td = curthread;
 	p = td->td_proc;
 	siginfo_to_siginfo32(&ksi->ksi_info, &siginfo);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = siginfo.si_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
 	sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi;
 	sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi;
 	sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp;
 	sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */
 	sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx;
 	sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx;
 	sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx;
 	sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax;
 	sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno;
 	sf.sf_uc.uc_mcontext.mc_err = regs->tf_err;
 	sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip;
 	sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs;
 	sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags;
 	sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp;
 	sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss;
 	sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds;
 	sf.sf_uc.uc_mcontext.mc_es = regs->tf_es;
 	sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs;
 	sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs;
 	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
 	    sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
 	bzero(sf.sf_uc.uc_mcontext.__spare__,
 	    sizeof(sf.sf_uc.uc_mcontext.__spare__));
 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct ia32_sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(sf));
 	} else
 		sfp = (struct ia32_sigframe4 *)regs->tf_rsp - 1;
 	PROC_UNLOCK(p);
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si;
 		sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = siginfo;
 		sf.sf_si.si_signo = sig;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = siginfo.si_code;
 		sf.sf_addr = (u_int32_t)siginfo.si_addr;
 		sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (uintptr_t)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base + sz_ia32_sigcode -
 	    sz_freebsd4_ia32_sigcode;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	/* leave user %fs and %gs untouched */
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 ia32_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct ia32_sigframe sf, *sfp;
 	struct siginfo32 siginfo;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	char *xfpusave;
 	size_t xfpusave_len;
 	int oonstack;
 	int sig;
 
 	siginfo_to_siginfo32(&ksi->ksi_info, &siginfo);
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = siginfo.si_signo;
 	psp = p->p_sigacts;
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_ia32_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		ia32_osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
-		xfpusave = __builtin_alloca(xfpusave_len);
+		xfpusave = (char *)td->td_md.md_fpu_scratch;
 	} else {
 		xfpusave_len = 0;
 		xfpusave = NULL;
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
 	sf.sf_uc.uc_stack.ss_size = td->td_sigstk.ss_size;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_edi = regs->tf_rdi;
 	sf.sf_uc.uc_mcontext.mc_esi = regs->tf_rsi;
 	sf.sf_uc.uc_mcontext.mc_ebp = regs->tf_rbp;
 	sf.sf_uc.uc_mcontext.mc_isp = regs->tf_rsp; /* XXX */
 	sf.sf_uc.uc_mcontext.mc_ebx = regs->tf_rbx;
 	sf.sf_uc.uc_mcontext.mc_edx = regs->tf_rdx;
 	sf.sf_uc.uc_mcontext.mc_ecx = regs->tf_rcx;
 	sf.sf_uc.uc_mcontext.mc_eax = regs->tf_rax;
 	sf.sf_uc.uc_mcontext.mc_trapno = regs->tf_trapno;
 	sf.sf_uc.uc_mcontext.mc_err = regs->tf_err;
 	sf.sf_uc.uc_mcontext.mc_eip = regs->tf_rip;
 	sf.sf_uc.uc_mcontext.mc_cs = regs->tf_cs;
 	sf.sf_uc.uc_mcontext.mc_eflags = regs->tf_rflags;
 	sf.sf_uc.uc_mcontext.mc_esp = regs->tf_rsp;
 	sf.sf_uc.uc_mcontext.mc_ss = regs->tf_ss;
 	sf.sf_uc.uc_mcontext.mc_ds = regs->tf_ds;
 	sf.sf_uc.uc_mcontext.mc_es = regs->tf_es;
 	sf.sf_uc.uc_mcontext.mc_fs = regs->tf_fs;
 	sf.sf_uc.uc_mcontext.mc_gs = regs->tf_gs;
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	ia32_get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
 	fpstate_drop(td);
 	sf.sf_uc.uc_mcontext.mc_fsbase = td->td_pcb->pcb_fsbase;
 	sf.sf_uc.uc_mcontext.mc_gsbase = td->td_pcb->pcb_gsbase;
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig))
 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
 	else
 		sp = (char *)regs->tf_rsp;
 	if (xfpusave != NULL) {
 		sp -= xfpusave_len;
 		sp = (char *)((unsigned long)sp & ~0x3Ful);
 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
 	}
 	sp -= sizeof(sf);
 	/* Align to 16 bytes. */
 	sfp = (struct ia32_sigframe *)((uintptr_t)sp & ~0xF);
 	PROC_UNLOCK(p);
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	bzero(&sf.sf_si, sizeof(sf.sf_si));
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (u_int32_t)(uintptr_t)&sfp->sf_si;
 		sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = siginfo;
 		sf.sf_si.si_signo = sig;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = siginfo.si_code;
 		sf.sf_addr = (u_int32_t)siginfo.si_addr;
 		sf.sf_ah = (u_int32_t)(uintptr_t)catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    (xfpusave != NULL && copyout(xfpusave,
 	    PTRIN(sf.sf_uc.uc_mcontext.mc_xfpustate), xfpusave_len)
 	    != 0)) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (uintptr_t)sfp;
 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
 	regs->tf_rflags &= ~(PSL_T | PSL_D);
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	/* XXXKIB leave user %fs and %gs untouched */
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 
 #ifdef COMPAT_43
 int
 ofreebsd32_sigreturn(struct thread *td, struct ofreebsd32_sigreturn_args *uap)
 {
 	struct ia32_sigcontext3 sc, *scp;
 	struct trapframe *regs;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_eflags;
 	if (!EFL_SECURE(eflags, regs->tf_rflags)) {
 		return (EINVAL);
 	}
 	if (!CS_SECURE(scp->sc_cs)) {
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 	regs->tf_ds = scp->sc_ds;
 	regs->tf_es = scp->sc_es;
 	regs->tf_fs = scp->sc_fs;
 	regs->tf_gs = scp->sc_gs;
 
 	regs->tf_rax = scp->sc_eax;
 	regs->tf_rbx = scp->sc_ebx;
 	regs->tf_rcx = scp->sc_ecx;
 	regs->tf_rdx = scp->sc_edx;
 	regs->tf_rsi = scp->sc_esi;
 	regs->tf_rdi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_rbp = scp->sc_ebp;
 	regs->tf_rsp = scp->sc_esp;
 	regs->tf_rip = scp->sc_eip;
 	regs->tf_rflags = eflags;
 
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 
 	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
 	    SIGPROCMASK_OLD);
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (EJUSTRETURN);
 }
 #endif
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_freebsd32_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_freebsd32_sigreturn_args /* {
 		const struct freebsd4_freebsd32_ucontext *sigcntxp;
 	} */ *uap;
 {
 	struct ia32_ucontext4 uc;
 	struct trapframe *regs;
 	struct ia32_ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(eflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): freebsd4_freebsd32_sigreturn eflags = 0x%x\n",
 		    td->td_proc->p_pid, td->td_name, eflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
 		    td->td_proc->p_pid, td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	regs->tf_rdi = ucp->uc_mcontext.mc_edi;
 	regs->tf_rsi = ucp->uc_mcontext.mc_esi;
 	regs->tf_rbp = ucp->uc_mcontext.mc_ebp;
 	regs->tf_rbx = ucp->uc_mcontext.mc_ebx;
 	regs->tf_rdx = ucp->uc_mcontext.mc_edx;
 	regs->tf_rcx = ucp->uc_mcontext.mc_ecx;
 	regs->tf_rax = ucp->uc_mcontext.mc_eax;
 	regs->tf_trapno = ucp->uc_mcontext.mc_trapno;
 	regs->tf_err = ucp->uc_mcontext.mc_err;
 	regs->tf_rip = ucp->uc_mcontext.mc_eip;
 	regs->tf_cs = cs;
 	regs->tf_rflags = ucp->uc_mcontext.mc_eflags;
 	regs->tf_rsp = ucp->uc_mcontext.mc_esp;
 	regs->tf_ss = ucp->uc_mcontext.mc_ss;
 	regs->tf_ds = ucp->uc_mcontext.mc_ds;
 	regs->tf_es = ucp->uc_mcontext.mc_es;
 	regs->tf_fs = ucp->uc_mcontext.mc_fs;
 	regs->tf_gs = ucp->uc_mcontext.mc_gs;
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 freebsd32_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd32_sigreturn_args /* {
 		const struct freebsd32_ucontext *sigcntxp;
 	} */ *uap;
 {
 	struct ia32_ucontext uc;
 	struct trapframe *regs;
 	struct ia32_ucontext *ucp;
 	char *xfpustate;
 	size_t xfpustate_len;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	if (!EFL_SECURE(eflags, regs->tf_rflags)) {
 		uprintf("pid %d (%s): freebsd32_sigreturn eflags = 0x%x\n",
 		    td->td_proc->p_pid, td->td_name, eflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
 		    td->td_proc->p_pid, td->td_name, cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	if ((ucp->uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
 		if (xfpustate_len > cpu_max_ext_state_size -
 		    sizeof(struct savefpu)) {
 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
 			    td->td_proc->p_pid, td->td_name, xfpustate_len);
 			return (EINVAL);
 		}
-		xfpustate = __builtin_alloca(xfpustate_len);
+		xfpustate = (char *)td->td_md.md_fpu_scratch;
 		error = copyin(PTRIN(ucp->uc_mcontext.mc_xfpustate),
 		    xfpustate, xfpustate_len);
 		if (error != 0) {
 			uprintf(
 	"pid %d (%s): sigreturn copying xfpustate failed\n",
 			    td->td_proc->p_pid, td->td_name);
 			return (error);
 		}
 	} else {
 		xfpustate = NULL;
 		xfpustate_len = 0;
 	}
 	ret = ia32_set_fpcontext(td, &ucp->uc_mcontext, xfpustate,
 	    xfpustate_len);
 	if (ret != 0) {
 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
 		    td->td_proc->p_pid, td->td_name, ret);
 		return (ret);
 	}
 
 	regs->tf_rdi = ucp->uc_mcontext.mc_edi;
 	regs->tf_rsi = ucp->uc_mcontext.mc_esi;
 	regs->tf_rbp = ucp->uc_mcontext.mc_ebp;
 	regs->tf_rbx = ucp->uc_mcontext.mc_ebx;
 	regs->tf_rdx = ucp->uc_mcontext.mc_edx;
 	regs->tf_rcx = ucp->uc_mcontext.mc_ecx;
 	regs->tf_rax = ucp->uc_mcontext.mc_eax;
 	regs->tf_trapno = ucp->uc_mcontext.mc_trapno;
 	regs->tf_err = ucp->uc_mcontext.mc_err;
 	regs->tf_rip = ucp->uc_mcontext.mc_eip;
 	regs->tf_cs = cs;
 	regs->tf_rflags = ucp->uc_mcontext.mc_eflags;
 	regs->tf_rsp = ucp->uc_mcontext.mc_esp;
 	regs->tf_ss = ucp->uc_mcontext.mc_ss;
 	regs->tf_ds = ucp->uc_mcontext.mc_ds;
 	regs->tf_es = ucp->uc_mcontext.mc_es;
 	regs->tf_fs = ucp->uc_mcontext.mc_fs;
 	regs->tf_gs = ucp->uc_mcontext.mc_gs;
 	regs->tf_flags = TF_HASSEGS;
 
 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 	return (EJUSTRETURN);
 }
 
 /*
  * Clear registers on exec
  */
 void
 ia32_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
 {
 	struct trapframe *regs;
 	struct pcb *pcb;
 	register_t saved_rflags;
 
 	regs = td->td_frame;
 	pcb = td->td_pcb;
 
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 #ifdef COMPAT_43
 	setup_lcall_gate();
 #endif
 
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW_I386__;
 
 	saved_rflags = regs->tf_rflags & PSL_T;
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = imgp->entry_addr;
 	regs->tf_rsp = stack;
 	regs->tf_rflags = PSL_USER | saved_rflags;
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucode32sel;
 	regs->tf_rbx = (register_t)imgp->ps_strings;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _ufssel;
 	regs->tf_gs = _ugssel;
 	regs->tf_flags = TF_HASSEGS;
 
 	x86_clear_dbregs(pcb);
 
 	fpstate_drop(td);
 
 	/* Return via doreti so that we can change to a different %cs */
 	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
 }
diff --git a/sys/amd64/include/proc.h b/sys/amd64/include/proc.h
index 0f8cf50e326d..bd07f70f8d44 100644
--- a/sys/amd64/include/proc.h
+++ b/sys/amd64/include/proc.h
@@ -1,125 +1,127 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)proc.h	7.1 (Berkeley) 5/15/91
  * $FreeBSD$
  */
 
 #ifndef _MACHINE_PROC_H_
 #define	_MACHINE_PROC_H_
 
 #include <sys/queue.h>
 #include <machine/pcb.h>
 #include <machine/segments.h>
 
 /*
  * List of locks
  *	c  - proc lock
  *	k  - only accessed by curthread
  *	pp - pmap.c:invl_gen_mtx
  */
 
 struct proc_ldt {
 	caddr_t ldt_base;
 	int     ldt_refcnt;
 };
 
 #define PMAP_INVL_GEN_NEXT_INVALID	0x1ULL
 struct pmap_invl_gen {
 	u_long gen;			/* (k) */
 	union {
 		LIST_ENTRY(pmap_invl_gen) link;	/* (pp) */
 		struct {
 			struct pmap_invl_gen *next;
 			u_char saved_pri;
 		};
 	};
 } __aligned(16);
 
 /*
  * Machine-dependent part of the proc structure for AMD64.
  */
 struct mdthread {
 	int	md_spinlock_count;	/* (k) */
 	register_t md_saved_flags;	/* (k) */
 	register_t md_spurflt_addr;	/* (k) Spurious page fault address. */
 	struct pmap_invl_gen md_invl_gen;
 	register_t md_efirt_tmp;	/* (k) */
 	int	md_efirt_dis_pf;	/* (k) */
 	struct pcb md_pcb;
 	vm_offset_t md_stack_base;
+	struct savefpu *md_usr_fpu_save;
+	struct savefpu *md_fpu_scratch;
 };
 
 struct mdproc {
 	struct proc_ldt *md_ldt;	/* (t) per-process ldt */
 	struct system_segment_descriptor md_ldt_sd;
 	u_int md_flags;			/* (c) md process flags P_MD */
 };
 
 #define	P_MD_KPTI		0x00000001	/* Enable KPTI on exec */
 #define	P_MD_LA48		0x00000002	/* Request LA48 after exec */
 #define	P_MD_LA57		0x00000004	/* Request LA57 after exec */
 
 #define	KINFO_PROC_SIZE 1088
 #define	KINFO_PROC32_SIZE 768
 
 struct syscall_args {
 	u_int code;
 	u_int original_code;
 	struct sysent *callp;
 	register_t args[8];
 };
 
 #ifdef	_KERNEL
 
 /* Get the current kernel thread stack usage. */
 #define GET_STACK_USAGE(total, used) do {				\
 	struct thread	*td = curthread;				\
 	(total) = td->td_kstack_pages * PAGE_SIZE;			\
 	(used) = (char *)td->td_kstack +				\
 	    td->td_kstack_pages * PAGE_SIZE -				\
 	    (char *)&td;						\
 } while (0)
 
 struct proc_ldt *user_ldt_alloc(struct proc *, int);
 void user_ldt_free(struct thread *);
 struct sysarch_args;
 int sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space);
 int amd64_set_ldt_data(struct thread *td, int start, int num,
     struct user_segment_descriptor *descs);
 
 extern struct mtx dt_lock;
 extern int max_ldt_segment;
 
 #define	NARGREGS	6
 
 #endif  /* _KERNEL */
 
 #endif /* !_MACHINE_PROC_H_ */
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
index 65c5cc65c87e..62f939406374 100644
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -1,1768 +1,1768 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include "opt_witness.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/msan.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/bitstring.h>
 #include <sys/epoch.h>
 #include <sys/rangelock.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/selinfo.h>
 #include <sys/syscallsubr.h>
 #include <sys/dtrace_bsd.h>
 #include <sys/sysent.h>
 #include <sys/turnstile.h>
 #include <sys/taskqueue.h>
 #include <sys/ktr.h>
 #include <sys/rwlock.h>
 #include <sys/umtxvar.h>
 #include <sys/vmmeter.h>
 #include <sys/cpuset.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <sys/priv.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/pmap.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/vm_phys.h>
 #include <sys/eventhandler.h>
 
 /*
  * Asserts below verify the stability of struct thread and struct proc
  * layout, as exposed by KBI to modules.  On head, the KBI is allowed
  * to drift, change to the structures must be accompanied by the
  * assert update.
  *
  * On the stable branches after KBI freeze, conditions must not be
  * violated.  Typically new fields are moved to the end of the
  * structures.
  */
 #ifdef __amd64__
 _Static_assert(offsetof(struct thread, td_flags) == 0x108,
     "struct thread KBI td_flags");
 _Static_assert(offsetof(struct thread, td_pflags) == 0x110,
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x4a8,
     "struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x6c0,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0xb8,
     "struct proc KBI p_flag");
 _Static_assert(offsetof(struct proc, p_pid) == 0xc4,
     "struct proc KBI p_pid");
 _Static_assert(offsetof(struct proc, p_filemon) == 0x3b8,
     "struct proc KBI p_filemon");
 _Static_assert(offsetof(struct proc, p_comm) == 0x3d0,
     "struct proc KBI p_comm");
 _Static_assert(offsetof(struct proc, p_emuldata) == 0x4b8,
     "struct proc KBI p_emuldata");
 #endif
 #ifdef __i386__
 _Static_assert(offsetof(struct thread, td_flags) == 0x9c,
     "struct thread KBI td_flags");
 _Static_assert(offsetof(struct thread, td_pflags) == 0xa4,
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x308,
     "struct thread KBI td_frame");
 _Static_assert(offsetof(struct thread, td_emuldata) == 0x34c,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0x6c,
     "struct proc KBI p_flag");
 _Static_assert(offsetof(struct proc, p_pid) == 0x78,
     "struct proc KBI p_pid");
 _Static_assert(offsetof(struct proc, p_filemon) == 0x268,
     "struct proc KBI p_filemon");
 _Static_assert(offsetof(struct proc, p_comm) == 0x27c,
     "struct proc KBI p_comm");
 _Static_assert(offsetof(struct proc, p_emuldata) == 0x308,
     "struct proc KBI p_emuldata");
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE(proc, , , lwp__exit);
 
 /*
  * thread related storage.
  */
 static uma_zone_t thread_zone;
 
 struct thread_domain_data {
 	struct thread	*tdd_zombies;
 	int		tdd_reapticks;
 } __aligned(CACHE_LINE_SIZE);
 
 static struct thread_domain_data thread_domain_data[MAXMEMDOM];
 
 static struct task	thread_reap_task;
 static struct callout  	thread_reap_callout;
 
 static void thread_zombie(struct thread *);
 static void thread_reap(void);
 static void thread_reap_all(void);
 static void thread_reap_task_cb(void *, int);
 static void thread_reap_callout_cb(void *);
 static int thread_unsuspend_one(struct thread *td, struct proc *p,
     bool boundary);
 static void thread_free_batched(struct thread *td);
 
 static __exclusive_cache_line struct mtx tid_lock;
 static bitstr_t *tid_bitmap;
 
 static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
 
 static int maxthread;
 SYSCTL_INT(_kern, OID_AUTO, maxthread, CTLFLAG_RDTUN,
     &maxthread, 0, "Maximum number of threads");
 
 static __exclusive_cache_line int nthreads;
 
 static LIST_HEAD(tidhashhead, thread) *tidhashtbl;
 static u_long	tidhash;
 static u_long	tidhashlock;
 static struct	rwlock *tidhashtbl_lock;
 #define	TIDHASH(tid)		(&tidhashtbl[(tid) & tidhash])
 #define	TIDHASHLOCK(tid)	(&tidhashtbl_lock[(tid) & tidhashlock])
 
 EVENTHANDLER_LIST_DEFINE(thread_ctor);
 EVENTHANDLER_LIST_DEFINE(thread_dtor);
 EVENTHANDLER_LIST_DEFINE(thread_init);
 EVENTHANDLER_LIST_DEFINE(thread_fini);
 
 static bool
 thread_count_inc_try(void)
 {
 	int nthreads_new;
 
 	nthreads_new = atomic_fetchadd_int(&nthreads, 1) + 1;
 	if (nthreads_new >= maxthread - 100) {
 		if (priv_check_cred(curthread->td_ucred, PRIV_MAXPROC) != 0 ||
 		    nthreads_new >= maxthread) {
 			atomic_subtract_int(&nthreads, 1);
 			return (false);
 		}
 	}
 	return (true);
 }
 
 static bool
 thread_count_inc(void)
 {
 	static struct timeval lastfail;
 	static int curfail;
 
 	thread_reap();
 	if (thread_count_inc_try()) {
 		return (true);
 	}
 
 	thread_reap_all();
 	if (thread_count_inc_try()) {
 		return (true);
 	}
 
 	if (ppsratecheck(&lastfail, &curfail, 1)) {
 		printf("maxthread limit exceeded by uid %u "
 		    "(pid %d); consider increasing kern.maxthread\n",
 		    curthread->td_ucred->cr_ruid, curproc->p_pid);
 	}
 	return (false);
 }
 
 static void
 thread_count_sub(int n)
 {
 
 	atomic_subtract_int(&nthreads, n);
 }
 
 static void
 thread_count_dec(void)
 {
 
 	thread_count_sub(1);
 }
 
 static lwpid_t
 tid_alloc(void)
 {
 	static lwpid_t trytid;
 	lwpid_t tid;
 
 	mtx_lock(&tid_lock);
 	/*
 	 * It is an invariant that the bitmap is big enough to hold maxthread
 	 * IDs. If we got to this point there has to be at least one free.
 	 */
 	if (trytid >= maxthread)
 		trytid = 0;
 	bit_ffc_at(tid_bitmap, trytid, maxthread, &tid);
 	if (tid == -1) {
 		KASSERT(trytid != 0, ("unexpectedly ran out of IDs"));
 		trytid = 0;
 		bit_ffc_at(tid_bitmap, trytid, maxthread, &tid);
 		KASSERT(tid != -1, ("unexpectedly ran out of IDs"));
 	}
 	bit_set(tid_bitmap, tid);
 	trytid = tid + 1;
 	mtx_unlock(&tid_lock);
 	return (tid + NO_PID);
 }
 
 static void
 tid_free_locked(lwpid_t rtid)
 {
 	lwpid_t tid;
 
 	mtx_assert(&tid_lock, MA_OWNED);
 	KASSERT(rtid >= NO_PID,
 	    ("%s: invalid tid %d\n", __func__, rtid));
 	tid = rtid - NO_PID;
 	KASSERT(bit_test(tid_bitmap, tid) != 0,
 	    ("thread ID %d not allocated\n", rtid));
 	bit_clear(tid_bitmap, tid);
 }
 
 static void
 tid_free(lwpid_t rtid)
 {
 
 	mtx_lock(&tid_lock);
 	tid_free_locked(rtid);
 	mtx_unlock(&tid_lock);
 }
 
 static void
 tid_free_batch(lwpid_t *batch, int n)
 {
 	int i;
 
 	mtx_lock(&tid_lock);
 	for (i = 0; i < n; i++) {
 		tid_free_locked(batch[i]);
 	}
 	mtx_unlock(&tid_lock);
 }
 
 /*
  * Batching for thread reapping.
  */
 struct tidbatch {
 	lwpid_t tab[16];
 	int n;
 };
 
 static void
 tidbatch_prep(struct tidbatch *tb)
 {
 
 	tb->n = 0;
 }
 
 static void
 tidbatch_add(struct tidbatch *tb, struct thread *td)
 {
 
 	KASSERT(tb->n < nitems(tb->tab),
 	    ("%s: count too high %d", __func__, tb->n));
 	tb->tab[tb->n] = td->td_tid;
 	tb->n++;
 }
 
 static void
 tidbatch_process(struct tidbatch *tb)
 {
 
 	KASSERT(tb->n <= nitems(tb->tab),
 	    ("%s: count too high %d", __func__, tb->n));
 	if (tb->n == nitems(tb->tab)) {
 		tid_free_batch(tb->tab, tb->n);
 		tb->n = 0;
 	}
 }
 
 static void
 tidbatch_final(struct tidbatch *tb)
 {
 
 	KASSERT(tb->n <= nitems(tb->tab),
 	    ("%s: count too high %d", __func__, tb->n));
 	if (tb->n != 0) {
 		tid_free_batch(tb->tab, tb->n);
 	}
 }
 
 /*
  * Prepare a thread for use.
  */
 static int
 thread_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	TD_SET_STATE(td, TDS_INACTIVE);
 	td->td_lastcpu = td->td_oncpu = NOCPU;
 
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
 	 * end of a context switch.
 	 */
 	td->td_critnest = 1;
 	td->td_lend_user_pri = PRI_MAX;
 #ifdef AUDIT
 	audit_thread_alloc(td);
 #endif
 #ifdef KDTRACE_HOOKS
 	kdtrace_thread_ctor(td);
 #endif
 	umtx_thread_alloc(td);
 	MPASS(td->td_sel == NULL);
 	return (0);
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (TD_GET_STATE(td)) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 #ifdef AUDIT
 	audit_thread_free(td);
 #endif
 #ifdef KDTRACE_HOOKS
 	kdtrace_thread_dtor(td);
 #endif
 	/* Free all OSD associated to this thread. */
 	osd_thread_exit(td);
 	td_softdep_cleanup(td);
 	MPASS(td->td_su == NULL);
 	seltdfini(td);
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static int
 thread_init(void *mem, int size, int flags)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 	td->td_allocdomain = vm_phys_domain(vtophys(td));
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
 	td->td_rlqe = NULL;
 	EVENTHANDLER_DIRECT_INVOKE(thread_init, td);
 	umtx_thread_init(td);
 	td->td_kstack = 0;
 	td->td_sel = NULL;
 	return (0);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_DIRECT_INVOKE(thread_fini, td);
 	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
 	MPASS(td->td_sel == NULL);
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  * called from:
  * {arch}/{arch}/machdep.c   {arch}_init(), init386() etc.
  * proc_dtor() (should go away)
  * proc_init()
  */
 void
 proc_linkup0(struct proc *p, struct thread *td)
 {
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	proc_linkup(p, td);
 }
 
 void
 proc_linkup(struct proc *p, struct thread *td)
 {
 
 	sigqueue_init(&p->p_sigqueue, p);
 	p->p_ksi = ksiginfo_alloc(1);
 	if (p->p_ksi != NULL) {
 		/* XXX p_ksi may be null if ksiginfo zone is not ready */
 		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
 	}
 	LIST_INIT(&p->p_mqnotifier);
 	p->p_numthreads = 0;
 	thread_link(td, p);
 }
 
 extern int max_threads_per_proc;
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 	u_long i;
 	lwpid_t tid0;
 	uint32_t flags;
 
 	/*
 	 * Place an upper limit on threads which can be allocated.
 	 *
 	 * Note that other factors may make the de facto limit much lower.
 	 *
 	 * Platform limits are somewhat arbitrary but deemed "more than good
 	 * enough" for the foreseable future.
 	 */
 	if (maxthread == 0) {
 #ifdef _LP64
 		maxthread = MIN(maxproc * max_threads_per_proc, 1000000);
 #else
 		maxthread = MIN(maxproc * max_threads_per_proc, 100000);
 #endif
 	}
 
 	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
 	tid_bitmap = bit_alloc(maxthread, M_TIDHASH, M_WAITOK);
 	/*
 	 * Handle thread0.
 	 */
 	thread_count_inc();
 	tid0 = tid_alloc();
 	if (tid0 != THREAD0_TID)
 		panic("tid0 %d != %d\n", tid0, THREAD0_TID);
 
 	flags = UMA_ZONE_NOFREE;
 #ifdef __aarch64__
 	/*
 	 * Force thread structures to be allocated from the direct map.
 	 * Otherwise, superpage promotions and demotions may temporarily
 	 * invalidate thread structure mappings.  For most dynamically allocated
 	 * structures this is not a problem, but translation faults cannot be
 	 * handled without accessing curthread.
 	 */
 	flags |= UMA_ZONE_CONTIG;
 #endif
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    32 - 1, flags);
 	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
 	tidhashlock = (tidhash + 1) / 64;
 	if (tidhashlock > 0)
 		tidhashlock--;
 	tidhashtbl_lock = malloc(sizeof(*tidhashtbl_lock) * (tidhashlock + 1),
 	    M_TIDHASH, M_WAITOK | M_ZERO);
 	for (i = 0; i < tidhashlock + 1; i++)
 		rw_init(&tidhashtbl_lock[i], "tidhash");
 
 	TASK_INIT(&thread_reap_task, 0, thread_reap_task_cb, NULL);
 	callout_init(&thread_reap_callout, 1);
 	callout_reset(&thread_reap_callout, 5 * hz,
 	    thread_reap_callout_cb, NULL);
 }
 
 /*
  * Place an unused thread on the zombie list.
  */
 void
 thread_zombie(struct thread *td)
 {
 	struct thread_domain_data *tdd;
 	struct thread *ztd;
 
 	tdd = &thread_domain_data[td->td_allocdomain];
 	ztd = atomic_load_ptr(&tdd->tdd_zombies);
 	for (;;) {
 		td->td_zombie = ztd;
 		if (atomic_fcmpset_rel_ptr((uintptr_t *)&tdd->tdd_zombies,
 		    (uintptr_t *)&ztd, (uintptr_t)td))
 			break;
 		continue;
 	}
 }
 
 /*
  * Release a thread that has exited after cpu_throw().
  */
 void
 thread_stash(struct thread *td)
 {
 	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
 	thread_zombie(td);
 }
 
 /*
  * Reap zombies from passed domain.
  */
 static void
 thread_reap_domain(struct thread_domain_data *tdd)
 {
 	struct thread *itd, *ntd;
 	struct tidbatch tidbatch;
 	struct credbatch credbatch;
 	int tdcount;
 	struct plimit *lim;
 	int limcount;
 
 	/*
 	 * Reading upfront is pessimal if followed by concurrent atomic_swap,
 	 * but most of the time the list is empty.
 	 */
 	if (tdd->tdd_zombies == NULL)
 		return;
 
 	itd = (struct thread *)atomic_swap_ptr((uintptr_t *)&tdd->tdd_zombies,
 	    (uintptr_t)NULL);
 	if (itd == NULL)
 		return;
 
 	/*
 	 * Multiple CPUs can get here, the race is fine as ticks is only
 	 * advisory.
 	 */
 	tdd->tdd_reapticks = ticks;
 
 	tidbatch_prep(&tidbatch);
 	credbatch_prep(&credbatch);
 	tdcount = 0;
 	lim = NULL;
 	limcount = 0;
 
 	while (itd != NULL) {
 		ntd = itd->td_zombie;
 		EVENTHANDLER_DIRECT_INVOKE(thread_dtor, itd);
 		tidbatch_add(&tidbatch, itd);
 		credbatch_add(&credbatch, itd);
 		MPASS(itd->td_limit != NULL);
 		if (lim != itd->td_limit) {
 			if (limcount != 0) {
 				lim_freen(lim, limcount);
 				limcount = 0;
 			}
 		}
 		lim = itd->td_limit;
 		limcount++;
 		thread_free_batched(itd);
 		tidbatch_process(&tidbatch);
 		credbatch_process(&credbatch);
 		tdcount++;
 		if (tdcount == 32) {
 			thread_count_sub(tdcount);
 			tdcount = 0;
 		}
 		itd = ntd;
 	}
 
 	tidbatch_final(&tidbatch);
 	credbatch_final(&credbatch);
 	if (tdcount != 0) {
 		thread_count_sub(tdcount);
 	}
 	MPASS(limcount != 0);
 	lim_freen(lim, limcount);
 }
 
 /*
  * Reap zombies from all domains.
  */
 static void
 thread_reap_all(void)
 {
 	struct thread_domain_data *tdd;
 	int i, domain;
 
 	domain = PCPU_GET(domain);
 	for (i = 0; i < vm_ndomains; i++) {
 		tdd = &thread_domain_data[(i + domain) % vm_ndomains];
 		thread_reap_domain(tdd);
 	}
 }
 
 /*
  * Reap zombies from local domain.
  */
 static void
 thread_reap(void)
 {
 	struct thread_domain_data *tdd;
 	int domain;
 
 	domain = PCPU_GET(domain);
 	tdd = &thread_domain_data[domain];
 
 	thread_reap_domain(tdd);
 }
 
 static void
 thread_reap_task_cb(void *arg __unused, int pending __unused)
 {
 
 	thread_reap_all();
 }
 
 static void
 thread_reap_callout_cb(void *arg __unused)
 {
 	struct thread_domain_data *tdd;
 	int i, cticks, lticks;
 	bool wantreap;
 
 	wantreap = false;
 	cticks = atomic_load_int(&ticks);
 	for (i = 0; i < vm_ndomains; i++) {
 		tdd = &thread_domain_data[i];
 		lticks = tdd->tdd_reapticks;
 		if (tdd->tdd_zombies != NULL &&
 		    (u_int)(cticks - lticks) > 5 * hz) {
 			wantreap = true;
 			break;
 		}
 	}
 
 	if (wantreap)
 		taskqueue_enqueue(taskqueue_thread, &thread_reap_task);
 	callout_reset(&thread_reap_callout, 5 * hz,
 	    thread_reap_callout_cb, NULL);
 }
 
 /*
  * Calling this function guarantees that any thread that exited before
  * the call is reaped when the function returns.  By 'exited' we mean
  * a thread removed from the process linkage with thread_unlink().
  * Practically this means that caller must lock/unlock corresponding
  * process lock before the call, to synchronize with thread_exit().
  */
 void
 thread_reap_barrier(void)
 {
 	struct task *t;
 
 	/*
 	 * First do context switches to each CPU to ensure that all
 	 * PCPU pc_deadthreads are moved to zombie list.
 	 */
 	quiesce_all_cpus("", PDROP);
 
 	/*
 	 * Second, fire the task in the same thread as normal
 	 * thread_reap() is done, to serialize reaping.
 	 */
 	t = malloc(sizeof(*t), M_TEMP, M_WAITOK);
 	TASK_INIT(t, 0, thread_reap_task_cb, t);
 	taskqueue_enqueue(taskqueue_thread, t);
 	taskqueue_drain(taskqueue_thread, t);
 	free(t, M_TEMP);
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(int pages)
 {
 	struct thread *td;
 	lwpid_t tid;
 
 	if (!thread_count_inc()) {
 		return (NULL);
 	}
 
 	tid = tid_alloc();
 	td = uma_zalloc(thread_zone, M_WAITOK);
 	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
 	if (!vm_thread_new(td, pages)) {
 		uma_zfree(thread_zone, td);
 		tid_free(tid);
 		thread_count_dec();
 		return (NULL);
 	}
 	td->td_tid = tid;
 	bzero(&td->td_sa.args, sizeof(td->td_sa.args));
 	kmsan_thread_alloc(td);
 	cpu_thread_alloc(td);
 	EVENTHANDLER_DIRECT_INVOKE(thread_ctor, td);
 	return (td);
 }
 
 int
 thread_alloc_stack(struct thread *td, int pages)
 {
 
 	KASSERT(td->td_kstack == 0,
 	    ("thread_alloc_stack called on a thread with kstack"));
 	if (!vm_thread_new(td, pages))
 		return (0);
 	cpu_thread_alloc(td);
 	return (1);
 }
 
 /*
  * Deallocate a thread.
  */
 static void
 thread_free_batched(struct thread *td)
 {
 
 	lock_profile_thread_exit(td);
 	if (td->td_cpuset)
 		cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_free(td);
 	if (td->td_kstack != 0)
 		vm_thread_dispose(td);
 	callout_drain(&td->td_slpcallout);
 	/*
 	 * Freeing handled by the caller.
 	 */
 	td->td_tid = -1;
 	kmsan_thread_free(td);
 	uma_zfree(thread_zone, td);
 }
 
 void
 thread_free(struct thread *td)
 {
 	lwpid_t tid;
 
 	EVENTHANDLER_DIRECT_INVOKE(thread_dtor, td);
 	tid = td->td_tid;
 	thread_free_batched(td);
 	tid_free(tid);
 	thread_count_dec();
 }
 
 void
 thread_cow_get_proc(struct thread *newtd, struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	newtd->td_realucred = crcowget(p->p_ucred);
 	newtd->td_ucred = newtd->td_realucred;
 	newtd->td_limit = lim_hold(p->p_limit);
 	newtd->td_cowgen = p->p_cowgen;
 }
 
 void
 thread_cow_get(struct thread *newtd, struct thread *td)
 {
 
 	MPASS(td->td_realucred == td->td_ucred);
 	newtd->td_realucred = crcowget(td->td_realucred);
 	newtd->td_ucred = newtd->td_realucred;
 	newtd->td_limit = lim_hold(td->td_limit);
 	newtd->td_cowgen = td->td_cowgen;
 }
 
 void
 thread_cow_free(struct thread *td)
 {
 
 	if (td->td_realucred != NULL)
 		crcowfree(td);
 	if (td->td_limit != NULL)
 		lim_free(td->td_limit);
 }
 
 void
 thread_cow_update(struct thread *td)
 {
 	struct proc *p;
 	struct ucred *oldcred;
 	struct plimit *oldlimit;
 
 	p = td->td_proc;
 	oldlimit = NULL;
 	PROC_LOCK(p);
 	oldcred = crcowsync();
 	if (td->td_limit != p->p_limit) {
 		oldlimit = td->td_limit;
 		td->td_limit = lim_hold(p->p_limit);
 	}
 	td->td_cowgen = p->p_cowgen;
 	PROC_UNLOCK(p);
 	if (oldcred != NULL)
 		crfree(oldcred);
 	if (oldlimit != NULL)
 		lim_free(oldlimit);
 }
 
 /*
  * Discard the current thread and exit from its context.
  * Always called with scheduler locked.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().
  */
 void
 thread_exit(void)
 {
 	uint64_t runtime, new_switchtime;
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, td->td_name);
 	SDT_PROBE0(proc, , , lwp__exit);
 	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
 	MPASS(td->td_realucred == td->td_ucred);
 
 	/*
 	 * drop FPU & debug register state storage, or any other
 	 * architecture specific resources that
 	 * would not be on a new untouched process.
 	 */
 	cpu_thread_exit(td);
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff if we never had threads.
 	 * EXIT clears all sign of other threads when
 	 * it goes to single threading, so the last thread always
 	 * takes the short path.
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
 			atomic_add_int(&td->td_proc->p_exitthreads, 1);
 			thread_unlink(td);
 			td2 = FIRST_THREAD_IN_PROC(p);
 			sched_exit_thread(td2, td);
 
 			/*
 			 * The test below is NOT true if we are the
 			 * sole exiting thread. P_STOPPED_SINGLE is unset
 			 * in exit1() after it is the only survivor.
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
 					thread_lock(p->p_singlethread);
 					wakeup_swapper = thread_unsuspend_one(
 						p->p_singlethread, p, false);
 					if (wakeup_swapper)
 						kick_proc0();
 				}
 			}
 
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
 			 * The last thread is exiting.. but not through exit()
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
 	} 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * If this thread is part of a process that is being tracked by hwpmc(4),
 	 * inform the module of the thread's impending exit.
 	 */
 	if (PMC_PROC_IS_USING_PMCS(td->td_proc)) {
 		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT, NULL);
 	} else if (PMC_SYSTEM_SAMPLING_ACTIVE())
 		PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_THR_EXIT_LOG, NULL);
 #endif
 	PROC_UNLOCK(p);
 	PROC_STATLOCK(p);
 	thread_lock(td);
 	PROC_SUNLOCK(p);
 
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
 	new_switchtime = cpu_ticks();
 	runtime = new_switchtime - PCPU_GET(switchtime);
 	td->td_runtime += runtime;
 	td->td_incruntime += runtime;
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	VM_CNT_INC(v_swtch);
 
 	/* Save our resource usage in our process. */
 	td->td_ru.ru_nvcsw++;
 	ruxagg_locked(p, td);
 	rucollect(&p->p_ru, &td->td_ru);
 	PROC_STATUNLOCK(p);
 
 	TD_SET_STATE(td, TDS_INACTIVE);
 #ifdef WITNESS
 	witness_thread_exit(td);
 #endif
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
 	sched_throw(td);
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /*
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()"));
 	KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking"));
 	td = FIRST_THREAD_IN_PROC(p);
 	/* Lock the last thread so we spin until it exits cpu_throw(). */
 	thread_lock(td);
 	thread_unlock(td);
 	lock_profile_thread_exit(td);
 	cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_clean(td);
 	thread_cow_free(td);
 	callout_drain(&td->td_slpcallout);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  */
 void
 thread_link(struct thread *td, struct proc *p)
 {
 
 	/*
 	 * XXX This can't be enabled because it's called for proc0 before
 	 * its lock has been created.
 	 * PROC_LOCK_ASSERT(p, MA_OWNED);
 	 */
 	TD_SET_STATE(td, TDS_INACTIVE);
 	td->td_proc     = p;
 	td->td_flags    = TDF_INMEM;
 
 	LIST_INIT(&td->td_contested);
 	LIST_INIT(&td->td_lprof[0]);
 	LIST_INIT(&td->td_lprof[1]);
 #ifdef EPOCH_TRACE
 	SLIST_INIT(&td->td_epochs);
 #endif
 	sigqueue_init(&td->td_sigqueue, p);
 	callout_init(&td->td_slpcallout, 1);
 	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
 
 /*
  * Called from:
  *  thread_exit()
  */
 void
 thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 #ifdef EPOCH_TRACE
 	MPASS(SLIST_EMPTY(&td->td_epochs));
 #endif
 
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	/* could clear a few other things here */
 	/* Must  NOT clear links to proc! */
 }
 
 static int
 calc_remaining(struct proc *p, int mode)
 {
 	int remaining;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
 	else if (mode == SINGLE_BOUNDARY)
 		remaining = p->p_numthreads - p->p_boundary_count;
 	else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC)
 		remaining = p->p_numthreads - p->p_suspcount;
 	else
 		panic("calc_remaining: wrong mode %d", mode);
 	return (remaining);
 }
 
 static int
 remain_for_mode(int mode)
 {
 
 	return (mode == SINGLE_ALLPROC ? 0 : 1);
 }
 
 static int
 weed_inhib(int mode, struct thread *td2, struct proc *p)
 {
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td2, MA_OWNED);
 
 	wakeup_swapper = 0;
 
 	/*
 	 * Since the thread lock is dropped by the scheduler we have
 	 * to retry to check for races.
 	 */
 restart:
 	switch (mode) {
 	case SINGLE_EXIT:
 		if (TD_IS_SUSPENDED(td2)) {
 			wakeup_swapper |= thread_unsuspend_one(td2, p, true);
 			thread_lock(td2);
 			goto restart;
 		}
 		if (TD_CAN_ABORT(td2)) {
 			wakeup_swapper |= sleepq_abort(td2, EINTR);
 			return (wakeup_swapper);
 		}
 		break;
 	case SINGLE_BOUNDARY:
 	case SINGLE_NO_EXIT:
 		if (TD_IS_SUSPENDED(td2) &&
 		    (td2->td_flags & TDF_BOUNDARY) == 0) {
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 			thread_lock(td2);
 			goto restart;
 		}
 		if (TD_CAN_ABORT(td2)) {
 			wakeup_swapper |= sleepq_abort(td2, ERESTART);
 			return (wakeup_swapper);
 		}
 		break;
 	case SINGLE_ALLPROC:
 		/*
 		 * ALLPROC suspend tries to avoid spurious EINTR for
 		 * threads sleeping interruptable, by suspending the
 		 * thread directly, similarly to sig_suspend_threads().
 		 * Since such sleep is not performed at the user
 		 * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP
 		 * is used to avoid immediate un-suspend.
 		 */
 		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY |
 		    TDF_ALLPROCSUSP)) == 0) {
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 			thread_lock(td2);
 			goto restart;
 		}
 		if (TD_CAN_ABORT(td2)) {
 			if ((td2->td_flags & TDF_SBDRY) == 0) {
 				thread_suspend_one(td2);
 				td2->td_flags |= TDF_ALLPROCSUSP;
 			} else {
 				wakeup_swapper |= sleepq_abort(td2, ERESTART);
 				return (wakeup_swapper);
 			}
 		}
 		break;
 	default:
 		break;
 	}
 	thread_unlock(td2);
 	return (wakeup_swapper);
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accelerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(struct proc *p, int mode)
 {
 	struct thread *td;
 	struct thread *td2;
 	int remaining, wakeup_swapper;
 
 	td = curthread;
 	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
 	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
 	    ("invalid mode %d", mode));
 	/*
 	 * If allowing non-ALLPROC singlethreading for non-curproc
 	 * callers, calc_remaining() and remain_for_mode() should be
 	 * adjusted to also account for td->td_proc != p.  For now
 	 * this is not implemented because it is not used.
 	 */
 	KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) ||
 	    (mode != SINGLE_ALLPROC && td->td_proc == p),
 	    ("mode %d proc %p curproc %p", mode, p, td->td_proc));
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread != NULL && p->p_singlethread != td)
 		return (1);
 
 	if (mode == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 		p->p_flag &= ~P_SINGLE_BOUNDARY;
 	} else {
 		p->p_flag &= ~P_SINGLE_EXIT;
 		if (mode == SINGLE_BOUNDARY)
 			p->p_flag |= P_SINGLE_BOUNDARY;
 		else
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	if (mode == SINGLE_ALLPROC)
 		p->p_flag |= P_TOTAL_STOP;
 	p->p_flag |= P_STOPPED_SINGLE;
 	PROC_SLOCK(p);
 	p->p_singlethread = td;
 	remaining = calc_remaining(p, mode);
 	while (remaining != remain_for_mode(mode)) {
 		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
 			goto stopme;
 		wakeup_swapper = 0;
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 			if (TD_IS_INHIBITED(td2)) {
 				wakeup_swapper |= weed_inhib(mode, td2, p);
 #ifdef SMP
 			} else if (TD_IS_RUNNING(td2) && td != td2) {
 				forward_signal(td2);
 				thread_unlock(td2);
 #endif
 			} else
 				thread_unlock(td2);
 		}
 		if (wakeup_swapper)
 			kick_proc0();
 		remaining = calc_remaining(p, mode);
 
 		/*
 		 * Maybe we suspended some threads.. was it enough?
 		 */
 		if (remaining == remain_for_mode(mode))
 			break;
 
 stopme:
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_switch(td, p);
 		remaining = calc_remaining(p, mode);
 	}
 	if (mode == SINGLE_EXIT) {
 		/*
 		 * Convert the process to an unthreaded process.  The
 		 * SINGLE_EXIT is called by exit1() or execve(), in
 		 * both cases other threads must be retired.
 		 */
 		KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads"));
 		p->p_singlethread = NULL;
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS);
 
 		/*
 		 * Wait for any remaining threads to exit cpu_throw().
 		 */
 		while (p->p_exitthreads != 0) {
 			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			sched_relinquish(td);
 			PROC_LOCK(p);
 			PROC_SLOCK(p);
 		}
 	} else if (mode == SINGLE_BOUNDARY) {
 		/*
 		 * Wait until all suspended threads are removed from
 		 * the processors.  The thread_suspend_check()
 		 * increments p_boundary_count while it is still
 		 * running, which makes it possible for the execve()
 		 * to destroy vmspace while our other threads are
 		 * still using the address space.
 		 *
 		 * We lock the thread, which is only allowed to
 		 * succeed after context switch code finished using
 		 * the address space.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			KASSERT((td2->td_flags & TDF_BOUNDARY) != 0,
 			    ("td %p not on boundary", td2));
 			KASSERT(TD_IS_SUSPENDED(td2),
 			    ("td %p is not suspended", td2));
 			thread_unlock(td2);
 		}
 	}
 	PROC_SUNLOCK(p);
 	return (0);
 }
 
 bool
 thread_suspend_check_needed(void)
 {
 	struct proc *p;
 	struct thread *td;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 &&
 	    (td->td_dbgflags & TDB_SUSPEND) != 0));
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediately
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediately
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (thread_suspend_check_needed()) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * It is safe to access p->p_singlethread unlocked
 			 * because it can only be set to our address by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		}
 		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
 			return (EINTR);
 
 		/* Should we goto user boundary if we didn't come from there? */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
 			return (ERESTART);
 
 		/*
 		 * Ignore suspend requests if they are deferred.
 		 */
 		if ((td->td_flags & TDF_SBDRY) != 0) {
 			KASSERT(return_instead,
 			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
 			KASSERT((td->td_flags & (TDF_SEINTR | TDF_SERESTART)) !=
 			    (TDF_SEINTR | TDF_SERESTART),
 			    ("both TDF_SEINTR and TDF_SERESTART"));
 			return (TD_SBDRY_INTR(td) ? TD_SBDRY_ERRNO(td) : 0);
 		}
 
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			PROC_UNLOCK(p);
 
 			/*
 			 * Allow Linux emulation layer to do some work
 			 * before thread suicide.
 			 */
 			if (__predict_false(p->p_sysent->sv_thread_detach != NULL))
 				(p->p_sysent->sv_thread_detach)(td);
 			umtx_thread_exit(td);
 			kern_thr_exit(td);
 			panic("stopped thread did not exit");
 		}
 
 		PROC_SLOCK(p);
 		thread_stopped(p);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount + 1) {
 				thread_lock(p->p_singlethread);
 				wakeup_swapper = thread_unsuspend_one(
 				    p->p_singlethread, p, false);
 				if (wakeup_swapper)
 					kick_proc0();
 			}
 		}
 		PROC_UNLOCK(p);
 		thread_lock(td);
 		/*
 		 * When a thread suspends, it just
 		 * gets taken off all queues.
 		 */
 		thread_suspend_one(td);
 		if (return_instead == 0) {
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
 		PROC_SUNLOCK(p);
 		mi_switch(SW_INVOL | SWT_SUSPEND);
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 /*
  * Check for possible stops and suspensions while executing a
  * casueword or similar transiently failing operation.
  *
  * The sleep argument controls whether the function can handle a stop
  * request itself or it should return ERESTART and the request is
  * proceed at the kernel/user boundary in ast.
  *
  * Typically, when retrying due to casueword(9) failure (rv == 1), we
  * should handle the stop requests there, with exception of cases when
  * the thread owns a kernel resource, for instance busied the umtx
  * key, or when functions return immediately if thread_check_susp()
  * returned non-zero.  On the other hand, retrying the whole lock
  * operation, we better not stop there but delegate the handling to
  * ast.
  *
  * If the request is for thread termination P_SINGLE_EXIT, we cannot
  * handle it at all, and simply return EINTR.
  */
 int
 thread_check_susp(struct thread *td, bool sleep)
 {
 	struct proc *p;
 	int error;
 
 	/*
 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
 	 * eventually break the lockstep loop.
 	 */
 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
 		return (0);
 	error = 0;
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (p->p_flag & P_SINGLE_EXIT)
 		error = EINTR;
 	else if (P_SHOULDSTOP(p) ||
 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND)))
 		error = sleep ? thread_suspend_check(0) : ERESTART;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 void
 thread_suspend_switch(struct thread *td, struct proc *p)
 {
 
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * We implement thread_suspend_one in stages here to avoid
 	 * dropping the proc lock while the thread lock is owned.
 	 */
 	if (p == td->td_proc) {
 		thread_stopped(p);
 		p->p_suspcount++;
 	}
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 	PROC_SUNLOCK(p);
 	DROP_GIANT();
 	mi_switch(SW_VOL | SWT_SUSPEND);
 	PICKUP_GIANT();
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 }
 
 static int
 thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	td->td_flags &= ~TDF_ALLPROCSUSP;
 	if (td->td_proc == p) {
 		PROC_SLOCK_ASSERT(p, MA_OWNED);
 		p->p_suspcount--;
 		if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) {
 			td->td_flags &= ~TDF_BOUNDARY;
 			p->p_boundary_count--;
 		}
 	}
 	return (setrunnable(td, 0));
 }
 
 void
 thread_run_flash(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (TD_ON_SLEEPQ(td))
 		sleepq_remove_nested(td);
 	else
 		thread_lock(td);
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 
 	TD_CLR_SUSPENDED(td);
 	PROC_SLOCK(p);
 	MPASS(p->p_suspcount > 0);
 	p->p_suspcount--;
 	PROC_SUNLOCK(p);
 	if (setrunnable(td, 0))
 		kick_proc0();
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	wakeup_swapper = 0;
 	if (!P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td, p,
 				    true);
 			} else
 				thread_unlock(td);
 		}
 	} else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 	    p->p_numthreads == p->p_suspcount) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		if (p->p_singlethread->td_proc == p) {
 			thread_lock(p->p_singlethread);
 			wakeup_swapper = thread_unsuspend_one(
 			    p->p_singlethread, p, false);
 		}
 	}
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * End the single threading mode..
  */
 void
 thread_single_end(struct proc *p, int mode)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
 	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
 	    ("invalid mode %d", mode));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) ||
 	    (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0),
 	    ("mode %d does not match P_TOTAL_STOP", mode));
 	KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread,
 	    ("thread_single_end from other thread %p %p",
 	    curthread, p->p_singlethread));
 	KASSERT(mode != SINGLE_BOUNDARY ||
 	    (p->p_flag & P_SINGLE_BOUNDARY) != 0,
 	    ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag));
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY |
 	    P_TOTAL_STOP);
 	PROC_SLOCK(p);
 	p->p_singlethread = NULL;
 	wakeup_swapper = 0;
 	/*
 	 * If there are other threads they may now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td, p,
 				    mode == SINGLE_BOUNDARY);
 			} else
 				thread_unlock(td);
 		}
 	}
 	KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
 	    ("inconsistent boundary count %d", p->p_boundary_count));
 	PROC_SUNLOCK(p);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Locate a thread by number and return with proc lock held.
  *
  * thread exit establishes proc -> tidhash lock ordering, but lookup
  * takes tidhash first and needs to return locked proc.
  *
  * The problem is worked around by relying on type-safety of both
  * structures and doing the work in 2 steps:
  * - tidhash-locked lookup which saves both thread and proc pointers
  * - proc-locked verification that the found thread still matches
  */
 static bool
 tdfind_hash(lwpid_t tid, pid_t pid, struct proc **pp, struct thread **tdp)
 {
 #define RUN_THRESH	16
 	struct proc *p;
 	struct thread *td;
 	int run;
 	bool locked;
 
 	run = 0;
 	rw_rlock(TIDHASHLOCK(tid));
 	locked = true;
 	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
 		if (td->td_tid != tid) {
 			run++;
 			continue;
 		}
 		p = td->td_proc;
 		if (pid != -1 && p->p_pid != pid) {
 			td = NULL;
 			break;
 		}
 		if (run > RUN_THRESH) {
 			if (rw_try_upgrade(TIDHASHLOCK(tid))) {
 				LIST_REMOVE(td, td_hash);
 				LIST_INSERT_HEAD(TIDHASH(td->td_tid),
 					td, td_hash);
 				rw_wunlock(TIDHASHLOCK(tid));
 				locked = false;
 				break;
 			}
 		}
 		break;
 	}
 	if (locked)
 		rw_runlock(TIDHASHLOCK(tid));
 	if (td == NULL)
 		return (false);
 	*pp = p;
 	*tdp = td;
 	return (true);
 }
 
 struct thread *
 tdfind(lwpid_t tid, pid_t pid)
 {
 	struct proc *p;
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_tid == tid) {
 		if (pid != -1 && td->td_proc->p_pid != pid)
 			return (NULL);
 		PROC_LOCK(td->td_proc);
 		return (td);
 	}
 
 	for (;;) {
 		if (!tdfind_hash(tid, pid, &p, &td))
 			return (NULL);
 		PROC_LOCK(p);
 		if (td->td_tid != tid) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (td->td_proc != p) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			return (NULL);
 		}
 		return (td);
 	}
 }
 
 void
 tidhash_add(struct thread *td)
 {
 	rw_wlock(TIDHASHLOCK(td->td_tid));
 	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
 	rw_wunlock(TIDHASHLOCK(td->td_tid));
 }
 
 void
 tidhash_remove(struct thread *td)
 {
 
 	rw_wlock(TIDHASHLOCK(td->td_tid));
 	LIST_REMOVE(td, td_hash);
 	rw_wunlock(TIDHASHLOCK(td->td_tid));
 }