diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index f74091438648..1c6b1549422b 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -1,685 +1,675 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
  */
 
 #include <sys/cdefs.h>
 #include "opt_isa.h"
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/wait.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/tss.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 
 _Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
     "OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf.");
 
 void
 set_top_of_stack_td(struct thread *td)
 {
 	td->td_md.md_stack_base = td->td_kstack +
 	    td->td_kstack_pages * PAGE_SIZE;
 }
 
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
 	KASSERT(((vm_offset_t)td->td_md.md_usr_fpu_save %
 	    XSAVE_AREA_ALIGN) == 0,
 	    ("Unaligned pcb_user_save area ptr %p td %p",
 	    td->td_md.md_usr_fpu_save, td));
 	return (td->td_md.md_usr_fpu_save);
 }
 
 struct pcb *
 get_pcb_td(struct thread *td)
 {
 
 	return (&td->td_md.md_pcb);
 }
 
 struct savefpu *
 get_pcb_user_save_pcb(struct pcb *pcb)
 {
 	struct thread *td;
 
 	td = __containerof(pcb, struct thread, td_md.md_pcb);
 	return (get_pcb_user_save_td(td));
 }
 
 void *
 alloc_fpusave(int flags)
 {
 	void *res;
 	struct savefpu_ymm *sf;
 
 	res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
 	if (use_xsave) {
 		sf = (struct savefpu_ymm *)res;
 		bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd));
 		sf->sv_xstate.sx_hd.xstate_bv = xsave_mask;
 	}
 	return (res);
 }
 
 /*
  * Common code shared between cpu_fork() and cpu_copy_thread() for
  * initializing a thread.
  */
 static void
 copy_thread(struct thread *td1, struct thread *td2)
 {
 	struct pcb *pcb2;
 
 	pcb2 = td2->td_pcb;
 
 	/* Ensure that td1's pcb is up to date for user threads. */
 	if ((td2->td_pflags & TDP_KTHREAD) == 0) {
 		MPASS(td1 == curthread);
 		fpuexit(td1);
 		update_pcb_bases(td1->td_pcb);
 	}
 
 	/* Copy td1's pcb */
 	bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
 
 	/* Properly initialize pcb_save */
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
 
 	/* Kernel threads start with clean FPU and segment bases. */
 	if ((td2->td_pflags & TDP_KTHREAD) != 0) {
 		pcb2->pcb_fsbase = 0;
 		pcb2->pcb_gsbase = 0;
 		clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE |
 		    PCB_KERNFPU | PCB_KERNFPU_THR);
 	} else {
 		MPASS((pcb2->pcb_flags & (PCB_KERNFPU | PCB_KERNFPU_THR)) == 0);
 		bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
 		    cpu_max_ext_state_size);
 	}
 
 	td2->td_frame = (struct trapframe *)td2->td_md.md_stack_base - 1;
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pcb2->pcb_r12 = (register_t)fork_return;	/* fork_trampoline argument */
 	pcb2->pcb_rbp = 0;
 	pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *);
 	pcb2->pcb_rbx = (register_t)td2;		/* fork_trampoline argument */
 	pcb2->pcb_rip = (register_t)fork_trampoline;
 	/*-
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_flags:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_[fg]sbase:	cloned above
 	 */
 
 	pcb2->pcb_tssp = NULL;
 
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 	pmap_thread_init_invl_gen(td2);
 
 	/*
 	 * Copy the trap frame for the return to user mode as if from a syscall.
 	 * This copies most of the user mode register values.  Some of these
 	 * registers are rewritten by cpu_set_upcall() and linux_set_upcall().
 	 */
 	if ((td1->td_proc->p_flag & P_KPROC) == 0) {
 		bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 		/*
 		 * If the current thread has the trap bit set (i.e. a debugger
 		 * had single stepped the process to the system call), we need
 		 * to clear the trap flag from the new frame. Otherwise, the new
 		 * thread will receive a (likely unexpected) SIGTRAP when it
 		 * executes the first instruction after returning to userland.
 		 */
 		td2->td_frame->tf_rflags &= ~PSL_T;
 	}
 }
 
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
  */
 void
 cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
 {
 	struct proc *p1;
 	struct pcb *pcb2;
 	struct mdproc *mdp1, *mdp2;
 	struct proc_ldt *pldt;
 
 	p1 = td1->td_proc;
 	if ((flags & RFPROC) == 0) {
 		if ((flags & RFMEM) == 0) {
 			/* unshare user LDT */
 			mdp1 = &p1->p_md;
 			mtx_lock(&dt_lock);
 			if ((pldt = mdp1->md_ldt) != NULL &&
 			    pldt->ldt_refcnt > 1 &&
 			    user_ldt_alloc(p1, 1) == NULL)
 				panic("could not copy LDT");
 			mtx_unlock(&dt_lock);
 		}
 		return;
 	}
 
 	/* Point the stack and pcb to the actual location */
 	set_top_of_stack_td(td2);
 	td2->td_pcb = pcb2 = get_pcb_td(td2);
 
 	copy_thread(td1, td2);
 
 	/* Reset debug registers in the new process */
 	x86_clear_dbregs(pcb2);
 
 	/* Point mdproc and then copy over p1's contents */
 	mdp2 = &p2->p_md;
 	bcopy(&p1->p_md, mdp2, sizeof(*mdp2));
 
 	/* Set child return values. */
 	p2->p_sysent->sv_set_fork_retval(td2);
 
 	/* As on i386, do not copy io permission bitmap. */
 	pcb2->pcb_tssp = NULL;
 
 	/* New segment registers. */
 	set_pcb_flags_raw(pcb2, PCB_FULL_IRET);
 
 	/* Copy the LDT, if necessary. */
 	mdp1 = &td1->td_proc->p_md;
 	mdp2 = &p2->p_md;
 	if (mdp1->md_ldt == NULL) {
 		mdp2->md_ldt = NULL;
 		return;
 	}
 	mtx_lock(&dt_lock);
 	if (mdp1->md_ldt != NULL) {
 		if (flags & RFMEM) {
 			mdp1->md_ldt->ldt_refcnt++;
 			mdp2->md_ldt = mdp1->md_ldt;
 			bcopy(&mdp1->md_ldt_sd, &mdp2->md_ldt_sd, sizeof(struct
 			    system_segment_descriptor));
 		} else {
 			mdp2->md_ldt = NULL;
 			mdp2->md_ldt = user_ldt_alloc(p2, 0);
 			if (mdp2->md_ldt == NULL)
 				panic("could not copy LDT");
 			amd64_set_ldt_data(td2, 0, max_ldt_segment,
 			    (struct user_segment_descriptor *)
 			    mdp1->md_ldt->ldt_base);
 		}
 	} else
 		mdp2->md_ldt = NULL;
 	mtx_unlock(&dt_lock);
 
 	/*
 	 * Now, cpu_switch() can schedule the new process.
 	 * pcb_rsp is loaded pointing to the cpu_switch() stack frame
 	 * containing the return address when exiting cpu_switch.
 	 * This will normally be to fork_trampoline(), which will have
 	 * %rbx loaded with the new proc's pointer.  fork_trampoline()
 	 * will set up a stack to call fork_return(p, frame); to complete
 	 * the return to user-mode.
 	 */
 }
 
 void
 x86_set_fork_retval(struct thread *td)
 {
 	struct trapframe *frame = td->td_frame;
 
 	frame->tf_rax = 0;		/* Child returns zero */
 	frame->tf_rflags &= ~PSL_C;	/* success */
 	frame->tf_rdx = 1;		/* System V emulation */
 }
 
 /*
  * Intercept the return address from a freshly forked process that has NOT
  * been scheduled yet.
  *
  * This is needed to make kernel threads stay in kernel mode.
  */
 void
 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg)
 {
 	/*
 	 * Note that the trap frame follows the args, so the function
 	 * is really called like this:  func(arg, frame);
 	 */
 	td->td_pcb->pcb_r12 = (long) func;	/* function */
 	td->td_pcb->pcb_rbx = (long) arg;	/* first arg */
 }
 
 void
 cpu_exit(struct thread *td)
 {
 
 	/*
 	 * If this process has a custom LDT, release it.
 	 */
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 }
 
 void
 cpu_thread_exit(struct thread *td)
 {
 	struct pcb *pcb;
 
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread))
 		fpudrop();
 	critical_exit();
 
 	pcb = td->td_pcb;
 
 	/* Disable any hardware breakpoints. */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		reset_dbregs();
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 }
 
 void
 cpu_thread_clean(struct thread *td)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	/*
 	 * Clean TSS/iomap
 	 */
 	if (pcb->pcb_tssp != NULL) {
 		pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp,
 		    (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1));
 		kmem_free(pcb->pcb_tssp, ctob(IOPAGES + 1));
 		pcb->pcb_tssp = NULL;
 	}
 }
 
 void
 cpu_thread_alloc(struct thread *td)
 {
 	struct pcb *pcb;
 	struct xstate_hdr *xhdr;
 
 	set_top_of_stack_td(td);
 	td->td_pcb = pcb = get_pcb_td(td);
 	td->td_frame = (struct trapframe *)td->td_md.md_stack_base - 1;
 	td->td_md.md_usr_fpu_save = fpu_save_area_alloc();
 	pcb->pcb_save = get_pcb_user_save_pcb(pcb);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
 		bzero(xhdr, sizeof(*xhdr));
 		xhdr->xstate_bv = xsave_mask;
 	}
 }
 
 void
 cpu_thread_free(struct thread *td)
 {
 	cpu_thread_clean(td);
 
 	fpu_save_area_free(td->td_md.md_usr_fpu_save);
 	td->td_md.md_usr_fpu_save = NULL;
 }
 
 bool
 cpu_exec_vmspace_reuse(struct proc *p, vm_map_t map)
 {
 
 	return (((curproc->p_md.md_flags & P_MD_KPTI) != 0) ==
 	    (vm_map_pmap(map)->pm_ucr3 != PMAP_NO_CR3));
 }
 
 static void
 cpu_procctl_kpti_ctl(struct proc *p, int val)
 {
 
 	if (pti && val == PROC_KPTI_CTL_ENABLE_ON_EXEC)
 		p->p_md.md_flags |= P_MD_KPTI;
 	if (val == PROC_KPTI_CTL_DISABLE_ON_EXEC)
 		p->p_md.md_flags &= ~P_MD_KPTI;
 }
 
 static void
 cpu_procctl_kpti_status(struct proc *p, int *val)
 {
 	*val = (p->p_md.md_flags & P_MD_KPTI) != 0 ?
 	    PROC_KPTI_CTL_ENABLE_ON_EXEC:
 	    PROC_KPTI_CTL_DISABLE_ON_EXEC;
 	if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3)
 		*val |= PROC_KPTI_STATUS_ACTIVE;
 }
 
 static int
 cpu_procctl_la_ctl(struct proc *p, int val)
 {
 	int error;
 
 	error = 0;
 	switch (val) {
 	case PROC_LA_CTL_LA48_ON_EXEC:
 		p->p_md.md_flags |= P_MD_LA48;
 		p->p_md.md_flags &= ~P_MD_LA57;
 		break;
 	case PROC_LA_CTL_LA57_ON_EXEC:
 		if (la57) {
 			p->p_md.md_flags &= ~P_MD_LA48;
 			p->p_md.md_flags |= P_MD_LA57;
 		} else {
 			error = ENOTSUP;
 		}
 		break;
 	case PROC_LA_CTL_DEFAULT_ON_EXEC:
 		p->p_md.md_flags &= ~(P_MD_LA48 | P_MD_LA57);
 		break;
 	}
 	return (error);
 }
 
 static void
 cpu_procctl_la_status(struct proc *p, int *val)
 {
 	int res;
 
 	if ((p->p_md.md_flags & P_MD_LA48) != 0)
 		res = PROC_LA_CTL_LA48_ON_EXEC;
 	else if ((p->p_md.md_flags & P_MD_LA57) != 0)
 		res = PROC_LA_CTL_LA57_ON_EXEC;
 	else
 		res = PROC_LA_CTL_DEFAULT_ON_EXEC;
 	if (p->p_sysent->sv_maxuser == VM_MAXUSER_ADDRESS_LA48)
 		res |= PROC_LA_STATUS_LA48;
 	else
 		res |= PROC_LA_STATUS_LA57;
 	*val = res;
 }
 
 int
 cpu_procctl(struct thread *td, int idtype, id_t id, int com, void *data)
 {
 	struct proc *p;
 	int error, val;
 
 	switch (com) {
 	case PROC_KPTI_CTL:
 	case PROC_KPTI_STATUS:
 	case PROC_LA_CTL:
 	case PROC_LA_STATUS:
 		if (idtype != P_PID) {
 			error = EINVAL;
 			break;
 		}
 		if (com == PROC_KPTI_CTL) {
 			/* sad but true and not a joke */
 			error = priv_check(td, PRIV_IO);
 			if (error != 0)
 				break;
 		}
 		if (com == PROC_KPTI_CTL || com == PROC_LA_CTL) {
 			error = copyin(data, &val, sizeof(val));
 			if (error != 0)
 				break;
 		}
 		if (com == PROC_KPTI_CTL &&
 		    val != PROC_KPTI_CTL_ENABLE_ON_EXEC &&
 		    val != PROC_KPTI_CTL_DISABLE_ON_EXEC) {
 			error = EINVAL;
 			break;
 		}
 		if (com == PROC_LA_CTL &&
 		    val != PROC_LA_CTL_LA48_ON_EXEC &&
 		    val != PROC_LA_CTL_LA57_ON_EXEC &&
 		    val != PROC_LA_CTL_DEFAULT_ON_EXEC) {
 			error = EINVAL;
 			break;
 		}
 		error = pget(id, PGET_CANSEE | PGET_NOTWEXIT | PGET_NOTID, &p);
 		if (error != 0)
 			break;
 		switch (com) {
 		case PROC_KPTI_CTL:
 			cpu_procctl_kpti_ctl(p, val);
 			break;
 		case PROC_KPTI_STATUS:
 			cpu_procctl_kpti_status(p, &val);
 			break;
 		case PROC_LA_CTL:
 			error = cpu_procctl_la_ctl(p, val);
 			break;
 		case PROC_LA_STATUS:
 			cpu_procctl_la_status(p, &val);
 			break;
 		}
 		PROC_UNLOCK(p);
 		if (com == PROC_KPTI_STATUS || com == PROC_LA_STATUS)
 			error = copyout(&val, data, sizeof(val));
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 	struct trapframe *frame;
 
 	frame = td->td_frame;
 	if (__predict_true(error == 0)) {
 		frame->tf_rax = td->td_retval[0];
 		frame->tf_rdx = td->td_retval[1];
 		frame->tf_rflags &= ~PSL_C;
 		return;
 	}
 
 	switch (error) {
 	case ERESTART:
 		/*
 		 * Reconstruct pc, we know that 'syscall' is 2 bytes,
 		 * lcall $X,y is 7 bytes, int 0x80 is 2 bytes.
 		 * We saved this in tf_err.
 		 * %r10 (which was holding the value of %rcx) is restored
 		 * for the next iteration.
 		 * %r10 restore is only required for freebsd/amd64 processes,
 		 * but shall be innocent for any ia32 ABI.
 		 *
 		 * Require full context restore to get the arguments
 		 * in the registers reloaded at return to usermode.
 		 */
 		frame->tf_rip -= frame->tf_err;
 		frame->tf_r10 = frame->tf_rcx;
 		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 		frame->tf_rax = error;
 		frame->tf_rflags |= PSL_C;
 		break;
 	}
 }
 
 /*
  * Initialize machine state, mostly pcb and trap frame for a new
  * thread, about to return to userspace.  Put enough state in the new
  * thread's PCB to get it to go back to the fork_return(), which
  * finalizes the thread state and handles peculiarities of the first
  * return to userspace for the new thread.
  */
 void
 cpu_copy_thread(struct thread *td, struct thread *td0)
 {
 	copy_thread(td0, td);
 
 	set_pcb_flags_raw(td->td_pcb, PCB_FULL_IRET);
 }
 
 /*
  * Set that machine state for performing an upcall that starts
  * the entry function with the given argument.
  */
 int
 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg,
     stack_t *stack)
 {
-
-	/*
-	 * Do any extra cleaning that needs to be done.
-	 * The thread may have optional components
-	 * that are not present in a fresh thread.
-	 * This may be a recycled thread so make it look
-	 * as though it's newly allocated.
-	 */
-	cpu_thread_clean(td);
-
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		/*
 		 * Set the trap frame to point at the beginning of the entry
 		 * function.
 		 */
 		td->td_frame->tf_rbp = 0;
 		td->td_frame->tf_rsp =
 		   (((uintptr_t)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4;
 		td->td_frame->tf_rip = (uintptr_t)entry;
 
 		/* Return address sentinel value to stop stack unwinding. */
 		if (suword32((void *)td->td_frame->tf_rsp, 0) != 0)
 			return (EFAULT);
 
 		/* Pass the argument to the entry point. */
 		if (suword32(
 		    (void *)(td->td_frame->tf_rsp + sizeof(int32_t)),
 		    (uint32_t)(uintptr_t)arg) != 0)
 			return (EFAULT);
 		return (0);
 	}
 #endif
 
 	/*
 	 * Set the trap frame to point at the beginning of the uts
 	 * function.
 	 */
 	td->td_frame->tf_rbp = 0;
 	td->td_frame->tf_rsp =
 	    ((register_t)stack->ss_sp + stack->ss_size) & ~0x0f;
 	td->td_frame->tf_rsp -= 8;
 	td->td_frame->tf_rip = (register_t)entry;
 	td->td_frame->tf_ds = _udatasel;
 	td->td_frame->tf_es = _udatasel;
 	td->td_frame->tf_fs = _ufssel;
 	td->td_frame->tf_gs = _ugssel;
 	td->td_frame->tf_flags = TF_HASSEGS;
 
 	/* Return address sentinel value to stop stack unwinding. */
 	if (suword((void *)td->td_frame->tf_rsp, 0) != 0)
 		return (EFAULT);
 
 	/* Pass the argument to the entry point. */
 	td->td_frame->tf_rdi = (register_t)arg;
 
 	return (0);
 }
 
 int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 	struct pcb *pcb;
 
 	if ((u_int64_t)tls_base >= VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 
 	pcb = td->td_pcb;
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		pcb->pcb_gsbase = (register_t)tls_base;
 		return (0);
 	}
 #endif
 	pcb->pcb_fsbase = (register_t)tls_base;
 	return (0);
 }
diff --git a/sys/compat/linux/linux_fork.c b/sys/compat/linux/linux_fork.c
index 4ce3bc192b4e..1c9189162a09 100644
--- a/sys/compat/linux/linux_fork.c
+++ b/sys/compat/linux/linux_fork.c
@@ -1,551 +1,549 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2004 Tim J. Robbins
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 2000 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/racct.h>
 #include <sys/sched.h>
 #include <sys/syscallsubr.h>
 #include <sys/sx.h>
 #include <sys/umtxvar.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_fork.h>
 #include <compat/linux/linux_futex.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_misc.h>
 #include <compat/linux/linux_util.h>
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_fork(struct thread *td, struct linux_fork_args *args)
 {
 	struct fork_req fr;
 	int error;
 	struct proc *p2;
 	struct thread *td2;
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = RFFDG | RFPROC | RFSTOPPED;
 	fr.fr_procp = &p2;
 	if ((error = fork1(td, &fr)) != 0)
 		return (error);
 
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	linux_proc_init(td, td2, false);
 
 	td->td_retval[0] = p2->p_pid;
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 
 	return (0);
 }
 
 int
 linux_vfork(struct thread *td, struct linux_vfork_args *args)
 {
 	struct fork_req fr;
 	int error;
 	struct proc *p2;
 	struct thread *td2;
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = RFFDG | RFPROC | RFMEM | RFPPWAIT | RFSTOPPED;
 	fr.fr_procp = &p2;
 	if ((error = fork1(td, &fr)) != 0)
 		return (error);
 
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	linux_proc_init(td, td2, false);
 
 	td->td_retval[0] = p2->p_pid;
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 
 	return (0);
 }
 #endif
 
 static int
 linux_clone_proc(struct thread *td, struct l_clone_args *args)
 {
 	struct fork_req fr;
 	int error, ff, f2;
 	struct proc *p2;
 	struct thread *td2;
 	int exit_signal;
 	struct linux_emuldata *em;
 
 	f2 = 0;
 	ff = RFPROC | RFSTOPPED;
 	if (LINUX_SIG_VALID(args->exit_signal)) {
 		exit_signal = linux_to_bsd_signal(args->exit_signal);
 	} else if (args->exit_signal != 0)
 		return (EINVAL);
 	else
 		exit_signal = 0;
 
 	if (args->flags & LINUX_CLONE_VM)
 		ff |= RFMEM;
 	if (args->flags & LINUX_CLONE_SIGHAND)
 		ff |= RFSIGSHARE;
 	if ((args->flags & LINUX_CLONE_CLEAR_SIGHAND) != 0)
 		f2 |= FR2_DROPSIG_CAUGHT;
 	if (args->flags & LINUX_CLONE_FILES) {
 		if (!(args->flags & LINUX_CLONE_FS))
 			f2 |= FR2_SHARE_PATHS;
 	} else {
 		ff |= RFFDG;
 		if (args->flags & LINUX_CLONE_FS)
 			f2 |= FR2_SHARE_PATHS;
 	}
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID)
 		if (args->parent_tid == NULL)
 			return (EINVAL);
 
 	if (args->flags & LINUX_CLONE_VFORK)
 		ff |= RFPPWAIT;
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = ff;
 	fr.fr_flags2 = f2;
 	fr.fr_procp = &p2;
 	error = fork1(td, &fr);
 	if (error)
 		return (error);
 
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	/* create the emuldata */
 	linux_proc_init(td, td2, false);
 
 	em = em_find(td2);
 	KASSERT(em != NULL, ("clone_proc: emuldata not found.\n"));
 
 	if (args->flags & LINUX_CLONE_CHILD_SETTID)
 		em->child_set_tid = args->child_tid;
 	else
 		em->child_set_tid = NULL;
 
 	if (args->flags & LINUX_CLONE_CHILD_CLEARTID)
 		em->child_clear_tid = args->child_tid;
 	else
 		em->child_clear_tid = NULL;
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID) {
 		error = copyout(&p2->p_pid, args->parent_tid,
 		    sizeof(p2->p_pid));
 		if (error)
 			linux_msg(td, "copyout p_pid failed!");
 	}
 
 	PROC_LOCK(p2);
 	p2->p_sigparent = exit_signal;
 	PROC_UNLOCK(p2);
 	/*
 	 * In a case of stack = NULL, we are supposed to COW calling process
 	 * stack. This is what normal fork() does, so we just keep tf_rsp arg
 	 * intact.
 	 */
 	linux_set_upcall(td2, args->stack);
 
 	if (args->flags & LINUX_CLONE_SETTLS)
 		linux_set_cloned_tls(td2, PTRIN(args->tls));
 
 	/*
 	 * If CLONE_PARENT is set, then the parent of the new process will be
 	 * the same as that of the calling process.
 	 */
 	if (args->flags & LINUX_CLONE_PARENT) {
 		sx_xlock(&proctree_lock);
 		PROC_LOCK(p2);
 		proc_reparent(p2, td->td_proc->p_pptr, true);
 		PROC_UNLOCK(p2);
 		sx_xunlock(&proctree_lock);
 	}
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 
 	td->td_retval[0] = p2->p_pid;
 
 	return (0);
 }
 
 static int
 linux_clone_thread(struct thread *td, struct l_clone_args *args)
 {
 	struct linux_emuldata *em;
 	struct thread *newtd;
 	struct proc *p;
 	int error;
 
 	LINUX_CTR4(clone_thread, "thread(%d) flags %x ptid %p ctid %p",
 	    td->td_tid, (unsigned)args->flags,
 	    args->parent_tid, args->child_tid);
 
 	if ((args->flags & LINUX_CLONE_PARENT) != 0)
 		return (EINVAL);
 	if (args->flags & LINUX_CLONE_PARENT_SETTID)
 		if (args->parent_tid == NULL)
 			return (EINVAL);
 
 	/* Threads should be created with own stack */
 	if (PTRIN(args->stack) == NULL)
 		return (EINVAL);
 
 	p = td->td_proc;
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		error = racct_add(p, RACCT_NTHR, 1);
 		PROC_UNLOCK(p);
 		if (error != 0)
 			return (EPROCLIM);
 	}
 #endif
 
 	/* Initialize our td */
 	error = kern_thr_alloc(p, 0, &newtd);
 	if (error)
 		goto fail;
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
 	newtd->td_proc = p;
 	thread_cow_get(newtd, td);
 
 	cpu_copy_thread(newtd, td);
 
 	/* create the emuldata */
 	linux_proc_init(td, newtd, true);
 
 	em = em_find(newtd);
 	KASSERT(em != NULL, ("clone_thread: emuldata not found.\n"));
 
 	if (args->flags & LINUX_CLONE_SETTLS)
 		linux_set_cloned_tls(newtd, PTRIN(args->tls));
 
 	if (args->flags & LINUX_CLONE_CHILD_SETTID)
 		em->child_set_tid = args->child_tid;
 	else
 		em->child_set_tid = NULL;
 
 	if (args->flags & LINUX_CLONE_CHILD_CLEARTID)
 		em->child_clear_tid = args->child_tid;
 	else
 		em->child_clear_tid = NULL;
 
-	cpu_thread_clean(newtd);
-
 	linux_set_upcall(newtd, args->stack);
 
 	PROC_LOCK(p);
 	p->p_flag |= P_HADTHREADS;
 	thread_link(newtd, p);
 	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
 
 	thread_lock(td);
 	/* let the scheduler know about these things. */
 	sched_fork_thread(td, newtd);
 	thread_unlock(td);
 	if (P_SHOULDSTOP(p))
 		ast_sched(newtd, TDA_SUSPEND);
 
 	if (p->p_ptevents & PTRACE_LWP)
 		newtd->td_dbgflags |= TDB_BORN;
 	PROC_UNLOCK(p);
 
 	tidhash_add(newtd);
 
 	LINUX_CTR2(clone_thread, "thread(%d) successful clone to %d",
 	    td->td_tid, newtd->td_tid);
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID) {
 		error = copyout(&newtd->td_tid, args->parent_tid,
 		    sizeof(newtd->td_tid));
 		if (error)
 			linux_msg(td, "clone_thread: copyout td_tid failed!");
 	}
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(newtd);
 	TD_SET_CAN_RUN(newtd);
 	sched_add(newtd, SRQ_BORING);
 
 	td->td_retval[0] = newtd->td_tid;
 
 	return (0);
 
 fail:
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_sub(p, RACCT_NTHR, 1);
 		PROC_UNLOCK(p);
 	}
 #endif
 	return (error);
 }
 
 int
 linux_clone(struct thread *td, struct linux_clone_args *args)
 {
 	struct l_clone_args ca = {
 		.flags = (lower_32_bits(args->flags) & ~LINUX_CSIGNAL),
 		.child_tid = args->child_tidptr,
 		.parent_tid = args->parent_tidptr,
 		.exit_signal = (lower_32_bits(args->flags) & LINUX_CSIGNAL),
 		.stack = args->stack,
 		.tls = args->tls,
 	};
 
 	if (args->flags & LINUX_CLONE_THREAD)
 		return (linux_clone_thread(td, &ca));
 	else
 		return (linux_clone_proc(td, &ca));
 }
 
 
 static int
 linux_clone3_args_valid(struct l_user_clone_args *uca)
 {
 
 	/* Verify that no unknown flags are passed along. */
 	if ((uca->flags & ~(LINUX_CLONE_LEGACY_FLAGS |
 	    LINUX_CLONE_CLEAR_SIGHAND | LINUX_CLONE_INTO_CGROUP)) != 0)
 		return (EINVAL);
 	if ((uca->flags & (LINUX_CLONE_DETACHED | LINUX_CSIGNAL)) != 0)
 		return (EINVAL);
 
 	if ((uca->flags & (LINUX_CLONE_SIGHAND | LINUX_CLONE_CLEAR_SIGHAND)) ==
 	    (LINUX_CLONE_SIGHAND | LINUX_CLONE_CLEAR_SIGHAND))
 		return (EINVAL);
 	if ((uca->flags & (LINUX_CLONE_THREAD | LINUX_CLONE_PARENT)) != 0 &&
 	    uca->exit_signal != 0)
 		return (EINVAL);
 
 	/* We don't support set_tid, only validate input. */
 	if (uca->set_tid_size > LINUX_MAX_PID_NS_LEVEL)
 		return (EINVAL);
 	if (uca->set_tid == 0 && uca->set_tid_size > 0)
 		return (EINVAL);
 	if (uca->set_tid != 0 && uca->set_tid_size == 0)
 		return (EINVAL);
 
 	if (uca->stack == 0 && uca->stack_size > 0)
 		return (EINVAL);
 	if (uca->stack != 0 && uca->stack_size == 0)
 		return (EINVAL);
 
 	/* Verify that higher 32bits of exit_signal are unset. */
 	if ((uca->exit_signal & ~(uint64_t)LINUX_CSIGNAL) != 0)
 		return (EINVAL);
 
 	/* Verify that no unsupported flags are passed along. */
 	if ((uca->flags & LINUX_CLONE_NEWTIME) != 0) {
 		LINUX_RATELIMIT_MSG("unsupported clone3 option CLONE_NEWTIME");
 		return (ENOSYS);
 	}
 	if ((uca->flags & LINUX_CLONE_INTO_CGROUP) != 0) {
 		LINUX_RATELIMIT_MSG("unsupported clone3 option CLONE_INTO_CGROUP");
 		return (ENOSYS);
 	}
 	if (uca->set_tid != 0 || uca->set_tid_size != 0) {
 		LINUX_RATELIMIT_MSG("unsupported clone3 set_tid");
 		return (ENOSYS);
 	}
 
 	return (0);
 }
 
 int
 linux_clone3(struct thread *td, struct linux_clone3_args *args)
 {
 	struct l_user_clone_args *uca;
 	struct l_clone_args *ca;
 	size_t size;
 	int error;
 
 	if (args->usize > PAGE_SIZE)
 		return (E2BIG);
 	if (args->usize < LINUX_CLONE_ARGS_SIZE_VER0)
 		return (EINVAL);
 
 	/*
 	 * usize can be less than size of struct clone_args, to avoid using
 	 * of uninitialized data of struct clone_args, allocate at least
 	 * sizeof(struct clone_args) storage and zero it.
 	 */
 	size = max(args->usize, sizeof(*uca));
 	uca = malloc(size, M_LINUX, M_WAITOK | M_ZERO);
 	error = copyin(args->uargs, uca, args->usize);
 	if (error != 0)
 		goto out;
 	error = linux_clone3_args_valid(uca);
 	if (error != 0)
 		goto out;
 	ca = malloc(sizeof(*ca), M_LINUX, M_WAITOK | M_ZERO);
 	ca->flags = uca->flags;
 	ca->child_tid = PTRIN(uca->child_tid);
 	ca->parent_tid = PTRIN(uca->parent_tid);
 	ca->exit_signal = uca->exit_signal;
 	ca->stack = uca->stack + uca->stack_size;
 	ca->stack_size = uca->stack_size;
 	ca->tls = uca->tls;
 
 	if ((ca->flags & LINUX_CLONE_THREAD) != 0)
 		error = linux_clone_thread(td, ca);
 	else
 		error = linux_clone_proc(td, ca);
 	free(ca, M_LINUX);
 out:
 	free(uca, M_LINUX);
 	return (error);
 }
 
 int
 linux_exit(struct thread *td, struct linux_exit_args *args)
 {
 	struct linux_emuldata *em __diagused;
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("exit: emuldata not found.\n"));
 
 	LINUX_CTR2(exit, "thread(%d) (%d)", em->em_tid, args->rval);
 
 	linux_thread_detach(td);
 
 	/*
 	 * XXX. When the last two threads of a process
 	 * exit via pthread_exit() try thr_exit() first.
 	 */
 	kern_thr_exit(td);
 	exit1(td, args->rval, 0);
 		/* NOTREACHED */
 }
 
 int
 linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args)
 {
 	struct linux_emuldata *em;
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n"));
 
 	em->child_clear_tid = args->tidptr;
 
 	td->td_retval[0] = em->em_tid;
 
 	LINUX_CTR3(set_tid_address, "tidptr(%d) %p, returns %d",
 	    em->em_tid, args->tidptr, td->td_retval[0]);
 
 	return (0);
 }
 
 void
 linux_thread_detach(struct thread *td)
 {
 	struct linux_emuldata *em;
 	int *child_clear_tid;
 	int error;
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("thread_detach: emuldata not found.\n"));
 
 	LINUX_CTR1(thread_detach, "thread(%d)", em->em_tid);
 
 	release_futexes(td, em);
 
 	child_clear_tid = em->child_clear_tid;
 
 	if (child_clear_tid != NULL) {
 		LINUX_CTR2(thread_detach, "thread(%d) %p",
 		    em->em_tid, child_clear_tid);
 
 		error = suword32(child_clear_tid, 0);
 		if (error != 0)
 			return;
 
 		error = futex_wake(td, child_clear_tid, 1, false);
 		/*
 		 * this cannot happen at the moment and if this happens it
 		 * probably means there is a user space bug
 		 */
 		if (error != 0)
 			linux_msg(td, "futex stuff in thread_detach failed.");
 	}
 
 	/*
 	 * Do not rely on the robust list which is maintained by userspace,
 	 * cleanup remaining pi (if any) after release_futexes anyway.
 	 */
 	umtx_thread_exit(td);
 }
diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c
index 0e0cebe98e5f..b0dd7534633b 100644
--- a/sys/i386/i386/vm_machdep.c
+++ b/sys/i386/i386/vm_machdep.c
@@ -1,641 +1,631 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
  */
 
 #include <sys/cdefs.h>
 #include "opt_isa.h"
 #include "opt_npx.h"
 #include "opt_reset.h"
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/smp.h>
 #include <machine/vm86.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 
 _Static_assert(__OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
     "__OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf.");
 
 union savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
 	vm_offset_t p;
 
 	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
 	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
 	KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area"));
 	return ((union savefpu *)p);
 }
 
 union savefpu *
 get_pcb_user_save_pcb(struct pcb *pcb)
 {
 	vm_offset_t p;
 
 	p = (vm_offset_t)(pcb + 1);
 	return ((union savefpu *)p);
 }
 
 struct pcb *
 get_pcb_td(struct thread *td)
 {
 	vm_offset_t p;
 
 	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
 	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) -
 	    sizeof(struct pcb);
 	return ((struct pcb *)p);
 }
 
 void *
 alloc_fpusave(int flags)
 {
 	void *res;
 	struct savefpu_ymm *sf;
 
 	res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
 	if (use_xsave) {
 		sf = (struct savefpu_ymm *)res;
 		bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd));
 		sf->sv_xstate.sx_hd.xstate_bv = xsave_mask;
 	}
 	return (res);
 }
 
 /*
  * Common code shared between cpu_fork() and cpu_copy_thread() for
  * initializing a thread.
  */
 static void
 copy_thread(struct thread *td1, struct thread *td2)
 {
 	struct pcb *pcb2;
 
 	pcb2 = td2->td_pcb;
 
 	/* Ensure that td1's pcb is up to date for user threads. */
 	if ((td2->td_pflags & TDP_KTHREAD) == 0) {
 		MPASS(td1 == curthread);
 		td1->td_pcb->pcb_gs = rgs();
 		critical_enter();
 		if (PCPU_GET(fpcurthread) == td1)
 			npxsave(td1->td_pcb->pcb_save);
 		critical_exit();
 	}
 
 	/* Copy td1's pcb */
 	bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
 
 	/* Properly initialize pcb_save */
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
 
 	/* Kernel threads start with clean NPX and segment bases. */
 	if ((td2->td_pflags & TDP_KTHREAD) != 0) {
 		pcb2->pcb_gs = _udatasel;
 		set_fsbase(td2, 0);
 		set_gsbase(td2, 0);
 		pcb2->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE |
 		    PCB_KERNNPX | PCB_KERNNPX_THR);
 	} else {
 		MPASS((pcb2->pcb_flags & (PCB_KERNNPX | PCB_KERNNPX_THR)) == 0);
 		bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
 		    cpu_max_ext_state_size);
 	}
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pcb2->pcb_edi = 0;
 	pcb2->pcb_esi = (int)fork_return;		    /* trampoline arg */
 	pcb2->pcb_ebp = 0;
 	pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *); /* trampoline arg */
 	pcb2->pcb_ebx = (int)td2;			    /* trampoline arg */
 	pcb2->pcb_eip = (int)fork_trampoline + setidt_disp;
 	/*
 	 * If we didn't copy the pcb, we'd need to do the following registers:
 	 * pcb2->pcb_cr3:	cloned above.
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_flags:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_gs:	cloned above.
 	 * pcb2->pcb_ext:	cleared below.
 	 */
 	pcb2->pcb_ext = NULL;
 
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 }
 
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
  */
 void
 cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
 {
 	struct proc *p1;
 	struct pcb *pcb2;
 	struct mdproc *mdp2;
 
 	p1 = td1->td_proc;
 	if ((flags & RFPROC) == 0) {
 		if ((flags & RFMEM) == 0) {
 			/* unshare user LDT */
 			struct mdproc *mdp1 = &p1->p_md;
 			struct proc_ldt *pldt, *pldt1;
 
 			mtx_lock_spin(&dt_lock);
 			if ((pldt1 = mdp1->md_ldt) != NULL &&
 			    pldt1->ldt_refcnt > 1) {
 				pldt = user_ldt_alloc(mdp1, pldt1->ldt_len);
 				if (pldt == NULL)
 					panic("could not copy LDT");
 				mdp1->md_ldt = pldt;
 				set_user_ldt(mdp1);
 				user_ldt_deref(pldt1);
 			} else
 				mtx_unlock_spin(&dt_lock);
 		}
 		return;
 	}
 
 	/* Point the pcb to the top of the stack */
 	pcb2 = get_pcb_td(td2);
 	td2->td_pcb = pcb2;
 
 	copy_thread(td1, td2);
 
 	/* Reset debug registers in the new process */
 	x86_clear_dbregs(pcb2);
 
 	/* Point mdproc and then copy over td1's contents */
 	mdp2 = &p2->p_md;
 	bcopy(&p1->p_md, mdp2, sizeof(*mdp2));
 
 	/*
 	 * Copy the trap frame for the return to user mode as if from a
 	 * syscall.  This copies most of the user mode register values.
 	 * The -VM86_STACK_SPACE (-16) is so we can expand the trapframe
 	 * if we go to vm86.
 	 */
 	td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb -
 	    VM86_STACK_SPACE) - 1;
 	bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 	/* Set child return values. */
 	p2->p_sysent->sv_set_fork_retval(td2);
 
 	/*
 	 * If the parent process has the trap bit set (i.e. a debugger
 	 * had single stepped the process to the system call), we need
 	 * to clear the trap flag from the new frame.
 	 */
 	td2->td_frame->tf_eflags &= ~PSL_T;
 
 	/* Set cr3 for the new process. */
 	pcb2->pcb_cr3 = pmap_get_cr3(vmspace_pmap(p2->p_vmspace));
 
 	/*
 	 * XXX don't copy the i/o pages.  this should probably be fixed.
 	 */
 	pcb2->pcb_ext = NULL;
 
 	/* Copy the LDT, if necessary. */
 	mtx_lock_spin(&dt_lock);
 	if (mdp2->md_ldt != NULL) {
 		if (flags & RFMEM) {
 			mdp2->md_ldt->ldt_refcnt++;
 		} else {
 			mdp2->md_ldt = user_ldt_alloc(mdp2,
 			    mdp2->md_ldt->ldt_len);
 			if (mdp2->md_ldt == NULL)
 				panic("could not copy LDT");
 		}
 	}
 	mtx_unlock_spin(&dt_lock);
 
 	/*
 	 * Now, cpu_switch() can schedule the new process.
 	 * pcb_esp is loaded pointing to the cpu_switch() stack frame
 	 * containing the return address when exiting cpu_switch.
 	 * This will normally be to fork_trampoline(), which will have
 	 * %ebx loaded with the new proc's pointer.  fork_trampoline()
 	 * will set up a stack to call fork_return(p, frame); to complete
 	 * the return to user-mode.
 	 */
 }
 
 void
 x86_set_fork_retval(struct thread *td)
 {
 	struct trapframe * frame = td->td_frame;
 
 	frame->tf_eax = 0;		/* Child returns zero */
 	frame->tf_eflags &= ~PSL_C;	/* success */
 	frame->tf_edx = 1;		/* System V emulation */
 }
 
 /*
  * Intercept the return address from a freshly forked process that has NOT
  * been scheduled yet.
  *
  * This is needed to make kernel threads stay in kernel mode.
  */
 void
 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg)
 {
 	/*
 	 * Note that the trap frame follows the args, so the function
 	 * is really called like this:  func(arg, frame);
 	 */
 	td->td_pcb->pcb_esi = (int) func;	/* function */
 	td->td_pcb->pcb_ebx = (int) arg;	/* first arg */
 }
 
 void
 cpu_exit(struct thread *td)
 {
 
 	/*
 	 * If this process has a custom LDT, release it.  Reset pc->pcb_gs
 	 * and %gs before we free it in case they refer to an LDT entry.
 	 */
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt) {
 		td->td_pcb->pcb_gs = _udatasel;
 		load_gs(_udatasel);
 		user_ldt_free(td);
 	} else
 		mtx_unlock_spin(&dt_lock);
 }
 
 void
 cpu_thread_exit(struct thread *td)
 {
 
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread))
 		npxdrop();
 	critical_exit();
 
 	/* Disable any hardware breakpoints. */
 	if (td->td_pcb->pcb_flags & PCB_DBREGS) {
 		reset_dbregs();
 		td->td_pcb->pcb_flags &= ~PCB_DBREGS;
 	}
 }
 
 void
 cpu_thread_clean(struct thread *td)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb; 
 	if (pcb->pcb_ext != NULL) {
 		/* if (pcb->pcb_ext->ext_refcount-- == 1) ?? */
 		/*
 		 * XXX do we need to move the TSS off the allocated pages
 		 * before freeing them?  (not done here)
 		 */
 		pmap_trm_free(pcb->pcb_ext, ctob(IOPAGES + 1));
 		pcb->pcb_ext = NULL;
 	}
 }
 
 void
 cpu_thread_alloc(struct thread *td)
 {
 	struct pcb *pcb;
 	struct xstate_hdr *xhdr;
 
 	td->td_pcb = pcb = get_pcb_td(td);
 	td->td_frame = (struct trapframe *)((caddr_t)pcb -
 	    VM86_STACK_SPACE) - 1;
 	pcb->pcb_ext = NULL; 
 	pcb->pcb_save = get_pcb_user_save_pcb(pcb);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
 		bzero(xhdr, sizeof(*xhdr));
 		xhdr->xstate_bv = xsave_mask;
 	}
 }
 
 void
 cpu_thread_free(struct thread *td)
 {
 
 	cpu_thread_clean(td);
 }
 
 bool
 cpu_exec_vmspace_reuse(struct proc *p __unused, vm_map_t map __unused)
 {
 
 	return (true);
 }
 
 int
 cpu_procctl(struct thread *td __unused, int idtype __unused, id_t id __unused,
     int com __unused, void *data __unused)
 {
 
 	return (EINVAL);
 }
 
 void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 
 	switch (error) {
 	case 0:
 		td->td_frame->tf_eax = td->td_retval[0];
 		td->td_frame->tf_edx = td->td_retval[1];
 		td->td_frame->tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes, int
 		 * 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		td->td_frame->tf_eip -= td->td_frame->tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 		td->td_frame->tf_eax = error;
 		td->td_frame->tf_eflags |= PSL_C;
 		break;
 	}
 }
 
 /*
  * Initialize machine state, mostly pcb and trap frame for a new
  * thread, about to return to userspace.  Put enough state in the new
  * thread's PCB to get it to go back to the fork_return(), which
  * finalizes the thread state and handles peculiarities of the first
  * return to userspace for the new thread.
  */
 void
 cpu_copy_thread(struct thread *td, struct thread *td0)
 {
 	copy_thread(td0, td);
 
 	/*
 	 * Copy user general-purpose registers.
 	 *
 	 * Some of these registers are rewritten by cpu_set_upcall()
 	 * and linux_set_upcall().
 	 */
 	bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe));
 
 	/* If the current thread has the trap bit set (i.e. a debugger had
 	 * single stepped the process to the system call), we need to clear
 	 * the trap flag from the new frame. Otherwise, the new thread will
 	 * receive a (likely unexpected) SIGTRAP when it executes the first
 	 * instruction after returning to userland.
 	 */
 	td->td_frame->tf_eflags &= ~PSL_T;
 }
 
 /*
  * Set that machine state for performing an upcall that starts
  * the entry function with the given argument.
  */
 int
 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg,
     stack_t *stack)
 {
-
-	/* 
-	 * Do any extra cleaning that needs to be done.
-	 * The thread may have optional components
-	 * that are not present in a fresh thread.
-	 * This may be a recycled thread so make it look
-	 * as though it's newly allocated.
-	 */
-	cpu_thread_clean(td);
-
 	/*
 	 * Set the trap frame to point at the beginning of the entry
 	 * function.
 	 */
 	td->td_frame->tf_ebp = 0; 
 	td->td_frame->tf_esp =
 	    (((int)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4;
 	td->td_frame->tf_eip = (int)entry;
 
 	/* Return address sentinel value to stop stack unwinding. */
 	if (suword((void *)td->td_frame->tf_esp, 0) != 0)
 		return (EFAULT);
 
 	/* Pass the argument to the entry point. */
 	if (suword((void *)(td->td_frame->tf_esp + sizeof(void *)),
 	    (int)arg) != 0)
 		return (EFAULT);
 	return (0);
 }
 
 int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 	struct segment_descriptor sd;
 	uint32_t base;
 
 	/*
 	 * Construct a descriptor and store it in the pcb for
 	 * the next context switch.  Also store it in the gdt
 	 * so that the load of tf_fs into %fs will activate it
 	 * at return to userland.
 	 */
 	base = (uint32_t)tls_base;
 	sd.sd_lobase = base & 0xffffff;
 	sd.sd_hibase = (base >> 24) & 0xff;
 	sd.sd_lolimit = 0xffff;	/* 4GB limit, wraps around */
 	sd.sd_hilimit = 0xf;
 	sd.sd_type  = SDT_MEMRWA;
 	sd.sd_dpl   = SEL_UPL;
 	sd.sd_p     = 1;
 	sd.sd_xx    = 0;
 	sd.sd_def32 = 1;
 	sd.sd_gran  = 1;
 	critical_enter();
 	/* set %gs */
 	td->td_pcb->pcb_gsd = sd;
 	if (td == curthread) {
 		PCPU_GET(fsgs_gdt)[1] = sd;
 		load_gs(GSEL(GUGS_SEL, SEL_UPL));
 	}
 	critical_exit();
 	return (0);
 }
 
 /*
  * Convert kernel VA to physical address
  */
 vm_paddr_t
 kvtop(void *addr)
 {
 	vm_paddr_t pa;
 
 	pa = pmap_kextract((vm_offset_t)addr);
 	if (pa == 0)
 		panic("kvtop: zero page frame");
 	return (pa);
 }
 
 /*
  * Get an sf_buf from the freelist.  May block if none are available.
  */
 void
 sf_buf_map(struct sf_buf *sf, int flags)
 {
 
 	pmap_sf_buf_map(sf);
 #ifdef SMP
 	sf_buf_shootdown(sf, flags);
 #endif
 }
 
 #ifdef SMP
 static void
 sf_buf_shootdown_curcpu_cb(pmap_t pmap __unused,
     vm_offset_t addr1 __unused, vm_offset_t addr2 __unused)
 {
 }
 
 void
 sf_buf_shootdown(struct sf_buf *sf, int flags)
 {
 	cpuset_t other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &sf->cpumask)) {
 		CPU_SET(cpuid, &sf->cpumask);
 		invlpg(sf->kva);
 	}
 	if ((flags & SFB_CPUPRIVATE) == 0) {
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		CPU_ANDNOT(&other_cpus, &other_cpus, &sf->cpumask);
 		if (!CPU_EMPTY(&other_cpus)) {
 			CPU_OR(&sf->cpumask, &sf->cpumask, &other_cpus);
 			smp_masked_invlpg(other_cpus, sf->kva, kernel_pmap,
 			    sf_buf_shootdown_curcpu_cb);
 		}
 	}
 	sched_unpin();
 }
 #endif
 
 /*
  * MD part of sf_buf_free().
  */
 int
 sf_buf_unmap(struct sf_buf *sf)
 {
 
 	return (0);
 }
 
 static void
 sf_buf_invalidate(struct sf_buf *sf)
 {
 	vm_page_t m = sf->m;
 
 	/*
 	 * Use pmap_qenter to update the pte for
 	 * existing mapping, in particular, the PAT
 	 * settings are recalculated.
 	 */
 	pmap_qenter(sf->kva, &m, 1);
 	pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE);
 }
 
 /*
  * Invalidate the cache lines that may belong to the page, if
  * (possibly old) mapping of the page by sf buffer exists.  Returns
  * TRUE when mapping was found and cache invalidated.
  */
 boolean_t
 sf_buf_invalidate_cache(vm_page_t m)
 {
 
 	return (sf_buf_process_page(m, sf_buf_invalidate));
 }