diff --git a/sys/amd64/amd64/vm_machdep.c b/sys/amd64/amd64/vm_machdep.c
index f10d0339a65a..6e60f2b3faff 100644
--- a/sys/amd64/amd64/vm_machdep.c
+++ b/sys/amd64/amd64/vm_machdep.c
@@ -1,735 +1,759 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_isa.h"
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/wait.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/specialreg.h>
 #include <machine/tss.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 
 _Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
     "OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf.");
 
 void
 set_top_of_stack_td(struct thread *td)
 {
 	td->td_md.md_stack_base = td->td_kstack +
 	    td->td_kstack_pages * PAGE_SIZE -
 	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
 }
 
 struct savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
 	vm_offset_t p;
 
 	p = td->td_md.md_stack_base;
 	KASSERT((p % XSAVE_AREA_ALIGN) == 0,
 	    ("Unaligned pcb_user_save area ptr %#lx td %p", p, td));
 	return ((struct savefpu *)p);
 }
 
 struct pcb *
 get_pcb_td(struct thread *td)
 {
 
 	return (&td->td_md.md_pcb);
 }
 
 struct savefpu *
 get_pcb_user_save_pcb(struct pcb *pcb)
 {
 	struct thread *td;
 
 	td = __containerof(pcb, struct thread, td_md.md_pcb);
 	return (get_pcb_user_save_td(td));
 }
 
 void *
 alloc_fpusave(int flags)
 {
 	void *res;
 	struct savefpu_ymm *sf;
 
 	res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
 	if (use_xsave) {
 		sf = (struct savefpu_ymm *)res;
 		bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd));
 		sf->sv_xstate.sx_hd.xstate_bv = xsave_mask;
 	}
 	return (res);
 }
 
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
  */
 void
 cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
 {
 	struct proc *p1;
 	struct pcb *pcb2;
 	struct mdproc *mdp1, *mdp2;
 	struct proc_ldt *pldt;
 
 	p1 = td1->td_proc;
 	if ((flags & RFPROC) == 0) {
 		if ((flags & RFMEM) == 0) {
 			/* unshare user LDT */
 			mdp1 = &p1->p_md;
 			mtx_lock(&dt_lock);
 			if ((pldt = mdp1->md_ldt) != NULL &&
 			    pldt->ldt_refcnt > 1 &&
 			    user_ldt_alloc(p1, 1) == NULL)
 				panic("could not copy LDT");
 			mtx_unlock(&dt_lock);
 		}
 		return;
 	}
 
-	/* Ensure that td1's pcb is up to date. */
-	fpuexit(td1);
-	if (td1 == curthread)
+	/* Ensure that td1's pcb is up to date for user processes. */
+	if ((td2->td_pflags & TDP_KTHREAD) == 0) {
+		MPASS(td1 == curthread);
+		fpuexit(td1);
 		update_pcb_bases(td1->td_pcb);
+	}
 
 	/* Point the stack and pcb to the actual location */
 	set_top_of_stack_td(td2);
 	td2->td_pcb = pcb2 = get_pcb_td(td2);
 
 	/* Copy td1's pcb */
 	bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
 
 	/* Properly initialize pcb_save */
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
-	bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
-	    cpu_max_ext_state_size);
+
+	/* Kernel processes start with clean FPU and segment bases. */
+	if ((td2->td_pflags & TDP_KTHREAD) != 0) {
+		pcb2->pcb_fsbase = 0;
+		pcb2->pcb_gsbase = 0;
+		clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE |
+		    PCB_KERNFPU | PCB_KERNFPU_THR);
+	} else {
+		MPASS((pcb2->pcb_flags & (PCB_KERNFPU | PCB_KERNFPU_THR)) == 0);
+		bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
+		    cpu_max_ext_state_size);
+	}
 
 	/* Point mdproc and then copy over td1's contents */
 	mdp2 = &p2->p_md;
 	bcopy(&p1->p_md, mdp2, sizeof(*mdp2));
 
 	/*
 	 * Create a new fresh stack for the new process.
 	 * Copy the trap frame for the return to user mode as if from a
 	 * syscall.  This copies most of the user mode register values.
 	 */
 	td2->td_frame = (struct trapframe *)td2->td_md.md_stack_base - 1;
 	bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 	td2->td_frame->tf_rax = 0;		/* Child returns zero */
 	td2->td_frame->tf_rflags &= ~PSL_C;	/* success */
 	td2->td_frame->tf_rdx = 1;
 
 	/*
 	 * If the parent process has the trap bit set (i.e. a debugger
 	 * had single stepped the process to the system call), we need
 	 * to clear the trap flag from the new frame.
 	 */
 	td2->td_frame->tf_rflags &= ~PSL_T;
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pcb2->pcb_r12 = (register_t)fork_return;	/* fork_trampoline argument */
 	pcb2->pcb_rbp = 0;
 	pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *);
 	pcb2->pcb_rbx = (register_t)td2;		/* fork_trampoline argument */
 	pcb2->pcb_rip = (register_t)fork_trampoline;
 	/*-
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_flags:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_[fg]sbase:	cloned above
 	 */
 
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 	pmap_thread_init_invl_gen(td2);
 
 	/* As an i386, do not copy io permission bitmap. */
 	pcb2->pcb_tssp = NULL;
 
 	/* New segment registers. */
 	set_pcb_flags_raw(pcb2, PCB_FULL_IRET);
 
 	/* Copy the LDT, if necessary. */
 	mdp1 = &td1->td_proc->p_md;
 	mdp2 = &p2->p_md;
 	if (mdp1->md_ldt == NULL) {
 		mdp2->md_ldt = NULL;
 		return;
 	}
 	mtx_lock(&dt_lock);
 	if (mdp1->md_ldt != NULL) {
 		if (flags & RFMEM) {
 			mdp1->md_ldt->ldt_refcnt++;
 			mdp2->md_ldt = mdp1->md_ldt;
 			bcopy(&mdp1->md_ldt_sd, &mdp2->md_ldt_sd, sizeof(struct
 			    system_segment_descriptor));
 		} else {
 			mdp2->md_ldt = NULL;
 			mdp2->md_ldt = user_ldt_alloc(p2, 0);
 			if (mdp2->md_ldt == NULL)
 				panic("could not copy LDT");
 			amd64_set_ldt_data(td2, 0, max_ldt_segment,
 			    (struct user_segment_descriptor *)
 			    mdp1->md_ldt->ldt_base);
 		}
 	} else
 		mdp2->md_ldt = NULL;
 	mtx_unlock(&dt_lock);
 
 	/*
 	 * Now, cpu_switch() can schedule the new process.
 	 * pcb_rsp is loaded pointing to the cpu_switch() stack frame
 	 * containing the return address when exiting cpu_switch.
 	 * This will normally be to fork_trampoline(), which will have
 	 * %ebx loaded with the new proc's pointer.  fork_trampoline()
 	 * will set up a stack to call fork_return(p, frame); to complete
 	 * the return to user-mode.
 	 */
 }
 
 /*
  * Intercept the return address from a freshly forked process that has NOT
  * been scheduled yet.
  *
  * This is needed to make kernel threads stay in kernel mode.
  */
 void
 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg)
 {
 	/*
 	 * Note that the trap frame follows the args, so the function
 	 * is really called like this:  func(arg, frame);
 	 */
 	td->td_pcb->pcb_r12 = (long) func;	/* function */
 	td->td_pcb->pcb_rbx = (long) arg;	/* first arg */
 }
 
 void
 cpu_exit(struct thread *td)
 {
 
 	/*
 	 * If this process has a custom LDT, release it.
 	 */
 	if (td->td_proc->p_md.md_ldt != NULL)
 		user_ldt_free(td);
 }
 
 void
 cpu_thread_exit(struct thread *td)
 {
 	struct pcb *pcb;
 
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread))
 		fpudrop();
 	critical_exit();
 
 	pcb = td->td_pcb;
 
 	/* Disable any hardware breakpoints. */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		reset_dbregs();
 		clear_pcb_flags(pcb, PCB_DBREGS);
 	}
 }
 
 void
 cpu_thread_clean(struct thread *td)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	/*
 	 * Clean TSS/iomap
 	 */
 	if (pcb->pcb_tssp != NULL) {
 		pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp,
 		    (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1));
 		kmem_free((vm_offset_t)pcb->pcb_tssp, ctob(IOPAGES + 1));
 		pcb->pcb_tssp = NULL;
 	}
 }
 
 void
 cpu_thread_swapin(struct thread *td)
 {
 }
 
 void
 cpu_thread_swapout(struct thread *td)
 {
 }
 
 void
 cpu_thread_alloc(struct thread *td)
 {
 	struct pcb *pcb;
 	struct xstate_hdr *xhdr;
 
 	set_top_of_stack_td(td);
 	td->td_pcb = pcb = get_pcb_td(td);
 	td->td_frame = (struct trapframe *)td->td_md.md_stack_base - 1;
 	pcb->pcb_save = get_pcb_user_save_pcb(pcb);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
 		bzero(xhdr, sizeof(*xhdr));
 		xhdr->xstate_bv = xsave_mask;
 	}
 }
 
 void
 cpu_thread_free(struct thread *td)
 {
 
 	cpu_thread_clean(td);
 }
 
 bool
 cpu_exec_vmspace_reuse(struct proc *p, vm_map_t map)
 {
 
 	return (((curproc->p_md.md_flags & P_MD_KPTI) != 0) ==
 	    (vm_map_pmap(map)->pm_ucr3 != PMAP_NO_CR3));
 }
 
 static void
 cpu_procctl_kpti_ctl(struct proc *p, int val)
 {
 
 	if (pti && val == PROC_KPTI_CTL_ENABLE_ON_EXEC)
 		p->p_md.md_flags |= P_MD_KPTI;
 	if (val == PROC_KPTI_CTL_DISABLE_ON_EXEC)
 		p->p_md.md_flags &= ~P_MD_KPTI;
 }
 
 static void
 cpu_procctl_kpti_status(struct proc *p, int *val)
 {
 	*val = (p->p_md.md_flags & P_MD_KPTI) != 0 ?
 	    PROC_KPTI_CTL_ENABLE_ON_EXEC:
 	    PROC_KPTI_CTL_DISABLE_ON_EXEC;
 	if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3)
 		*val |= PROC_KPTI_STATUS_ACTIVE;
 }
 
 static int
 cpu_procctl_la_ctl(struct proc *p, int val)
 {
 	int error;
 
 	error = 0;
 	switch (val) {
 	case PROC_LA_CTL_LA48_ON_EXEC:
 		p->p_md.md_flags |= P_MD_LA48;
 		p->p_md.md_flags &= ~P_MD_LA57;
 		break;
 	case PROC_LA_CTL_LA57_ON_EXEC:
 		if (la57) {
 			p->p_md.md_flags &= ~P_MD_LA48;
 			p->p_md.md_flags |= P_MD_LA57;
 		} else {
 			error = ENOTSUP;
 		}
 		break;
 	case PROC_LA_CTL_DEFAULT_ON_EXEC:
 		p->p_md.md_flags &= ~(P_MD_LA48 | P_MD_LA57);
 		break;
 	}
 	return (error);
 }
 
 static void
 cpu_procctl_la_status(struct proc *p, int *val)
 {
 	int res;
 
 	if ((p->p_md.md_flags & P_MD_LA48) != 0)
 		res = PROC_LA_CTL_LA48_ON_EXEC;
 	else if ((p->p_md.md_flags & P_MD_LA57) != 0)
 		res = PROC_LA_CTL_LA57_ON_EXEC;
 	else
 		res = PROC_LA_CTL_DEFAULT_ON_EXEC;
 	if (p->p_sysent->sv_maxuser == VM_MAXUSER_ADDRESS_LA48)
 		res |= PROC_LA_STATUS_LA48;
 	else
 		res |= PROC_LA_STATUS_LA57;
 	*val = res;
 }
 
 int
 cpu_procctl(struct thread *td, int idtype, id_t id, int com, void *data)
 {
 	struct proc *p;
 	int error, val;
 
 	switch (com) {
 	case PROC_KPTI_CTL:
 	case PROC_KPTI_STATUS:
 	case PROC_LA_CTL:
 	case PROC_LA_STATUS:
 		if (idtype != P_PID) {
 			error = EINVAL;
 			break;
 		}
 		if (com == PROC_KPTI_CTL) {
 			/* sad but true and not a joke */
 			error = priv_check(td, PRIV_IO);
 			if (error != 0)
 				break;
 		}
 		if (com == PROC_KPTI_CTL || com == PROC_LA_CTL) {
 			error = copyin(data, &val, sizeof(val));
 			if (error != 0)
 				break;
 		}
 		if (com == PROC_KPTI_CTL &&
 		    val != PROC_KPTI_CTL_ENABLE_ON_EXEC &&
 		    val != PROC_KPTI_CTL_DISABLE_ON_EXEC) {
 			error = EINVAL;
 			break;
 		}
 		if (com == PROC_LA_CTL &&
 		    val != PROC_LA_CTL_LA48_ON_EXEC &&
 		    val != PROC_LA_CTL_LA57_ON_EXEC &&
 		    val != PROC_LA_CTL_DEFAULT_ON_EXEC) {
 			error = EINVAL;
 			break;
 		}
 		error = pget(id, PGET_CANSEE | PGET_NOTWEXIT | PGET_NOTID, &p);
 		if (error != 0)
 			break;
 		switch (com) {
 		case PROC_KPTI_CTL:
 			cpu_procctl_kpti_ctl(p, val);
 			break;
 		case PROC_KPTI_STATUS:
 			cpu_procctl_kpti_status(p, &val);
 			break;
 		case PROC_LA_CTL:
 			error = cpu_procctl_la_ctl(p, val);
 			break;
 		case PROC_LA_STATUS:
 			cpu_procctl_la_status(p, &val);
 			break;
 		}
 		PROC_UNLOCK(p);
 		if (com == PROC_KPTI_STATUS || com == PROC_LA_STATUS)
 			error = copyout(&val, data, sizeof(val));
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 	struct trapframe *frame;
 
 	frame = td->td_frame;
 	if (__predict_true(error == 0)) {
 		frame->tf_rax = td->td_retval[0];
 		frame->tf_rdx = td->td_retval[1];
 		frame->tf_rflags &= ~PSL_C;
 		return;
 	}
 
 	switch (error) {
 	case ERESTART:
 		/*
 		 * Reconstruct pc, we know that 'syscall' is 2 bytes,
 		 * lcall $X,y is 7 bytes, int 0x80 is 2 bytes.
 		 * We saved this in tf_err.
 		 * %r10 (which was holding the value of %rcx) is restored
 		 * for the next iteration.
 		 * %r10 restore is only required for freebsd/amd64 processes,
 		 * but shall be innocent for any ia32 ABI.
 		 *
 		 * Require full context restore to get the arguments
 		 * in the registers reloaded at return to usermode.
 		 */
 		frame->tf_rip -= frame->tf_err;
 		frame->tf_r10 = frame->tf_rcx;
 		set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 		frame->tf_rax = error;
 		frame->tf_rflags |= PSL_C;
 		break;
 	}
 }
 
 /*
  * Initialize machine state, mostly pcb and trap frame for a new
  * thread, about to return to userspace.  Put enough state in the new
  * thread's PCB to get it to go back to the fork_return(), which
  * finalizes the thread state and handles peculiarities of the first
  * return to userspace for the new thread.
  */
 void
 cpu_copy_thread(struct thread *td, struct thread *td0)
 {
 	struct pcb *pcb2;
 
 	pcb2 = td->td_pcb;
 
-	/* Ensure that td0's pcb is up to date. */
-	fpuexit(td0);
-	if (td0 == curthread)
+	/* Ensure that td0's pcb is up to date for user threads. */
+	if ((td->td_pflags & TDP_KTHREAD) == 0) {
+		MPASS(td0 == curthread);
+		fpuexit(td0);
 		update_pcb_bases(td0->td_pcb);
+	}
 
 	/*
 	 * Copy the upcall pcb.  This loads kernel regs.
 	 * Those not loaded individually below get their default
 	 * values here.
 	 */
 	bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));
-	clear_pcb_flags(pcb2, PCB_KERNFPU);
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
-	bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
-	    cpu_max_ext_state_size);
+
+	/* Kernel threads start with clean FPU and segment bases. */
+	if ((td->td_pflags & TDP_KTHREAD) != 0) {
+		pcb2->pcb_fsbase = 0;
+		pcb2->pcb_gsbase = 0;
+		clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE |
+		    PCB_KERNFPU | PCB_KERNFPU_THR);
+	} else {
+		MPASS((pcb2->pcb_flags & (PCB_KERNFPU | PCB_KERNFPU_THR)) == 0);
+		bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
+		    cpu_max_ext_state_size);
+	}
 	set_pcb_flags_raw(pcb2, PCB_FULL_IRET);
 
+
 	/*
 	 * Create a new fresh stack for the new thread.
 	 */
 	bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe));
 
 	/* If the current thread has the trap bit set (i.e. a debugger had
 	 * single stepped the process to the system call), we need to clear
 	 * the trap flag from the new frame. Otherwise, the new thread will
 	 * receive a (likely unexpected) SIGTRAP when it executes the first
 	 * instruction after returning to userland.
 	 */
 	td->td_frame->tf_rflags &= ~PSL_T;
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pcb2->pcb_r12 = (register_t)fork_return;	    /* trampoline arg */
 	pcb2->pcb_rbp = 0;
 	pcb2->pcb_rsp = (register_t)td->td_frame - sizeof(void *);	/* trampoline arg */
 	pcb2->pcb_rbx = (register_t)td;			    /* trampoline arg */
 	pcb2->pcb_rip = (register_t)fork_trampoline;
 	/*
 	 * If we didn't copy the pcb, we'd need to do the following registers:
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_[fg]sbase: cloned above
 	 */
 
 	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 	pmap_thread_init_invl_gen(td);
 }
 
 /*
  * Set that machine state for performing an upcall that starts
  * the entry function with the given argument.
  */
 void
 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg,
     stack_t *stack)
 {
 
 	/* 
 	 * Do any extra cleaning that needs to be done.
 	 * The thread may have optional components
 	 * that are not present in a fresh thread.
 	 * This may be a recycled thread so make it look
 	 * as though it's newly allocated.
 	 */
 	cpu_thread_clean(td);
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		/*
 		 * Set the trap frame to point at the beginning of the entry
 		 * function.
 		 */
 		td->td_frame->tf_rbp = 0;
 		td->td_frame->tf_rsp =
 		   (((uintptr_t)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4;
 		td->td_frame->tf_rip = (uintptr_t)entry;
 
 		/* Return address sentinel value to stop stack unwinding. */
 		suword32((void *)td->td_frame->tf_rsp, 0);
 
 		/* Pass the argument to the entry point. */
 		suword32((void *)(td->td_frame->tf_rsp + sizeof(int32_t)),
 		    (uint32_t)(uintptr_t)arg);
 
 		return;
 	}
 #endif
 
 	/*
 	 * Set the trap frame to point at the beginning of the uts
 	 * function.
 	 */
 	td->td_frame->tf_rbp = 0;
 	td->td_frame->tf_rsp =
 	    ((register_t)stack->ss_sp + stack->ss_size) & ~0x0f;
 	td->td_frame->tf_rsp -= 8;
 	td->td_frame->tf_rip = (register_t)entry;
 	td->td_frame->tf_ds = _udatasel;
 	td->td_frame->tf_es = _udatasel;
 	td->td_frame->tf_fs = _ufssel;
 	td->td_frame->tf_gs = _ugssel;
 	td->td_frame->tf_flags = TF_HASSEGS;
 
 	/* Return address sentinel value to stop stack unwinding. */
 	suword((void *)td->td_frame->tf_rsp, 0);
 
 	/* Pass the argument to the entry point. */
 	td->td_frame->tf_rdi = (register_t)arg;
 }
 
 int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 	struct pcb *pcb;
 
 	if ((u_int64_t)tls_base >= VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 
 	pcb = td->td_pcb;
 	set_pcb_flags(pcb, PCB_FULL_IRET);
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		pcb->pcb_gsbase = (register_t)tls_base;
 		return (0);
 	}
 #endif
 	pcb->pcb_fsbase = (register_t)tls_base;
 	return (0);
 }
 
 /*
  * Software interrupt handler for queued VM system processing.
  */   
 void  
 swi_vm(void *dummy) 
 {     
 	if (busdma_swi_pending != 0)
 		busdma_swi();
 }
 
 /*
  * Tell whether this address is in some physical memory region.
  * Currently used by the kernel coredump code in order to avoid
  * dumping the ``ISA memory hole'' which could cause indefinite hangs,
  * or other unpredictable behaviour.
  */
 
 int
 is_physical_memory(vm_paddr_t addr)
 {
 
 #ifdef DEV_ISA
 	/* The ISA ``memory hole''. */
 	if (addr >= 0xa0000 && addr < 0x100000)
 		return 0;
 #endif
 
 	/*
 	 * stuff other tests for known memory-mapped devices (PCI?)
 	 * here
 	 */
 
 	return 1;
 }
diff --git a/sys/i386/i386/sys_machdep.c b/sys/i386/i386/sys_machdep.c
index 3f650b65e160..a0a1c273f467 100644
--- a/sys/i386/i386/sys_machdep.c
+++ b/sys/i386/i386/sys_machdep.c
@@ -1,809 +1,811 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)sys_machdep.c	5.5 (Berkeley) 1/19/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysproto.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/sysarch.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/vm_kern.h>		/* for kernel_map */
 
 #define MAX_LD 8192
 #define LD_PER_PAGE 512
 #define	NEW_MAX_LD(num)  rounddown2(num + LD_PER_PAGE, LD_PER_PAGE)
 #define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3)
 #define	NULL_LDT_BASE	((caddr_t)NULL)
 
 #ifdef SMP
 static void set_user_ldt_rv(void *arg);
 #endif
 static int i386_set_ldt_data(struct thread *, int start, int num,
     union descriptor *descs);
 static int i386_ldt_grow(struct thread *td, int len);
 
 void
 fill_based_sd(struct segment_descriptor *sdp, uint32_t base)
 {
 
 	sdp->sd_lobase = base & 0xffffff;
 	sdp->sd_hibase = (base >> 24) & 0xff;
 	sdp->sd_lolimit = 0xffff;	/* 4GB limit, wraps around */
 	sdp->sd_hilimit = 0xf;
 	sdp->sd_type = SDT_MEMRWA;
 	sdp->sd_dpl = SEL_UPL;
 	sdp->sd_p = 1;
 	sdp->sd_xx = 0;
 	sdp->sd_def32 = 1;
 	sdp->sd_gran = 1;
 }
 
 /*
  * Construct special descriptors for "base" selectors.  Store them in
  * the PCB for later use by cpu_switch().  Store them in the GDT for
  * more immediate use.  The GDT entries are part of the current
  * context.  Callers must load related segment registers to complete
  * setting up the current context.
  */
 void
 set_fsbase(struct thread *td, uint32_t base)
 {
 	struct segment_descriptor sd;
 
 	fill_based_sd(&sd, base);
 	critical_enter();
 	td->td_pcb->pcb_fsd = sd;
-	PCPU_GET(fsgs_gdt)[0] = sd;
+	if (td == curthread)
+		PCPU_GET(fsgs_gdt)[0] = sd;
 	critical_exit();
 }
 
 void
 set_gsbase(struct thread *td, uint32_t base)
 {
 	struct segment_descriptor sd;
 
 	fill_based_sd(&sd, base);
 	critical_enter();
 	td->td_pcb->pcb_gsd = sd;
-	PCPU_GET(fsgs_gdt)[1] = sd;
+	if (td == curthread)
+		PCPU_GET(fsgs_gdt)[1] = sd;
 	critical_exit();
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sysarch_args {
 	int op;
 	char *parms;
 };
 #endif
 
 int
 sysarch(struct thread *td, struct sysarch_args *uap)
 {
 	int error;
 	union descriptor *lp;
 	union {
 		struct i386_ldt_args largs;
 		struct i386_ioperm_args iargs;
 		struct i386_get_xfpustate xfpu;
 	} kargs;
 	uint32_t base;
 	struct segment_descriptor *sdp;
 
 	AUDIT_ARG_CMD(uap->op);
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * When adding new operations, add a new case statement here to
 	 * explicitly indicate whether or not the operation is safe to
 	 * perform in capability mode.
 	 */
 	if (IN_CAPABILITY_MODE(td)) {
 		switch (uap->op) {
 		case I386_GET_LDT:
 		case I386_SET_LDT:
 		case I386_GET_IOPERM:
 		case I386_GET_FSBASE:
 		case I386_SET_FSBASE:
 		case I386_GET_GSBASE:
 		case I386_SET_GSBASE:
 		case I386_GET_XFPUSTATE:
 			break;
 
 		case I386_SET_IOPERM:
 		default:
 #ifdef KTRACE
 			if (KTRPOINT(td, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_SYSCALL, NULL, NULL);
 #endif
 			return (ECAPMODE);
 		}
 	}
 #endif
 
 	switch (uap->op) {
 	case I386_GET_IOPERM:
 	case I386_SET_IOPERM:
 		if ((error = copyin(uap->parms, &kargs.iargs,
 		    sizeof(struct i386_ioperm_args))) != 0)
 			return (error);
 		break;
 	case I386_GET_LDT:
 	case I386_SET_LDT:
 		if ((error = copyin(uap->parms, &kargs.largs,
 		    sizeof(struct i386_ldt_args))) != 0)
 			return (error);
 		break;
 	case I386_GET_XFPUSTATE:
 		if ((error = copyin(uap->parms, &kargs.xfpu,
 		    sizeof(struct i386_get_xfpustate))) != 0)
 			return (error);
 		break;
 	default:
 		break;
 	}
 
 	switch (uap->op) {
 	case I386_GET_LDT:
 		error = i386_get_ldt(td, &kargs.largs);
 		break;
 	case I386_SET_LDT:
 		if (kargs.largs.descs != NULL) {
 			if (kargs.largs.num > MAX_LD)
 				return (EINVAL);
 			lp = malloc(kargs.largs.num * sizeof(union descriptor),
 			    M_TEMP, M_WAITOK);
 			error = copyin(kargs.largs.descs, lp,
 			    kargs.largs.num * sizeof(union descriptor));
 			if (error == 0)
 				error = i386_set_ldt(td, &kargs.largs, lp);
 			free(lp, M_TEMP);
 		} else {
 			error = i386_set_ldt(td, &kargs.largs, NULL);
 		}
 		break;
 	case I386_GET_IOPERM:
 		error = i386_get_ioperm(td, &kargs.iargs);
 		if (error == 0)
 			error = copyout(&kargs.iargs, uap->parms,
 			    sizeof(struct i386_ioperm_args));
 		break;
 	case I386_SET_IOPERM:
 		error = i386_set_ioperm(td, &kargs.iargs);
 		break;
 	case I386_VM86:
 		error = vm86_sysarch(td, uap->parms);
 		break;
 	case I386_GET_FSBASE:
 		sdp = &td->td_pcb->pcb_fsd;
 		base = sdp->sd_hibase << 24 | sdp->sd_lobase;
 		error = copyout(&base, uap->parms, sizeof(base));
 		break;
 	case I386_SET_FSBASE:
 		error = copyin(uap->parms, &base, sizeof(base));
 		if (error == 0) {
 			/*
 			 * Construct the special descriptor for fsbase
 			 * and arrange for doreti to load its selector
 			 * soon enough.
 			 */
 			set_fsbase(td, base);
 			td->td_frame->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
 		}
 		break;
 	case I386_GET_GSBASE:
 		sdp = &td->td_pcb->pcb_gsd;
 		base = sdp->sd_hibase << 24 | sdp->sd_lobase;
 		error = copyout(&base, uap->parms, sizeof(base));
 		break;
 	case I386_SET_GSBASE:
 		error = copyin(uap->parms, &base, sizeof(base));
 		if (error == 0) {
 			/*
 			 * Construct the special descriptor for gsbase.
 			 * The selector is loaded immediately, since we
 			 * normally only reload %gs on context switches.
 			 */
 			set_gsbase(td, base);
 			load_gs(GSEL(GUGS_SEL, SEL_UPL));
 		}
 		break;
 	case I386_GET_XFPUSTATE:
 		if (kargs.xfpu.len > cpu_max_ext_state_size -
 		    sizeof(union savefpu))
 			return (EINVAL);
 		npxgetregs(td);
 		error = copyout((char *)(get_pcb_user_save_td(td) + 1),
 		    kargs.xfpu.addr, kargs.xfpu.len);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 i386_extend_pcb(struct thread *td)
 {
 	int i, offset;
 	u_long *addr;
 	struct pcb_ext *ext;
 	struct soft_segment_descriptor ssd = {
 		0,			/* segment base address (overwritten) */
 		ctob(IOPAGES + 1) - 1,	/* length */
 		SDT_SYS386TSS,		/* segment type */
 		0,			/* priority level */
 		1,			/* descriptor present */
 		0, 0,
 		0,			/* default 32 size */
 		0			/* granularity */
 	};
 
 	ext = pmap_trm_alloc(ctob(IOPAGES + 1), M_WAITOK | M_ZERO);
 	/* -16 is so we can convert a trapframe into vm86trapframe inplace */
 	ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
 	/*
 	 * The last byte of the i/o map must be followed by an 0xff byte.
 	 * We arbitrarily allocate 16 bytes here, to keep the starting
 	 * address on a doubleword boundary.
 	 */
 	offset = PAGE_SIZE - 16;
 	ext->ext_tss.tss_ioopt = 
 	    (offset - ((unsigned)&ext->ext_tss - (unsigned)ext)) << 16;
 	ext->ext_iomap = (caddr_t)ext + offset;
 	ext->ext_vm86.vm86_intmap = (caddr_t)ext + offset - 32;
 
 	addr = (u_long *)ext->ext_vm86.vm86_intmap;
 	for (i = 0; i < (ctob(IOPAGES) + 32 + 16) / sizeof(u_long); i++)
 		*addr++ = ~0;
 
 	ssd.ssd_base = (unsigned)&ext->ext_tss;
 	ssd.ssd_limit -= ((unsigned)&ext->ext_tss - (unsigned)ext);
 	ssdtosd(&ssd, &ext->ext_tssd);
 
 	KASSERT(td == curthread, ("giving TSS to !curthread"));
 	KASSERT(td->td_pcb->pcb_ext == 0, ("already have a TSS!"));
 
 	/* Switch to the new TSS. */
 	critical_enter();
 	ext->ext_tss.tss_esp0 = PCPU_GET(trampstk);
 	td->td_pcb->pcb_ext = ext;
 	PCPU_SET(private_tss, 1);
 	*PCPU_GET(tss_gdt) = ext->ext_tssd;
 	ltr(GSEL(GPROC0_SEL, SEL_KPL));
 	critical_exit();
 
 	return 0;
 }
 
 int
 i386_set_ioperm(td, uap)
 	struct thread *td;
 	struct i386_ioperm_args *uap;
 {
 	char *iomap;
 	u_int i;
 	int error;
 
 	if ((error = priv_check(td, PRIV_IO)) != 0)
 		return (error);
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
 	/*
 	 * XXX 
 	 * While this is restricted to root, we should probably figure out
 	 * whether any other driver is using this i/o address, as so not to
 	 * cause confusion.  This probably requires a global 'usage registry'.
 	 */
 
 	if (td->td_pcb->pcb_ext == 0)
 		if ((error = i386_extend_pcb(td)) != 0)
 			return (error);
 	iomap = (char *)td->td_pcb->pcb_ext->ext_iomap;
 
 	if (uap->start > uap->start + uap->length ||
 	    uap->start + uap->length > IOPAGES * PAGE_SIZE * NBBY)
 		return (EINVAL);
 
 	for (i = uap->start; i < uap->start + uap->length; i++) {
 		if (uap->enable)
 			iomap[i >> 3] &= ~(1 << (i & 7));
 		else
 			iomap[i >> 3] |= (1 << (i & 7));
 	}
 	return (error);
 }
 
 int
 i386_get_ioperm(td, uap)
 	struct thread *td;
 	struct i386_ioperm_args *uap;
 {
 	int i, state;
 	char *iomap;
 
 	if (uap->start >= IOPAGES * PAGE_SIZE * NBBY)
 		return (EINVAL);
 
 	if (td->td_pcb->pcb_ext == 0) {
 		uap->length = 0;
 		goto done;
 	}
 
 	iomap = (char *)td->td_pcb->pcb_ext->ext_iomap;
 
 	i = uap->start;
 	state = (iomap[i >> 3] >> (i & 7)) & 1;
 	uap->enable = !state;
 	uap->length = 1;
 
 	for (i = uap->start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) {
 		if (state != ((iomap[i >> 3] >> (i & 7)) & 1))
 			break;
 		uap->length++;
 	}
 
 done:
 	return (0);
 }
 
 /*
  * Update the GDT entry pointing to the LDT to point to the LDT of the
  * current process. Manage dt_lock holding/unholding autonomously.
  */   
 static void
 set_user_ldt_locked(struct mdproc *mdp)
 {
 	struct proc_ldt *pldt;
 	int gdt_idx;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 
 	pldt = mdp->md_ldt;
 	gdt_idx = GUSERLDT_SEL;
 	gdt_idx += PCPU_GET(cpuid) * NGDT;	/* always 0 on UP */
 	gdt[gdt_idx].sd = pldt->ldt_sd;
 	lldt(GSEL(GUSERLDT_SEL, SEL_KPL));
 	PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL));
 }
 
 void
 set_user_ldt(struct mdproc *mdp)
 {
 
 	mtx_lock_spin(&dt_lock);
 	set_user_ldt_locked(mdp);
 	mtx_unlock_spin(&dt_lock);
 }
 
 #ifdef SMP
 static void
 set_user_ldt_rv(void *arg)
 {
 	struct proc *p;
 
 	p = curproc;
 	if (arg == p->p_vmspace)
 		set_user_ldt(&p->p_md);
 }
 #endif
 
 /*
  * dt_lock must be held. Returns with dt_lock held.
  */
 struct proc_ldt *
 user_ldt_alloc(struct mdproc *mdp, int len)
 {
 	struct proc_ldt *pldt, *new_ldt;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 	mtx_unlock_spin(&dt_lock);
 	new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
 
 	new_ldt->ldt_len = len = NEW_MAX_LD(len);
 	new_ldt->ldt_base = pmap_trm_alloc(len * sizeof(union descriptor),
 	    M_WAITOK | M_ZERO);
 	new_ldt->ldt_refcnt = 1;
 	new_ldt->ldt_active = 0;
 
 	mtx_lock_spin(&dt_lock);
 	gdt_segs[GUSERLDT_SEL].ssd_base = (unsigned)new_ldt->ldt_base;
 	gdt_segs[GUSERLDT_SEL].ssd_limit = len * sizeof(union descriptor) - 1;
 	ssdtosd(&gdt_segs[GUSERLDT_SEL], &new_ldt->ldt_sd);
 
 	if ((pldt = mdp->md_ldt) != NULL) {
 		if (len > pldt->ldt_len)
 			len = pldt->ldt_len;
 		bcopy(pldt->ldt_base, new_ldt->ldt_base,
 		    len * sizeof(union descriptor));
 	} else
 		bcopy(ldt, new_ldt->ldt_base, sizeof(union descriptor) * NLDT);
 
 	return (new_ldt);
 }
 
 /*
  * Must be called with dt_lock held.  Returns with dt_lock unheld.
  */
 void
 user_ldt_free(struct thread *td)
 {
 	struct mdproc *mdp;
 	struct proc_ldt *pldt;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 	mdp = &td->td_proc->p_md;
 	if ((pldt = mdp->md_ldt) == NULL) {
 		mtx_unlock_spin(&dt_lock);
 		return;
 	}
 
 	if (td == curthread) {
 		lldt(_default_ldt);
 		PCPU_SET(currentldt, _default_ldt);
 	}
 
 	mdp->md_ldt = NULL;
 	user_ldt_deref(pldt);
 }
 
 void
 user_ldt_deref(struct proc_ldt *pldt)
 {
 
 	mtx_assert(&dt_lock, MA_OWNED);
 	if (--pldt->ldt_refcnt == 0) {
 		mtx_unlock_spin(&dt_lock);
 		pmap_trm_free(pldt->ldt_base, pldt->ldt_len *
 		    sizeof(union descriptor));
 		free(pldt, M_SUBPROC);
 	} else
 		mtx_unlock_spin(&dt_lock);
 }
 
 /*
  * Note for the authors of compat layers (linux, etc): copyout() in
  * the function below is not a problem since it presents data in
  * arch-specific format (i.e. i386-specific in this case), not in
  * the OS-specific one.
  */
 int
 i386_get_ldt(struct thread *td, struct i386_ldt_args *uap)
 {
 	struct proc_ldt *pldt;
 	char *data;
 	u_int nldt, num;
 	int error;
 
 #ifdef DEBUG
 	printf("i386_get_ldt: start=%u num=%u descs=%p\n",
 	    uap->start, uap->num, (void *)uap->descs);
 #endif
 
 	num = min(uap->num, MAX_LD);
 	data = malloc(num * sizeof(union descriptor), M_TEMP, M_WAITOK);
 	mtx_lock_spin(&dt_lock);
 	pldt = td->td_proc->p_md.md_ldt;
 	nldt = pldt != NULL ? pldt->ldt_len : NLDT;
 	if (uap->start >= nldt) {
 		num = 0;
 	} else {
 		num = min(num, nldt - uap->start);
 		bcopy(pldt != NULL ?
 		    &((union descriptor *)(pldt->ldt_base))[uap->start] :
 		    &ldt[uap->start], data, num * sizeof(union descriptor));
 	}
 	mtx_unlock_spin(&dt_lock);
 	error = copyout(data, uap->descs, num * sizeof(union descriptor));
 	if (error == 0)
 		td->td_retval[0] = num;
 	free(data, M_TEMP);
 	return (error);
 }
 
 int
 i386_set_ldt(struct thread *td, struct i386_ldt_args *uap,
     union descriptor *descs)
 {
 	struct mdproc *mdp;
 	struct proc_ldt *pldt;
 	union descriptor *dp;
 	u_int largest_ld, i;
 	int error;
 
 #ifdef DEBUG
 	printf("i386_set_ldt: start=%u num=%u descs=%p\n",
 	    uap->start, uap->num, (void *)uap->descs);
 #endif
 	error = 0;
 	mdp = &td->td_proc->p_md;
 
 	if (descs == NULL) {
 		/* Free descriptors */
 		if (uap->start == 0 && uap->num == 0) {
 			/*
 			 * Treat this as a special case, so userland needn't
 			 * know magic number NLDT.
 			 */
 			uap->start = NLDT;
 			uap->num = MAX_LD - NLDT;
 		}
 		mtx_lock_spin(&dt_lock);
 		if ((pldt = mdp->md_ldt) == NULL ||
 		    uap->start >= pldt->ldt_len) {
 			mtx_unlock_spin(&dt_lock);
 			return (0);
 		}
 		largest_ld = uap->start + uap->num;
 		if (largest_ld > pldt->ldt_len)
 			largest_ld = pldt->ldt_len;
 		for (i = uap->start; i < largest_ld; i++)
 			atomic_store_rel_64(&((uint64_t *)(pldt->ldt_base))[i],
 			    0);
 		mtx_unlock_spin(&dt_lock);
 		return (0);
 	}
 
 	if (uap->start != LDT_AUTO_ALLOC || uap->num != 1) {
 		/* verify range of descriptors to modify */
 		largest_ld = uap->start + uap->num;
 		if (uap->start >= MAX_LD || largest_ld > MAX_LD)
 			return (EINVAL);
 	}
 
 	/* Check descriptors for access violations */
 	for (i = 0; i < uap->num; i++) {
 		dp = &descs[i];
 
 		switch (dp->sd.sd_type) {
 		case SDT_SYSNULL:	/* system null */ 
 			dp->sd.sd_p = 0;
 			break;
 		case SDT_SYS286TSS: /* system 286 TSS available */
 		case SDT_SYSLDT:    /* system local descriptor table */
 		case SDT_SYS286BSY: /* system 286 TSS busy */
 		case SDT_SYSTASKGT: /* system task gate */
 		case SDT_SYS286IGT: /* system 286 interrupt gate */
 		case SDT_SYS286TGT: /* system 286 trap gate */
 		case SDT_SYSNULL2:  /* undefined by Intel */ 
 		case SDT_SYS386TSS: /* system 386 TSS available */
 		case SDT_SYSNULL3:  /* undefined by Intel */
 		case SDT_SYS386BSY: /* system 386 TSS busy */
 		case SDT_SYSNULL4:  /* undefined by Intel */ 
 		case SDT_SYS386IGT: /* system 386 interrupt gate */
 		case SDT_SYS386TGT: /* system 386 trap gate */
 		case SDT_SYS286CGT: /* system 286 call gate */ 
 		case SDT_SYS386CGT: /* system 386 call gate */
 			return (EACCES);
 
 		/* memory segment types */
 		case SDT_MEMEC:   /* memory execute only conforming */
 		case SDT_MEMEAC:  /* memory execute only accessed conforming */
 		case SDT_MEMERC:  /* memory execute read conforming */
 		case SDT_MEMERAC: /* memory execute read accessed conforming */
 			 /* Must be "present" if executable and conforming. */
 			if (dp->sd.sd_p == 0)
 				return (EACCES);
 			break;
 		case SDT_MEMRO:   /* memory read only */
 		case SDT_MEMROA:  /* memory read only accessed */
 		case SDT_MEMRW:   /* memory read write */
 		case SDT_MEMRWA:  /* memory read write accessed */
 		case SDT_MEMROD:  /* memory read only expand dwn limit */
 		case SDT_MEMRODA: /* memory read only expand dwn lim accessed */
 		case SDT_MEMRWD:  /* memory read write expand dwn limit */  
 		case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */
 		case SDT_MEME:    /* memory execute only */ 
 		case SDT_MEMEA:   /* memory execute only accessed */
 		case SDT_MEMER:   /* memory execute read */
 		case SDT_MEMERA:  /* memory execute read accessed */
 			break;
 		default:
 			return (EINVAL);
 		}
 
 		/* Only user (ring-3) descriptors may be present. */
 		if (dp->sd.sd_p != 0 && dp->sd.sd_dpl != SEL_UPL)
 			return (EACCES);
 	}
 
 	if (uap->start == LDT_AUTO_ALLOC && uap->num == 1) {
 		/* Allocate a free slot */
 		mtx_lock_spin(&dt_lock);
 		if ((pldt = mdp->md_ldt) == NULL) {
 			if ((error = i386_ldt_grow(td, NLDT + 1))) {
 				mtx_unlock_spin(&dt_lock);
 				return (error);
 			}
 			pldt = mdp->md_ldt;
 		}
 again:
 		/*
 		 * start scanning a bit up to leave room for NVidia and
 		 * Wine, which still user the "Blat" method of allocation.
 		 */
 		dp = &((union descriptor *)(pldt->ldt_base))[NLDT];
 		for (i = NLDT; i < pldt->ldt_len; ++i) {
 			if (dp->sd.sd_type == SDT_SYSNULL)
 				break;
 			dp++;
 		}
 		if (i >= pldt->ldt_len) {
 			if ((error = i386_ldt_grow(td, pldt->ldt_len+1))) {
 				mtx_unlock_spin(&dt_lock);
 				return (error);
 			}
 			goto again;
 		}
 		uap->start = i;
 		error = i386_set_ldt_data(td, i, 1, descs);
 		mtx_unlock_spin(&dt_lock);
 	} else {
 		largest_ld = uap->start + uap->num;
 		mtx_lock_spin(&dt_lock);
 		if (!(error = i386_ldt_grow(td, largest_ld))) {
 			error = i386_set_ldt_data(td, uap->start, uap->num,
 			    descs);
 		}
 		mtx_unlock_spin(&dt_lock);
 	}
 	if (error == 0)
 		td->td_retval[0] = uap->start;
 	return (error);
 }
 
 static int
 i386_set_ldt_data(struct thread *td, int start, int num,
     union descriptor *descs)
 {
 	struct mdproc *mdp;
 	struct proc_ldt *pldt;
 	uint64_t *dst, *src;
 	int i;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 
 	mdp = &td->td_proc->p_md;
 	pldt = mdp->md_ldt;
 	dst = (uint64_t *)(pldt->ldt_base);
 	src = (uint64_t *)descs;
 
 	/*
 	 * Atomic(9) is used only to get 64bit atomic store with
 	 * cmpxchg8b when available.  There is no op without release
 	 * semantic.
 	 */
 	for (i = 0; i < num; i++)
 		atomic_store_rel_64(&dst[start + i], src[i]);
 	return (0);
 }
 
 static int
 i386_ldt_grow(struct thread *td, int len) 
 {
 	struct mdproc *mdp;
 	struct proc_ldt *new_ldt, *pldt;
 	caddr_t old_ldt_base;
 	int old_ldt_len;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 
 	if (len > MAX_LD)
 		return (ENOMEM);
 	if (len < NLDT + 1)
 		len = NLDT + 1;
 
 	mdp = &td->td_proc->p_md;
 	old_ldt_base = NULL_LDT_BASE;
 	old_ldt_len = 0;
 
 	/* Allocate a user ldt. */
 	if ((pldt = mdp->md_ldt) == NULL || len > pldt->ldt_len) {
 		new_ldt = user_ldt_alloc(mdp, len);
 		if (new_ldt == NULL)
 			return (ENOMEM);
 		pldt = mdp->md_ldt;
 
 		if (pldt != NULL) {
 			if (new_ldt->ldt_len <= pldt->ldt_len) {
 				/*
 				 * We just lost the race for allocation, so
 				 * free the new object and return.
 				 */
 				mtx_unlock_spin(&dt_lock);
 				pmap_trm_free(new_ldt->ldt_base,
 				   new_ldt->ldt_len * sizeof(union descriptor));
 				free(new_ldt, M_SUBPROC);
 				mtx_lock_spin(&dt_lock);
 				return (0);
 			}
 
 			/*
 			 * We have to substitute the current LDT entry for
 			 * curproc with the new one since its size grew.
 			 */
 			old_ldt_base = pldt->ldt_base;
 			old_ldt_len = pldt->ldt_len;
 			pldt->ldt_sd = new_ldt->ldt_sd;
 			pldt->ldt_base = new_ldt->ldt_base;
 			pldt->ldt_len = new_ldt->ldt_len;
 		} else
 			mdp->md_ldt = pldt = new_ldt;
 #ifdef SMP
 		/*
 		 * Signal other cpus to reload ldt.  We need to unlock dt_lock
 		 * here because other CPU will contest on it since their
 		 * curthreads won't hold the lock and will block when trying
 		 * to acquire it.
 		 */
 		mtx_unlock_spin(&dt_lock);
 		smp_rendezvous(NULL, set_user_ldt_rv, NULL,
 		    td->td_proc->p_vmspace);
 #else
 		set_user_ldt_locked(&td->td_proc->p_md);
 		mtx_unlock_spin(&dt_lock);
 #endif
 		if (old_ldt_base != NULL_LDT_BASE) {
 			pmap_trm_free(old_ldt_base, old_ldt_len *
 			    sizeof(union descriptor));
 			free(new_ldt, M_SUBPROC);
 		}
 		mtx_lock_spin(&dt_lock);
 	}
 	return (0);
 }
diff --git a/sys/i386/i386/vm_machdep.c b/sys/i386/i386/vm_machdep.c
index 502de6e7f38f..471128e1713d 100644
--- a/sys/i386/i386/vm_machdep.c
+++ b/sys/i386/i386/vm_machdep.c
@@ -1,688 +1,713 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_isa.h"
 #include "opt_npx.h"
 #include "opt_reset.h"
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/smp.h>
 #include <machine/vm86.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 
 _Static_assert(__OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
     "__OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf.");
 
 union savefpu *
 get_pcb_user_save_td(struct thread *td)
 {
 	vm_offset_t p;
 
 	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
 	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
 	KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area"));
 	return ((union savefpu *)p);
 }
 
 union savefpu *
 get_pcb_user_save_pcb(struct pcb *pcb)
 {
 	vm_offset_t p;
 
 	p = (vm_offset_t)(pcb + 1);
 	return ((union savefpu *)p);
 }
 
 struct pcb *
 get_pcb_td(struct thread *td)
 {
 	vm_offset_t p;
 
 	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
 	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) -
 	    sizeof(struct pcb);
 	return ((struct pcb *)p);
 }
 
 void *
 alloc_fpusave(int flags)
 {
 	void *res;
 	struct savefpu_ymm *sf;
 
 	res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags);
 	if (use_xsave) {
 		sf = (struct savefpu_ymm *)res;
 		bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd));
 		sf->sv_xstate.sx_hd.xstate_bv = xsave_mask;
 	}
 	return (res);
 }
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
  * ready to run and return to user mode.
  */
 void
 cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags)
 {
 	struct proc *p1;
 	struct pcb *pcb2;
 	struct mdproc *mdp2;
 
 	p1 = td1->td_proc;
 	if ((flags & RFPROC) == 0) {
 		if ((flags & RFMEM) == 0) {
 			/* unshare user LDT */
 			struct mdproc *mdp1 = &p1->p_md;
 			struct proc_ldt *pldt, *pldt1;
 
 			mtx_lock_spin(&dt_lock);
 			if ((pldt1 = mdp1->md_ldt) != NULL &&
 			    pldt1->ldt_refcnt > 1) {
 				pldt = user_ldt_alloc(mdp1, pldt1->ldt_len);
 				if (pldt == NULL)
 					panic("could not copy LDT");
 				mdp1->md_ldt = pldt;
 				set_user_ldt(mdp1);
 				user_ldt_deref(pldt1);
 			} else
 				mtx_unlock_spin(&dt_lock);
 		}
 		return;
 	}
 
-	/* Ensure that td1's pcb is up to date. */
-	if (td1 == curthread)
+	/* Ensure that td1's pcb is up to date for user processes. */
+	if ((td2->td_pflags & TDP_KTHREAD) == 0) {
+		MPASS(td1 == curthread);
 		td1->td_pcb->pcb_gs = rgs();
-	critical_enter();
-	if (PCPU_GET(fpcurthread) == td1)
-		npxsave(td1->td_pcb->pcb_save);
-	critical_exit();
+		critical_enter();
+		if (PCPU_GET(fpcurthread) == td1)
+			npxsave(td1->td_pcb->pcb_save);
+		critical_exit();
+	}
 
 	/* Point the pcb to the top of the stack */
 	pcb2 = get_pcb_td(td2);
 	td2->td_pcb = pcb2;
 
 	/* Copy td1's pcb */
 	bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
 
 	/* Properly initialize pcb_save */
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
-	bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
-	    cpu_max_ext_state_size);
+
+	/* Kernel processes start with clean NPX and segment bases. */
+	if ((td2->td_pflags & TDP_KTHREAD) != 0) {
+		pcb2->pcb_gs = _udatasel;
+		set_fsbase(td2, 0);
+		set_gsbase(td2, 0);
+		pcb2->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE |
+		    PCB_KERNNPX | PCB_KERNNPX_THR);
+	} else {
+		MPASS((pcb2->pcb_flags & (PCB_KERNNPX | PCB_KERNNPX_THR)) == 0);
+		bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2),
+		    cpu_max_ext_state_size);
+	}
 
 	/* Point mdproc and then copy over td1's contents */
 	mdp2 = &p2->p_md;
 	bcopy(&p1->p_md, mdp2, sizeof(*mdp2));
 
 	/*
 	 * Create a new fresh stack for the new process.
 	 * Copy the trap frame for the return to user mode as if from a
 	 * syscall.  This copies most of the user mode register values.
 	 * The -VM86_STACK_SPACE (-16) is so we can expand the trapframe
 	 * if we go to vm86.
 	 */
 	td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb -
 	    VM86_STACK_SPACE) - 1;
 	bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 	td2->td_frame->tf_eax = 0;		/* Child returns zero */
 	td2->td_frame->tf_eflags &= ~PSL_C;	/* success */
 	td2->td_frame->tf_edx = 1;
 
 	/*
 	 * If the parent process has the trap bit set (i.e. a debugger
 	 * had single stepped the process to the system call), we need
 	 * to clear the trap flag from the new frame.
 	 */
 	td2->td_frame->tf_eflags &= ~PSL_T;
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pcb2->pcb_cr3 = pmap_get_cr3(vmspace_pmap(p2->p_vmspace));
 	pcb2->pcb_edi = 0;
 	pcb2->pcb_esi = (int)fork_return;	/* fork_trampoline argument */
 	pcb2->pcb_ebp = 0;
 	pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *);
 	pcb2->pcb_ebx = (int)td2;		/* fork_trampoline argument */
 	pcb2->pcb_eip = (int)fork_trampoline + setidt_disp;
 	/*-
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_flags:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_gs:	cloned above.
 	 * pcb2->pcb_ext:	cleared below.
 	 */
 
 	/*
 	 * XXX don't copy the i/o pages.  this should probably be fixed.
 	 */
 	pcb2->pcb_ext = 0;
 
 	/* Copy the LDT, if necessary. */
 	mtx_lock_spin(&dt_lock);
 	if (mdp2->md_ldt != NULL) {
 		if (flags & RFMEM) {
 			mdp2->md_ldt->ldt_refcnt++;
 		} else {
 			mdp2->md_ldt = user_ldt_alloc(mdp2,
 			    mdp2->md_ldt->ldt_len);
 			if (mdp2->md_ldt == NULL)
 				panic("could not copy LDT");
 		}
 	}
 	mtx_unlock_spin(&dt_lock);
 
 	/* Setup to release spin count in fork_exit(). */
 	td2->td_md.md_spinlock_count = 1;
 	td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 
 	/*
 	 * Now, cpu_switch() can schedule the new process.
 	 * pcb_esp is loaded pointing to the cpu_switch() stack frame
 	 * containing the return address when exiting cpu_switch.
 	 * This will normally be to fork_trampoline(), which will have
 	 * %ebx loaded with the new proc's pointer.  fork_trampoline()
 	 * will set up a stack to call fork_return(p, frame); to complete
 	 * the return to user-mode.
 	 */
 }
 
 /*
  * Intercept the return address from a freshly forked process that has NOT
  * been scheduled yet.
  *
  * This is needed to make kernel threads stay in kernel mode.
  */
 void
 cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg)
 {
 	/*
 	 * Note that the trap frame follows the args, so the function
 	 * is really called like this:  func(arg, frame);
 	 */
 	td->td_pcb->pcb_esi = (int) func;	/* function */
 	td->td_pcb->pcb_ebx = (int) arg;	/* first arg */
 }
 
 void
 cpu_exit(struct thread *td)
 {
 
 	/*
 	 * If this process has a custom LDT, release it.  Reset pc->pcb_gs
 	 * and %gs before we free it in case they refer to an LDT entry.
 	 */
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt) {
 		td->td_pcb->pcb_gs = _udatasel;
 		load_gs(_udatasel);
 		user_ldt_free(td);
 	} else
 		mtx_unlock_spin(&dt_lock);
 }
 
 void
 cpu_thread_exit(struct thread *td)
 {
 
 	critical_enter();
 	if (td == PCPU_GET(fpcurthread))
 		npxdrop();
 	critical_exit();
 
 	/* Disable any hardware breakpoints. */
 	if (td->td_pcb->pcb_flags & PCB_DBREGS) {
 		reset_dbregs();
 		td->td_pcb->pcb_flags &= ~PCB_DBREGS;
 	}
 }
 
 void
 cpu_thread_clean(struct thread *td)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb; 
 	if (pcb->pcb_ext != NULL) {
 		/* if (pcb->pcb_ext->ext_refcount-- == 1) ?? */
 		/*
 		 * XXX do we need to move the TSS off the allocated pages
 		 * before freeing them?  (not done here)
 		 */
 		pmap_trm_free(pcb->pcb_ext, ctob(IOPAGES + 1));
 		pcb->pcb_ext = NULL;
 	}
 }
 
 void
 cpu_thread_swapin(struct thread *td)
 {
 }
 
 void
 cpu_thread_swapout(struct thread *td)
 {
 }
 
 void
 cpu_thread_alloc(struct thread *td)
 {
 	struct pcb *pcb;
 	struct xstate_hdr *xhdr;
 
 	td->td_pcb = pcb = get_pcb_td(td);
 	td->td_frame = (struct trapframe *)((caddr_t)pcb -
 	    VM86_STACK_SPACE) - 1;
 	pcb->pcb_ext = NULL; 
 	pcb->pcb_save = get_pcb_user_save_pcb(pcb);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
 		bzero(xhdr, sizeof(*xhdr));
 		xhdr->xstate_bv = xsave_mask;
 	}
 }
 
 void
 cpu_thread_free(struct thread *td)
 {
 
 	cpu_thread_clean(td);
 }
 
 bool
 cpu_exec_vmspace_reuse(struct proc *p __unused, vm_map_t map __unused)
 {
 
 	return (true);
 }
 
 int
 cpu_procctl(struct thread *td __unused, int idtype __unused, id_t id __unused,
     int com __unused, void *data __unused)
 {
 
 	return (EINVAL);
 }
 
 void
 cpu_set_syscall_retval(struct thread *td, int error)
 {
 
 	switch (error) {
 	case 0:
 		td->td_frame->tf_eax = td->td_retval[0];
 		td->td_frame->tf_edx = td->td_retval[1];
 		td->td_frame->tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes, int
 		 * 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		td->td_frame->tf_eip -= td->td_frame->tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 		td->td_frame->tf_eax = error;
 		td->td_frame->tf_eflags |= PSL_C;
 		break;
 	}
 }
 
 /*
  * Initialize machine state, mostly pcb and trap frame for a new
  * thread, about to return to userspace.  Put enough state in the new
  * thread's PCB to get it to go back to the fork_return(), which
  * finalizes the thread state and handles peculiarities of the first
  * return to userspace for the new thread.
  */
 void
 cpu_copy_thread(struct thread *td, struct thread *td0)
 {
 	struct pcb *pcb2;
 
 	/* Point the pcb to the top of the stack. */
 	pcb2 = td->td_pcb;
 
-	/* Ensure that td0's pcb is up to date. */
-	if (td0 == curthread)
+	/* Ensure that td0's pcb is up to date for user threads. */
+	if ((td->td_pflags & TDP_KTHREAD) == 0) {
+		MPASS(td0 == curthread);
 		td0->td_pcb->pcb_gs = rgs();
-	critical_enter();
-	if (PCPU_GET(fpcurthread) == td0)
-		npxsave(td0->td_pcb->pcb_save);
-	critical_exit();
+		critical_enter();
+		if (PCPU_GET(fpcurthread) == td0)
+			npxsave(td0->td_pcb->pcb_save);
+		critical_exit();
+	}
 
 	/*
 	 * Copy the upcall pcb.  This loads kernel regs.
 	 * Those not loaded individually below get their default
 	 * values here.
 	 */
 	bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));
-	pcb2->pcb_flags &= ~PCB_KERNNPX;
 	pcb2->pcb_save = get_pcb_user_save_pcb(pcb2);
-	bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
-	    cpu_max_ext_state_size);
+
+	/* Kernel threads start with clean NPX and segment bases. */
+	if ((td->td_pflags & TDP_KTHREAD) != 0) {
+		pcb2->pcb_gs = _udatasel;
+		set_fsbase(td, 0);
+		set_gsbase(td, 0);
+		pcb2->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE |
+		    PCB_KERNNPX | PCB_KERNNPX_THR);
+	} else {
+		MPASS((pcb2->pcb_flags & (PCB_KERNNPX | PCB_KERNNPX_THR)) == 0);
+		bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save,
+		    cpu_max_ext_state_size);
+	}
 
 	/*
 	 * Create a new fresh stack for the new thread.
 	 */
 	bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe));
 
 	/* If the current thread has the trap bit set (i.e. a debugger had
 	 * single stepped the process to the system call), we need to clear
 	 * the trap flag from the new frame. Otherwise, the new thread will
 	 * receive a (likely unexpected) SIGTRAP when it executes the first
 	 * instruction after returning to userland.
 	 */
 	td->td_frame->tf_eflags &= ~PSL_T;
 
 	/*
 	 * Set registers for trampoline to user mode.  Leave space for the
 	 * return address on stack.  These are the kernel mode register values.
 	 */
 	pcb2->pcb_edi = 0;
 	pcb2->pcb_esi = (int)fork_return;		    /* trampoline arg */
 	pcb2->pcb_ebp = 0;
 	pcb2->pcb_esp = (int)td->td_frame - sizeof(void *); /* trampoline arg */
 	pcb2->pcb_ebx = (int)td;			    /* trampoline arg */
 	pcb2->pcb_eip = (int)fork_trampoline + setidt_disp;
 	/*
 	 * If we didn't copy the pcb, we'd need to do the following registers:
 	 * pcb2->pcb_cr3:	cloned above.
 	 * pcb2->pcb_dr*:	cloned above.
 	 * pcb2->pcb_savefpu:	cloned above.
 	 * pcb2->pcb_flags:	cloned above.
 	 * pcb2->pcb_onfault:	cloned above (always NULL here?).
 	 * pcb2->pcb_gs:	cloned above.
 	 * pcb2->pcb_ext:	cleared below.
 	 */
 	pcb2->pcb_ext = NULL;
 
 	/* Setup to release spin count in fork_exit(). */
 	td->td_md.md_spinlock_count = 1;
 	td->td_md.md_saved_flags = PSL_KERNEL | PSL_I;
 }
 
 /*
  * Set that machine state for performing an upcall that starts
  * the entry function with the given argument.
  */
 void
 cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg,
     stack_t *stack)
 {
 
 	/* 
 	 * Do any extra cleaning that needs to be done.
 	 * The thread may have optional components
 	 * that are not present in a fresh thread.
 	 * This may be a recycled thread so make it look
 	 * as though it's newly allocated.
 	 */
 	cpu_thread_clean(td);
 
 	/*
 	 * Set the trap frame to point at the beginning of the entry
 	 * function.
 	 */
 	td->td_frame->tf_ebp = 0; 
 	td->td_frame->tf_esp =
 	    (((int)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4;
 	td->td_frame->tf_eip = (int)entry;
 
 	/* Return address sentinel value to stop stack unwinding. */
 	suword((void *)td->td_frame->tf_esp, 0);
 
 	/* Pass the argument to the entry point. */
 	suword((void *)(td->td_frame->tf_esp + sizeof(void *)),
 	    (int)arg);
 }
 
 int
 cpu_set_user_tls(struct thread *td, void *tls_base)
 {
 	struct segment_descriptor sd;
 	uint32_t base;
 
 	/*
 	 * Construct a descriptor and store it in the pcb for
 	 * the next context switch.  Also store it in the gdt
 	 * so that the load of tf_fs into %fs will activate it
 	 * at return to userland.
 	 */
 	base = (uint32_t)tls_base;
 	sd.sd_lobase = base & 0xffffff;
 	sd.sd_hibase = (base >> 24) & 0xff;
 	sd.sd_lolimit = 0xffff;	/* 4GB limit, wraps around */
 	sd.sd_hilimit = 0xf;
 	sd.sd_type  = SDT_MEMRWA;
 	sd.sd_dpl   = SEL_UPL;
 	sd.sd_p     = 1;
 	sd.sd_xx    = 0;
 	sd.sd_def32 = 1;
 	sd.sd_gran  = 1;
 	critical_enter();
 	/* set %gs */
 	td->td_pcb->pcb_gsd = sd;
 	if (td == curthread) {
 		PCPU_GET(fsgs_gdt)[1] = sd;
 		load_gs(GSEL(GUGS_SEL, SEL_UPL));
 	}
 	critical_exit();
 	return (0);
 }
 
 /*
  * Convert kernel VA to physical address
  */
 vm_paddr_t
 kvtop(void *addr)
 {
 	vm_paddr_t pa;
 
 	pa = pmap_kextract((vm_offset_t)addr);
 	if (pa == 0)
 		panic("kvtop: zero page frame");
 	return (pa);
 }
 
 /*
  * Get an sf_buf from the freelist.  May block if none are available.
  */
 void
 sf_buf_map(struct sf_buf *sf, int flags)
 {
 
 	pmap_sf_buf_map(sf);
 #ifdef SMP
 	sf_buf_shootdown(sf, flags);
 #endif
 }
 
 #ifdef SMP
 static void
 sf_buf_shootdown_curcpu_cb(pmap_t pmap __unused,
     vm_offset_t addr1 __unused, vm_offset_t addr2 __unused)
 {
 }
 
 void
 sf_buf_shootdown(struct sf_buf *sf, int flags)
 {
 	cpuset_t other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	if (!CPU_ISSET(cpuid, &sf->cpumask)) {
 		CPU_SET(cpuid, &sf->cpumask);
 		invlpg(sf->kva);
 	}
 	if ((flags & SFB_CPUPRIVATE) == 0) {
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		CPU_ANDNOT(&other_cpus, &sf->cpumask);
 		if (!CPU_EMPTY(&other_cpus)) {
 			CPU_OR(&sf->cpumask, &other_cpus);
 			smp_masked_invlpg(other_cpus, sf->kva, kernel_pmap,
 			    sf_buf_shootdown_curcpu_cb);
 		}
 	}
 	sched_unpin();
 }
 #endif
 
 /*
  * MD part of sf_buf_free().
  */
 int
 sf_buf_unmap(struct sf_buf *sf)
 {
 
 	return (0);
 }
 
 static void
 sf_buf_invalidate(struct sf_buf *sf)
 {
 	vm_page_t m = sf->m;
 
 	/*
 	 * Use pmap_qenter to update the pte for
 	 * existing mapping, in particular, the PAT
 	 * settings are recalculated.
 	 */
 	pmap_qenter(sf->kva, &m, 1);
 	pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE);
 }
 
 /*
  * Invalidate the cache lines that may belong to the page, if
  * (possibly old) mapping of the page by sf buffer exists.  Returns
  * TRUE when mapping was found and cache invalidated.
  */
 boolean_t
 sf_buf_invalidate_cache(vm_page_t m)
 {
 
 	return (sf_buf_process_page(m, sf_buf_invalidate));
 }
 
 /*
  * Software interrupt handler for queued VM system processing.
  */   
 void  
 swi_vm(void *dummy) 
 {     
 	if (busdma_swi_pending != 0)
 		busdma_swi();
 }
 
 /*
  * Tell whether this address is in some physical memory region.
  * Currently used by the kernel coredump code in order to avoid
  * dumping the ``ISA memory hole'' which could cause indefinite hangs,
  * or other unpredictable behaviour.
  */
 
 int
 is_physical_memory(vm_paddr_t addr)
 {
 
 #ifdef DEV_ISA
 	/* The ISA ``memory hole''. */
 	if (addr >= 0xa0000 && addr < 0x100000)
 		return 0;
 #endif
 
 	/*
 	 * stuff other tests for known memory-mapped devices (PCI?)
 	 * here
 	 */
 
 	return 1;
 }