Index: sys/amd64/amd64/cpu_switch.S
===================================================================
--- sys/amd64/amd64/cpu_switch.S
+++ sys/amd64/amd64/cpu_switch.S
@@ -74,7 +74,7 @@
  */
 ENTRY(cpu_switch)
 	/* Switch to new thread.  First, save context. */
-	movq	TD_PCB(%rdi),%r8
+	leaq	TD_MD_PCB(%rdi),%r8
 
 	movq	(%rsp),%rax			/* Hardware registers */
 	movq	%r15,PCB_R15(%r8)
@@ -140,7 +140,7 @@
 	callq	pmap_activate_sw
 	movq	%r15,TD_LOCK(%r13)		/* Release the old thread */
 sw1:
-	movq	TD_PCB(%r12),%r8
+	leaq	TD_MD_PCB(%r12),%r8
 #if defined(SCHED_ULE) && defined(SMP)
 	movq	$blocked_lock, %rdx
 	movq	TD_LOCK(%r12),%rcx
@@ -193,11 +193,12 @@
 	cmpq	%rax,%rdx
 	jne	do_tss
 done_tss:
-	movq	%r8,PCPU(RSP0)
+	movq	TD_MD_STACK_BASE(%r12),%r9
+	movq	%r9,PCPU(RSP0)
 	movq	%r8,PCPU(CURPCB)
 	movq	PCPU(PTI_RSP0),%rax
 	cmpq	$~0,PCPU(UCR3)
-	cmove	%r8,%rax
+	cmove	%r9,%rax
 	movq	%rax,TSS_RSP0(%rdx)
 	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
 
Index: sys/amd64/amd64/genassym.c
===================================================================
--- sys/amd64/amd64/genassym.c
+++ sys/amd64/amd64/genassym.c
@@ -87,6 +87,8 @@
 ASSYM(TD_PROC, offsetof(struct thread, td_proc));
 ASSYM(TD_FRAME, offsetof(struct thread, td_frame));
 ASSYM(TD_MD, offsetof(struct thread, td_md));
+ASSYM(TD_MD_PCB, offsetof(struct thread, td_md.md_pcb));
+ASSYM(TD_MD_STACK_BASE, offsetof(struct thread, td_md.md_stack_base));
 
 ASSYM(TDF_ASTPENDING, TDF_ASTPENDING);
 ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED);
Index: sys/amd64/amd64/machdep.c
===================================================================
--- sys/amd64/amd64/machdep.c
+++ sys/amd64/amd64/machdep.c
@@ -1789,12 +1789,12 @@
 	amd64_conf_fast_syscall();
 
 	/*
-	 * Temporary forge some valid pointer to PCB, for exception
-	 * handlers.  It is reinitialized properly below after FPU is
-	 * set up.  Also set up td_critnest to short-cut the page
-	 * fault handler.
+	 * We initialize the PCB pointer early so that exception
+	 * handlers will work.  Also set up td_critnest to short-cut
+	 * the page fault handler.
 	 */
 	cpu_max_ext_state_size = sizeof(struct savefpu);
+	set_top_of_stack_td(&thread0);
 	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_critnest = 1;
 
@@ -1850,11 +1850,10 @@
 	fpuinit();
 
 	/*
-	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
+	 * Set up thread0 pcb save area after fpuinit calculated fpu save
 	 * area size.  Zero out the extended state header in fpu save
 	 * area.
 	 */
-	thread0.td_pcb = get_pcb_td(&thread0);
 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
 	if (use_xsave) {
@@ -1863,7 +1862,7 @@
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
-	rsp0 = (vm_offset_t)thread0.td_pcb;
+	rsp0 = thread0.td_md.md_stack_base;
 	/* Ensure the stack is aligned to 16 bytes */
 	rsp0 &= ~0xFul;
 	common_tss[0].tss_rsp0 = rsp0;
@@ -1899,7 +1898,7 @@
 	TSEXIT();
 
 	/* Location of kernel stack for locore */
-	return ((u_int64_t)thread0.td_pcb);
+	return (thread0.td_md.md_stack_base);
 }
 
 void
Index: sys/amd64/amd64/vm_machdep.c
===================================================================
--- sys/amd64/amd64/vm_machdep.c
+++ sys/amd64/amd64/vm_machdep.c
@@ -87,35 +87,39 @@
 _Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf),
     "OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf.");
 
-struct savefpu *
-get_pcb_user_save_td(struct thread *td)
+void
+set_top_of_stack_td(struct thread *td)
 {
-	vm_offset_t p;
-
-	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
+	td->td_md.md_stack_base = td->td_kstack +
+	    td->td_kstack_pages * PAGE_SIZE -
 	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN);
-	KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area"));
-	return ((struct savefpu *)p);
 }
 
 struct savefpu *
-get_pcb_user_save_pcb(struct pcb *pcb)
+get_pcb_user_save_td(struct thread *td)
 {
 	vm_offset_t p;
 
-	p = (vm_offset_t)(pcb + 1);
+	p = td->td_md.md_stack_base;
+	KASSERT((p % XSAVE_AREA_ALIGN) == 0,
+	    ("Unaligned pcb_user_save area ptr %#lx td %p", p, td));
 	return ((struct savefpu *)p);
 }
 
 struct pcb *
 get_pcb_td(struct thread *td)
 {
-	vm_offset_t p;
 
-	p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE -
-	    roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) -
-	    sizeof(struct pcb);
-	return ((struct pcb *)p);
+	return (&td->td_md.md_pcb);
+}
+
+struct savefpu *
+get_pcb_user_save_pcb(struct pcb *pcb)
+{
+	struct thread *td;
+
+	td = __containerof(pcb, struct thread, td_md.md_pcb);
+	return (get_pcb_user_save_td(td));
 }
 
 void *
@@ -165,9 +169,9 @@
 	fpuexit(td1);
 	update_pcb_bases(td1->td_pcb);
 
-	/* Point the pcb to the top of the stack */
-	pcb2 = get_pcb_td(td2);
-	td2->td_pcb = pcb2;
+	/* Point the stack and pcb to the actual location */
+	set_top_of_stack_td(td2);
+	td2->td_pcb = pcb2 = get_pcb_td(td2);
 
 	/* Copy td1's pcb */
 	bcopy(td1->td_pcb, pcb2, sizeof(*pcb2));
@@ -186,7 +190,7 @@
 	 * Copy the trap frame for the return to user mode as if from a
 	 * syscall.  This copies most of the user mode register values.
 	 */
-	td2->td_frame = (struct trapframe *)td2->td_pcb - 1;
+	td2->td_frame = (struct trapframe *)td2->td_md.md_stack_base - 1;
 	bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 	td2->td_frame->tf_rax = 0;		/* Child returns zero */
@@ -351,8 +355,9 @@
 	struct pcb *pcb;
 	struct xstate_hdr *xhdr;
 
+	set_top_of_stack_td(td);
 	td->td_pcb = pcb = get_pcb_td(td);
-	td->td_frame = (struct trapframe *)pcb - 1;
+	td->td_frame = (struct trapframe *)td->td_md.md_stack_base - 1;
 	pcb->pcb_save = get_pcb_user_save_pcb(pcb);
 	if (use_xsave) {
 		xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1);
@@ -490,7 +495,6 @@
 {
 	struct pcb *pcb2;
 
-	/* Point the pcb to the top of the stack. */
 	pcb2 = td->td_pcb;
 
 	/*
Index: sys/amd64/include/md_var.h
===================================================================
--- sys/amd64/include/md_var.h
+++ sys/amd64/include/md_var.h
@@ -83,6 +83,7 @@
 void	pagezero(void *addr);
 void	setidt(int idx, alias_for_inthand_t *func, int typ, int dpl, int ist);
 void	sse2_pagezero(void *addr);
+void	set_top_of_stack_td(struct thread *td);
 struct savefpu *get_pcb_user_save_td(struct thread *td);
 struct savefpu *get_pcb_user_save_pcb(struct pcb *pcb);
 void	pci_early_quirks(void);
Index: sys/amd64/include/pcpu_aux.h
===================================================================
--- sys/amd64/include/pcpu_aux.h
+++ sys/amd64/include/pcpu_aux.h
@@ -57,16 +57,6 @@
 	return (td);
 }
 #define	curthread		(__curthread())
-
-static __inline __pure2 struct pcb *
-__curpcb(void)
-{
-	struct pcb *pcb;
-
-	__asm("movq %%gs:%P1,%0" : "=r" (pcb) : "n" (offsetof(struct pcpu,
-	    pc_curpcb)));
-	return (pcb);
-}
-#define	curpcb		(__curpcb())
+#define	curpcb			(&curthread->td_md.md_pcb)
 
 #endif	/* _MACHINE_PCPU_AUX_H_ */
Index: sys/amd64/include/proc.h
===================================================================
--- sys/amd64/include/proc.h
+++ sys/amd64/include/proc.h
@@ -36,6 +36,7 @@
 #define	_MACHINE_PROC_H_
 
 #include <sys/queue.h>
+#include <machine/pcb.h>
 #include <machine/segments.h>
 
 /*
@@ -72,6 +73,8 @@
 	struct pmap_invl_gen md_invl_gen;
 	register_t md_efirt_tmp;	/* (k) */
 	int	md_efirt_dis_pf;	/* (k) */
+	struct pcb md_pcb;
+	vm_offset_t md_stack_base;
 };
 
 struct mdproc {
Index: sys/kern/kern_thread.c
===================================================================
--- sys/kern/kern_thread.c
+++ sys/kern/kern_thread.c
@@ -84,7 +84,7 @@
     "struct thread KBI td_pflags");
 _Static_assert(offsetof(struct thread, td_frame) == 0x478,
     "struct thread KBI td_frame");
-_Static_assert(offsetof(struct thread, td_emuldata) == 0x540,
+_Static_assert(offsetof(struct thread, td_emuldata) == 0x690,
     "struct thread KBI td_emuldata");
 _Static_assert(offsetof(struct proc, p_flag) == 0xb0,
     "struct proc KBI p_flag");