Index: sys/amd64/amd64/apic_vector.S
===================================================================
--- sys/amd64/amd64/apic_vector.S
+++ sys/amd64/amd64/apic_vector.S
@@ -2,6 +2,12 @@
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
+ * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -38,12 +44,12 @@
 
 #include "opt_smp.h"
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
-#include "assym.s"
-
 #ifdef SMP
 #define LK	lock ;
 #else
@@ -73,30 +79,28 @@
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
-#define	ISR_VEC(index, vec_name)					\
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
-	cmpl	$0,x2apic_mode ;					\
-	je	1f ;							\
-	movl	$(MSR_APIC_ISR0 + index),%ecx ;				\
-	rdmsr ;								\
-	jmp	2f ;							\
-1: ;									\
-	movq	lapic_map, %rdx ;	/* pointer to local APIC */	\
-	movl	LA_ISR + 16 * (index)(%rdx), %eax ;	/* load ISR */	\
-2: ;									\
-	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
-	jz	3f ;							\
-	addl	$(32 * index),%eax ;					\
-	movq	%rsp, %rsi	;                                       \
-	movl	%eax, %edi ;	/* pass the IRQ */			\
-	call	lapic_handle_intr ;					\
-3: ;									\
-	MEXITCOUNT ;							\
+	.macro	ISR_VEC	index, vec_name
+	INTR_HANDLER	\vec_name
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	cmpl	$0,x2apic_mode
+	je	1f
+	movl	$(MSR_APIC_ISR0 + \index),%ecx
+	rdmsr
+	jmp	2f
+1:
+	movq	lapic_map, %rdx		/* pointer to local APIC */
+	movl	LA_ISR + 16 * (\index)(%rdx), %eax	/* load ISR */
+2:
+	bsrl	%eax, %eax	/* index of highest set bit in ISR */
+	jz	3f
+	addl	$(32 * \index),%eax
+	movq	%rsp, %rsi
+	movl	%eax, %edi	/* pass the IRQ */
+	call	lapic_handle_intr
+3:
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
 /*
  * Handle "spurious INTerrupts".
@@ -108,26 +112,21 @@
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
-
 	/* No EOI cycle used here */
-
 	jmp	doreti_iret
 
-	ISR_VEC(1, apic_isr1)
-	ISR_VEC(2, apic_isr2)
-	ISR_VEC(3, apic_isr3)
-	ISR_VEC(4, apic_isr4)
-	ISR_VEC(5, apic_isr5)
-	ISR_VEC(6, apic_isr6)
-	ISR_VEC(7, apic_isr7)
+	ISR_VEC	1, apic_isr1
+	ISR_VEC	2, apic_isr2
+	ISR_VEC	3, apic_isr3
+	ISR_VEC	4, apic_isr4
+	ISR_VEC	5, apic_isr5
+	ISR_VEC	6, apic_isr6
+	ISR_VEC	7, apic_isr7
 
 /*
  * Local APIC periodic timer handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(timerint)
-	PUSH_FRAME
+	INTR_HANDLER	timerint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	lapic_handle_timer
@@ -137,10 +136,7 @@
 /*
  * Local APIC CMCI handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cmcint)
-	PUSH_FRAME
+	INTR_HANDLER cmcint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
@@ -149,10 +145,7 @@
 /*
  * Local APIC error interrupt handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(errorint)
-	PUSH_FRAME
+	INTR_HANDLER errorint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_error
 	MEXITCOUNT
@@ -163,10 +156,7 @@
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(xen_intr_upcall)
-	PUSH_FRAME
+	INTR_HANDLER xen_intr_upcall
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	xen_intr_handle_upcall
@@ -183,74 +173,48 @@
 	SUPERALIGN_TEXT
 invltlb_ret:
 	call	as_lapic_eoi
-	POP_FRAME
-	jmp	doreti_iret
+	jmp	ld_regs
 
 	SUPERALIGN_TEXT
-IDTVEC(invltlb)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb
 	call	invltlb_handler
 	jmp	invltlb_ret
 
-IDTVEC(invltlb_pcid)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb_pcid
 	call	invltlb_pcid_handler
 	jmp	invltlb_ret
 
-IDTVEC(invltlb_invpcid)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb_invpcid
 	call	invltlb_invpcid_handler
 	jmp	invltlb_ret
 
 /*
  * Single page TLB shootdown
  */
-	.text
-
-	SUPERALIGN_TEXT
-IDTVEC(invlpg)
-	PUSH_FRAME
-
+	INTR_HANDLER invlpg
 	call	invlpg_handler
 	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(invlrng)
-	PUSH_FRAME
-
+	INTR_HANDLER invlrng
 	call	invlrng_handler
 	jmp	invltlb_ret
 
 /*
  * Invalidate cache.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(invlcache)
-	PUSH_FRAME
-
+	INTR_HANDLER invlcache
 	call	invlcache_handler
 	jmp	invltlb_ret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(ipi_intr_bitmap_handler)		
-	PUSH_FRAME
-
+	INTR_HANDLER ipi_intr_bitmap_handler
 	call	as_lapic_eoi
-	
 	FAKE_MCOUNT(TF_RIP(%rsp))
-
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
@@ -258,24 +222,15 @@
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cpustop)
-	PUSH_FRAME
-
+	INTR_HANDLER cpustop
 	call	as_lapic_eoi
-
 	call	cpustop_handler
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cpususpend)
-	PUSH_FRAME
-
+	INTR_HANDLER cpususpend
 	call	cpususpend_handler
 	call	as_lapic_eoi
 	jmp	doreti
@@ -285,10 +240,7 @@
  *
  * - Calls the generic rendezvous action function.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(rendezvous)
-	PUSH_FRAME
+	INTR_HANDLER rendezvous
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movq	ipi_rendezvous_counts(,%rax,8), %rax
@@ -328,4 +280,8 @@
 	popq	%rax
 	jmp	doreti_iret
 
+	INTR_HANDLER	justreturn1
+	call	as_lapic_eoi
+	jmp	doreti
+
 #endif /* SMP */
Index: sys/amd64/amd64/atpic_vector.S
===================================================================
--- sys/amd64/amd64/atpic_vector.S
+++ sys/amd64/amd64/atpic_vector.S
@@ -36,38 +36,35 @@
  * master and slave interrupt controllers.
  */
 
-#include <machine/asmacros.h>
-
 #include "assym.s"
+#include <machine/asmacros.h>
 
 /*
  * Macros for interrupt entry, call to handler, and exit.
  */
-#define	INTR(irq_num, vec_name) \
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
-	movq	%rsp, %rsi	;                                       \
-	movl	$irq_num, %edi; 	/* pass the IRQ */		\
-	call	atpic_handle_intr ;					\
-	MEXITCOUNT ;							\
+	.macro	INTR	irq_num, vec_name
+	INTR_HANDLER	\vec_name
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp, %rsi
+	movl	$\irq_num, %edi	 	/* pass the IRQ */
+	call	atpic_handle_intr
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
-	INTR(0, atpic_intr0)
-	INTR(1, atpic_intr1)
-	INTR(2, atpic_intr2)
-	INTR(3, atpic_intr3)
-	INTR(4, atpic_intr4)
-	INTR(5, atpic_intr5)
-	INTR(6, atpic_intr6)
-	INTR(7, atpic_intr7)
-	INTR(8, atpic_intr8)
-	INTR(9, atpic_intr9)
-	INTR(10, atpic_intr10)
-	INTR(11, atpic_intr11)
-	INTR(12, atpic_intr12)
-	INTR(13, atpic_intr13)
-	INTR(14, atpic_intr14)
-	INTR(15, atpic_intr15)
+	INTR	0, atpic_intr0
+	INTR	1, atpic_intr1
+	INTR	2, atpic_intr2
+	INTR	3, atpic_intr3
+	INTR	4, atpic_intr4
+	INTR	5, atpic_intr5
+	INTR	6, atpic_intr6
+	INTR	7, atpic_intr7
+	INTR	8, atpic_intr8
+	INTR	9, atpic_intr9
+	INTR	10, atpic_intr10
+	INTR	11, atpic_intr11
+	INTR	12, atpic_intr12
+	INTR	13, atpic_intr13
+	INTR	14, atpic_intr14
+	INTR	15, atpic_intr15
Index: sys/amd64/amd64/cpu_switch.S
===================================================================
--- sys/amd64/amd64/cpu_switch.S
+++ sys/amd64/amd64/cpu_switch.S
@@ -215,8 +215,10 @@
 	movq	%r8,PCPU(RSP0)
 	movq	%r8,PCPU(CURPCB)
 	/* Update the TSS_RSP0 pointer for the next interrupt */
+	cmpb	$0,pti(%rip)
+	jne	1f
 	movq	%r8,TSS_RSP0(%rdx)
-	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
+1:	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
 
 	/* Test if debug registers should be restored. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%r8)
@@ -293,7 +295,12 @@
 	shrq	$8,%rcx
 	movl	%ecx,8(%rax)
 	movb	$0x89,5(%rax)	/* unset busy */
-	movl	$TSSSEL,%eax
+	cmpb	$0,pti(%rip)
+	je	1f
+	movq	PCPU(PRVSPACE),%rax
+	addq	$PC_PTI_STACK+PC_PTI_STACK_SZ*8,%rax
+	movq	%rax,TSS_RSP0(%rdx)
+1:	movl	$TSSSEL,%eax
 	ltr	%ax
 	jmp	done_tss
 
Index: sys/amd64/amd64/exception.S
===================================================================
--- sys/amd64/amd64/exception.S
+++ sys/amd64/amd64/exception.S
@@ -1,12 +1,16 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
- * Copyright (c) 2007 The FreeBSD Foundation
+ * Copyright (c) 2007-2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -38,13 +42,13 @@
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/psl.h>
 #include <machine/trap.h>
 #include <machine/specialreg.h>
 
-#include "assym.s"
-
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
@@ -100,68 +104,62 @@
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
-/* Traps that we leave interrupts disabled for.. */
-#define	TRAP_NOEN(a)	\
-	subq $TF_RIP,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
-	movq $0,TF_ERR(%rsp) ; \
+/* Traps that we leave interrupts disabled for. */
+	.macro	TRAP_NOEN	l, trapno
+	PTI_ENTRY	\l,X\l
+	.globl	X\l
+	.type	X\l,@function
+X\l:	subq $TF_RIP,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
+	movq $0,TF_ERR(%rsp)
 	jmp alltraps_noen
-IDTVEC(dbg)
-	TRAP_NOEN(T_TRCTRAP)
-IDTVEC(bpt)
-	TRAP_NOEN(T_BPTFLT)
+	.endm
+
+	TRAP_NOEN	dbg, T_TRCTRAP
+	TRAP_NOEN	bpt, T_BPTFLT
 #ifdef KDTRACE_HOOKS
-IDTVEC(dtrace_ret)
-	TRAP_NOEN(T_DTRACE_RET)
+	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
 #endif
 
 /* Regular traps; The cpu does not supply tf_err for these. */
-#define	TRAP(a)	 \
-	subq $TF_RIP,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
-	movq $0,TF_ERR(%rsp) ; \
+	.macro	TRAP	l, trapno
+	PTI_ENTRY	\l,X\l
+	.globl	X\l
+	.type	X\l,@function
+X\l:
+	subq $TF_RIP,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
+	movq $0,TF_ERR(%rsp)
 	jmp alltraps
-IDTVEC(div)
-	TRAP(T_DIVIDE)
-IDTVEC(ofl)
-	TRAP(T_OFLOW)
-IDTVEC(bnd)
-	TRAP(T_BOUND)
-IDTVEC(ill)
-	TRAP(T_PRIVINFLT)
-IDTVEC(dna)
-	TRAP(T_DNA)
-IDTVEC(fpusegm)
-	TRAP(T_FPOPFLT)
-IDTVEC(mchk)
-	TRAP(T_MCHK)
-IDTVEC(rsvd)
-	TRAP(T_RESERVED)
-IDTVEC(fpu)
-	TRAP(T_ARITHTRAP)
-IDTVEC(xmm)
-	TRAP(T_XMMFLT)
-
-/* This group of traps have tf_err already pushed by the cpu */
-#define	TRAP_ERR(a)	\
-	subq $TF_ERR,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
+	.endm
+
+	TRAP	div, T_DIVIDE
+	TRAP	ofl, T_OFLOW
+	TRAP	bnd, T_BOUND
+	TRAP	ill, T_PRIVINFLT
+	TRAP	dna, T_DNA
+	TRAP	fpusegm, T_FPOPFLT
+	TRAP	mchk, T_MCHK
+	TRAP	rsvd, T_RESERVED
+	TRAP	fpu, T_ARITHTRAP
+	TRAP	xmm, T_XMMFLT
+
+/* This group of traps have tf_err already pushed by the cpu. */
+	.macro	TRAP_ERR	l, trapno
+	PTI_ENTRY	\l,X\l,has_err=1
+	.globl	X\l
+	.type	X\l,@function
+X\l:
+	subq $TF_ERR,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
 	jmp alltraps
-IDTVEC(tss)
-	TRAP_ERR(T_TSSFLT)
-IDTVEC(missing)
-	subq	$TF_ERR,%rsp
-	movl	$T_SEGNPFLT,TF_TRAPNO(%rsp)
-	jmp	prot_addrf
-IDTVEC(stk)
-	subq	$TF_ERR,%rsp
-	movl	$T_STKFLT,TF_TRAPNO(%rsp)
-	jmp	prot_addrf
-IDTVEC(align)
-	TRAP_ERR(T_ALIGNFLT)
+	.endm
+
+	TRAP_ERR	tss, T_TSSFLT
+	TRAP_ERR	align, T_ALIGNFLT
 
 	/*
 	 * alltraps entry point.  Use swapgs if this is the first time in the
@@ -174,15 +172,12 @@
 alltraps:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	alltraps_testi		/* already running with kernel GS.base */
+	jz	alltraps_segs		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
-alltraps_testi:
+alltraps_segs:
+	SAVE_SEGS
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rdi
 	sti
@@ -249,14 +244,12 @@
 alltraps_noen:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f	/* already running with kernel GS.base */
+	jz	alltraps_noen_segs /* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1:	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+alltraps_noen_segs:
+	SAVE_SEGS
 	jmp	alltraps_pushregs_no_rdi
 
 IDTVEC(dblfault)
@@ -279,37 +272,36 @@
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
 1:
-	movq	%rsp,%rdi
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	2f
+	movq	%rax,%cr3
+2:	movq	%rsp,%rdi
 	call	dblfault_handler
-2:
-	hlt
-	jmp	2b
+3:	hlt
+	jmp	3b
 
+	PTI_ENTRY	page, Xpage, has_err=1
 IDTVEC(page)
 	subq	$TF_ERR,%rsp
-	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f			/* already running with kernel GS.base */
+	jz	page_cr2		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1:	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
+page_cr2:
+	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
 	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
+	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
 	jz	alltraps_pushregs_no_rdi
 	sti
@@ -320,10 +312,43 @@
 	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
 	 * to do a special the swapgs in this case even coming from the kernel.
 	 * XXX linux has a trap handler for their equivalent of load_gs().
+	 *
+	 * On the stack, we have the hardware interrupt frame to return
+	 * to usermode (faulted) and another frame with error code, for
+	 * fault.  For PTI, copy both frames to the main thread stack.
 	 */
-IDTVEC(prot)
+	.macro PROTF_ENTRY name,trapno
+\name\()_pti_doreti:
+	pushq	%rax
+	pushq	%rdx
+	swapgs
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	movq	PCPU(RSP0),%rax
+	subq	$2*PTI_SIZE-3*8,%rax
+	MOVE_STACKS	(PTI_SIZE / 4 - 3)
+	movq	%rax,%rsp
+	popq	%rdx
+	popq	%rax
+	swapgs
+	jmp	X\name
+IDTVEC(\name\()_pti)
+	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
+	je	\name\()_pti_doreti
+	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
+	jz	X\name
+	PTI_UENTRY has_err=1
+	swapgs
+IDTVEC(\name)
 	subq	$TF_ERR,%rsp
-	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
+	movl	$\trapno,TF_TRAPNO(%rsp)
+	jmp	prot_addrf
+	.endm
+
+	PROTF_ENTRY	missing, T_SEGNPFLT
+	PROTF_ENTRY	stk, T_STKFLT
+	PROTF_ENTRY	prot, T_PROTFLT
+
 prot_addrf:
 	movq	$0,TF_ADDR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
@@ -375,8 +400,18 @@
  * We do not support invoking this from a custom segment registers,
  * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
  */
+	SUPERALIGN_TEXT
+IDTVEC(fast_syscall_pti)
+	swapgs
+	movq	%rax,PCPU(SCRATCH_RAX)
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	jmp	fast_syscall_common
+	SUPERALIGN_TEXT
 IDTVEC(fast_syscall)
 	swapgs
+	movq	%rax,PCPU(SCRATCH_RAX)
+fast_syscall_common:
 	movq	%rsp,PCPU(SCRATCH_RSP)
 	movq	PCPU(RSP0),%rsp
 	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
@@ -386,10 +421,9 @@
 	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
 	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
 	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	movq	PCPU(SCRATCH_RAX),%rax
+	movq	%rax,TF_RAX(%rsp)	/* syscall number */
+	SAVE_SEGS
 	movq	PCPU(CURPCB),%r11
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
 	sti
@@ -402,7 +436,6 @@
 	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
 	movq	%r8,TF_R8(%rsp)		/* arg 5 */
 	movq	%r9,TF_R9(%rsp)		/* arg 6 */
-	movq	%rax,TF_RAX(%rsp)	/* syscall number */
 	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
 	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
 	movq	%r12,TF_R12(%rsp)	/* C preserved */
@@ -420,11 +453,11 @@
 	/* Disable interrupts before testing PCB_FULL_IRET. */
 	cli
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
-	jnz	3f
+	jnz	4f
 	/* Check for and handle AST's on return to userland. */
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
-	jne	2f
+	jne	3f
 	/* Restore preserved registers. */
 	MEXITCOUNT
 	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
@@ -434,16 +467,21 @@
 	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
 	movq	TF_RIP(%rsp),%rcx	/* original %rip */
 	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
-	swapgs
+	cmpb	$0,pti
+	je	2f
+	movq	PCPU(UCR3),%r9
+	movq	%r9,%cr3
+	xorl	%r9d,%r9d
+2:	swapgs
 	sysretq
 
-2:	/* AST scheduled. */
+3:	/* AST scheduled. */
 	sti
 	movq	%rsp,%rdi
 	call	ast
 	jmp	1b
 
-3:	/* Requested full context restore, use doreti for that. */
+4:	/* Requested full context restore, use doreti for that. */
 	MEXITCOUNT
 	jmp	doreti
 
@@ -499,17 +537,15 @@
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	xorl	%ebx,%ebx
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	nmi_fromuserspace
 	/*
-	 * We've interrupted the kernel.  Preserve GS.base in %r12.
+	 * We've interrupted the kernel.  Preserve GS.base in %r12
+	 * and %cr3 in %r13.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
@@ -521,27 +557,38 @@
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	nmi_calltrap
+	movq	%rax,%cr3
 	jmp	nmi_calltrap
 nmi_fromuserspace:
 	incl	%ebx
 	swapgs
-	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
-	jz	2f
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
 	movq	PCPU(CURPCB),%rdi
 	testq	%rdi,%rdi
-	jz	2f
+	jz	3f
+	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
+1:	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
+	jz	3f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
-	jne	1f
+	jne	2f
 	rdfsbase %rax
 	movq	%rax,PCB_FSBASE(%rdi)
-1:	cmpw	$KUG32SEL,TF_GS(%rsp)
-	jne	2f
+2:	cmpw	$KUG32SEL,TF_GS(%rsp)
+	jne	3f
 	movl	$MSR_KGSBASE,%ecx
 	rdmsr
 	shlq	$32,%rdx
 	orq	%rdx,%rax
 	movq	%rax,PCB_GSBASE(%rdi)
-2:
+3:
 /* Note: this label is also used by ddb and gdb: */
 nmi_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
@@ -564,26 +611,29 @@
 	movq	PCPU(CURTHREAD),%rax
 	orq	%rax,%rax	/* curthread present? */
 	jz	nocallchain
-	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
-	jz	nocallchain
 	/*
-	 * A user callchain is to be captured, so:
-	 * - Move execution to the regular kernel stack, to allow for
-	 *   nested NMI interrupts.
-	 * - Take the processor out of "NMI" mode by faking an "iret".
-	 * - Enable interrupts, so that copyin() can work.
+	 * Move execution to the regular kernel stack, because we
+	 * committed to return through doreti.
 	 */
 	movq	%rsp,%rsi	/* source stack pointer */
 	movq	$TF_SIZE,%rcx
 	movq	PCPU(RSP0),%rdx
 	subq	%rcx,%rdx
 	movq	%rdx,%rdi	/* destination stack pointer */
-
 	shrq	$3,%rcx		/* trap frame size in long words */
 	cld
 	rep
 	movsq			/* copy trapframe */
+	movq	%rdx,%rsp	/* we are on the regular kstack */
 
+	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
+	jz	nocallchain
+	/*
+	 * A user callchain is to be captured, so:
+	 * - Take the processor out of "NMI" mode by faking an "iret",
+	 *   to allow for nested NMI interrupts.
+	 * - Enable interrupts, so that copyin() can work.
+	 */
 	movl	%ss,%eax
 	pushq	%rax		/* tf_ss */
 	pushq	%rdx		/* tf_rsp (on kernel stack) */
@@ -624,22 +674,9 @@
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
+	movq	%r13,%cr3
 nmi_restoreregs:
-	movq	TF_RDI(%rsp),%rdi
-	movq	TF_RSI(%rsp),%rsi
-	movq	TF_RDX(%rsp),%rdx
-	movq	TF_RCX(%rsp),%rcx
-	movq	TF_R8(%rsp),%r8
-	movq	TF_R9(%rsp),%r9
-	movq	TF_RAX(%rsp),%rax
-	movq	TF_RBX(%rsp),%rbx
-	movq	TF_RBP(%rsp),%rbp
-	movq	TF_R10(%rsp),%r10
-	movq	TF_R11(%rsp),%r11
-	movq	TF_R12(%rsp),%r12
-	movq	TF_R13(%rsp),%r13
-	movq	TF_R14(%rsp),%r14
-	movq	TF_R15(%rsp),%r15
+	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret
 
@@ -807,27 +844,38 @@
 ld_ds:
 	movw	TF_DS(%rsp),%ds
 ld_regs:
-	movq	TF_RDI(%rsp),%rdi
-	movq	TF_RSI(%rsp),%rsi
-	movq	TF_RDX(%rsp),%rdx
-	movq	TF_RCX(%rsp),%rcx
-	movq	TF_R8(%rsp),%r8
-	movq	TF_R9(%rsp),%r9
-	movq	TF_RAX(%rsp),%rax
-	movq	TF_RBX(%rsp),%rbx
-	movq	TF_RBP(%rsp),%rbp
-	movq	TF_R10(%rsp),%r10
-	movq	TF_R11(%rsp),%r11
-	movq	TF_R12(%rsp),%r12
-	movq	TF_R13(%rsp),%r13
-	movq	TF_R14(%rsp),%r14
-	movq	TF_R15(%rsp),%r15
+	RESTORE_REGS
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f			/* keep running with kernel GS.base */
+	jz	2f			/* keep running with kernel GS.base */
 	cli
+	cmpb	$0,pti
+	je	1f
+	pushq	%rdx
+	movq	PCPU(PRVSPACE),%rdx
+	addq	$PC_PTI_STACK+PC_PTI_STACK_SZ*8-PTI_SIZE,%rdx
+	movq	%rax,PTI_RAX(%rdx)
+	popq	%rax
+	movq	%rax,PTI_RDX(%rdx)
+	movq	TF_RIP(%rsp),%rax
+	movq	%rax,PTI_RIP(%rdx)
+	movq	TF_CS(%rsp),%rax
+	movq	%rax,PTI_CS(%rdx)
+	movq	TF_RFLAGS(%rsp),%rax
+	movq	%rax,PTI_RFLAGS(%rdx)
+	movq	TF_RSP(%rsp),%rax
+	movq	%rax,PTI_RSP(%rdx)
+	movq	TF_SS(%rsp),%rax
+	movq	%rax,PTI_SS(%rdx)
+	movq	PCPU(UCR3),%rax
 	swapgs
-1:
-	addq	$TF_RIP,%rsp		/* skip over tf_err, tf_trapno */
+	movq	%rdx,%rsp
+	movq	%rax,%cr3
+	popq	%rdx
+	popq	%rax
+	addq	$8,%rsp
+	jmp	doreti_iret
+1:	swapgs
+2:	addq	$TF_RIP,%rsp
 	.globl	doreti_iret
 doreti_iret:
 	iretq
@@ -851,14 +899,11 @@
 	.globl	doreti_iret_fault
 doreti_iret_fault:
 	subq	$TF_RIP,%rsp		/* space including tf_err, tf_trapno */
-	testl	$PSL_I,TF_RFLAGS(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	movq	%rdi,TF_RDI(%rsp)
 	movq	%rsi,TF_RSI(%rsp)
@@ -885,7 +930,7 @@
 	.globl	ds_load_fault
 ds_load_fault:
 	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
-	testl	$PSL_I,TF_RFLAGS(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jz	1f
 	sti
 1:
Index: sys/amd64/amd64/genassym.c
===================================================================
--- sys/amd64/amd64/genassym.c
+++ sys/amd64/amd64/genassym.c
@@ -186,6 +186,16 @@
 ASSYM(TF_SIZE, sizeof(struct trapframe));
 ASSYM(TF_HASSEGS, TF_HASSEGS);
 
+ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx));
+ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax));
+ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err));
+ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip));
+ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs));
+ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags));
+ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp));
+ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss));
+ASSYM(PTI_SIZE, sizeof(struct pti_frame));
+
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags));
@@ -202,6 +212,7 @@
 ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
 ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
 ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp));
+ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax));
 ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
 ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp));
 ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0));
@@ -211,6 +222,10 @@
 ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
 ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
 ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
+ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3));
+ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3));
+ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack));
+ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ);
  
 ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
 ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
Index: sys/amd64/amd64/machdep.c
===================================================================
--- sys/amd64/amd64/machdep.c
+++ sys/amd64/amd64/machdep.c
@@ -813,13 +813,20 @@
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
+	IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
+	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
+	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
+	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(mchk_pti),
+	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
+	IDTVEC(xmm_pti),
 #ifdef KDTRACE_HOOKS
-	IDTVEC(dtrace_ret),
+	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
 #endif
 #ifdef XENHVM
-	IDTVEC(xen_intr_upcall),
+	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
 #endif
-	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
+	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
+	IDTVEC(fast_syscall_pti);
 
 #ifdef DDB
 /*
@@ -1520,7 +1527,8 @@
 
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
-	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
+	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
+	    (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
@@ -1536,6 +1544,7 @@
 	struct pcpu *pc;
 	struct nmi_pcpu *np;
 	struct xstate_hdr *xhdr;
+	u_int64_t rsp0;
 	char *env;
 	size_t kstack0_sz;
 	int late_console;
@@ -1609,34 +1618,54 @@
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
 
 	/* exceptions */
+	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
+
 	for (x = 0; x < NIDT; x++)
-		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
+		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
+		    SEL_KPL, 0);
+	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_DB, pti ? &IDTVEC(dbg_pti) : &IDTVEC(dbg), SDT_SYSIGT,
+	    SEL_KPL, 0);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
- 	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
-	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
+	    SEL_UPL, 0);
+	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
+	    SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
-	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
+	    SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
+	    SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_MC, pti ? &IDTVEC(mchk_pti) : &IDTVEC(mchk), SDT_SYSIGT,
+	    SEL_KPL, 0);
+	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
+	    SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
-	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
+	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
+	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
 #endif
 #ifdef XENHVM
 	setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
 #endif
-
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
@@ -1750,10 +1779,12 @@
 		xhdr->xstate_bv = xsave_mask;
 	}
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
-	common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
+	rsp0 = (vm_offset_t)thread0.td_pcb;
 	/* Ensure the stack is aligned to 16 bytes */
-	common_tss[0].tss_rsp0 &= ~0xFul;
-	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
+	rsp0 &= ~0xFul;
+	common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) +
+	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0;
+	PCPU_SET(rsp0, rsp0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/* transfer to user mode */
Index: sys/amd64/amd64/mp_machdep.c
===================================================================
--- sys/amd64/amd64/mp_machdep.c
+++ sys/amd64/amd64/mp_machdep.c
@@ -132,33 +132,40 @@
 	/* Install an inter-CPU IPI for TLB invalidation */
 	if (pmap_pcid_enabled) {
 		if (invpcid_works) {
-			setidt(IPI_INVLTLB, IDTVEC(invltlb_invpcid),
-			    SDT_SYSIGT, SEL_KPL, 0);
+			setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_invpcid_pti) :
+			    IDTVEC(invltlb_invpcid), SDT_SYSIGT, SEL_KPL, 0);
 		} else {
-			setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT,
-			    SEL_KPL, 0);
+			setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) :
+			    IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0);
 		}
 	} else {
-		setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
+		setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb),
+		    SDT_SYSIGT, SEL_KPL, 0);
 	}
-	setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
-	setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg),
+	    SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng),
+	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for cache invalidation. */
-	setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache),
+	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for all-CPU rendezvous */
-	setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) :
+	    IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install generic inter-CPU IPI handler */
-	setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
-	       SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) :
+	    IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU stop/restart */
-	setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
+	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Install an inter-CPU IPI for CPU suspend/resume */
-	setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0);
+	setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
+	    SDT_SYSIGT, SEL_KPL, 0);
 
 	/* Set boot_cpu_id if needed. */
 	if (boot_cpu_id == -1) {
@@ -197,7 +204,6 @@
 
 	/* Init tss */
 	common_tss[cpu] = common_tss[0];
-	common_tss[cpu].tss_rsp0 = 0;   /* not used until after switch */
 	common_tss[cpu].tss_iobase = sizeof(struct amd64tss) +
 	    IOPERM_BITMAP_SIZE;
 	common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
@@ -240,6 +246,8 @@
 	pc->pc_curpmap = kernel_pmap;
 	pc->pc_pcid_gen = 1;
 	pc->pc_pcid_next = PMAP_PCID_KERN + 1;
+	common_tss[cpu].tss_rsp0 = pti ? ((vm_offset_t)&pc->pc_pti_stack +
+	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0;
 
 	/* Save the per-cpu pointer for use by the NMI handler. */
 	np->np_pcpu = (register_t) pc;
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -11,11 +11,17 @@
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
+ * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -149,6 +155,7 @@
 #ifdef SMP
 #include <machine/smp.h>
 #endif
+#include <machine/tss.h>
 
 static __inline boolean_t
 pmap_type_guest(pmap_t pmap)
@@ -217,7 +224,7 @@
 
 	switch (pmap->pm_type) {
 	case PT_X86:
-		mask = X86_PG_G;
+		mask = pg_g;
 		break;
 	case PT_RVI:
 	case PT_EPT:
@@ -340,7 +347,7 @@
 static int ndmpdp;
 vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
-pt_entry_t pg_nx;
+pt_entry_t pg_nx, pg_g;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
@@ -405,6 +412,15 @@
 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
     "Is the invpcid instruction available ?");
 
+int pti = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+    &pti, 0,
+    "Page Table Isolation enabled");
+static vm_object_t pti_obj;
+static pml4_entry_t *pti_pml4;
+static vm_pindex_t pti_pg_idx;
+static bool pti_finalized;
+
 static int
 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 {
@@ -639,6 +655,11 @@
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
+static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
+    bool exec);
+static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
+static pd_entry_t *pmap_pti_pde(vm_offset_t va);
+static void pmap_pti_wire_pte(void *pte);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
@@ -921,7 +942,7 @@
 	/* XXX not fully used, underneath 2M pages */
 	pt_p = (pt_entry_t *)KPTphys;
 	for (i = 0; ptoa(i) < *firstaddr; i++)
-		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
+		pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g;
 
 	/* Now map the page tables at their location within PTmap */
 	pd_p = (pd_entry_t *)KPDphys;
@@ -932,7 +953,7 @@
 	/* This replaces some of the KPTphys entries above */
 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
 		pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
-		    X86_PG_G;
+		    pg_g;
 
 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
@@ -952,14 +973,14 @@
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | pg_nx;
 	}
 	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | pg_nx;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
@@ -1002,6 +1023,9 @@
 	pt_entry_t *pte;
 	int i;
 
+	if (!pti)
+		pg_g = X86_PG_G;
+
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
@@ -1071,6 +1095,8 @@
 	pmap_init_pat();
 
 	/* Initialize TLB Context Id. */
+	if (pti)
+		pmap_pcid_enabled = 0;
 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
 		/* Check for INVPCID support */
@@ -2114,7 +2140,7 @@
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
-	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
+	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
 }
 
 static __inline void
@@ -2125,7 +2151,7 @@
 
 	pte = vtopte(va);
 	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
-	pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
+	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
 }
 
 /*
@@ -2185,7 +2211,7 @@
 		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
-			pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
+			pte_store(pte, pa | pg_g | X86_PG_RW | X86_PG_V);
 		}
 		pte++;
 	}
@@ -2306,6 +2332,10 @@
 		pml4_entry_t *pml4;
 		pml4 = pmap_pml4e(pmap, va);
 		*pml4 = 0;
+		if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
+			pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
+			*pml4 = 0;
+		}
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pdp_entry_t *pdp;
@@ -2364,7 +2394,9 @@
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
+	pmap->pm_pml4u = NULL;
 	pmap->pm_cr3 = KPML4phys;
+	pmap->pm_ucr3 = ~0UL;
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
@@ -2373,6 +2405,8 @@
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
+		if (!pti)
+			__pcpu[i].pc_kcr3 = ~0ul;
 	}
 	PCPU_SET(curpmap, kernel_pmap);
 	pmap_activate(curthread);
@@ -2402,6 +2436,17 @@
 	    X86_PG_A | X86_PG_M;
 }
 
+static void
+pmap_pinit_pml4_pti(vm_page_t pml4pg)
+{
+	pml4_entry_t *pm_pml4;
+	int i;
+
+	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+	for (i = 0; i < NPML4EPG; i++)
+		pm_pml4[i] = pti_pml4[i];
+}
+
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
@@ -2409,7 +2454,7 @@
 int
 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
-	vm_page_t pml4pg;
+	vm_page_t pml4pg, pml4pgu;
 	vm_paddr_t pml4phys;
 	int i;
 
@@ -2425,8 +2470,10 @@
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
-	pmap->pm_cr3 = ~0;	/* initialize to an invalid value */
+	pmap->pm_cr3 = ~0l;	/* initialize to an invalid value */
+	pmap->pm_pml4u = NULL;
 
+	pmap->pm_type = pm_type;
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
@@ -2434,10 +2481,19 @@
 	 * Do not install the host kernel mappings in the nested page
 	 * tables. These mappings are meaningless in the guest physical
 	 * address space.
+	 * Install minimal kernel mappings in PTI case.
 	 */
-	if ((pmap->pm_type = pm_type) == PT_X86) {
+	if (pm_type == PT_X86) {
 		pmap->pm_cr3 = pml4phys;
 		pmap_pinit_pml4(pml4pg);
+		if (pti) {
+			pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
+			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
+			pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
+			    VM_PAGE_TO_PHYS(pml4pgu));
+			pmap_pinit_pml4_pti(pml4pgu);
+			pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
+		}
 	}
 
 	pmap->pm_root.rt_root = 0;
@@ -2509,13 +2565,18 @@
 	 */
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
-		pml4_entry_t *pml4;
+		pml4_entry_t *pml4, *pml4u;
 		vm_pindex_t pml4index;
 
 		/* Wire up a new PDPE page */
 		pml4index = ptepindex - (NUPDE + NUPDPE);
 		pml4 = &pmap->pm_pml4[pml4index];
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+		if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
+			pml4u = &pmap->pm_pml4u[pml4index];
+			*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
+			    PG_A | PG_M;
+		}
 
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml4index;
@@ -2716,6 +2777,13 @@
 	m->wire_count--;
 	atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 	vm_page_free_zero(m);
+
+	if (pmap->pm_pml4u != NULL) {
+		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
+		m->wire_count--;
+		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
+		vm_page_free(m);
+	}
 }
 
 static int
@@ -7141,6 +7209,10 @@
 	} else if (cr3 != pmap->pm_cr3) {
 		load_cr3(pmap->pm_cr3);
 		PCPU_SET(curpmap, pmap);
+		if (pti) {
+			PCPU_SET(kcr3, pmap->pm_cr3);
+			PCPU_SET(ucr3, pmap->pm_ucr3);
+		}
 	}
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
@@ -7464,6 +7536,291 @@
 	mtx_unlock_spin(&qframe_mtx);
 }
 
+static vm_page_t
+pmap_pti_alloc_page(void)
+{
+	vm_page_t m;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+	m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
+	    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
+	return (m);
+}
+
+static bool
+pmap_pti_free_page(vm_page_t m, bool last)
+{
+
+	m->wire_count--;
+	if (m->wire_count == 0 || last) {
+		KASSERT(m->wire_count == 0, ("page %p wired", m));
+		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
+		vm_page_free_zero(m);
+		return (true);
+	}
+	return (false);
+}
+
+extern char kernphys[], etext[];
+
+static void
+pmap_pti_init(void)
+{
+	vm_page_t pml4_pg;
+	pdp_entry_t *pdpe;
+	vm_offset_t va;
+	int i;
+
+	if (!pti)
+		return;
+	pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
+	VM_OBJECT_WLOCK(pti_obj);
+	pml4_pg = pmap_pti_alloc_page();
+	pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
+	for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
+	    va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
+		pdpe = pmap_pti_pdpe(va);
+		pmap_pti_wire_pte(pdpe);
+	}
+	pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
+	    (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
+	pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
+	    sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
+	pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
+	    sizeof(struct gate_descriptor) * NIDT, false);
+	pmap_pti_add_kva_locked((vm_offset_t)common_tss,
+	    (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
+	CPU_FOREACH(i) {
+		/* Doublefault stack IST 1 */
+		va = common_tss[i].tss_ist1;
+		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
+		/* NMI stack IST 2 */
+		va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
+		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
+	}
+	pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
+	    (vm_offset_t)etext, true);
+	pti_finalized = true;
+	VM_OBJECT_WUNLOCK(pti_obj);
+}
+SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
+
+static pdp_entry_t *
+pmap_pti_pdpe(vm_offset_t va)
+{
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	vm_page_t m;
+	vm_pindex_t pml4_idx;
+	vm_paddr_t mphys;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+	pml4_idx = pmap_pml4e_index(va);
+	pml4e = &pti_pml4[pml4_idx];
+	m = NULL;
+	if (*pml4e == 0) {
+		if (pti_finalized)
+			panic("pml4 alloc after finalization\n");
+		m = pmap_pti_alloc_page();
+		if (*pml4e != 0) {
+			pmap_pti_free_page(m, true);
+			mphys = *pml4e & ~PAGE_MASK;
+		} else {
+			mphys = VM_PAGE_TO_PHYS(m);
+			*pml4e = mphys | X86_PG_RW | X86_PG_V;
+		}
+	} else {
+		mphys = *pml4e & ~PAGE_MASK;
+	}
+	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
+	return (pdpe);
+}
+
+static void
+pmap_pti_wire_pte(void *pte)
+{
+	vm_page_t m;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
+	m->wire_count++;
+}
+
+static void
+pmap_pti_unwire_pde(void *pde, bool only_ref)
+{
+	vm_page_t m;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
+	MPASS(m->wire_count > 0);
+	MPASS(only_ref || m->wire_count > 1);
+	pmap_pti_free_page(m, false);
+}
+
+static void
+pmap_pti_unwire_pte(void *pte, vm_offset_t va)
+{
+	vm_page_t m;
+	pd_entry_t *pde;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
+	MPASS(m->wire_count > 0);
+	if (pmap_pti_free_page(m, false)) {
+		pde = pmap_pti_pde(va);
+		MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
+		*pde = 0;
+		pmap_pti_unwire_pde(pde, false);
+	}
+}
+
+static pd_entry_t *
+pmap_pti_pde(vm_offset_t va)
+{
+	pdp_entry_t *pdpe;
+	pd_entry_t *pde;
+	vm_page_t m;
+	vm_pindex_t pd_idx;
+	vm_paddr_t mphys;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+	pdpe = pmap_pti_pdpe(va);
+	if (*pdpe == 0) {
+		m = pmap_pti_alloc_page();
+		if (*pdpe != 0) {
+			pmap_pti_free_page(m, true);
+			MPASS((*pdpe & X86_PG_PS) == 0);
+			mphys = *pdpe & ~PAGE_MASK;
+		} else {
+			mphys =  VM_PAGE_TO_PHYS(m);
+			*pdpe = mphys | X86_PG_RW | X86_PG_V;
+		}
+	} else {
+		MPASS((*pdpe & X86_PG_PS) == 0);
+		mphys = *pdpe & ~PAGE_MASK;
+	}
+
+	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
+	pd_idx = pmap_pde_index(va);
+	pde += pd_idx;
+	return (pde);
+}
+
+static pt_entry_t *
+pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
+{
+	pd_entry_t *pde;
+	pt_entry_t *pte;
+	vm_page_t m;
+	vm_paddr_t mphys;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+	pde = pmap_pti_pde(va);
+	if (unwire_pde != NULL) {
+		*unwire_pde = true;
+		pmap_pti_wire_pte(pde);
+	}
+	if (*pde == 0) {
+		m = pmap_pti_alloc_page();
+		if (*pde != 0) {
+			pmap_pti_free_page(m, true);
+			MPASS((*pde & X86_PG_PS) == 0);
+			mphys = *pde & ~(PAGE_MASK | pg_nx);
+		} else {
+			mphys = VM_PAGE_TO_PHYS(m);
+			*pde = mphys | X86_PG_RW | X86_PG_V;
+			if (unwire_pde != NULL)
+				*unwire_pde = false;
+		}
+	} else {
+		MPASS((*pde & X86_PG_PS) == 0);
+		mphys = *pde & ~(PAGE_MASK | pg_nx);
+	}
+
+	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
+	pte += pmap_pte_index(va);
+
+	return (pte);
+}
+
+static void
+pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
+{
+	vm_paddr_t pa;
+	pd_entry_t *pde;
+	pt_entry_t *pte, ptev;
+	bool unwire_pde;
+
+	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+	sva = trunc_page(sva);
+	MPASS(sva > VM_MAXUSER_ADDRESS);
+	eva = round_page(eva);
+	MPASS(sva < eva);
+	for (; sva < eva; sva += PAGE_SIZE) {
+		pte = pmap_pti_pte(sva, &unwire_pde);
+		pa = pmap_kextract(sva);
+		ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A |
+		    (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
+		    VM_MEMATTR_DEFAULT, FALSE);
+		if (*pte == 0) {
+			pte_store(pte, ptev);
+			pmap_pti_wire_pte(pte);
+		} else {
+			KASSERT(!pti_finalized,
+			    ("pti overlap after fin %#lx %#lx %#lx",
+			    sva, *pte, ptev));
+			KASSERT(*pte == ptev,
+			    ("pti non-identical pte after fin %#lx %#lx %#lx",
+			    sva, *pte, ptev));
+		}
+		if (unwire_pde) {
+			pde = pmap_pti_pde(sva);
+			pmap_pti_unwire_pde(pde, true);
+		}
+	}
+}
+
+void
+pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
+{
+
+	if (!pti)
+		return;
+	VM_OBJECT_WLOCK(pti_obj);
+	pmap_pti_add_kva_locked(sva, eva, exec);
+	VM_OBJECT_WUNLOCK(pti_obj);
+}
+
+void
+pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
+{
+	pt_entry_t *pte;
+	vm_offset_t va;
+
+	if (!pti)
+		return;
+	sva = rounddown2(sva, PAGE_SIZE);
+	MPASS(sva > VM_MAXUSER_ADDRESS);
+	eva = roundup2(eva, PAGE_SIZE);
+	MPASS(sva < eva);
+	VM_OBJECT_WLOCK(pti_obj);
+	for (va = sva; va < eva; va += PAGE_SIZE) {
+		pte = pmap_pti_pte(va, NULL);
+		KASSERT((*pte & X86_PG_V) != 0,
+		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
+		    (u_long)pte, *pte));
+		pte_clear(pte);
+		pmap_pti_unwire_pte(pte, va);
+	}
+	pmap_invalidate_range(kernel_pmap, sva, eva);
+	VM_OBJECT_WUNLOCK(pti_obj);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kdb.h>
Index: sys/amd64/amd64/sys_machdep.c
===================================================================
--- sys/amd64/amd64/sys_machdep.c
+++ sys/amd64/amd64/sys_machdep.c
@@ -65,6 +65,9 @@
 
 #include <security/audit/audit.h>
 
+static void user_ldt_deref(struct proc_ldt *pldt);
+static void user_ldt_derefl(struct proc_ldt *pldt);
+
 #define	MAX_LD		8192
 
 int max_ldt_segment = 512;
@@ -83,8 +86,6 @@
 }
 SYSINIT(maxldt, SI_SUB_VM_CONF, SI_ORDER_ANY, max_ldt_segment_init, NULL);
 
-static void user_ldt_derefl(struct proc_ldt *pldt);
-
 #ifndef _SYS_SYSPROTO_H_
 struct sysarch_args {
 	int op;
@@ -361,7 +362,9 @@
 	pcb = td->td_pcb;
 	if (pcb->pcb_tssp == NULL) {
 		tssp = (struct amd64tss *)kmem_malloc(kernel_arena,
-		    ctob(IOPAGES+1), M_WAITOK);
+		    ctob(IOPAGES + 1), M_WAITOK);
+		pmap_pti_add_kva((vm_offset_t)tssp, (vm_offset_t)tssp +
+		    ctob(IOPAGES + 1), false);
 		iomap = (char *)&tssp[1];
 		memset(iomap, 0xff, IOPERM_BITMAP_SIZE);
 		critical_enter();
@@ -450,6 +453,8 @@
 	struct proc_ldt *pldt, *new_ldt;
 	struct mdproc *mdp;
 	struct soft_segment_descriptor sldt;
+	vm_offset_t sva;
+	vm_size_t sz;
 
 	mtx_assert(&dt_lock, MA_OWNED);
 	mdp = &p->p_md;
@@ -457,13 +462,13 @@
 		return (mdp->md_ldt);
 	mtx_unlock(&dt_lock);
 	new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
-	new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
-	     max_ldt_segment * sizeof(struct user_segment_descriptor),
-	     M_WAITOK | M_ZERO);
+	sz = max_ldt_segment * sizeof(struct user_segment_descriptor);
+	sva = kmem_malloc(kernel_arena, sz, M_WAITOK | M_ZERO);
+	new_ldt->ldt_base = (caddr_t)sva;
+	pmap_pti_add_kva(sva, sva + sz, false);
 	new_ldt->ldt_refcnt = 1;
-	sldt.ssd_base = (uint64_t)new_ldt->ldt_base;
-	sldt.ssd_limit = max_ldt_segment *
-	    sizeof(struct user_segment_descriptor) - 1;
+	sldt.ssd_base = sva;
+	sldt.ssd_limit = sz - 1;
 	sldt.ssd_type = SDT_SYSLDT;
 	sldt.ssd_dpl = SEL_KPL;
 	sldt.ssd_p = 1;
@@ -473,8 +478,8 @@
 	mtx_lock(&dt_lock);
 	pldt = mdp->md_ldt;
 	if (pldt != NULL && !force) {
-		kmem_free(kernel_arena, (vm_offset_t)new_ldt->ldt_base,
-		    max_ldt_segment * sizeof(struct user_segment_descriptor));
+		pmap_pti_remove_kva(sva, sva + sz);
+		kmem_free(kernel_arena, sva, sz);
 		free(new_ldt, M_SUBPROC);
 		return (pldt);
 	}
@@ -521,15 +526,19 @@
 static void
 user_ldt_derefl(struct proc_ldt *pldt)
 {
+	vm_offset_t sva;
+	vm_size_t sz;
 
 	if (--pldt->ldt_refcnt == 0) {
-		kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
-		    max_ldt_segment * sizeof(struct user_segment_descriptor));
+		sva = (vm_offset_t)pldt->ldt_base;
+		sz = max_ldt_segment * sizeof(struct user_segment_descriptor);
+		pmap_pti_remove_kva(sva, sva + sz);
+		kmem_free(kernel_arena, sva, sz);
 		free(pldt, M_SUBPROC);
 	}
 }
 
-void
+static void
 user_ldt_deref(struct proc_ldt *pldt)
 {
 
Index: sys/amd64/amd64/trap.c
===================================================================
--- sys/amd64/amd64/trap.c
+++ sys/amd64/amd64/trap.c
@@ -448,9 +448,28 @@
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
+			 *
+			 * In case of PTI, the IRETQ faulted while the
+			 * kernel used the pti stack, and exception
+			 * frame records %rsp value pointing to that
+			 * stack.  If we return normally to
+			 * doreti_iret_fault, the trapframe is
+			 * reconstructed on pti stack, and calltrap()
+			 * called on it as well.  Due to the very
+			 * limited pti stack size, kernel does not
+			 * survive for too long.  Switch to the normal
+			 * thread stack for the trap handling.
+			 *
+			 * Magic '5' is the number of qwords occupied by
+			 * the hardware trap frame.
 			 */
 			if (frame->tf_rip == (long)doreti_iret) {
 				frame->tf_rip = (long)doreti_iret_fault;
+				if (pti && frame->tf_rsp == (uintptr_t)PCPU_PTR(
+				    pti_stack) + (PC_PTI_STACK_SZ - 5) *
+				    sizeof(register_t))
+					frame->tf_rsp = PCPU_GET(rsp0) - 5 *
+					    sizeof(register_t);
 				return;
 			}
 			if (frame->tf_rip == (long)ld_ds) {
Index: sys/amd64/amd64/vm_machdep.c
===================================================================
--- sys/amd64/amd64/vm_machdep.c
+++ sys/amd64/amd64/vm_machdep.c
@@ -339,6 +339,8 @@
 	 * Clean TSS/iomap
 	 */
 	if (pcb->pcb_tssp != NULL) {
+		pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp,
+		    (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1));
 		kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp,
 		    ctob(IOPAGES + 1));
 		pcb->pcb_tssp = NULL;
Index: sys/amd64/ia32/ia32_exception.S
===================================================================
--- sys/amd64/ia32/ia32_exception.S
+++ sys/amd64/ia32/ia32_exception.S
@@ -40,17 +40,19 @@
  * that it originated in supervisor mode and skip the swapgs.
  */
 	SUPERALIGN_TEXT
+IDTVEC(int0x80_syscall_pti)
+	PTI_UENTRY has_err=0
+	jmp	int0x80_syscall_common
+	SUPERALIGN_TEXT
 IDTVEC(int0x80_syscall)
 	swapgs
+int0x80_syscall_common:
 	pushq	$2			/* sizeof "int 0x80" */
 	subq	$TF_ERR,%rsp		/* skip over tf_trapno */
 	movq	%rdi,TF_RDI(%rsp)
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	sti
 	movq	%rsi,TF_RSI(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
Index: sys/amd64/ia32/ia32_syscall.c
===================================================================
--- sys/amd64/ia32/ia32_syscall.c
+++ sys/amd64/ia32/ia32_syscall.c
@@ -95,7 +95,8 @@
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
-extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd);
+extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(int0x80_syscall_pti),
+    IDTVEC(rsvd), IDTVEC(rsvd_pti);
 
 void ia32_syscall(struct trapframe *frame);	/* Called from asm code */
 
@@ -208,14 +209,16 @@
 ia32_syscall_enable(void *dummy)
 {
 
- 	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
+ 	setidt(IDT_SYSCALL, pti ? &IDTVEC(int0x80_syscall_pti) :
+	    &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
 }
 
 static void
 ia32_syscall_disable(void *dummy)
 {
 
- 	setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+ 	setidt(IDT_SYSCALL, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd),
+	    SDT_SYSIGT, SEL_KPL, 0);
 }
 
 SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL);
Index: sys/amd64/include/asmacros.h
===================================================================
--- sys/amd64/include/asmacros.h
+++ sys/amd64/include/asmacros.h
@@ -1,9 +1,17 @@
+/* -*- mode: asm -*- */
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -145,69 +153,6 @@
 	popq	%rbp
 
 #ifdef LOCORE
-/*
- * Convenience macro for declaring interrupt entry points.
- */
-#define	IDTVEC(name)	ALIGN_TEXT; .globl __CONCAT(X,name); \
-			.type __CONCAT(X,name),@function; __CONCAT(X,name):
-
-/*
- * Macros to create and destroy a trap frame.
- */
-#define PUSH_FRAME							\
-	subq	$TF_RIP,%rsp ;	/* skip dummy tf_err and tf_trapno */	\
-	testb	$SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */	\
-	jz	1f ;		/* Yes, dont swapgs again */		\
-	swapgs ;							\
-1:	movq	%rdi,TF_RDI(%rsp) ;					\
-	movq	%rsi,TF_RSI(%rsp) ;					\
-	movq	%rdx,TF_RDX(%rsp) ;					\
-	movq	%rcx,TF_RCX(%rsp) ;					\
-	movq	%r8,TF_R8(%rsp) ;					\
-	movq	%r9,TF_R9(%rsp) ;					\
-	movq	%rax,TF_RAX(%rsp) ;					\
-	movq	%rbx,TF_RBX(%rsp) ;					\
-	movq	%rbp,TF_RBP(%rsp) ;					\
-	movq	%r10,TF_R10(%rsp) ;					\
-	movq	%r11,TF_R11(%rsp) ;					\
-	movq	%r12,TF_R12(%rsp) ;					\
-	movq	%r13,TF_R13(%rsp) ;					\
-	movq	%r14,TF_R14(%rsp) ;					\
-	movq	%r15,TF_R15(%rsp) ;					\
-	movw	%fs,TF_FS(%rsp) ;					\
-	movw	%gs,TF_GS(%rsp) ;					\
-	movw	%es,TF_ES(%rsp) ;					\
-	movw	%ds,TF_DS(%rsp) ;					\
-	movl	$TF_HASSEGS,TF_FLAGS(%rsp) ;				\
-	cld ;								\
-	testb	$SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel ? */	\
-	jz	2f ;		/* yes, leave PCB_FULL_IRET alone */	\
-	movq	PCPU(CURPCB),%r8 ;					\
-	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8) ;			\
-2:
-
-#define POP_FRAME							\
-	movq	TF_RDI(%rsp),%rdi ;					\
-	movq	TF_RSI(%rsp),%rsi ;					\
-	movq	TF_RDX(%rsp),%rdx ;					\
-	movq	TF_RCX(%rsp),%rcx ;					\
-	movq	TF_R8(%rsp),%r8 ;					\
-	movq	TF_R9(%rsp),%r9 ;					\
-	movq	TF_RAX(%rsp),%rax ;					\
-	movq	TF_RBX(%rsp),%rbx ;					\
-	movq	TF_RBP(%rsp),%rbp ;					\
-	movq	TF_R10(%rsp),%r10 ;					\
-	movq	TF_R11(%rsp),%r11 ;					\
-	movq	TF_R12(%rsp),%r12 ;					\
-	movq	TF_R13(%rsp),%r13 ;					\
-	movq	TF_R14(%rsp),%r14 ;					\
-	movq	TF_R15(%rsp),%r15 ;					\
-	testb	$SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */	\
-	jz	1f ;		/* keep kernel GS.base */		\
-	cli ;								\
-	swapgs ;							\
-1:	addq	$TF_RIP,%rsp	/* skip over tf_err, tf_trapno */
-
 /*
  * Access per-CPU data.
  */
@@ -216,6 +161,125 @@
 	movq %gs:PC_PRVSPACE, reg ;				\
 	addq $PC_ ## member, reg
 
+/*
+ * Convenience macro for declaring interrupt entry points.
+ */
+#define	IDTVEC(name)	ALIGN_TEXT; .globl __CONCAT(X,name); \
+			.type __CONCAT(X,name),@function; __CONCAT(X,name):
+
+	.macro	SAVE_SEGS
+	movw	%fs,TF_FS(%rsp)
+	movw	%gs,TF_GS(%rsp)
+	movw	%es,TF_ES(%rsp)
+	movw	%ds,TF_DS(%rsp)
+	.endm
+
+	.macro	MOVE_STACKS qw
+	offset=0
+	.rept	\qw
+	movq	offset(%rsp),%rdx
+	movq	%rdx,offset(%rax)
+	offset=offset+8
+	.endr
+	.endm
+
+	.macro	PTI_UENTRY has_err
+	swapgs
+	pushq	%rax
+	pushq	%rdx
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	movq	PCPU(RSP0),%rax
+	subq	$PTI_SIZE,%rax
+	MOVE_STACKS	(PTI_SIZE / 8) - 1 + \has_err
+	movq	%rax,%rsp
+	popq	%rdx
+	popq	%rax
+	.endm
+
+	.macro	PTI_ENTRY name, cont, has_err=0
+	ALIGN_TEXT
+	.globl	X\name\()_pti
+	.type	X\name\()_pti,@function
+X\name\()_pti:
+	/* %rax, %rdx and possibly err not yet pushed */
+	testb	$SEL_RPL_MASK,PTI_CS-(2+1-\has_err)*8(%rsp)
+	jz	\cont
+	PTI_UENTRY \has_err
+	swapgs
+	jmp	\cont
+	.endm
+
+	.macro	PTI_INTRENTRY vec_name
+	SUPERALIGN_TEXT
+	.globl	X\vec_name\()_pti
+	.type	X\vec_name\()_pti,@function
+X\vec_name\()_pti:
+	testb	$SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* err, %rax, %rdx not pushed */
+	jz	\vec_name\()_u
+	PTI_UENTRY has_err=0
+	jmp	\vec_name\()_u
+	.endm
+
+	.macro	INTR_PUSH_FRAME vec_name
+	SUPERALIGN_TEXT
+	.globl	X\vec_name
+	.type	X\vec_name,@function
+X\vec_name:
+	testb	$SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* come from kernel? */
+	jz	\vec_name\()_u		/* Yes, dont swapgs again */
+	swapgs
+\vec_name\()_u:
+	subq	$TF_RIP,%rsp	/* skip dummy tf_err and tf_trapno */
+	movq	%rdi,TF_RDI(%rsp)
+	movq	%rsi,TF_RSI(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	movq	%r8,TF_R8(%rsp)
+	movq	%r9,TF_R9(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rbx,TF_RBX(%rsp)
+	movq	%rbp,TF_RBP(%rsp)
+	movq	%r10,TF_R10(%rsp)
+	movq	%r11,TF_R11(%rsp)
+	movq	%r12,TF_R12(%rsp)
+	movq	%r13,TF_R13(%rsp)
+	movq	%r14,TF_R14(%rsp)
+	movq	%r15,TF_R15(%rsp)
+	SAVE_SEGS
+	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
+	cld
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)  /* come from kernel ? */
+	jz	1f		/* yes, leave PCB_FULL_IRET alone */
+	movq	PCPU(CURPCB),%r8
+	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r8)
+1:
+	.endm
+
+	.macro	INTR_HANDLER vec_name
+	.text
+	PTI_INTRENTRY	\vec_name
+	INTR_PUSH_FRAME	\vec_name
+	.endm
+
+	.macro	RESTORE_REGS
+	movq	TF_RDI(%rsp),%rdi
+	movq	TF_RSI(%rsp),%rsi
+	movq	TF_RDX(%rsp),%rdx
+	movq	TF_RCX(%rsp),%rcx
+	movq	TF_R8(%rsp),%r8
+	movq	TF_R9(%rsp),%r9
+	movq	TF_RAX(%rsp),%rax
+	movq	TF_RBX(%rsp),%rbx
+	movq	TF_RBP(%rsp),%rbp
+	movq	TF_R10(%rsp),%r10
+	movq	TF_R11(%rsp),%r11
+	movq	TF_R12(%rsp),%r12
+	movq	TF_R13(%rsp),%r13
+	movq	TF_R14(%rsp),%r14
+	movq	TF_R15(%rsp),%r15
+	.endm
+
 #endif /* LOCORE */
 
 #ifdef __STDC__
Index: sys/amd64/include/frame.h
===================================================================
--- sys/amd64/include/frame.h
+++ sys/amd64/include/frame.h
@@ -1,6 +1,50 @@
 /*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  * This file is in the public domain.
+ * $FreeBSD$
  */
-/* $FreeBSD$ */
+
+#ifndef	_AMD64_FRAME_H
+#define	_AMD64_FRAME_H
 
 #include <x86/frame.h>
+
+struct pti_frame {
+	register_t	pti_rdx;
+	register_t	pti_rax;
+	register_t	pti_err;
+	register_t	pti_rip;
+	register_t	pti_cs;
+	register_t	pti_rflags;
+	register_t	pti_rsp;
+	register_t	pti_ss;
+};
+
+#endif
Index: sys/amd64/include/pcpu.h
===================================================================
--- sys/amd64/include/pcpu.h
+++ sys/amd64/include/pcpu.h
@@ -35,6 +35,7 @@
 #error "sys/cdefs.h is a prerequisite for this file"
 #endif
 
+#define	PC_PTI_STACK_SZ	16
 /*
  * The SMP parts are setup in pmap.c and locore.s for the BSP, and
  * mp_machdep.c sets up the data for the AP's to "see" when they awake.
@@ -48,8 +49,11 @@
 	struct	pmap *pc_curpmap;					\
 	struct	amd64tss *pc_tssp;	/* TSS segment active on CPU */	\
 	struct	amd64tss *pc_commontssp;/* Common TSS for the CPU */	\
+	uint64_t pc_kcr3;						\
+	uint64_t pc_ucr3;						\
 	register_t pc_rsp0;						\
 	register_t pc_scratch_rsp;	/* User %rsp in syscall */	\
+	register_t pc_scratch_rax;					\
 	u_int	pc_apic_id;						\
 	u_int   pc_acpi_id;		/* ACPI CPU id */		\
 	/* Pointer to the CPU %fs descriptor */				\
@@ -63,12 +67,13 @@
 	uint64_t	pc_pm_save_cnt;					\
 	u_int	pc_cmci_mask;		/* MCx banks for CMCI */	\
 	uint64_t pc_dbreg[16];		/* ddb debugging regs */	\
+	uint64_t pc_pti_stack[PC_PTI_STACK_SZ];					\
 	int pc_dbreg_cmd;		/* ddb debugging reg cmd */	\
 	u_int	pc_vcpu_id;		/* Xen vCPU ID */		\
 	uint32_t pc_pcid_next;						\
 	uint32_t pc_pcid_gen;						\
 	uint32_t pc_smp_tlb_done;	/* TLB op acknowledgement */	\
-	char	__pad[384]		/* be divisor of PAGE_SIZE	\
+	char	__pad[232]		/* be divisor of PAGE_SIZE	\
 					   after cache alignment */
 
 #define	PC_DBREG_CMD_NONE	0
Index: sys/amd64/include/pmap.h
===================================================================
--- sys/amd64/include/pmap.h
+++ sys/amd64/include/pmap.h
@@ -277,7 +277,7 @@
 
 #define	pde_store(pdep, pde)		pte_store(pdep, pde)
 
-extern pt_entry_t pg_nx;
+extern pt_entry_t pg_nx, pg_g;
 
 #endif /* _KERNEL */
 
@@ -315,7 +315,9 @@
 struct pmap {
 	struct mtx		pm_mtx;
 	pml4_entry_t		*pm_pml4;	/* KVA of level 4 page table */
+	pml4_entry_t		*pm_pml4u;	/* KVA of user l4 page table */
 	uint64_t		pm_cr3;
+	uint64_t		pm_ucr3;
 	TAILQ_HEAD(,pv_chunk)	pm_pvchunk;	/* list of mappings in pmap */
 	cpuset_t		pm_active;	/* active on cpus */
 	enum pmap_type		pm_type;	/* regular or nested tables */
@@ -429,6 +431,8 @@
 void	pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
 boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
 void	pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
+void	pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec);
+void	pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva);
 #endif /* _KERNEL */
 
 /* Return various clipped indexes for a given VA */
Index: sys/amd64/include/proc.h
===================================================================
--- sys/amd64/include/proc.h
+++ sys/amd64/include/proc.h
@@ -92,7 +92,6 @@
 
 struct proc_ldt *user_ldt_alloc(struct proc *, int);
 void user_ldt_free(struct thread *);
-void user_ldt_deref(struct proc_ldt *);
 struct sysarch_args;
 int sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space);
 int amd64_set_ldt_data(struct thread *td, int start, int num,
Index: sys/amd64/include/smp.h
===================================================================
--- sys/amd64/include/smp.h
+++ sys/amd64/include/smp.h
@@ -30,7 +30,18 @@
 inthand_t
 	IDTVEC(invltlb_pcid),	/* TLB shootdowns - global, pcid */
 	IDTVEC(invltlb_invpcid),/* TLB shootdowns - global, invpcid */
-	IDTVEC(justreturn);	/* interrupt CPU with minimum overhead */
+	IDTVEC(justreturn),	/* interrupt CPU with minimum overhead */
+	IDTVEC(invltlb_pcid_pti),
+	IDTVEC(invltlb_invpcid_pti),
+	IDTVEC(justreturn1_pti),
+	IDTVEC(invltlb_pti),
+	IDTVEC(invlpg_pti),
+	IDTVEC(invlrng_pti),
+	IDTVEC(invlcache_pti),
+	IDTVEC(ipi_intr_bitmap_handler_pti),
+	IDTVEC(cpustop_pti),
+	IDTVEC(cpususpend_pti),
+	IDTVEC(rendezvous_pti);
 
 void	invltlb_pcid_handler(void);
 void	invltlb_invpcid_handler(void);
Index: sys/amd64/vmm/intel/vmx.c
===================================================================
--- sys/amd64/vmm/intel/vmx.c
+++ sys/amd64/vmm/intel/vmx.c
@@ -695,7 +695,8 @@
 		    MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
 		    &tmp);
 		if (error == 0) {
-			pirvec = lapic_ipi_alloc(&IDTVEC(justreturn));
+			pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
+			    &IDTVEC(justreturn));
 			if (pirvec < 0) {
 				if (bootverbose) {
 					printf("vmx_init: unable to allocate "
Index: sys/amd64/vmm/vmm.c
===================================================================
--- sys/amd64/vmm/vmm.c
+++ sys/amd64/vmm/vmm.c
@@ -57,6 +57,7 @@
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
+#include <machine/md_var.h>
 #include <x86/psl.h>
 #include <x86/apicreg.h>
 
@@ -327,7 +328,8 @@
 
 	vmm_host_state_init();
 
-	vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn));
+	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
+	    &IDTVEC(justreturn));
 	if (vmm_ipinum < 0)
 		vmm_ipinum = IPI_AST;
 
Index: sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
===================================================================
--- sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
+++ sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
@@ -26,11 +26,11 @@
  * $FreeBSD$
  */
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 
-#include "assym.s"
-
 /*
  * This is the Hyper-V vmbus channel direct callback interrupt.
  * Only used when it is running on Hyper-V.
@@ -38,7 +38,7 @@
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(vmbus_isr)
-	PUSH_FRAME
+	INTR_PUSH_FRAME
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	vmbus_handle_intr
Index: sys/i386/i386/apic_vector.s
===================================================================
--- sys/i386/i386/apic_vector.s
+++ sys/i386/i386/apic_vector.s
@@ -70,6 +70,7 @@
 #define	ISR_VEC(index, vec_name)					\
 	.text ;								\
 	SUPERALIGN_TEXT ;						\
+IDTVEC(vec_name ## _pti) ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	SET_KERNEL_SREGS ;						\
@@ -123,6 +124,7 @@
  */
 	.text
 	SUPERALIGN_TEXT
+IDTVEC(timerint_pti)
 IDTVEC(timerint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
@@ -139,6 +141,7 @@
  */
 	.text
 	SUPERALIGN_TEXT
+IDTVEC(cmcint_pti)
 IDTVEC(cmcint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
@@ -153,6 +156,7 @@
  */
 	.text
 	SUPERALIGN_TEXT
+IDTVEC(errorint_pti)
 IDTVEC(errorint)
 	PUSH_FRAME
 	SET_KERNEL_SREGS
Index: sys/i386/i386/atpic_vector.s
===================================================================
--- sys/i386/i386/atpic_vector.s
+++ sys/i386/i386/atpic_vector.s
@@ -46,6 +46,7 @@
 #define	INTR(irq_num, vec_name) \
 	.text ;								\
 	SUPERALIGN_TEXT ;						\
+IDTVEC(vec_name ##_pti) ;						\
 IDTVEC(vec_name) ;							\
 	PUSH_FRAME ;							\
 	SET_KERNEL_SREGS ;						\
Index: sys/i386/i386/exception.s
===================================================================
--- sys/i386/i386/exception.s
+++ sys/i386/i386/exception.s
@@ -133,6 +133,7 @@
 	TRAP(T_PAGEFLT)
 IDTVEC(mchk)
 	pushl $0; TRAP(T_MCHK)
+IDTVEC(rsvd_pti)
 IDTVEC(rsvd)
 	pushl $0; TRAP(T_RESERVED)
 IDTVEC(fpu)
Index: sys/i386/i386/pmap.c
===================================================================
--- sys/i386/i386/pmap.c
+++ sys/i386/i386/pmap.c
@@ -280,6 +280,8 @@
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
+int pti;
+
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
Index: sys/x86/include/apicvar.h
===================================================================
--- sys/x86/include/apicvar.h
+++ sys/x86/include/apicvar.h
@@ -185,7 +185,11 @@
 	IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
 	IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
 	IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
-	IDTVEC(spuriousint), IDTVEC(timerint);
+	IDTVEC(spuriousint), IDTVEC(timerint),
+	IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti),
+	IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti),
+	IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti),
+	IDTVEC(spuriousint_pti), IDTVEC(timerint_pti);
 
 extern vm_paddr_t lapic_paddr;
 extern int *apic_cpuids;
Index: sys/x86/include/x86_var.h
===================================================================
--- sys/x86/include/x86_var.h
+++ sys/x86/include/x86_var.h
@@ -83,6 +83,7 @@
 extern	int	use_xsave;
 extern	uint64_t xsave_mask;
 extern	u_int	max_apic_id;
+extern int	pti;
 
 struct	pcb;
 struct	thread;
Index: sys/x86/isa/atpic.c
===================================================================
--- sys/x86/isa/atpic.c
+++ sys/x86/isa/atpic.c
@@ -80,6 +80,16 @@
 	IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11),
 	IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14),
 	IDTVEC(atpic_intr15);
+/* XXXKIB i386 uses stubs until pti comes */
+inthand_t
+	IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti),
+	IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti),
+	IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti),
+	IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti),
+	IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti),
+	IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti),
+	IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti),
+	IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti);
 
 #define	IRQ(ap, ai)	((ap)->at_irqbase + (ai)->at_irq)
 
@@ -92,7 +102,7 @@
 
 #define	INTSRC(irq)							\
 	{ { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ),	\
-	    (irq) % 8 }
+	    IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 }
 
 struct atpic {
 	struct pic at_pic;
@@ -104,7 +114,7 @@
 
 struct atpic_intsrc {
 	struct intsrc at_intsrc;
-	inthand_t *at_intr;
+	inthand_t *at_intr, *at_intr_pti;
 	int	at_irq;			/* Relative to PIC base. */
 	enum intr_trigger at_trigger;
 	u_long	at_count;
@@ -408,7 +418,8 @@
 		ai->at_intsrc.is_count = &ai->at_count;
 		ai->at_intsrc.is_straycount = &ai->at_straycount;
 		setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase +
-		    ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC);
+		    ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC,
+		    SEL_KPL, GSEL_ATPIC);
 	}
 
 	/*
Index: sys/x86/x86/local_apic.c
===================================================================
--- sys/x86/x86/local_apic.c
+++ sys/x86/x86/local_apic.c
@@ -171,13 +171,23 @@
 	IDTVEC(apic_isr7),	/* 224 - 255 */
 };
 
+static inthand_t *ioint_pti_handlers[] = {
+	NULL,			/* 0 - 31 */
+	IDTVEC(apic_isr1_pti),	/* 32 - 63 */
+	IDTVEC(apic_isr2_pti),	/* 64 - 95 */
+	IDTVEC(apic_isr3_pti),	/* 96 - 127 */
+	IDTVEC(apic_isr4_pti),	/* 128 - 159 */
+	IDTVEC(apic_isr5_pti),	/* 160 - 191 */
+	IDTVEC(apic_isr6_pti),	/* 192 - 223 */
+	IDTVEC(apic_isr7_pti),	/* 224 - 255 */
+};
 
 static u_int32_t lapic_timer_divisors[] = {
 	APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
 	APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
 };
 
-extern inthand_t IDTVEC(rsvd);
+extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
 
 volatile char *lapic_map;
 vm_paddr_t lapic_paddr;
@@ -496,15 +506,18 @@
 	PCPU_SET(apic_id, lapic_id());
 
 	/* Local APIC timer interrupt. */
-	setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
+	setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* Local APIC error interrupt. */
-	setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
+	setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 
 	/* XXX: Thermal interrupt */
 
 	/* Local APIC CMCI. */
-	setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
+	    SDT_APICT, SEL_KPL, GSEL_APIC);
 
 	if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
 		arat = 0;
@@ -1569,8 +1582,8 @@
 	KASSERT(vector != IDT_DTRACE_RET,
 	    ("Attempt to overwrite DTrace entry"));
 #endif
-	setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
-	    GSEL_APIC);
+	setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
+	    SDT_APIC, SEL_KPL, GSEL_APIC);
 }
 
 static void
@@ -1589,7 +1602,8 @@
 	 * We can not currently clear the idt entry because other cpus
 	 * may have a valid vector at this offset.
 	 */
-	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	    SEL_KPL, GSEL_APIC);
 #endif
 }
 
@@ -2087,6 +2101,7 @@
  * APIC page is mapped as an uncached region. In x2APIC mode there is an
  * explicit 'mfence' before the ICR MSR is written. Therefore in both cases
  * the IDT slot update is globally visible before the IPI is delivered.
+ * XXX pti
  */
 static int
 native_lapic_ipi_alloc(inthand_t *ipifunc)
@@ -2095,7 +2110,8 @@
 	long func;
 	int idx, vector;
 
-	KASSERT(ipifunc != &IDTVEC(rsvd), ("invalid ipifunc %p", ipifunc));
+	KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
+	    ("invalid ipifunc %p", ipifunc));
 
 	vector = -1;
 	mtx_lock_spin(&icu_lock);
@@ -2124,8 +2140,10 @@
 	mtx_lock_spin(&icu_lock);
 	ip = &idt[vector];
 	func = (ip->gd_hioffset << 16) | ip->gd_looffset;
-	KASSERT(func != (uintptr_t)&IDTVEC(rsvd),
+	KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
+	    func != (uintptr_t)&IDTVEC(rsvd_pti),
 	    ("invalid idtfunc %#lx", func));
-	setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+	setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+	    SEL_KPL, GSEL_APIC);
 	mtx_unlock_spin(&icu_lock);
 }