Page MenuHomeFreeBSD

D13797.id37885.diff
No OneTemporary

D13797.id37885.diff

Index: sys/amd64/amd64/apic_vector.S
===================================================================
--- sys/amd64/amd64/apic_vector.S
+++ sys/amd64/amd64/apic_vector.S
@@ -38,12 +38,12 @@
#include "opt_smp.h"
+#include "assym.s"
+
#include <machine/asmacros.h>
#include <machine/specialreg.h>
#include <x86/apicreg.h>
-#include "assym.s"
-
#ifdef SMP
#define LK lock ;
#else
@@ -73,30 +73,28 @@
* translates that into a vector, and passes the vector to the
* lapic_handle_intr() function.
*/
-#define ISR_VEC(index, vec_name) \
- .text ; \
- SUPERALIGN_TEXT ; \
-IDTVEC(vec_name) ; \
- PUSH_FRAME ; \
- FAKE_MCOUNT(TF_RIP(%rsp)) ; \
- cmpl $0,x2apic_mode ; \
- je 1f ; \
- movl $(MSR_APIC_ISR0 + index),%ecx ; \
- rdmsr ; \
- jmp 2f ; \
-1: ; \
- movq lapic_map, %rdx ; /* pointer to local APIC */ \
- movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \
-2: ; \
- bsrl %eax, %eax ; /* index of highest set bit in ISR */ \
- jz 3f ; \
- addl $(32 * index),%eax ; \
- movq %rsp, %rsi ; \
- movl %eax, %edi ; /* pass the IRQ */ \
- call lapic_handle_intr ; \
-3: ; \
- MEXITCOUNT ; \
+ .macro ISR_VEC index, vec_name
+ INTR_HANDLER \vec_name
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ cmpl $0,x2apic_mode
+ je 1f
+ movl $(MSR_APIC_ISR0 + \index),%ecx
+ rdmsr
+ jmp 2f
+1:
+ movq lapic_map, %rdx /* pointer to local APIC */
+ movl LA_ISR + 16 * (\index)(%rdx), %eax /* load ISR */
+2:
+ bsrl %eax, %eax /* index of highest set bit in ISR */
+ jz 3f
+ addl $(32 * \index),%eax
+ movq %rsp, %rsi
+ movl %eax, %edi /* pass the IRQ */
+ call lapic_handle_intr
+3:
+ MEXITCOUNT
jmp doreti
+ .endm
/*
* Handle "spurious INTerrupts".
@@ -108,26 +106,21 @@
.text
SUPERALIGN_TEXT
IDTVEC(spuriousint)
-
/* No EOI cycle used here */
-
jmp doreti_iret
- ISR_VEC(1, apic_isr1)
- ISR_VEC(2, apic_isr2)
- ISR_VEC(3, apic_isr3)
- ISR_VEC(4, apic_isr4)
- ISR_VEC(5, apic_isr5)
- ISR_VEC(6, apic_isr6)
- ISR_VEC(7, apic_isr7)
+ ISR_VEC 1, apic_isr1
+ ISR_VEC 2, apic_isr2
+ ISR_VEC 3, apic_isr3
+ ISR_VEC 4, apic_isr4
+ ISR_VEC 5, apic_isr5
+ ISR_VEC 6, apic_isr6
+ ISR_VEC 7, apic_isr7
/*
* Local APIC periodic timer handler.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(timerint)
- PUSH_FRAME
+ INTR_HANDLER timerint
FAKE_MCOUNT(TF_RIP(%rsp))
movq %rsp, %rdi
call lapic_handle_timer
@@ -137,10 +130,7 @@
/*
* Local APIC CMCI handler.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(cmcint)
- PUSH_FRAME
+ INTR_HANDLER cmcint
FAKE_MCOUNT(TF_RIP(%rsp))
call lapic_handle_cmc
MEXITCOUNT
@@ -149,10 +139,7 @@
/*
* Local APIC error interrupt handler.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(errorint)
- PUSH_FRAME
+ INTR_HANDLER errorint
FAKE_MCOUNT(TF_RIP(%rsp))
call lapic_handle_error
MEXITCOUNT
@@ -163,10 +150,7 @@
* Xen event channel upcall interrupt handler.
* Only used when the hypervisor supports direct vector callbacks.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(xen_intr_upcall)
- PUSH_FRAME
+ INTR_HANDLER xen_intr_upcall
FAKE_MCOUNT(TF_RIP(%rsp))
movq %rsp, %rdi
call xen_intr_handle_upcall
@@ -183,74 +167,48 @@
SUPERALIGN_TEXT
invltlb_ret:
call as_lapic_eoi
- POP_FRAME
- jmp doreti_iret
+ jmp ld_regs
SUPERALIGN_TEXT
-IDTVEC(invltlb)
- PUSH_FRAME
-
+ INTR_HANDLER invltlb
call invltlb_handler
jmp invltlb_ret
-IDTVEC(invltlb_pcid)
- PUSH_FRAME
-
+ INTR_HANDLER invltlb_pcid
call invltlb_pcid_handler
jmp invltlb_ret
-IDTVEC(invltlb_invpcid)
- PUSH_FRAME
-
+ INTR_HANDLER invltlb_invpcid
call invltlb_invpcid_handler
jmp invltlb_ret
/*
* Single page TLB shootdown
*/
- .text
-
- SUPERALIGN_TEXT
-IDTVEC(invlpg)
- PUSH_FRAME
-
+ INTR_HANDLER invlpg
call invlpg_handler
jmp invltlb_ret
/*
* Page range TLB shootdown.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(invlrng)
- PUSH_FRAME
-
+ INTR_HANDLER invlrng
call invlrng_handler
jmp invltlb_ret
/*
* Invalidate cache.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(invlcache)
- PUSH_FRAME
-
+ INTR_HANDLER invlcache
call invlcache_handler
jmp invltlb_ret
/*
* Handler for IPIs sent via the per-cpu IPI bitmap.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(ipi_intr_bitmap_handler)
- PUSH_FRAME
-
+ INTR_HANDLER ipi_intr_bitmap_handler
call as_lapic_eoi
-
FAKE_MCOUNT(TF_RIP(%rsp))
-
call ipi_bitmap_handler
MEXITCOUNT
jmp doreti
@@ -258,24 +216,15 @@
/*
* Executed by a CPU when it receives an IPI_STOP from another CPU.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(cpustop)
- PUSH_FRAME
-
+ INTR_HANDLER cpustop
call as_lapic_eoi
-
call cpustop_handler
jmp doreti
/*
* Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(cpususpend)
- PUSH_FRAME
-
+ INTR_HANDLER cpususpend
call cpususpend_handler
call as_lapic_eoi
jmp doreti
@@ -285,10 +234,7 @@
*
* - Calls the generic rendezvous action function.
*/
- .text
- SUPERALIGN_TEXT
-IDTVEC(rendezvous)
- PUSH_FRAME
+ INTR_HANDLER rendezvous
#ifdef COUNT_IPIS
movl PCPU(CPUID), %eax
movq ipi_rendezvous_counts(,%rax,8), %rax
@@ -328,4 +274,8 @@
popq %rax
jmp doreti_iret
+ INTR_HANDLER justreturn1
+ call as_lapic_eoi
+ jmp doreti
+
#endif /* SMP */
Index: sys/amd64/amd64/atpic_vector.S
===================================================================
--- sys/amd64/amd64/atpic_vector.S
+++ sys/amd64/amd64/atpic_vector.S
@@ -36,38 +36,36 @@
* master and slave interrupt controllers.
*/
-#include <machine/asmacros.h>
-
#include "assym.s"
+#include <machine/asmacros.h>
/*
* Macros for interrupt entry, call to handler, and exit.
*/
-#define INTR(irq_num, vec_name) \
- .text ; \
- SUPERALIGN_TEXT ; \
-IDTVEC(vec_name) ; \
- PUSH_FRAME ; \
- FAKE_MCOUNT(TF_RIP(%rsp)) ; \
- movq %rsp, %rsi ; \
- movl $irq_num, %edi; /* pass the IRQ */ \
- call atpic_handle_intr ; \
- MEXITCOUNT ; \
+ .macro INTR irq_num, vec_name
+ INTR_HANDLER \vec_name
+ INTR_PUSH_FRAME
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ movq %rsp, %rsi
+ movl $\irq_num, %edi /* pass the IRQ */
+ call atpic_handle_intr
+ MEXITCOUNT
jmp doreti
+ .endm
- INTR(0, atpic_intr0)
- INTR(1, atpic_intr1)
- INTR(2, atpic_intr2)
- INTR(3, atpic_intr3)
- INTR(4, atpic_intr4)
- INTR(5, atpic_intr5)
- INTR(6, atpic_intr6)
- INTR(7, atpic_intr7)
- INTR(8, atpic_intr8)
- INTR(9, atpic_intr9)
- INTR(10, atpic_intr10)
- INTR(11, atpic_intr11)
- INTR(12, atpic_intr12)
- INTR(13, atpic_intr13)
- INTR(14, atpic_intr14)
- INTR(15, atpic_intr15)
+ INTR 0, atpic_intr0
+ INTR 1, atpic_intr1
+ INTR 2, atpic_intr2
+ INTR 3, atpic_intr3
+ INTR 4, atpic_intr4
+ INTR 5, atpic_intr5
+ INTR 6, atpic_intr6
+ INTR 7, atpic_intr7
+ INTR 8, atpic_intr8
+ INTR 9, atpic_intr9
+ INTR 10, atpic_intr10
+ INTR 11, atpic_intr11
+ INTR 12, atpic_intr12
+ INTR 13, atpic_intr13
+ INTR 14, atpic_intr14
+ INTR 15, atpic_intr15
Index: sys/amd64/amd64/cpu_switch.S
===================================================================
--- sys/amd64/amd64/cpu_switch.S
+++ sys/amd64/amd64/cpu_switch.S
@@ -215,8 +215,10 @@
movq %r8,PCPU(RSP0)
movq %r8,PCPU(CURPCB)
/* Update the TSS_RSP0 pointer for the next interrupt */
+ cmpb $0,pti(%rip)
+ jne 1f
movq %r8,TSS_RSP0(%rdx)
- movq %r12,PCPU(CURTHREAD) /* into next thread */
+1: movq %r12,PCPU(CURTHREAD) /* into next thread */
/* Test if debug registers should be restored. */
testl $PCB_DBREGS,PCB_FLAGS(%r8)
@@ -293,7 +295,12 @@
shrq $8,%rcx
movl %ecx,8(%rax)
movb $0x89,5(%rax) /* unset busy */
- movl $TSSSEL,%eax
+ cmpb $0,pti(%rip)
+ je 1f
+ movq PCPU(PRVSPACE),%rax
+ addq $PC_PTI_STACK+PC_PTI_STACK_SZ*8,%rax
+ movq %rax,TSS_RSP0(%rdx)
+1: movl $TSSSEL,%eax
ltr %ax
jmp done_tss
Index: sys/amd64/amd64/exception.S
===================================================================
--- sys/amd64/amd64/exception.S
+++ sys/amd64/amd64/exception.S
@@ -38,13 +38,13 @@
#include "opt_compat.h"
#include "opt_hwpmc_hooks.h"
+#include "assym.s"
+
#include <machine/asmacros.h>
#include <machine/psl.h>
#include <machine/trap.h>
#include <machine/specialreg.h>
-#include "assym.s"
-
#ifdef KDTRACE_HOOKS
.bss
.globl dtrace_invop_jump_addr
@@ -100,68 +100,62 @@
MCOUNT_LABEL(user)
MCOUNT_LABEL(btrap)
-/* Traps that we leave interrupts disabled for.. */
-#define TRAP_NOEN(a) \
- subq $TF_RIP,%rsp; \
- movl $(a),TF_TRAPNO(%rsp) ; \
- movq $0,TF_ADDR(%rsp) ; \
- movq $0,TF_ERR(%rsp) ; \
+/* Traps that we leave interrupts disabled for. */
+ .macro TRAP_NOEN l, trapno
+ PTI_ENTRY \l,X\l
+ .globl X\l
+ .type X\l,@function
+X\l: subq $TF_RIP,%rsp
+ movl $\trapno,TF_TRAPNO(%rsp)
+ movq $0,TF_ADDR(%rsp)
+ movq $0,TF_ERR(%rsp)
jmp alltraps_noen
-IDTVEC(dbg)
- TRAP_NOEN(T_TRCTRAP)
-IDTVEC(bpt)
- TRAP_NOEN(T_BPTFLT)
+ .endm
+
+ TRAP_NOEN dbg, T_TRCTRAP
+ TRAP_NOEN bpt, T_BPTFLT
#ifdef KDTRACE_HOOKS
-IDTVEC(dtrace_ret)
- TRAP_NOEN(T_DTRACE_RET)
+ TRAP_NOEN dtrace_ret, T_DTRACE_RET
#endif
/* Regular traps; The cpu does not supply tf_err for these. */
-#define TRAP(a) \
- subq $TF_RIP,%rsp; \
- movl $(a),TF_TRAPNO(%rsp) ; \
- movq $0,TF_ADDR(%rsp) ; \
- movq $0,TF_ERR(%rsp) ; \
+ .macro TRAP l, trapno
+ PTI_ENTRY \l,X\l
+ .globl X\l
+ .type X\l,@function
+X\l:
+ subq $TF_RIP,%rsp
+ movl $\trapno,TF_TRAPNO(%rsp)
+ movq $0,TF_ADDR(%rsp)
+ movq $0,TF_ERR(%rsp)
jmp alltraps
-IDTVEC(div)
- TRAP(T_DIVIDE)
-IDTVEC(ofl)
- TRAP(T_OFLOW)
-IDTVEC(bnd)
- TRAP(T_BOUND)
-IDTVEC(ill)
- TRAP(T_PRIVINFLT)
-IDTVEC(dna)
- TRAP(T_DNA)
-IDTVEC(fpusegm)
- TRAP(T_FPOPFLT)
-IDTVEC(mchk)
- TRAP(T_MCHK)
-IDTVEC(rsvd)
- TRAP(T_RESERVED)
-IDTVEC(fpu)
- TRAP(T_ARITHTRAP)
-IDTVEC(xmm)
- TRAP(T_XMMFLT)
-
-/* This group of traps have tf_err already pushed by the cpu */
-#define TRAP_ERR(a) \
- subq $TF_ERR,%rsp; \
- movl $(a),TF_TRAPNO(%rsp) ; \
- movq $0,TF_ADDR(%rsp) ; \
+ .endm
+
+ TRAP div, T_DIVIDE
+ TRAP ofl, T_OFLOW
+ TRAP bnd, T_BOUND
+ TRAP ill, T_PRIVINFLT
+ TRAP dna, T_DNA
+ TRAP fpusegm, T_FPOPFLT
+ TRAP mchk, T_MCHK
+ TRAP rsvd, T_RESERVED
+ TRAP fpu, T_ARITHTRAP
+ TRAP xmm, T_XMMFLT
+
+/* This group of traps have tf_err already pushed by the cpu. */
+ .macro TRAP_ERR l, trapno
+ PTI_ENTRY \l,X\l,has_err=1
+ .globl X\l
+ .type X\l,@function
+X\l:
+ subq $TF_ERR,%rsp
+ movl $\trapno,TF_TRAPNO(%rsp)
+ movq $0,TF_ADDR(%rsp)
jmp alltraps
-IDTVEC(tss)
- TRAP_ERR(T_TSSFLT)
-IDTVEC(missing)
- subq $TF_ERR,%rsp
- movl $T_SEGNPFLT,TF_TRAPNO(%rsp)
- jmp prot_addrf
-IDTVEC(stk)
- subq $TF_ERR,%rsp
- movl $T_STKFLT,TF_TRAPNO(%rsp)
- jmp prot_addrf
-IDTVEC(align)
- TRAP_ERR(T_ALIGNFLT)
+ .endm
+
+ TRAP_ERR tss, T_TSSFLT
+ TRAP_ERR align, T_ALIGNFLT
/*
* alltraps entry point. Use swapgs if this is the first time in the
@@ -174,15 +168,12 @@
alltraps:
movq %rdi,TF_RDI(%rsp)
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
- jz alltraps_testi /* already running with kernel GS.base */
+ jz alltraps_segs /* already running with kernel GS.base */
swapgs
movq PCPU(CURPCB),%rdi
andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
- movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw %es,TF_ES(%rsp)
- movw %ds,TF_DS(%rsp)
-alltraps_testi:
+alltraps_segs:
+ SAVE_SEGS
testl $PSL_I,TF_RFLAGS(%rsp)
jz alltraps_pushregs_no_rdi
sti
@@ -249,14 +240,12 @@
alltraps_noen:
movq %rdi,TF_RDI(%rsp)
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
- jz 1f /* already running with kernel GS.base */
+ jz alltraps_noen_segs /* already running with kernel GS.base */
swapgs
movq PCPU(CURPCB),%rdi
andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1: movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw %es,TF_ES(%rsp)
- movw %ds,TF_DS(%rsp)
+alltraps_noen_segs:
+ SAVE_SEGS
jmp alltraps_pushregs_no_rdi
IDTVEC(dblfault)
@@ -279,37 +268,36 @@
movq %r13,TF_R13(%rsp)
movq %r14,TF_R14(%rsp)
movq %r15,TF_R15(%rsp)
- movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw %es,TF_ES(%rsp)
- movw %ds,TF_DS(%rsp)
+ SAVE_SEGS
movl $TF_HASSEGS,TF_FLAGS(%rsp)
cld
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
jz 1f /* already running with kernel GS.base */
swapgs
1:
- movq %rsp,%rdi
+ movq PCPU(KCR3),%rax
+ cmpq $~0,%rax
+ jne 2f
+ movq %rax,%cr3
+2: movq %rsp,%rdi
call dblfault_handler
-2:
- hlt
- jmp 2b
+3: hlt
+ jmp 3b
+ PTI_ENTRY page, Xpage, has_err=1
IDTVEC(page)
subq $TF_ERR,%rsp
- movl $T_PAGEFLT,TF_TRAPNO(%rsp)
movq %rdi,TF_RDI(%rsp) /* free up a GP register */
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
- jz 1f /* already running with kernel GS.base */
+ jz page_cr2 /* already running with kernel GS.base */
swapgs
movq PCPU(CURPCB),%rdi
andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1: movq %cr2,%rdi /* preserve %cr2 before .. */
+page_cr2:
+ movq %cr2,%rdi /* preserve %cr2 before .. */
movq %rdi,TF_ADDR(%rsp) /* enabling interrupts. */
- movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw %es,TF_ES(%rsp)
- movw %ds,TF_DS(%rsp)
+ SAVE_SEGS
+ movl $T_PAGEFLT,TF_TRAPNO(%rsp)
testl $PSL_I,TF_RFLAGS(%rsp)
jz alltraps_pushregs_no_rdi
sti
@@ -320,10 +308,43 @@
* the iretq stage, we'll reenter with the wrong gs state. We'll have
* to do a special the swapgs in this case even coming from the kernel.
* XXX linux has a trap handler for their equivalent of load_gs().
+ *
+ * On the stack, we have the hardware interrupt frame to return
+ * to usermode (faulted) and another frame with error code, for
+ * fault. For PTI, copy both frames to the main thread stack.
*/
-IDTVEC(prot)
+ .macro PROTF_ENTRY name,trapno
+\name\()_pti_doreti:
+ pushq %rax
+ pushq %rdx
+ swapgs
+ movq PCPU(KCR3),%rax
+ movq %rax,%cr3
+ movq PCPU(RSP0),%rax
+ subq $2*PTI_SIZE-3*8,%rax
+ MOVE_STACKS (PTI_SIZE / 4 - 3)
+ movq %rax,%rsp
+ popq %rdx
+ popq %rax
+ swapgs
+ jmp X\name
+IDTVEC(\name\()_pti)
+ cmpq $doreti_iret,PTI_RIP-2*8(%rsp)
+ je \name\()_pti_doreti
+ testb $SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
+ jz X\name
+ PTI_UENTRY has_err=1
+ swapgs
+IDTVEC(\name)
subq $TF_ERR,%rsp
- movl $T_PROTFLT,TF_TRAPNO(%rsp)
+ movl $\trapno,TF_TRAPNO(%rsp)
+ jmp prot_addrf
+ .endm
+
+ PROTF_ENTRY missing, T_SEGNPFLT
+ PROTF_ENTRY stk, T_STKFLT
+ PROTF_ENTRY prot, T_PROTFLT
+
prot_addrf:
movq $0,TF_ADDR(%rsp)
movq %rdi,TF_RDI(%rsp) /* free up a GP register */
@@ -375,8 +396,18 @@
* We do not support invoking this from a custom segment registers,
* esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
*/
+ SUPERALIGN_TEXT
+IDTVEC(fast_syscall_pti)
+ swapgs
+ movq %rax,PCPU(SCRATCH_RAX)
+ movq PCPU(KCR3),%rax
+ movq %rax,%cr3
+ jmp fast_syscall_common
+ SUPERALIGN_TEXT
IDTVEC(fast_syscall)
swapgs
+ movq %rax,PCPU(SCRATCH_RAX)
+fast_syscall_common:
movq %rsp,PCPU(SCRATCH_RSP)
movq PCPU(RSP0),%rsp
/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
@@ -386,10 +417,9 @@
movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */
movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */
movq %r11,TF_RSP(%rsp) /* user stack pointer */
- movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw %es,TF_ES(%rsp)
- movw %ds,TF_DS(%rsp)
+ movq PCPU(SCRATCH_RAX),%rax
+ movq %rax,TF_RAX(%rsp) /* syscall number */
+ SAVE_SEGS
movq PCPU(CURPCB),%r11
andl $~PCB_FULL_IRET,PCB_FLAGS(%r11)
sti
@@ -402,7 +432,6 @@
movq %r10,TF_RCX(%rsp) /* arg 4 */
movq %r8,TF_R8(%rsp) /* arg 5 */
movq %r9,TF_R9(%rsp) /* arg 6 */
- movq %rax,TF_RAX(%rsp) /* syscall number */
movq %rbx,TF_RBX(%rsp) /* C preserved */
movq %rbp,TF_RBP(%rsp) /* C preserved */
movq %r12,TF_R12(%rsp) /* C preserved */
@@ -420,11 +449,11 @@
/* Disable interrupts before testing PCB_FULL_IRET. */
cli
testl $PCB_FULL_IRET,PCB_FLAGS(%rax)
- jnz 3f
+ jnz 4f
/* Check for and handle AST's on return to userland. */
movq PCPU(CURTHREAD),%rax
testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
- jne 2f
+ jne 3f
/* Restore preserved registers. */
MEXITCOUNT
movq TF_RDI(%rsp),%rdi /* bonus; preserve arg 1 */
@@ -434,16 +463,21 @@
movq TF_RFLAGS(%rsp),%r11 /* original %rflags */
movq TF_RIP(%rsp),%rcx /* original %rip */
movq TF_RSP(%rsp),%rsp /* user stack pointer */
- swapgs
+ cmpb $0,pti
+ je 2f
+ movq PCPU(UCR3),%r9
+ movq %r9,%cr3
+ xorl %r9d,%r9d
+2: swapgs
sysretq
-2: /* AST scheduled. */
+3: /* AST scheduled. */
sti
movq %rsp,%rdi
call ast
jmp 1b
-3: /* Requested full context restore, use doreti for that. */
+4: /* Requested full context restore, use doreti for that. */
MEXITCOUNT
jmp doreti
@@ -499,17 +533,15 @@
movq %r13,TF_R13(%rsp)
movq %r14,TF_R14(%rsp)
movq %r15,TF_R15(%rsp)
- movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw %es,TF_ES(%rsp)
- movw %ds,TF_DS(%rsp)
+ SAVE_SEGS
movl $TF_HASSEGS,TF_FLAGS(%rsp)
cld
xorl %ebx,%ebx
testb $SEL_RPL_MASK,TF_CS(%rsp)
jnz nmi_fromuserspace
/*
- * We've interrupted the kernel. Preserve GS.base in %r12.
+ * We've interrupted the kernel. Preserve GS.base in %r12
+ * and %cr3 in %r13.
*/
movl $MSR_GSBASE,%ecx
rdmsr
@@ -521,27 +553,38 @@
movl %edx,%eax
shrq $32,%rdx
wrmsr
+ movq %cr3,%r13
+ movq PCPU(KCR3),%rax
+ cmpq $~0,%rax
+ je nmi_calltrap
+ movq %rax,%cr3
jmp nmi_calltrap
nmi_fromuserspace:
incl %ebx
swapgs
- testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
- jz 2f
+ movq %cr3,%r13
+ movq PCPU(KCR3),%rax
+ cmpq $~0,%rax
+ je 1f
+ movq %rax,%cr3
movq PCPU(CURPCB),%rdi
testq %rdi,%rdi
- jz 2f
+ jz 3f
+ orl $PCB_FULL_IRET,PCB_FLAGS(%rdi)
+1: testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
+ jz 3f
cmpw $KUF32SEL,TF_FS(%rsp)
- jne 1f
+ jne 2f
rdfsbase %rax
movq %rax,PCB_FSBASE(%rdi)
-1: cmpw $KUG32SEL,TF_GS(%rsp)
- jne 2f
+2: cmpw $KUG32SEL,TF_GS(%rsp)
+ jne 3f
movl $MSR_KGSBASE,%ecx
rdmsr
shlq $32,%rdx
orq %rdx,%rax
movq %rax,PCB_GSBASE(%rdi)
-2:
+3:
/* Note: this label is also used by ddb and gdb: */
nmi_calltrap:
FAKE_MCOUNT(TF_RIP(%rsp))
@@ -564,26 +607,29 @@
movq PCPU(CURTHREAD),%rax
orq %rax,%rax /* curthread present? */
jz nocallchain
- testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
- jz nocallchain
/*
- * A user callchain is to be captured, so:
- * - Move execution to the regular kernel stack, to allow for
- * nested NMI interrupts.
- * - Take the processor out of "NMI" mode by faking an "iret".
- * - Enable interrupts, so that copyin() can work.
+ * Move execution to the regular kernel stack, because we
+ * committed to return through doreti.
*/
movq %rsp,%rsi /* source stack pointer */
movq $TF_SIZE,%rcx
movq PCPU(RSP0),%rdx
subq %rcx,%rdx
movq %rdx,%rdi /* destination stack pointer */
-
shrq $3,%rcx /* trap frame size in long words */
cld
rep
movsq /* copy trapframe */
+ movq %rdx,%rsp /* we are on the regular kstack */
+ testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
+ jz nocallchain
+ /*
+ * A user callchain is to be captured, so:
+ * - Take the processor out of "NMI" mode by faking an "iret",
+ * to allow for nested NMI interrupts.
+ * - Enable interrupts, so that copyin() can work.
+ */
movl %ss,%eax
pushq %rax /* tf_ss */
pushq %rdx /* tf_rsp (on kernel stack) */
@@ -624,22 +670,9 @@
movl %edx,%eax
shrq $32,%rdx
wrmsr
+ movq %r13,%cr3
nmi_restoreregs:
- movq TF_RDI(%rsp),%rdi
- movq TF_RSI(%rsp),%rsi
- movq TF_RDX(%rsp),%rdx
- movq TF_RCX(%rsp),%rcx
- movq TF_R8(%rsp),%r8
- movq TF_R9(%rsp),%r9
- movq TF_RAX(%rsp),%rax
- movq TF_RBX(%rsp),%rbx
- movq TF_RBP(%rsp),%rbp
- movq TF_R10(%rsp),%r10
- movq TF_R11(%rsp),%r11
- movq TF_R12(%rsp),%r12
- movq TF_R13(%rsp),%r13
- movq TF_R14(%rsp),%r14
- movq TF_R15(%rsp),%r15
+ RESTORE_REGS
addq $TF_RIP,%rsp
jmp doreti_iret
@@ -807,27 +840,38 @@
ld_ds:
movw TF_DS(%rsp),%ds
ld_regs:
- movq TF_RDI(%rsp),%rdi
- movq TF_RSI(%rsp),%rsi
- movq TF_RDX(%rsp),%rdx
- movq TF_RCX(%rsp),%rcx
- movq TF_R8(%rsp),%r8
- movq TF_R9(%rsp),%r9
- movq TF_RAX(%rsp),%rax
- movq TF_RBX(%rsp),%rbx
- movq TF_RBP(%rsp),%rbp
- movq TF_R10(%rsp),%r10
- movq TF_R11(%rsp),%r11
- movq TF_R12(%rsp),%r12
- movq TF_R13(%rsp),%r13
- movq TF_R14(%rsp),%r14
- movq TF_R15(%rsp),%r15
+ RESTORE_REGS
testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
- jz 1f /* keep running with kernel GS.base */
+ jz 2f /* keep running with kernel GS.base */
cli
+ cmpb $0,pti
+ je 1f
+ pushq %rdx
+ movq PCPU(PRVSPACE),%rdx
+ addq $PC_PTI_STACK+PC_PTI_STACK_SZ*8-PTI_SIZE,%rdx
+ movq %rax,PTI_RAX(%rdx)
+ popq %rax
+ movq %rax,PTI_RDX(%rdx)
+ movq TF_RIP(%rsp),%rax
+ movq %rax,PTI_RIP(%rdx)
+ movq TF_CS(%rsp),%rax
+ movq %rax,PTI_CS(%rdx)
+ movq TF_RFLAGS(%rsp),%rax
+ movq %rax,PTI_RFLAGS(%rdx)
+ movq TF_RSP(%rsp),%rax
+ movq %rax,PTI_RSP(%rdx)
+ movq TF_SS(%rsp),%rax
+ movq %rax,PTI_SS(%rdx)
+ movq PCPU(UCR3),%rax
swapgs
-1:
- addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */
+ movq %rdx,%rsp
+ movq %rax,%cr3
+ popq %rdx
+ popq %rax
+ addq $8,%rsp
+ jmp doreti_iret
+1: swapgs
+2: addq $TF_RIP,%rsp
.globl doreti_iret
doreti_iret:
iretq
@@ -851,14 +895,11 @@
.globl doreti_iret_fault
doreti_iret_fault:
subq $TF_RIP,%rsp /* space including tf_err, tf_trapno */
- testl $PSL_I,TF_RFLAGS(%rsp)
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
jz 1f
sti
1:
- movw %fs,TF_FS(%rsp)
- movw %gs,TF_GS(%rsp)
- movw %es,TF_ES(%rsp)
- movw %ds,TF_DS(%rsp)
+ SAVE_SEGS
movl $TF_HASSEGS,TF_FLAGS(%rsp)
movq %rdi,TF_RDI(%rsp)
movq %rsi,TF_RSI(%rsp)
@@ -885,7 +926,7 @@
.globl ds_load_fault
ds_load_fault:
movl $T_PROTFLT,TF_TRAPNO(%rsp)
- testl $PSL_I,TF_RFLAGS(%rsp)
+ testb $SEL_RPL_MASK,TF_CS(%rsp)
jz 1f
sti
1:
Index: sys/amd64/amd64/genassym.c
===================================================================
--- sys/amd64/amd64/genassym.c
+++ sys/amd64/amd64/genassym.c
@@ -186,6 +186,16 @@
ASSYM(TF_SIZE, sizeof(struct trapframe));
ASSYM(TF_HASSEGS, TF_HASSEGS);
+ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx));
+ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax));
+ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err));
+ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip));
+ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs));
+ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags));
+ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp));
+ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss));
+ASSYM(PTI_SIZE, sizeof(struct pti_frame));
+
ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags));
@@ -202,6 +212,7 @@
ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb));
ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid));
ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp));
+ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax));
ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap));
ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp));
ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0));
@@ -211,6 +222,10 @@
ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp));
ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
+ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3));
+ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3));
+ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack));
+ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ);
ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL);
ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL);
Index: sys/amd64/amd64/machdep.c
===================================================================
--- sys/amd64/amd64/machdep.c
+++ sys/amd64/amd64/machdep.c
@@ -658,9 +658,9 @@
static struct gate_descriptor idt0[NIDT];
struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
-static char dblfault_stack[PAGE_SIZE] __aligned(16);
+char dblfault_stack[PAGE_SIZE] __aligned(16);
-static char nmi0_stack[PAGE_SIZE] __aligned(16);
+char nmi0_stack[PAGE_SIZE] __aligned(16);
CTASSERT(sizeof(struct nmi_pcpu) == 16);
struct amd64tss common_tss[MAXCPU];
@@ -813,13 +813,20 @@
IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
IDTVEC(xmm), IDTVEC(dblfault),
+ IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
+ IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
+ IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
+ IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(mchk_pti),
+ IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
+ IDTVEC(xmm_pti),
#ifdef KDTRACE_HOOKS
- IDTVEC(dtrace_ret),
+ IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
#endif
#ifdef XENHVM
- IDTVEC(xen_intr_upcall),
+ IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
#endif
- IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
+ IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
+ IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32_pti);
#ifdef DDB
/*
@@ -1520,7 +1527,8 @@
msr = rdmsr(MSR_EFER) | EFER_SCE;
wrmsr(MSR_EFER, msr);
- wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
+ wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
+ (u_int64_t)IDTVEC(fast_syscall));
wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
@@ -1536,6 +1544,7 @@
struct pcpu *pc;
struct nmi_pcpu *np;
struct xstate_hdr *xhdr;
+ u_int64_t rsp0;
char *env;
size_t kstack0_sz;
int late_console;
@@ -1609,34 +1618,54 @@
mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
/* exceptions */
+ TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
+
for (x = 0; x < NIDT; x++)
- setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_DB, pti ? &IDTVEC(dbg_pti) : &IDTVEC(dbg), SDT_SYSIGT,
+ SEL_KPL, 0);
setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
- setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
- setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
+ SEL_UPL, 0);
+ setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
+ SEL_KPL, 0);
setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
- setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
+ SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
+ SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_MC, pti ? &IDTVEC(mchk_pti) : &IDTVEC(mchk), SDT_SYSIGT,
+ SEL_KPL, 0);
+ setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
+ SEL_KPL, 0);
#ifdef KDTRACE_HOOKS
- setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
+ setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
+ &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
#endif
#ifdef XENHVM
setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0);
#endif
-
r_idt.rd_limit = sizeof(idt0) - 1;
r_idt.rd_base = (long) idt;
lidt(&r_idt);
@@ -1750,10 +1779,12 @@
xhdr->xstate_bv = xsave_mask;
}
/* make an initial tss so cpu can get interrupt stack on syscall! */
- common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb;
+ rsp0 = (vm_offset_t)thread0.td_pcb;
/* Ensure the stack is aligned to 16 bytes */
- common_tss[0].tss_rsp0 &= ~0xFul;
- PCPU_SET(rsp0, common_tss[0].tss_rsp0);
+ rsp0 &= ~0xFul;
+ common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) +
+ PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0;
+ PCPU_SET(rsp0, rsp0);
PCPU_SET(curpcb, thread0.td_pcb);
/* transfer to user mode */
Index: sys/amd64/amd64/mp_machdep.c
===================================================================
--- sys/amd64/amd64/mp_machdep.c
+++ sys/amd64/amd64/mp_machdep.c
@@ -132,33 +132,40 @@
/* Install an inter-CPU IPI for TLB invalidation */
if (pmap_pcid_enabled) {
if (invpcid_works) {
- setidt(IPI_INVLTLB, IDTVEC(invltlb_invpcid),
- SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_invpcid_pti) :
+ IDTVEC(invltlb_invpcid), SDT_SYSIGT, SEL_KPL, 0);
} else {
- setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT,
- SEL_KPL, 0);
+ setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) :
+ IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0);
}
} else {
- setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb),
+ SDT_SYSIGT, SEL_KPL, 0);
}
- setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0);
- setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg),
+ SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng),
+ SDT_SYSIGT, SEL_KPL, 0);
/* Install an inter-CPU IPI for cache invalidation. */
- setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache),
+ SDT_SYSIGT, SEL_KPL, 0);
/* Install an inter-CPU IPI for all-CPU rendezvous */
- setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) :
+ IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0);
/* Install generic inter-CPU IPI handler */
- setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler),
- SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) :
+ IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0);
/* Install an inter-CPU IPI for CPU stop/restart */
- setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop),
+ SDT_SYSIGT, SEL_KPL, 0);
/* Install an inter-CPU IPI for CPU suspend/resume */
- setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend),
+ SDT_SYSIGT, SEL_KPL, 0);
/* Set boot_cpu_id if needed. */
if (boot_cpu_id == -1) {
@@ -197,7 +204,6 @@
/* Init tss */
common_tss[cpu] = common_tss[0];
- common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */
common_tss[cpu].tss_iobase = sizeof(struct amd64tss) +
IOPERM_BITMAP_SIZE;
common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE];
@@ -240,6 +246,8 @@
pc->pc_curpmap = kernel_pmap;
pc->pc_pcid_gen = 1;
pc->pc_pcid_next = PMAP_PCID_KERN + 1;
+ common_tss[cpu].tss_rsp0 = pti ? ((vm_offset_t)&pc->pc_pti_stack +
+ PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0;
/* Save the per-cpu pointer for use by the NMI handler. */
np->np_pcpu = (register_t) pc;
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -149,6 +149,7 @@
#ifdef SMP
#include <machine/smp.h>
#endif
+#include <machine/tss.h>
static __inline boolean_t
pmap_type_guest(pmap_t pmap)
@@ -217,7 +218,7 @@
switch (pmap->pm_type) {
case PT_X86:
- mask = X86_PG_G;
+ mask = pg_g;
break;
case PT_RVI:
case PT_EPT:
@@ -340,7 +341,7 @@
static int ndmpdp;
vm_paddr_t dmaplimit;
vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
-pt_entry_t pg_nx;
+pt_entry_t pg_nx, pg_g;
static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
@@ -405,6 +406,15 @@
SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
"Is the invpcid instruction available ?");
+int pti = 1;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+ &pti, 0,
+ "Page Table Isolation enabled");
+static vm_object_t pti_obj;
+static pml4_entry_t *pti_pml4;
+static vm_pindex_t pti_pg_idx;
+static bool pti_finalized;
+
static int
pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
{
@@ -639,6 +649,12 @@
static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
vm_prot_t prot);
static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
+static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
+ bool exec);
+static void pmap_pti_init(void);
+static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
+static pd_entry_t *pmap_pti_pde(vm_offset_t va);
+static void pmap_pti_wire_pte(void *pte);
static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
struct spglist *free, struct rwlock **lockp);
static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
@@ -921,7 +937,7 @@
/* XXX not fully used, underneath 2M pages */
pt_p = (pt_entry_t *)KPTphys;
for (i = 0; ptoa(i) < *firstaddr; i++)
- pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G;
+ pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g;
/* Now map the page tables at their location within PTmap */
pd_p = (pd_entry_t *)KPDphys;
@@ -932,7 +948,7 @@
/* This replaces some of the KPTphys entries above */
for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS |
- X86_PG_G;
+ pg_g;
/* And connect up the PD to the PDP (leaving room for L4 pages) */
pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
@@ -952,14 +968,14 @@
for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
/* Preset PG_M and PG_A because demotion expects it. */
- pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
X86_PG_M | X86_PG_A | pg_nx;
}
pdp_p = (pdp_entry_t *)DMPDPphys;
for (i = 0; i < ndm1g; i++) {
pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
/* Preset PG_M and PG_A because demotion expects it. */
- pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G |
+ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
X86_PG_M | X86_PG_A | pg_nx;
}
for (j = 0; i < ndmpdp; i++, j++) {
@@ -1027,6 +1043,8 @@
load_cr3(KPML4phys);
if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
load_cr4(rcr4() | CR4_SMEP);
+ if (!pti)
+ pg_g = X86_PG_G;
/*
* Initialize the kernel pmap (which is statically allocated).
@@ -1071,6 +1089,8 @@
pmap_init_pat();
/* Initialize TLB Context Id. */
+ if (pti)
+ pmap_pcid_enabled = 0;
TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
/* Check for INVPCID support */
@@ -1088,8 +1108,6 @@
* during pcpu setup.
*/
load_cr4(rcr4() | CR4_PCIDE);
- } else {
- pmap_pcid_enabled = 0;
}
}
@@ -1285,6 +1303,8 @@
(vmem_addr_t *)&qframe);
if (error != 0)
panic("qframe allocation failed");
+
+ pmap_pti_init();
}
static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
@@ -2114,7 +2134,7 @@
pt_entry_t *pte;
pte = vtopte(va);
- pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G);
+ pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
}
static __inline void
@@ -2125,7 +2145,7 @@
pte = vtopte(va);
cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
- pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits);
+ pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
}
/*
@@ -2185,7 +2205,7 @@
pa = VM_PAGE_TO_PHYS(m) | cache_bits;
if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
oldpte |= *pte;
- pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V);
+ pte_store(pte, pa | pg_g | X86_PG_RW | X86_PG_V);
}
pte++;
}
@@ -2306,6 +2326,10 @@
pml4_entry_t *pml4;
pml4 = pmap_pml4e(pmap, va);
*pml4 = 0;
+ if (pti && va <= VM_MAXUSER_ADDRESS) {
+ pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
+ *pml4 = 0;
+ }
} else if (m->pindex >= NUPDE) {
/* PD page */
pdp_entry_t *pdp;
@@ -2364,7 +2388,9 @@
PMAP_LOCK_INIT(pmap);
pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
+ pmap->pm_pml4u = NULL;
pmap->pm_cr3 = KPML4phys;
+ pmap->pm_ucr3 = ~0UL;
pmap->pm_root.rt_root = 0;
CPU_ZERO(&pmap->pm_active);
TAILQ_INIT(&pmap->pm_pvchunk);
@@ -2373,6 +2399,8 @@
CPU_FOREACH(i) {
pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
pmap->pm_pcids[i].pm_gen = 0;
+ if (!pti)
+ __pcpu[i].pc_kcr3 = ~0ul;
}
PCPU_SET(curpmap, kernel_pmap);
pmap_activate(curthread);
@@ -2402,6 +2430,17 @@
X86_PG_A | X86_PG_M;
}
+static void
+pmap_pinit_pml4_pti(vm_page_t pml4pg)
+{
+ pml4_entry_t *pm_pml4;
+ int i;
+
+ pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+ for (i = 0; i < NPML4EPG; i++)
+ pm_pml4[i] = pti_pml4[i];
+}
+
/*
* Initialize a preallocated and zeroed pmap structure,
* such as one in a vmspace structure.
@@ -2409,7 +2448,7 @@
int
pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
{
- vm_page_t pml4pg;
+ vm_page_t pml4pg, pml4pgu;
vm_paddr_t pml4phys;
int i;
@@ -2425,8 +2464,9 @@
pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
pmap->pm_pcids[i].pm_gen = 0;
}
- pmap->pm_cr3 = ~0; /* initialize to an invalid value */
+ pmap->pm_cr3 = ~0l; /* initialize to an invalid value */
+ pmap->pm_type = pm_type;
if ((pml4pg->flags & PG_ZERO) == 0)
pagezero(pmap->pm_pml4);
@@ -2434,10 +2474,19 @@
* Do not install the host kernel mappings in the nested page
* tables. These mappings are meaningless in the guest physical
* address space.
+ * Install minimal kernel mappings in PTI case.
*/
- if ((pmap->pm_type = pm_type) == PT_X86) {
+ if (pm_type == PT_X86) {
pmap->pm_cr3 = pml4phys;
pmap_pinit_pml4(pml4pg);
+ if (pti) {
+ pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
+ VM_ALLOC_NOOBJ | VM_ALLOC_WAITOK);
+ pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
+ VM_PAGE_TO_PHYS(pml4pgu));
+ pmap_pinit_pml4_pti(pml4pgu);
+ pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
+ }
}
pmap->pm_root.rt_root = 0;
@@ -2509,13 +2558,18 @@
*/
if (ptepindex >= (NUPDE + NUPDPE)) {
- pml4_entry_t *pml4;
+ pml4_entry_t *pml4, *pml4u;
vm_pindex_t pml4index;
/* Wire up a new PDPE page */
pml4index = ptepindex - (NUPDE + NUPDPE);
pml4 = &pmap->pm_pml4[pml4index];
*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+ if (pti && pml4index < NUPML4E) {
+ pml4u = &pmap->pm_pml4u[pml4index];
+ *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
+ PG_A | PG_M;
+ }
} else if (ptepindex >= NUPDE) {
vm_pindex_t pml4index;
@@ -2716,6 +2770,11 @@
m->wire_count--;
atomic_subtract_int(&vm_cnt.v_wire_count, 1);
vm_page_free_zero(m);
+
+ if (pti && pmap->pm_pml4u != NULL) {
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
+ vm_page_free(m);
+ }
}
static int
@@ -7141,6 +7200,10 @@
} else if (cr3 != pmap->pm_cr3) {
load_cr3(pmap->pm_cr3);
PCPU_SET(curpmap, pmap);
+ if (pti) {
+ PCPU_SET(kcr3, pmap->pm_cr3);
+ PCPU_SET(ucr3, pmap->pm_ucr3);
+ }
}
#ifdef SMP
CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
@@ -7464,6 +7527,285 @@
mtx_unlock_spin(&qframe_mtx);
}
+static vm_page_t
+pmap_pti_alloc_page(void)
+{
+ vm_page_t m;
+
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+ m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
+ VM_ALLOC_WIRED | VM_ALLOC_ZERO);
+ return (m);
+}
+
+extern char kernphys[], etext[], dblfault_stack[], nmi0_stack[];
+
+static void
+pmap_pti_init(void)
+{
+ vm_page_t pml4_pg;
+ pdp_entry_t *pdpe;
+ vm_offset_t va;
+ int i;
+
+ if (!pti)
+ return;
+ pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
+ VM_OBJECT_WLOCK(pti_obj);
+ pml4_pg = pmap_pti_alloc_page();
+ pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
+ for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
+ va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
+ pdpe = pmap_pti_pdpe(va);
+ pmap_pti_wire_pte(pdpe);
+ }
+ pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
+ (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
+ pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + NGDT *
+ MAXCPU, false);
+ pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
+ sizeof(struct gate_descriptor) * NIDT, false);
+ pmap_pti_add_kva_locked((vm_offset_t)common_tss,
+ (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
+ pmap_pti_add_kva_locked((vm_offset_t)dblfault_stack,
+ (vm_offset_t)dblfault_stack + PAGE_SIZE, false);
+ pmap_pti_add_kva_locked((vm_offset_t)nmi0_stack,
+ (vm_offset_t)nmi0_stack + PAGE_SIZE, false);
+ CPU_FOREACH(i) {
+ if (i == 0)
+ continue;
+ va = common_tss[i].tss_ist1;
+ pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
+ va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu) -
+ PAGE_SIZE;
+ pmap_pti_add_kva_locked(va, va + PAGE_SIZE, false);
+ }
+ pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
+ (vm_offset_t)etext, true);
+ pti_finalized = true;
+ VM_OBJECT_WUNLOCK(pti_obj);
+}
+
+static pdp_entry_t *
+pmap_pti_pdpe(vm_offset_t va)
+{
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
+ vm_page_t m;
+ vm_pindex_t pml4_idx;
+ vm_paddr_t mphys;
+
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+ pml4_idx = pmap_pml4e_index(va);
+ pml4e = &pti_pml4[pml4_idx];
+ m = NULL;
+ if (*pml4e == 0) {
+ if (pti_finalized)
+ panic("pml4 alloc after finalization\n");
+ m = pmap_pti_alloc_page();
+ if (*pml4e != 0) {
+ vm_page_free_zero(m);
+ mphys = *pml4e & ~PAGE_MASK;
+ } else {
+ mphys = VM_PAGE_TO_PHYS(m);
+ *pml4e = mphys | X86_PG_RW | X86_PG_V;
+ }
+ } else {
+ mphys = *pml4e & ~PAGE_MASK;
+ }
+ pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
+ return (pdpe);
+}
+
+static void
+pmap_pti_wire_pte(void *pte)
+{
+ vm_page_t m;
+
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
+ m->wire_count++;
+}
+
+static void
+pmap_pti_unwire_pde(void *pde, bool only_ref)
+{
+ vm_page_t m;
+
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
+ MPASS(m->wire_count > 0);
+ MPASS(only_ref || m->wire_count > 1);
+ m->wire_count--;
+ if (m->wire_count == 0)
+ vm_page_free_zero(m);
+}
+
+static void
+pmap_pti_unwire_pte(void *pte, vm_offset_t va)
+{
+ vm_page_t m;
+ pd_entry_t *pde;
+
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
+ MPASS(m->wire_count > 0);
+ m->wire_count--;
+ if (m->wire_count == 0) {
+ vm_page_free_zero(m);
+ pde = pmap_pti_pde(va);
+ MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
+ *pde = 0;
+ pmap_pti_unwire_pde(pde, false);
+ }
+}
+
+static pd_entry_t *
+pmap_pti_pde(vm_offset_t va)
+{
+ pdp_entry_t *pdpe;
+ pd_entry_t *pde;
+ vm_page_t m;
+ vm_pindex_t pd_idx;
+ vm_paddr_t mphys;
+
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+ pdpe = pmap_pti_pdpe(va);
+ if (*pdpe == 0) {
+ m = pmap_pti_alloc_page();
+ if (*pdpe != 0) {
+ vm_page_free_zero(m);
+ MPASS((*pdpe & X86_PG_PS) == 0);
+ mphys = *pdpe & ~PAGE_MASK;
+ } else {
+ mphys = VM_PAGE_TO_PHYS(m);
+ *pdpe = mphys | X86_PG_RW | X86_PG_V;
+ }
+ } else {
+ MPASS((*pdpe & X86_PG_PS) == 0);
+ mphys = *pdpe & ~PAGE_MASK;
+ }
+
+ pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
+ pd_idx = pmap_pde_index(va);
+ pde += pd_idx;
+ return (pde);
+}
+
+static pt_entry_t *
+pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
+{
+ pd_entry_t *pde;
+ pt_entry_t *pte;
+ vm_page_t m;
+ vm_paddr_t mphys;
+
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+ pde = pmap_pti_pde(va);
+ if (unwire_pde != NULL) {
+ *unwire_pde = true;
+ pmap_pti_wire_pte(pde);
+ }
+ if (*pde == 0) {
+ m = pmap_pti_alloc_page();
+ if (*pde != 0) {
+ vm_page_free_zero(m);
+ MPASS((*pde & X86_PG_PS) == 0);
+ mphys = *pde & ~(PAGE_MASK | pg_nx);
+ } else {
+ mphys = VM_PAGE_TO_PHYS(m);
+ *pde = mphys | X86_PG_RW | X86_PG_V;
+ if (unwire_pde != NULL)
+ *unwire_pde = false;
+ }
+ } else {
+ MPASS((*pde & X86_PG_PS) == 0);
+ mphys = *pde & ~(PAGE_MASK | pg_nx);
+ }
+
+ pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
+ pte += pmap_pte_index(va);
+
+ return (pte);
+}
+
+static void
+pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
+{
+ vm_paddr_t pa;
+ pd_entry_t *pde;
+ pt_entry_t *pte, ptev;
+ bool unwire_pde;
+
+ VM_OBJECT_ASSERT_WLOCKED(pti_obj);
+
+ sva = trunc_page(sva);
+ MPASS(sva > VM_MAXUSER_ADDRESS);
+ eva = round_page(eva);
+ MPASS(sva < eva);
+ for (; sva < eva; sva += PAGE_SIZE) {
+ pte = pmap_pti_pte(sva, &unwire_pde);
+ pa = pmap_kextract(sva);
+ ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A |
+ (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
+ VM_MEMATTR_DEFAULT, FALSE);
+ if (*pte == 0) {
+ pte_store(pte, ptev);
+ pmap_pti_wire_pte(pte);
+ } else {
+ KASSERT(!pti_finalized,
+ ("pti overlap after fin %#lx %#lx %#lx",
+ sva, *pte, ptev));
+ KASSERT(*pte == ptev,
+ ("pti non-identical pte after fin %#lx %#lx %#lx",
+ sva, *pte, ptev));
+ }
+ if (unwire_pde) {
+ pde = pmap_pti_pde(sva);
+ pmap_pti_unwire_pde(pde, true);
+ }
+ }
+}
+
+void
+pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
+{
+
+ if (!pti)
+ return;
+ VM_OBJECT_WLOCK(pti_obj);
+ pmap_pti_add_kva_locked(sva, eva, exec);
+ VM_OBJECT_WUNLOCK(pti_obj);
+}
+
+void
+pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
+{
+ pt_entry_t *pte;
+ vm_offset_t va;
+
+ if (!pti)
+ return;
+ sva = rounddown2(sva, PAGE_SIZE);
+ MPASS(sva > VM_MAXUSER_ADDRESS);
+ eva = roundup2(eva, PAGE_SIZE);
+ MPASS(sva < eva);
+ VM_OBJECT_WLOCK(pti_obj);
+ for (va = sva; va < eva; va += PAGE_SIZE) {
+ pte = pmap_pti_pte(va, NULL);
+ KASSERT((*pte & X86_PG_V) != 0,
+ ("invalid pte va %#lx pte %#lx pt %#lx", va,
+ (u_long)pte, *pte));
+ pte_clear(pte);
+ pmap_pti_unwire_pte(pte, va);
+ }
+ pmap_invalidate_range(kernel_pmap, sva, eva);
+ VM_OBJECT_WUNLOCK(pti_obj);
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kdb.h>
Index: sys/amd64/amd64/sys_machdep.c
===================================================================
--- sys/amd64/amd64/sys_machdep.c
+++ sys/amd64/amd64/sys_machdep.c
@@ -65,6 +65,9 @@
#include <security/audit/audit.h>
+static void user_ldt_deref(struct proc_ldt *pldt);
+static void user_ldt_derefl(struct proc_ldt *pldt);
+
#define MAX_LD 8192
int max_ldt_segment = 512;
@@ -83,8 +86,6 @@
}
SYSINIT(maxldt, SI_SUB_VM_CONF, SI_ORDER_ANY, max_ldt_segment_init, NULL);
-static void user_ldt_derefl(struct proc_ldt *pldt);
-
#ifndef _SYS_SYSPROTO_H_
struct sysarch_args {
int op;
@@ -361,7 +362,9 @@
pcb = td->td_pcb;
if (pcb->pcb_tssp == NULL) {
tssp = (struct amd64tss *)kmem_malloc(kernel_arena,
- ctob(IOPAGES+1), M_WAITOK);
+ ctob(IOPAGES + 1), M_WAITOK);
+ pmap_pti_add_kva((vm_offset_t)tssp, (vm_offset_t)tssp +
+ ctob(IOPAGES + 1), false);
iomap = (char *)&tssp[1];
memset(iomap, 0xff, IOPERM_BITMAP_SIZE);
critical_enter();
@@ -450,6 +453,8 @@
struct proc_ldt *pldt, *new_ldt;
struct mdproc *mdp;
struct soft_segment_descriptor sldt;
+ vm_offset_t sva;
+ vm_size_t sz;
mtx_assert(&dt_lock, MA_OWNED);
mdp = &p->p_md;
@@ -457,13 +462,13 @@
return (mdp->md_ldt);
mtx_unlock(&dt_lock);
new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK);
- new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena,
- max_ldt_segment * sizeof(struct user_segment_descriptor),
- M_WAITOK | M_ZERO);
+ sz = max_ldt_segment * sizeof(struct user_segment_descriptor);
+ sva = kmem_malloc(kernel_arena, sz, M_WAITOK | M_ZERO);
+ new_ldt->ldt_base = (caddr_t)sva;
+ pmap_pti_add_kva(sva, sva + sz, false);
new_ldt->ldt_refcnt = 1;
- sldt.ssd_base = (uint64_t)new_ldt->ldt_base;
- sldt.ssd_limit = max_ldt_segment *
- sizeof(struct user_segment_descriptor) - 1;
+ sldt.ssd_base = sva;
+ sldt.ssd_limit = sz - 1;
sldt.ssd_type = SDT_SYSLDT;
sldt.ssd_dpl = SEL_KPL;
sldt.ssd_p = 1;
@@ -473,8 +478,8 @@
mtx_lock(&dt_lock);
pldt = mdp->md_ldt;
if (pldt != NULL && !force) {
- kmem_free(kernel_arena, (vm_offset_t)new_ldt->ldt_base,
- max_ldt_segment * sizeof(struct user_segment_descriptor));
+ pmap_pti_remove_kva(sva, sva + sz);
+ kmem_free(kernel_arena, sva, sz);
free(new_ldt, M_SUBPROC);
return (pldt);
}
@@ -521,15 +526,19 @@
static void
user_ldt_derefl(struct proc_ldt *pldt)
{
+ vm_offset_t sva;
+ vm_size_t sz;
if (--pldt->ldt_refcnt == 0) {
- kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base,
- max_ldt_segment * sizeof(struct user_segment_descriptor));
+ sva = (vm_offset_t)pldt->ldt_base;
+ sz = max_ldt_segment * sizeof(struct user_segment_descriptor);
+ pmap_pti_remove_kva(sva, sva + sz);
+ kmem_free(kernel_arena, sva, sz);
free(pldt, M_SUBPROC);
}
}
-void
+static void
user_ldt_deref(struct proc_ldt *pldt)
{
Index: sys/amd64/amd64/trap.c
===================================================================
--- sys/amd64/amd64/trap.c
+++ sys/amd64/amd64/trap.c
@@ -448,9 +448,28 @@
* problem here and not have to check all the
* selectors and pointers when the user changes
* them.
+ *
+ * In case of PTI, the IRETQ faulted while the
+ * kernel used the pti stack, and exception
+ * frame records %rsp value pointing to that
+ * stack. If we return normally to
+ * doreti_iret_fault, the trapframe is
+ * reconstructed on pti stack, and calltrap()
+ * called on it as well. Due to the very
+ * limited pti stack size, kernel does not
+ * survive for too long. Switch to the normal
+ * thread stack for the trap handling.
+ *
+ * Magic '5' is the number of qwords occupied by
+ * the hardware trap frame.
*/
if (frame->tf_rip == (long)doreti_iret) {
frame->tf_rip = (long)doreti_iret_fault;
+ if (pti && frame->tf_rsp == (uintptr_t)PCPU_PTR(
+ pti_stack) + (PC_PTI_STACK_SZ - 5) *
+ sizeof(register_t))
+ frame->tf_rsp = PCPU_GET(rsp0) - 5 *
+ sizeof(register_t);
return;
}
if (frame->tf_rip == (long)ld_ds) {
Index: sys/amd64/amd64/vm_machdep.c
===================================================================
--- sys/amd64/amd64/vm_machdep.c
+++ sys/amd64/amd64/vm_machdep.c
@@ -339,6 +339,8 @@
* Clean TSS/iomap
*/
if (pcb->pcb_tssp != NULL) {
+ pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp,
+ (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1));
kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp,
ctob(IOPAGES + 1));
pcb->pcb_tssp = NULL;
Index: sys/amd64/ia32/ia32_exception.S
===================================================================
--- sys/amd64/ia32/ia32_exception.S
+++ sys/amd64/ia32/ia32_exception.S
@@ -40,8 +40,13 @@
* that it originated in supervisor mode and skip the swapgs.
*/
SUPERALIGN_TEXT
+IDTVEC(int0x80_syscall_pti)
+ PTI_UENTRY has_err=0
+ jmp int0x80_syscall_common
+ SUPERALIGN_TEXT
IDTVEC(int0x80_syscall)
swapgs
+int0x80_syscall_common:
pushq $2 /* sizeof "int 0x80" */
subq $TF_ERR,%rsp /* skip over tf_trapno */
movq %rdi,TF_RDI(%rsp)
Index: sys/amd64/ia32/ia32_syscall.c
===================================================================
--- sys/amd64/ia32/ia32_syscall.c
+++ sys/amd64/ia32/ia32_syscall.c
@@ -95,7 +95,8 @@
#define IDTVEC(name) __CONCAT(X,name)
-extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd);
+extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(int0x80_syscall_pti),
+ IDTVEC(rsvd), IDTVEC(rsvd_pti);
void ia32_syscall(struct trapframe *frame); /* Called from asm code */
@@ -208,14 +209,16 @@
ia32_syscall_enable(void *dummy)
{
- setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
+ setidt(IDT_SYSCALL, pti ? &IDTVEC(int0x80_syscall_pti) :
+ &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
}
static void
ia32_syscall_disable(void *dummy)
{
- setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
+ setidt(IDT_SYSCALL, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd),
+ SDT_SYSIGT, SEL_KPL, 0);
}
SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL);
Index: sys/amd64/include/asmacros.h
===================================================================
--- sys/amd64/include/asmacros.h
+++ sys/amd64/include/asmacros.h
@@ -1,3 +1,4 @@
+/* -*- mode: asm -*- */
/*-
* SPDX-License-Identifier: BSD-3-Clause
*
@@ -145,69 +146,6 @@
popq %rbp
#ifdef LOCORE
-/*
- * Convenience macro for declaring interrupt entry points.
- */
-#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \
- .type __CONCAT(X,name),@function; __CONCAT(X,name):
-
-/*
- * Macros to create and destroy a trap frame.
- */
-#define PUSH_FRAME \
- subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \
- testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \
- jz 1f ; /* Yes, dont swapgs again */ \
- swapgs ; \
-1: movq %rdi,TF_RDI(%rsp) ; \
- movq %rsi,TF_RSI(%rsp) ; \
- movq %rdx,TF_RDX(%rsp) ; \
- movq %rcx,TF_RCX(%rsp) ; \
- movq %r8,TF_R8(%rsp) ; \
- movq %r9,TF_R9(%rsp) ; \
- movq %rax,TF_RAX(%rsp) ; \
- movq %rbx,TF_RBX(%rsp) ; \
- movq %rbp,TF_RBP(%rsp) ; \
- movq %r10,TF_R10(%rsp) ; \
- movq %r11,TF_R11(%rsp) ; \
- movq %r12,TF_R12(%rsp) ; \
- movq %r13,TF_R13(%rsp) ; \
- movq %r14,TF_R14(%rsp) ; \
- movq %r15,TF_R15(%rsp) ; \
- movw %fs,TF_FS(%rsp) ; \
- movw %gs,TF_GS(%rsp) ; \
- movw %es,TF_ES(%rsp) ; \
- movw %ds,TF_DS(%rsp) ; \
- movl $TF_HASSEGS,TF_FLAGS(%rsp) ; \
- cld ; \
- testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel ? */ \
- jz 2f ; /* yes, leave PCB_FULL_IRET alone */ \
- movq PCPU(CURPCB),%r8 ; \
- andl $~PCB_FULL_IRET,PCB_FLAGS(%r8) ; \
-2:
-
-#define POP_FRAME \
- movq TF_RDI(%rsp),%rdi ; \
- movq TF_RSI(%rsp),%rsi ; \
- movq TF_RDX(%rsp),%rdx ; \
- movq TF_RCX(%rsp),%rcx ; \
- movq TF_R8(%rsp),%r8 ; \
- movq TF_R9(%rsp),%r9 ; \
- movq TF_RAX(%rsp),%rax ; \
- movq TF_RBX(%rsp),%rbx ; \
- movq TF_RBP(%rsp),%rbp ; \
- movq TF_R10(%rsp),%r10 ; \
- movq TF_R11(%rsp),%r11 ; \
- movq TF_R12(%rsp),%r12 ; \
- movq TF_R13(%rsp),%r13 ; \
- movq TF_R14(%rsp),%r14 ; \
- movq TF_R15(%rsp),%r15 ; \
- testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \
- jz 1f ; /* keep kernel GS.base */ \
- cli ; \
- swapgs ; \
-1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */
-
/*
* Access per-CPU data.
*/
@@ -216,6 +154,123 @@
movq %gs:PC_PRVSPACE, reg ; \
addq $PC_ ## member, reg
+/*
+ * Convenience macro for declaring interrupt entry points.
+ */
+#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \
+ .type __CONCAT(X,name),@function; __CONCAT(X,name):
+
+ .macro SAVE_SEGS
+ movw %fs,TF_FS(%rsp)
+ movw %gs,TF_GS(%rsp)
+ movw %es,TF_ES(%rsp)
+ movw %ds,TF_DS(%rsp)
+ .endm
+
+ .macro MOVE_STACKS qw
+ offset=0
+ .rept \qw
+ movq offset(%rsp),%rdx
+ movq %rdx,offset(%rax)
+ offset=offset+8
+ .endr
+ .endm
+
+ .macro PTI_UENTRY has_err
+ swapgs
+ pushq %rax
+ pushq %rdx
+ movq PCPU(KCR3),%rax
+ movq %rax,%cr3
+ movq PCPU(RSP0),%rax
+ subq $PTI_SIZE,%rax
+ MOVE_STACKS (PTI_SIZE / 8) - 1 + \has_err
+ movq %rax,%rsp
+ popq %rdx
+ popq %rax
+ .endm
+
+ .macro PTI_ENTRY name, cont, has_err=0
+ ALIGN_TEXT
+ .globl X\name\()_pti
+ .type X\name\()_pti,@function
+X\name\()_pti:
+ /* %rax, %rdx and possibly err not yet pushed */
+ testb $SEL_RPL_MASK,PTI_CS-(2+1-\has_err)*8(%rsp)
+ jz \cont
+ PTI_UENTRY \has_err
+ swapgs
+ jmp \cont
+ .endm
+
+ .macro PTI_INTRENTRY vec_name
+ SUPERALIGN_TEXT
+ .globl X\vec_name\()_pti
+ .type X\vec_name\()_pti,@function
+X\vec_name\()_pti:
+ testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* err, %rax, %rdx not pushed */
+ jz \vec_name\()_u
+ PTI_UENTRY has_err=0
+ jmp \vec_name\()_u
+ .endm
+
+ .macro INTR_PUSH_FRAME vec_name
+ SUPERALIGN_TEXT
+IDTVEC(\vec_name)
+ testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* come from kernel? */
+ jz \vec_name\()_u /* Yes, dont swapgs again */
+ swapgs
+\vec_name\()_u:
+ subq $TF_RIP,%rsp /* skip dummy tf_err and tf_trapno */
+ movq %rdi,TF_RDI(%rsp)
+ movq %rsi,TF_RSI(%rsp)
+ movq %rdx,TF_RDX(%rsp)
+ movq %rcx,TF_RCX(%rsp)
+ movq %r8,TF_R8(%rsp)
+ movq %r9,TF_R9(%rsp)
+ movq %rax,TF_RAX(%rsp)
+ movq %rbx,TF_RBX(%rsp)
+ movq %rbp,TF_RBP(%rsp)
+ movq %r10,TF_R10(%rsp)
+ movq %r11,TF_R11(%rsp)
+ movq %r12,TF_R12(%rsp)
+ movq %r13,TF_R13(%rsp)
+ movq %r14,TF_R14(%rsp)
+ movq %r15,TF_R15(%rsp)
+ SAVE_SEGS
+ movl $TF_HASSEGS,TF_FLAGS(%rsp)
+ cld
+ testb $SEL_RPL_MASK,TF_CS(%rsp) /* come from kernel ? */
+ jz 1f /* yes, leave PCB_FULL_IRET alone */
+ movq PCPU(CURPCB),%r8
+ andl $~PCB_FULL_IRET,PCB_FLAGS(%r8)
+1:
+ .endm
+
+ .macro INTR_HANDLER vec_name
+ .text
+ PTI_INTRENTRY \vec_name
+ INTR_PUSH_FRAME \vec_name
+ .endm
+
+ .macro RESTORE_REGS
+ movq TF_RDI(%rsp),%rdi
+ movq TF_RSI(%rsp),%rsi
+ movq TF_RDX(%rsp),%rdx
+ movq TF_RCX(%rsp),%rcx
+ movq TF_R8(%rsp),%r8
+ movq TF_R9(%rsp),%r9
+ movq TF_RAX(%rsp),%rax
+ movq TF_RBX(%rsp),%rbx
+ movq TF_RBP(%rsp),%rbp
+ movq TF_R10(%rsp),%r10
+ movq TF_R11(%rsp),%r11
+ movq TF_R12(%rsp),%r12
+ movq TF_R13(%rsp),%r13
+ movq TF_R14(%rsp),%r14
+ movq TF_R15(%rsp),%r15
+ .endm
+
#endif /* LOCORE */
#ifdef __STDC__
Index: sys/amd64/include/frame.h
===================================================================
--- sys/amd64/include/frame.h
+++ sys/amd64/include/frame.h
@@ -1,6 +1,50 @@
/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2018 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
* This file is in the public domain.
+ * $FreeBSD$
*/
-/* $FreeBSD$ */
+
+#ifndef _AMD64_FRAME_H
+#define _AMD64_FRAME_H
#include <x86/frame.h>
+
+struct pti_frame {
+ register_t pti_rdx;
+ register_t pti_rax;
+ register_t pti_err;
+ register_t pti_rip;
+ register_t pti_cs;
+ register_t pti_rflags;
+ register_t pti_rsp;
+ register_t pti_ss;
+};
+
+#endif
Index: sys/amd64/include/md_var.h
===================================================================
--- sys/amd64/include/md_var.h
+++ sys/amd64/include/md_var.h
@@ -38,6 +38,7 @@
extern uint64_t *vm_page_dump;
extern int hw_lower_amd64_sharedpage;
+extern int pti;
struct savefpu;
struct sysentvec;
Index: sys/amd64/include/pcpu.h
===================================================================
--- sys/amd64/include/pcpu.h
+++ sys/amd64/include/pcpu.h
@@ -35,6 +35,7 @@
#error "sys/cdefs.h is a prerequisite for this file"
#endif
+#define PC_PTI_STACK_SZ 16
/*
* The SMP parts are setup in pmap.c and locore.s for the BSP, and
* mp_machdep.c sets up the data for the AP's to "see" when they awake.
@@ -48,8 +49,11 @@
struct pmap *pc_curpmap; \
struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \
struct amd64tss *pc_commontssp;/* Common TSS for the CPU */ \
+ uint64_t pc_kcr3; \
+ uint64_t pc_ucr3; \
register_t pc_rsp0; \
register_t pc_scratch_rsp; /* User %rsp in syscall */ \
+ register_t pc_scratch_rax; \
u_int pc_apic_id; \
u_int pc_acpi_id; /* ACPI CPU id */ \
/* Pointer to the CPU %fs descriptor */ \
@@ -63,12 +67,13 @@
uint64_t pc_pm_save_cnt; \
u_int pc_cmci_mask; /* MCx banks for CMCI */ \
uint64_t pc_dbreg[16]; /* ddb debugging regs */ \
+ uint64_t pc_pti_stack[PC_PTI_STACK_SZ]; \
int pc_dbreg_cmd; /* ddb debugging reg cmd */ \
u_int pc_vcpu_id; /* Xen vCPU ID */ \
uint32_t pc_pcid_next; \
uint32_t pc_pcid_gen; \
uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \
- char __pad[384] /* be divisor of PAGE_SIZE \
+ char __pad[232] /* be divisor of PAGE_SIZE \
after cache alignment */
#define PC_DBREG_CMD_NONE 0
Index: sys/amd64/include/pmap.h
===================================================================
--- sys/amd64/include/pmap.h
+++ sys/amd64/include/pmap.h
@@ -277,7 +277,7 @@
#define pde_store(pdep, pde) pte_store(pdep, pde)
-extern pt_entry_t pg_nx;
+extern pt_entry_t pg_nx, pg_g;
#endif /* _KERNEL */
@@ -315,7 +315,9 @@
struct pmap {
struct mtx pm_mtx;
pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
+ pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */
uint64_t pm_cr3;
+ uint64_t pm_ucr3;
TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
cpuset_t pm_active; /* active on cpus */
enum pmap_type pm_type; /* regular or nested tables */
@@ -429,6 +431,8 @@
void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num);
boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t);
+void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec);
+void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva);
#endif /* _KERNEL */
/* Return various clipped indexes for a given VA */
Index: sys/amd64/include/proc.h
===================================================================
--- sys/amd64/include/proc.h
+++ sys/amd64/include/proc.h
@@ -92,7 +92,6 @@
struct proc_ldt *user_ldt_alloc(struct proc *, int);
void user_ldt_free(struct thread *);
-void user_ldt_deref(struct proc_ldt *);
struct sysarch_args;
int sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space);
int amd64_set_ldt_data(struct thread *td, int start, int num,
Index: sys/amd64/include/smp.h
===================================================================
--- sys/amd64/include/smp.h
+++ sys/amd64/include/smp.h
@@ -30,7 +30,18 @@
inthand_t
IDTVEC(invltlb_pcid), /* TLB shootdowns - global, pcid */
IDTVEC(invltlb_invpcid),/* TLB shootdowns - global, invpcid */
- IDTVEC(justreturn); /* interrupt CPU with minimum overhead */
+ IDTVEC(justreturn), /* interrupt CPU with minimum overhead */
+ IDTVEC(invltlb_pcid_pti),
+ IDTVEC(invltlb_invpcid_pti),
+ IDTVEC(justreturn1_pti),
+ IDTVEC(invltlb_pti),
+ IDTVEC(invlpg_pti),
+ IDTVEC(invlrng_pti),
+ IDTVEC(invlcache_pti),
+ IDTVEC(ipi_intr_bitmap_handler_pti),
+ IDTVEC(cpustop_pti),
+ IDTVEC(cpususpend_pti),
+ IDTVEC(rendezvous_pti);
void invltlb_pcid_handler(void);
void invltlb_invpcid_handler(void);
Index: sys/amd64/vmm/intel/vmx.c
===================================================================
--- sys/amd64/vmm/intel/vmx.c
+++ sys/amd64/vmm/intel/vmx.c
@@ -695,7 +695,8 @@
MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
&tmp);
if (error == 0) {
- pirvec = lapic_ipi_alloc(&IDTVEC(justreturn));
+ pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
+ &IDTVEC(justreturn));
if (pirvec < 0) {
if (bootverbose) {
printf("vmx_init: unable to allocate "
Index: sys/amd64/vmm/vmm.c
===================================================================
--- sys/amd64/vmm/vmm.c
+++ sys/amd64/vmm/vmm.c
@@ -57,6 +57,7 @@
#include <machine/cpu.h>
#include <machine/pcb.h>
#include <machine/smp.h>
+#include <machine/md_var.h>
#include <x86/psl.h>
#include <x86/apicreg.h>
@@ -327,7 +328,8 @@
vmm_host_state_init();
- vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn));
+ vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
+ &IDTVEC(justreturn));
if (vmm_ipinum < 0)
vmm_ipinum = IPI_AST;
Index: sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
===================================================================
--- sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
+++ sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
@@ -26,11 +26,11 @@
* $FreeBSD$
*/
+#include "assym.s"
+
#include <machine/asmacros.h>
#include <machine/specialreg.h>
-#include "assym.s"
-
/*
* This is the Hyper-V vmbus channel direct callback interrupt.
* Only used when it is running on Hyper-V.
@@ -38,7 +38,7 @@
.text
SUPERALIGN_TEXT
IDTVEC(vmbus_isr)
- PUSH_FRAME
+ INTR_PUSH_FRAME
FAKE_MCOUNT(TF_RIP(%rsp))
movq %rsp, %rdi
call vmbus_handle_intr
Index: sys/x86/include/apicvar.h
===================================================================
--- sys/x86/include/apicvar.h
+++ sys/x86/include/apicvar.h
@@ -185,7 +185,11 @@
IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
- IDTVEC(spuriousint), IDTVEC(timerint);
+ IDTVEC(spuriousint), IDTVEC(timerint),
+ IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti),
+ IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti),
+ IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti),
+ IDTVEC(spuriousint_pti), IDTVEC(timerint_pti);
extern vm_paddr_t lapic_paddr;
extern int *apic_cpuids;
Index: sys/x86/isa/atpic.c
===================================================================
--- sys/x86/isa/atpic.c
+++ sys/x86/isa/atpic.c
@@ -80,6 +80,16 @@
IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11),
IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14),
IDTVEC(atpic_intr15);
+/* XXXKIB i386 uses stubs until pti comes */
+inthand_t
+ IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti),
+ IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti),
+ IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti),
+ IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti),
+ IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti),
+ IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti),
+ IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti),
+ IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti);
#define IRQ(ap, ai) ((ap)->at_irqbase + (ai)->at_irq)
@@ -92,7 +102,7 @@
#define INTSRC(irq) \
{ { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ), \
- (irq) % 8 }
+ IDTVEC(atpic_intr ## irq ## _pti, (irq) % 8 }
struct atpic {
struct pic at_pic;
@@ -104,7 +114,7 @@
struct atpic_intsrc {
struct intsrc at_intsrc;
- inthand_t *at_intr;
+ inthand_t *at_intr, *at_intr_pti;
int at_irq; /* Relative to PIC base. */
enum intr_trigger at_trigger;
u_long at_count;
@@ -408,7 +418,8 @@
ai->at_intsrc.is_count = &ai->at_count;
ai->at_intsrc.is_straycount = &ai->at_straycount;
setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase +
- ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC);
+ ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC,
+ SEL_KPL, GSEL_ATPIC);
}
/*
Index: sys/x86/x86/local_apic.c
===================================================================
--- sys/x86/x86/local_apic.c
+++ sys/x86/x86/local_apic.c
@@ -171,13 +171,23 @@
IDTVEC(apic_isr7), /* 224 - 255 */
};
+static inthand_t *ioint_pti_handlers[] = {
+ NULL, /* 0 - 31 */
+ IDTVEC(apic_isr1_pti), /* 32 - 63 */
+ IDTVEC(apic_isr2_pti), /* 64 - 95 */
+ IDTVEC(apic_isr3_pti), /* 96 - 127 */
+ IDTVEC(apic_isr4_pti), /* 128 - 159 */
+ IDTVEC(apic_isr5_pti), /* 160 - 191 */
+ IDTVEC(apic_isr6_pti), /* 192 - 223 */
+ IDTVEC(apic_isr7_pti), /* 224 - 255 */
+};
static u_int32_t lapic_timer_divisors[] = {
APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16,
APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128
};
-extern inthand_t IDTVEC(rsvd);
+extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd);
volatile char *lapic_map;
vm_paddr_t lapic_paddr;
@@ -496,15 +506,18 @@
PCPU_SET(apic_id, lapic_id());
/* Local APIC timer interrupt. */
- setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC);
+ setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint),
+ SDT_APIC, SEL_KPL, GSEL_APIC);
/* Local APIC error interrupt. */
- setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
+ setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint),
+ SDT_APIC, SEL_KPL, GSEL_APIC);
/* XXX: Thermal interrupt */
/* Local APIC CMCI. */
- setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
+ setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint),
+ SDT_APICT, SEL_KPL, GSEL_APIC);
if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) {
arat = 0;
@@ -1569,8 +1582,8 @@
KASSERT(vector != IDT_DTRACE_RET,
("Attempt to overwrite DTrace entry"));
#endif
- setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL,
- GSEL_APIC);
+ setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32],
+ SDT_APIC, SEL_KPL, GSEL_APIC);
}
static void
@@ -1589,7 +1602,8 @@
* We can not currently clear the idt entry because other cpus
* may have a valid vector at this offset.
*/
- setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+ SEL_KPL, GSEL_APIC);
#endif
}
@@ -2087,6 +2101,7 @@
* APIC page is mapped as an uncached region. In x2APIC mode there is an
* explicit 'mfence' before the ICR MSR is written. Therefore in both cases
* the IDT slot update is globally visible before the IPI is delivered.
+ * XXX pti
*/
static int
native_lapic_ipi_alloc(inthand_t *ipifunc)
@@ -2095,7 +2110,8 @@
long func;
int idx, vector;
- KASSERT(ipifunc != &IDTVEC(rsvd), ("invalid ipifunc %p", ipifunc));
+ KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti),
+ ("invalid ipifunc %p", ipifunc));
vector = -1;
mtx_lock_spin(&icu_lock);
@@ -2124,8 +2140,10 @@
mtx_lock_spin(&icu_lock);
ip = &idt[vector];
func = (ip->gd_hioffset << 16) | ip->gd_looffset;
- KASSERT(func != (uintptr_t)&IDTVEC(rsvd),
+ KASSERT(func != (uintptr_t)&IDTVEC(rsvd) &&
+ func != (uintptr_t)&IDTVEC(rsvd_pti),
("invalid idtfunc %#lx", func));
- setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC);
+ setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT,
+ SEL_KPL, GSEL_APIC);
mtx_unlock_spin(&icu_lock);
}

File Metadata

Mime Type
text/plain
Expires
Wed, Nov 26, 9:22 AM (13 h, 43 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
26208726
Default Alt Text
D13797.id37885.diff (70 KB)

Event Timeline