Index: sys/amd64/amd64/apic_vector.S =================================================================== --- sys/amd64/amd64/apic_vector.S +++ sys/amd64/amd64/apic_vector.S @@ -2,6 +2,12 @@ * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2014-2018 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -38,12 +44,12 @@ #include "opt_smp.h" +#include "assym.s" + #include #include #include -#include "assym.s" - #ifdef SMP #define LK lock ; #else @@ -73,30 +79,28 @@ * translates that into a vector, and passes the vector to the * lapic_handle_intr() function. */ -#define ISR_VEC(index, vec_name) \ - .text ; \ - SUPERALIGN_TEXT ; \ -IDTVEC(vec_name) ; \ - PUSH_FRAME ; \ - FAKE_MCOUNT(TF_RIP(%rsp)) ; \ - cmpl $0,x2apic_mode ; \ - je 1f ; \ - movl $(MSR_APIC_ISR0 + index),%ecx ; \ - rdmsr ; \ - jmp 2f ; \ -1: ; \ - movq lapic_map, %rdx ; /* pointer to local APIC */ \ - movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \ -2: ; \ - bsrl %eax, %eax ; /* index of highest set bit in ISR */ \ - jz 3f ; \ - addl $(32 * index),%eax ; \ - movq %rsp, %rsi ; \ - movl %eax, %edi ; /* pass the IRQ */ \ - call lapic_handle_intr ; \ -3: ; \ - MEXITCOUNT ; \ + .macro ISR_VEC index, vec_name + INTR_HANDLER \vec_name + FAKE_MCOUNT(TF_RIP(%rsp)) + cmpl $0,x2apic_mode + je 1f + movl $(MSR_APIC_ISR0 + \index),%ecx + rdmsr + jmp 2f +1: + movq lapic_map, %rdx /* pointer to local APIC */ + movl LA_ISR + 16 * (\index)(%rdx), %eax /* load ISR */ +2: + bsrl %eax, %eax /* index of highest set bit in ISR */ + jz 3f + addl $(32 * \index),%eax + movq %rsp, %rsi + movl %eax, %edi /* pass the IRQ */ + call lapic_handle_intr +3: + MEXITCOUNT jmp doreti + .endm /* * Handle "spurious INTerrupts". @@ -108,26 +112,21 @@ .text SUPERALIGN_TEXT IDTVEC(spuriousint) - /* No EOI cycle used here */ - jmp doreti_iret - ISR_VEC(1, apic_isr1) - ISR_VEC(2, apic_isr2) - ISR_VEC(3, apic_isr3) - ISR_VEC(4, apic_isr4) - ISR_VEC(5, apic_isr5) - ISR_VEC(6, apic_isr6) - ISR_VEC(7, apic_isr7) + ISR_VEC 1, apic_isr1 + ISR_VEC 2, apic_isr2 + ISR_VEC 3, apic_isr3 + ISR_VEC 4, apic_isr4 + ISR_VEC 5, apic_isr5 + ISR_VEC 6, apic_isr6 + ISR_VEC 7, apic_isr7 /* * Local APIC periodic timer handler. */ - .text - SUPERALIGN_TEXT -IDTVEC(timerint) - PUSH_FRAME + INTR_HANDLER timerint FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call lapic_handle_timer @@ -137,10 +136,7 @@ /* * Local APIC CMCI handler. */ - .text - SUPERALIGN_TEXT -IDTVEC(cmcint) - PUSH_FRAME + INTR_HANDLER cmcint FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_cmc MEXITCOUNT @@ -149,10 +145,7 @@ /* * Local APIC error interrupt handler. */ - .text - SUPERALIGN_TEXT -IDTVEC(errorint) - PUSH_FRAME + INTR_HANDLER errorint FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_error MEXITCOUNT @@ -163,10 +156,7 @@ * Xen event channel upcall interrupt handler. * Only used when the hypervisor supports direct vector callbacks. */ - .text - SUPERALIGN_TEXT -IDTVEC(xen_intr_upcall) - PUSH_FRAME + INTR_HANDLER xen_intr_upcall FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call xen_intr_handle_upcall @@ -183,74 +173,48 @@ SUPERALIGN_TEXT invltlb_ret: call as_lapic_eoi - POP_FRAME - jmp doreti_iret + jmp ld_regs SUPERALIGN_TEXT -IDTVEC(invltlb) - PUSH_FRAME - + INTR_HANDLER invltlb call invltlb_handler jmp invltlb_ret -IDTVEC(invltlb_pcid) - PUSH_FRAME - + INTR_HANDLER invltlb_pcid call invltlb_pcid_handler jmp invltlb_ret -IDTVEC(invltlb_invpcid) - PUSH_FRAME - + INTR_HANDLER invltlb_invpcid call invltlb_invpcid_handler jmp invltlb_ret /* * Single page TLB shootdown */ - .text - - SUPERALIGN_TEXT -IDTVEC(invlpg) - PUSH_FRAME - + INTR_HANDLER invlpg call invlpg_handler jmp invltlb_ret /* * Page range TLB shootdown. */ - .text - SUPERALIGN_TEXT -IDTVEC(invlrng) - PUSH_FRAME - + INTR_HANDLER invlrng call invlrng_handler jmp invltlb_ret /* * Invalidate cache. */ - .text - SUPERALIGN_TEXT -IDTVEC(invlcache) - PUSH_FRAME - + INTR_HANDLER invlcache call invlcache_handler jmp invltlb_ret /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ - .text - SUPERALIGN_TEXT -IDTVEC(ipi_intr_bitmap_handler) - PUSH_FRAME - + INTR_HANDLER ipi_intr_bitmap_handler call as_lapic_eoi - FAKE_MCOUNT(TF_RIP(%rsp)) - call ipi_bitmap_handler MEXITCOUNT jmp doreti @@ -258,24 +222,15 @@ /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ - .text - SUPERALIGN_TEXT -IDTVEC(cpustop) - PUSH_FRAME - + INTR_HANDLER cpustop call as_lapic_eoi - call cpustop_handler jmp doreti /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. */ - .text - SUPERALIGN_TEXT -IDTVEC(cpususpend) - PUSH_FRAME - + INTR_HANDLER cpususpend call cpususpend_handler call as_lapic_eoi jmp doreti @@ -285,10 +240,7 @@ * * - Calls the generic rendezvous action function. */ - .text - SUPERALIGN_TEXT -IDTVEC(rendezvous) - PUSH_FRAME + INTR_HANDLER rendezvous #ifdef COUNT_IPIS movl PCPU(CPUID), %eax movq ipi_rendezvous_counts(,%rax,8), %rax @@ -328,4 +280,8 @@ popq %rax jmp doreti_iret + INTR_HANDLER justreturn1 + call as_lapic_eoi + jmp doreti + #endif /* SMP */ Index: sys/amd64/amd64/atpic_vector.S =================================================================== --- sys/amd64/amd64/atpic_vector.S +++ sys/amd64/amd64/atpic_vector.S @@ -36,38 +36,35 @@ * master and slave interrupt controllers. */ -#include - #include "assym.s" +#include /* * Macros for interrupt entry, call to handler, and exit. */ -#define INTR(irq_num, vec_name) \ - .text ; \ - SUPERALIGN_TEXT ; \ -IDTVEC(vec_name) ; \ - PUSH_FRAME ; \ - FAKE_MCOUNT(TF_RIP(%rsp)) ; \ - movq %rsp, %rsi ; \ - movl $irq_num, %edi; /* pass the IRQ */ \ - call atpic_handle_intr ; \ - MEXITCOUNT ; \ + .macro INTR irq_num, vec_name + INTR_HANDLER \vec_name + FAKE_MCOUNT(TF_RIP(%rsp)) + movq %rsp, %rsi + movl $\irq_num, %edi /* pass the IRQ */ + call atpic_handle_intr + MEXITCOUNT jmp doreti + .endm - INTR(0, atpic_intr0) - INTR(1, atpic_intr1) - INTR(2, atpic_intr2) - INTR(3, atpic_intr3) - INTR(4, atpic_intr4) - INTR(5, atpic_intr5) - INTR(6, atpic_intr6) - INTR(7, atpic_intr7) - INTR(8, atpic_intr8) - INTR(9, atpic_intr9) - INTR(10, atpic_intr10) - INTR(11, atpic_intr11) - INTR(12, atpic_intr12) - INTR(13, atpic_intr13) - INTR(14, atpic_intr14) - INTR(15, atpic_intr15) + INTR 0, atpic_intr0 + INTR 1, atpic_intr1 + INTR 2, atpic_intr2 + INTR 3, atpic_intr3 + INTR 4, atpic_intr4 + INTR 5, atpic_intr5 + INTR 6, atpic_intr6 + INTR 7, atpic_intr7 + INTR 8, atpic_intr8 + INTR 9, atpic_intr9 + INTR 10, atpic_intr10 + INTR 11, atpic_intr11 + INTR 12, atpic_intr12 + INTR 13, atpic_intr13 + INTR 14, atpic_intr14 + INTR 15, atpic_intr15 Index: sys/amd64/amd64/cpu_switch.S =================================================================== --- sys/amd64/amd64/cpu_switch.S +++ sys/amd64/amd64/cpu_switch.S @@ -215,8 +215,10 @@ movq %r8,PCPU(RSP0) movq %r8,PCPU(CURPCB) /* Update the TSS_RSP0 pointer for the next interrupt */ + cmpb $0,pti(%rip) + jne 1f movq %r8,TSS_RSP0(%rdx) - movq %r12,PCPU(CURTHREAD) /* into next thread */ +1: movq %r12,PCPU(CURTHREAD) /* into next thread */ /* Test if debug registers should be restored. */ testl $PCB_DBREGS,PCB_FLAGS(%r8) @@ -293,7 +295,12 @@ shrq $8,%rcx movl %ecx,8(%rax) movb $0x89,5(%rax) /* unset busy */ - movl $TSSSEL,%eax + cmpb $0,pti(%rip) + je 1f + movq PCPU(PRVSPACE),%rax + addq $PC_PTI_STACK+PC_PTI_STACK_SZ*8,%rax + movq %rax,TSS_RSP0(%rdx) +1: movl $TSSSEL,%eax ltr %ax jmp done_tss Index: sys/amd64/amd64/exception.S =================================================================== --- sys/amd64/amd64/exception.S +++ sys/amd64/amd64/exception.S @@ -1,12 +1,16 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. - * Copyright (c) 2007 The FreeBSD Foundation + * Copyright (c) 2007-2018 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -38,13 +42,13 @@ #include "opt_compat.h" #include "opt_hwpmc_hooks.h" +#include "assym.s" + #include #include #include #include -#include "assym.s" - #ifdef KDTRACE_HOOKS .bss .globl dtrace_invop_jump_addr @@ -100,68 +104,62 @@ MCOUNT_LABEL(user) MCOUNT_LABEL(btrap) -/* Traps that we leave interrupts disabled for.. */ -#define TRAP_NOEN(a) \ - subq $TF_RIP,%rsp; \ - movl $(a),TF_TRAPNO(%rsp) ; \ - movq $0,TF_ADDR(%rsp) ; \ - movq $0,TF_ERR(%rsp) ; \ +/* Traps that we leave interrupts disabled for. */ + .macro TRAP_NOEN l, trapno + PTI_ENTRY \l,X\l + .globl X\l + .type X\l,@function +X\l: subq $TF_RIP,%rsp + movl $\trapno,TF_TRAPNO(%rsp) + movq $0,TF_ADDR(%rsp) + movq $0,TF_ERR(%rsp) jmp alltraps_noen -IDTVEC(dbg) - TRAP_NOEN(T_TRCTRAP) -IDTVEC(bpt) - TRAP_NOEN(T_BPTFLT) + .endm + + TRAP_NOEN dbg, T_TRCTRAP + TRAP_NOEN bpt, T_BPTFLT #ifdef KDTRACE_HOOKS -IDTVEC(dtrace_ret) - TRAP_NOEN(T_DTRACE_RET) + TRAP_NOEN dtrace_ret, T_DTRACE_RET #endif /* Regular traps; The cpu does not supply tf_err for these. */ -#define TRAP(a) \ - subq $TF_RIP,%rsp; \ - movl $(a),TF_TRAPNO(%rsp) ; \ - movq $0,TF_ADDR(%rsp) ; \ - movq $0,TF_ERR(%rsp) ; \ + .macro TRAP l, trapno + PTI_ENTRY \l,X\l + .globl X\l + .type X\l,@function +X\l: + subq $TF_RIP,%rsp + movl $\trapno,TF_TRAPNO(%rsp) + movq $0,TF_ADDR(%rsp) + movq $0,TF_ERR(%rsp) jmp alltraps -IDTVEC(div) - TRAP(T_DIVIDE) -IDTVEC(ofl) - TRAP(T_OFLOW) -IDTVEC(bnd) - TRAP(T_BOUND) -IDTVEC(ill) - TRAP(T_PRIVINFLT) -IDTVEC(dna) - TRAP(T_DNA) -IDTVEC(fpusegm) - TRAP(T_FPOPFLT) -IDTVEC(mchk) - TRAP(T_MCHK) -IDTVEC(rsvd) - TRAP(T_RESERVED) -IDTVEC(fpu) - TRAP(T_ARITHTRAP) -IDTVEC(xmm) - TRAP(T_XMMFLT) - -/* This group of traps have tf_err already pushed by the cpu */ -#define TRAP_ERR(a) \ - subq $TF_ERR,%rsp; \ - movl $(a),TF_TRAPNO(%rsp) ; \ - movq $0,TF_ADDR(%rsp) ; \ + .endm + + TRAP div, T_DIVIDE + TRAP ofl, T_OFLOW + TRAP bnd, T_BOUND + TRAP ill, T_PRIVINFLT + TRAP dna, T_DNA + TRAP fpusegm, T_FPOPFLT + TRAP mchk, T_MCHK + TRAP rsvd, T_RESERVED + TRAP fpu, T_ARITHTRAP + TRAP xmm, T_XMMFLT + +/* This group of traps have tf_err already pushed by the cpu. */ + .macro TRAP_ERR l, trapno + PTI_ENTRY \l,X\l,has_err=1 + .globl X\l + .type X\l,@function +X\l: + subq $TF_ERR,%rsp + movl $\trapno,TF_TRAPNO(%rsp) + movq $0,TF_ADDR(%rsp) jmp alltraps -IDTVEC(tss) - TRAP_ERR(T_TSSFLT) -IDTVEC(missing) - subq $TF_ERR,%rsp - movl $T_SEGNPFLT,TF_TRAPNO(%rsp) - jmp prot_addrf -IDTVEC(stk) - subq $TF_ERR,%rsp - movl $T_STKFLT,TF_TRAPNO(%rsp) - jmp prot_addrf -IDTVEC(align) - TRAP_ERR(T_ALIGNFLT) + .endm + + TRAP_ERR tss, T_TSSFLT + TRAP_ERR align, T_ALIGNFLT /* * alltraps entry point. Use swapgs if this is the first time in the @@ -174,15 +172,12 @@ alltraps: movq %rdi,TF_RDI(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz alltraps_testi /* already running with kernel GS.base */ + jz alltraps_segs /* already running with kernel GS.base */ swapgs movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) -alltraps_testi: +alltraps_segs: + SAVE_SEGS testl $PSL_I,TF_RFLAGS(%rsp) jz alltraps_pushregs_no_rdi sti @@ -249,14 +244,12 @@ alltraps_noen: movq %rdi,TF_RDI(%rsp) testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 1f /* already running with kernel GS.base */ + jz alltraps_noen_segs /* already running with kernel GS.base */ swapgs movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) -1: movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) +alltraps_noen_segs: + SAVE_SEGS jmp alltraps_pushregs_no_rdi IDTVEC(dblfault) @@ -279,37 +272,36 @@ movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) cld testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ jz 1f /* already running with kernel GS.base */ swapgs 1: - movq %rsp,%rdi + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je 2f + movq %rax,%cr3 +2: movq %rsp,%rdi call dblfault_handler -2: - hlt - jmp 2b +3: hlt + jmp 3b + PTI_ENTRY page, Xpage, has_err=1 IDTVEC(page) subq $TF_ERR,%rsp - movl $T_PAGEFLT,TF_TRAPNO(%rsp) movq %rdi,TF_RDI(%rsp) /* free up a GP register */ testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 1f /* already running with kernel GS.base */ + jz page_cr2 /* already running with kernel GS.base */ swapgs movq PCPU(CURPCB),%rdi andl $~PCB_FULL_IRET,PCB_FLAGS(%rdi) -1: movq %cr2,%rdi /* preserve %cr2 before .. */ +page_cr2: + movq %cr2,%rdi /* preserve %cr2 before .. */ movq %rdi,TF_ADDR(%rsp) /* enabling interrupts. */ - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + SAVE_SEGS + movl $T_PAGEFLT,TF_TRAPNO(%rsp) testl $PSL_I,TF_RFLAGS(%rsp) jz alltraps_pushregs_no_rdi sti @@ -320,10 +312,43 @@ * the iretq stage, we'll reenter with the wrong gs state. We'll have * to do a special the swapgs in this case even coming from the kernel. * XXX linux has a trap handler for their equivalent of load_gs(). + * + * On the stack, we have the hardware interrupt frame to return + * to usermode (faulted) and another frame with error code, for + * fault. For PTI, copy both frames to the main thread stack. */ -IDTVEC(prot) + .macro PROTF_ENTRY name,trapno +\name\()_pti_doreti: + pushq %rax + pushq %rdx + swapgs + movq PCPU(KCR3),%rax + movq %rax,%cr3 + movq PCPU(RSP0),%rax + subq $2*PTI_SIZE-3*8,%rax + MOVE_STACKS (PTI_SIZE / 4 - 3) + movq %rax,%rsp + popq %rdx + popq %rax + swapgs + jmp X\name +IDTVEC(\name\()_pti) + cmpq $doreti_iret,PTI_RIP-2*8(%rsp) + je \name\()_pti_doreti + testb $SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */ + jz X\name + PTI_UENTRY has_err=1 + swapgs +IDTVEC(\name) subq $TF_ERR,%rsp - movl $T_PROTFLT,TF_TRAPNO(%rsp) + movl $\trapno,TF_TRAPNO(%rsp) + jmp prot_addrf + .endm + + PROTF_ENTRY missing, T_SEGNPFLT + PROTF_ENTRY stk, T_STKFLT + PROTF_ENTRY prot, T_PROTFLT + prot_addrf: movq $0,TF_ADDR(%rsp) movq %rdi,TF_RDI(%rsp) /* free up a GP register */ @@ -375,8 +400,18 @@ * We do not support invoking this from a custom segment registers, * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT. */ + SUPERALIGN_TEXT +IDTVEC(fast_syscall_pti) + swapgs + movq %rax,PCPU(SCRATCH_RAX) + movq PCPU(KCR3),%rax + movq %rax,%cr3 + jmp fast_syscall_common + SUPERALIGN_TEXT IDTVEC(fast_syscall) swapgs + movq %rax,PCPU(SCRATCH_RAX) +fast_syscall_common: movq %rsp,PCPU(SCRATCH_RSP) movq PCPU(RSP0),%rsp /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */ @@ -386,10 +421,9 @@ movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */ movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */ movq %r11,TF_RSP(%rsp) /* user stack pointer */ - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + movq PCPU(SCRATCH_RAX),%rax + movq %rax,TF_RAX(%rsp) /* syscall number */ + SAVE_SEGS movq PCPU(CURPCB),%r11 andl $~PCB_FULL_IRET,PCB_FLAGS(%r11) sti @@ -402,7 +436,6 @@ movq %r10,TF_RCX(%rsp) /* arg 4 */ movq %r8,TF_R8(%rsp) /* arg 5 */ movq %r9,TF_R9(%rsp) /* arg 6 */ - movq %rax,TF_RAX(%rsp) /* syscall number */ movq %rbx,TF_RBX(%rsp) /* C preserved */ movq %rbp,TF_RBP(%rsp) /* C preserved */ movq %r12,TF_R12(%rsp) /* C preserved */ @@ -420,11 +453,11 @@ /* Disable interrupts before testing PCB_FULL_IRET. */ cli testl $PCB_FULL_IRET,PCB_FLAGS(%rax) - jnz 3f + jnz 4f /* Check for and handle AST's on return to userland. */ movq PCPU(CURTHREAD),%rax testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax) - jne 2f + jne 3f /* Restore preserved registers. */ MEXITCOUNT movq TF_RDI(%rsp),%rdi /* bonus; preserve arg 1 */ @@ -434,16 +467,21 @@ movq TF_RFLAGS(%rsp),%r11 /* original %rflags */ movq TF_RIP(%rsp),%rcx /* original %rip */ movq TF_RSP(%rsp),%rsp /* user stack pointer */ - swapgs + cmpb $0,pti + je 2f + movq PCPU(UCR3),%r9 + movq %r9,%cr3 + xorl %r9d,%r9d +2: swapgs sysretq -2: /* AST scheduled. */ +3: /* AST scheduled. */ sti movq %rsp,%rdi call ast jmp 1b -3: /* Requested full context restore, use doreti for that. */ +4: /* Requested full context restore, use doreti for that. */ MEXITCOUNT jmp doreti @@ -499,17 +537,15 @@ movq %r13,TF_R13(%rsp) movq %r14,TF_R14(%rsp) movq %r15,TF_R15(%rsp) - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) cld xorl %ebx,%ebx testb $SEL_RPL_MASK,TF_CS(%rsp) jnz nmi_fromuserspace /* - * We've interrupted the kernel. Preserve GS.base in %r12. + * We've interrupted the kernel. Preserve GS.base in %r12 + * and %cr3 in %r13. */ movl $MSR_GSBASE,%ecx rdmsr @@ -521,27 +557,38 @@ movl %edx,%eax shrq $32,%rdx wrmsr + movq %cr3,%r13 + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je nmi_calltrap + movq %rax,%cr3 jmp nmi_calltrap nmi_fromuserspace: incl %ebx swapgs - testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) - jz 2f + movq %cr3,%r13 + movq PCPU(KCR3),%rax + cmpq $~0,%rax + je 1f + movq %rax,%cr3 movq PCPU(CURPCB),%rdi testq %rdi,%rdi - jz 2f + jz 3f + orl $PCB_FULL_IRET,PCB_FLAGS(%rdi) +1: testb $CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip) + jz 3f cmpw $KUF32SEL,TF_FS(%rsp) - jne 1f + jne 2f rdfsbase %rax movq %rax,PCB_FSBASE(%rdi) -1: cmpw $KUG32SEL,TF_GS(%rsp) - jne 2f +2: cmpw $KUG32SEL,TF_GS(%rsp) + jne 3f movl $MSR_KGSBASE,%ecx rdmsr shlq $32,%rdx orq %rdx,%rax movq %rax,PCB_GSBASE(%rdi) -2: +3: /* Note: this label is also used by ddb and gdb: */ nmi_calltrap: FAKE_MCOUNT(TF_RIP(%rsp)) @@ -564,26 +611,29 @@ movq PCPU(CURTHREAD),%rax orq %rax,%rax /* curthread present? */ jz nocallchain - testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */ - jz nocallchain /* - * A user callchain is to be captured, so: - * - Move execution to the regular kernel stack, to allow for - * nested NMI interrupts. - * - Take the processor out of "NMI" mode by faking an "iret". - * - Enable interrupts, so that copyin() can work. + * Move execution to the regular kernel stack, because we + * committed to return through doreti. */ movq %rsp,%rsi /* source stack pointer */ movq $TF_SIZE,%rcx movq PCPU(RSP0),%rdx subq %rcx,%rdx movq %rdx,%rdi /* destination stack pointer */ - shrq $3,%rcx /* trap frame size in long words */ cld rep movsq /* copy trapframe */ + movq %rdx,%rsp /* we are on the regular kstack */ + testl $TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */ + jz nocallchain + /* + * A user callchain is to be captured, so: + * - Take the processor out of "NMI" mode by faking an "iret", + * to allow for nested NMI interrupts. + * - Enable interrupts, so that copyin() can work. + */ movl %ss,%eax pushq %rax /* tf_ss */ pushq %rdx /* tf_rsp (on kernel stack) */ @@ -624,22 +674,9 @@ movl %edx,%eax shrq $32,%rdx wrmsr + movq %r13,%cr3 nmi_restoreregs: - movq TF_RDI(%rsp),%rdi - movq TF_RSI(%rsp),%rsi - movq TF_RDX(%rsp),%rdx - movq TF_RCX(%rsp),%rcx - movq TF_R8(%rsp),%r8 - movq TF_R9(%rsp),%r9 - movq TF_RAX(%rsp),%rax - movq TF_RBX(%rsp),%rbx - movq TF_RBP(%rsp),%rbp - movq TF_R10(%rsp),%r10 - movq TF_R11(%rsp),%r11 - movq TF_R12(%rsp),%r12 - movq TF_R13(%rsp),%r13 - movq TF_R14(%rsp),%r14 - movq TF_R15(%rsp),%r15 + RESTORE_REGS addq $TF_RIP,%rsp jmp doreti_iret @@ -807,27 +844,38 @@ ld_ds: movw TF_DS(%rsp),%ds ld_regs: - movq TF_RDI(%rsp),%rdi - movq TF_RSI(%rsp),%rsi - movq TF_RDX(%rsp),%rdx - movq TF_RCX(%rsp),%rcx - movq TF_R8(%rsp),%r8 - movq TF_R9(%rsp),%r9 - movq TF_RAX(%rsp),%rax - movq TF_RBX(%rsp),%rbx - movq TF_RBP(%rsp),%rbp - movq TF_R10(%rsp),%r10 - movq TF_R11(%rsp),%r11 - movq TF_R12(%rsp),%r12 - movq TF_R13(%rsp),%r13 - movq TF_R14(%rsp),%r14 - movq TF_R15(%rsp),%r15 + RESTORE_REGS testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ - jz 1f /* keep running with kernel GS.base */ + jz 2f /* keep running with kernel GS.base */ cli + cmpb $0,pti + je 1f + pushq %rdx + movq PCPU(PRVSPACE),%rdx + addq $PC_PTI_STACK+PC_PTI_STACK_SZ*8-PTI_SIZE,%rdx + movq %rax,PTI_RAX(%rdx) + popq %rax + movq %rax,PTI_RDX(%rdx) + movq TF_RIP(%rsp),%rax + movq %rax,PTI_RIP(%rdx) + movq TF_CS(%rsp),%rax + movq %rax,PTI_CS(%rdx) + movq TF_RFLAGS(%rsp),%rax + movq %rax,PTI_RFLAGS(%rdx) + movq TF_RSP(%rsp),%rax + movq %rax,PTI_RSP(%rdx) + movq TF_SS(%rsp),%rax + movq %rax,PTI_SS(%rdx) + movq PCPU(UCR3),%rax swapgs -1: - addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ + movq %rdx,%rsp + movq %rax,%cr3 + popq %rdx + popq %rax + addq $8,%rsp + jmp doreti_iret +1: swapgs +2: addq $TF_RIP,%rsp .globl doreti_iret doreti_iret: iretq @@ -851,14 +899,11 @@ .globl doreti_iret_fault doreti_iret_fault: subq $TF_RIP,%rsp /* space including tf_err, tf_trapno */ - testl $PSL_I,TF_RFLAGS(%rsp) + testb $SEL_RPL_MASK,TF_CS(%rsp) jz 1f sti 1: - movw %fs,TF_FS(%rsp) - movw %gs,TF_GS(%rsp) - movw %es,TF_ES(%rsp) - movw %ds,TF_DS(%rsp) + SAVE_SEGS movl $TF_HASSEGS,TF_FLAGS(%rsp) movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) @@ -885,7 +930,7 @@ .globl ds_load_fault ds_load_fault: movl $T_PROTFLT,TF_TRAPNO(%rsp) - testl $PSL_I,TF_RFLAGS(%rsp) + testb $SEL_RPL_MASK,TF_CS(%rsp) jz 1f sti 1: Index: sys/amd64/amd64/genassym.c =================================================================== --- sys/amd64/amd64/genassym.c +++ sys/amd64/amd64/genassym.c @@ -186,6 +186,16 @@ ASSYM(TF_SIZE, sizeof(struct trapframe)); ASSYM(TF_HASSEGS, TF_HASSEGS); +ASSYM(PTI_RDX, offsetof(struct pti_frame, pti_rdx)); +ASSYM(PTI_RAX, offsetof(struct pti_frame, pti_rax)); +ASSYM(PTI_ERR, offsetof(struct pti_frame, pti_err)); +ASSYM(PTI_RIP, offsetof(struct pti_frame, pti_rip)); +ASSYM(PTI_CS, offsetof(struct pti_frame, pti_cs)); +ASSYM(PTI_RFLAGS, offsetof(struct pti_frame, pti_rflags)); +ASSYM(PTI_RSP, offsetof(struct pti_frame, pti_rsp)); +ASSYM(PTI_SS, offsetof(struct pti_frame, pti_ss)); +ASSYM(PTI_SIZE, sizeof(struct pti_frame)); + ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags)); @@ -202,6 +212,7 @@ ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp)); +ASSYM(PC_SCRATCH_RAX, offsetof(struct pcpu, pc_scratch_rax)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp)); ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0)); @@ -211,6 +222,10 @@ ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt)); +ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3)); +ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3)); +ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack)); +ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ); ASSYM(LA_EOI, LAPIC_EOI * LAPIC_MEM_MUL); ASSYM(LA_ISR, LAPIC_ISR0 * LAPIC_MEM_MUL); Index: sys/amd64/amd64/machdep.c =================================================================== --- sys/amd64/amd64/machdep.c +++ sys/amd64/amd64/machdep.c @@ -658,9 +658,9 @@ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ -static char dblfault_stack[PAGE_SIZE] __aligned(16); +char dblfault_stack[PAGE_SIZE] __aligned(16); -static char nmi0_stack[PAGE_SIZE] __aligned(16); +char nmi0_stack[PAGE_SIZE] __aligned(16); CTASSERT(sizeof(struct nmi_pcpu) == 16); struct amd64tss common_tss[MAXCPU]; @@ -813,13 +813,20 @@ IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), IDTVEC(dblfault), + IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti), + IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), + IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), + IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(mchk_pti), + IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), + IDTVEC(xmm_pti), #ifdef KDTRACE_HOOKS - IDTVEC(dtrace_ret), + IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), #endif #ifdef XENHVM - IDTVEC(xen_intr_upcall), + IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), #endif - IDTVEC(fast_syscall), IDTVEC(fast_syscall32); + IDTVEC(fast_syscall), IDTVEC(fast_syscall32), + IDTVEC(fast_syscall_pti); #ifdef DDB /* @@ -1520,7 +1527,8 @@ msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); - wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); + wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : + (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); @@ -1536,6 +1544,7 @@ struct pcpu *pc; struct nmi_pcpu *np; struct xstate_hdr *xhdr; + u_int64_t rsp0; char *env; size_t kstack0_sz; int late_console; @@ -1609,34 +1618,54 @@ mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); /* exceptions */ + TUNABLE_INT_FETCH("vm.pmap.pti", &pti); + for (x = 0; x < NIDT; x++) - setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); + setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_DB, pti ? &IDTVEC(dbg_pti) : &IDTVEC(dbg), SDT_SYSIGT, + SEL_KPL, 0); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); - setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0); - setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, + SEL_UPL, 0); + setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, + SEL_KPL, 0); setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); - setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); - setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), + SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), + SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_MC, pti ? &IDTVEC(mchk_pti) : &IDTVEC(mchk), SDT_SYSIGT, + SEL_KPL, 0); + setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, + SEL_KPL, 0); #ifdef KDTRACE_HOOKS - setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); + setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : + &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); #endif #ifdef XENHVM setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_UPL, 0); #endif - r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; lidt(&r_idt); @@ -1750,10 +1779,12 @@ xhdr->xstate_bv = xsave_mask; } /* make an initial tss so cpu can get interrupt stack on syscall! */ - common_tss[0].tss_rsp0 = (vm_offset_t)thread0.td_pcb; + rsp0 = (vm_offset_t)thread0.td_pcb; /* Ensure the stack is aligned to 16 bytes */ - common_tss[0].tss_rsp0 &= ~0xFul; - PCPU_SET(rsp0, common_tss[0].tss_rsp0); + rsp0 &= ~0xFul; + common_tss[0].tss_rsp0 = pti ? ((vm_offset_t)PCPU_PTR(pti_stack) + + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : rsp0; + PCPU_SET(rsp0, rsp0); PCPU_SET(curpcb, thread0.td_pcb); /* transfer to user mode */ Index: sys/amd64/amd64/mp_machdep.c =================================================================== --- sys/amd64/amd64/mp_machdep.c +++ sys/amd64/amd64/mp_machdep.c @@ -132,33 +132,40 @@ /* Install an inter-CPU IPI for TLB invalidation */ if (pmap_pcid_enabled) { if (invpcid_works) { - setidt(IPI_INVLTLB, IDTVEC(invltlb_invpcid), - SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_invpcid_pti) : + IDTVEC(invltlb_invpcid), SDT_SYSIGT, SEL_KPL, 0); } else { - setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT, - SEL_KPL, 0); + setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pcid_pti) : + IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0); } } else { - setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLTLB, pti ? IDTVEC(invltlb_pti) : IDTVEC(invltlb), + SDT_SYSIGT, SEL_KPL, 0); } - setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); - setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLPG, pti ? IDTVEC(invlpg_pti) : IDTVEC(invlpg), + SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLRNG, pti ? IDTVEC(invlrng_pti) : IDTVEC(invlrng), + SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for cache invalidation. */ - setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_INVLCACHE, pti ? IDTVEC(invlcache_pti) : IDTVEC(invlcache), + SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for all-CPU rendezvous */ - setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_RENDEZVOUS, pti ? IDTVEC(rendezvous_pti) : + IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); /* Install generic inter-CPU IPI handler */ - setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), - SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_BITMAP_VECTOR, pti ? IDTVEC(ipi_intr_bitmap_handler_pti) : + IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU stop/restart */ - setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_STOP, pti ? IDTVEC(cpustop_pti) : IDTVEC(cpustop), + SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU suspend/resume */ - setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); + setidt(IPI_SUSPEND, pti ? IDTVEC(cpususpend_pti) : IDTVEC(cpususpend), + SDT_SYSIGT, SEL_KPL, 0); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { @@ -197,7 +204,6 @@ /* Init tss */ common_tss[cpu] = common_tss[0]; - common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE; common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; @@ -240,6 +246,8 @@ pc->pc_curpmap = kernel_pmap; pc->pc_pcid_gen = 1; pc->pc_pcid_next = PMAP_PCID_KERN + 1; + common_tss[cpu].tss_rsp0 = pti ? ((vm_offset_t)&pc->pc_pti_stack + + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0; /* Save the per-cpu pointer for use by the NMI handler. */ np->np_pcpu = (register_t) pc; Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -11,11 +11,17 @@ * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. + * Copyright (c) 2014-2018 The FreeBSD Foundation + * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -149,6 +155,7 @@ #ifdef SMP #include #endif +#include static __inline boolean_t pmap_type_guest(pmap_t pmap) @@ -217,7 +224,7 @@ switch (pmap->pm_type) { case PT_X86: - mask = X86_PG_G; + mask = pg_g; break; case PT_RVI: case PT_EPT: @@ -340,7 +347,7 @@ static int ndmpdp; vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; -pt_entry_t pg_nx; +pt_entry_t pg_nx, pg_g; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); @@ -405,6 +412,15 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); +int pti = 1; +SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pti, 0, + "Page Table Isolation enabled"); +static vm_object_t pti_obj; +static pml4_entry_t *pti_pml4; +static vm_pindex_t pti_pg_idx; +static bool pti_finalized; + static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { @@ -639,6 +655,11 @@ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); +static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, + bool exec); +static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); +static pd_entry_t *pmap_pti_pde(vm_offset_t va); +static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, @@ -921,7 +942,7 @@ /* XXX not fully used, underneath 2M pages */ pt_p = (pt_entry_t *)KPTphys; for (i = 0; ptoa(i) < *firstaddr; i++) - pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; + pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g; /* Now map the page tables at their location within PTmap */ pd_p = (pd_entry_t *)KPDphys; @@ -932,7 +953,7 @@ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | - X86_PG_G; + pg_g; /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); @@ -952,14 +973,14 @@ for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | + pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ - pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | + pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } for (j = 0; i < ndmpdp; i++, j++) { @@ -1027,6 +1048,8 @@ load_cr3(KPML4phys); if (cpu_stdext_feature & CPUID_STDEXT_SMEP) load_cr4(rcr4() | CR4_SMEP); + if (!pti) + pg_g = X86_PG_G; /* * Initialize the kernel pmap (which is statically allocated). @@ -1071,6 +1094,8 @@ pmap_init_pat(); /* Initialize TLB Context Id. */ + if (pti) + pmap_pcid_enabled = 0; TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { /* Check for INVPCID support */ @@ -1088,8 +1113,6 @@ * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); - } else { - pmap_pcid_enabled = 0; } } @@ -2114,7 +2137,7 @@ pt_entry_t *pte; pte = vtopte(va); - pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); } static __inline void @@ -2125,7 +2148,7 @@ pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); - pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); + pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); } /* @@ -2185,7 +2208,7 @@ pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; - pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); + pte_store(pte, pa | pg_g | X86_PG_RW | X86_PG_V); } pte++; } @@ -2306,6 +2329,10 @@ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; + if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { + pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; + *pml4 = 0; + } } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; @@ -2364,7 +2391,9 @@ PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); + pmap->pm_pml4u = NULL; pmap->pm_cr3 = KPML4phys; + pmap->pm_ucr3 = ~0UL; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); @@ -2373,6 +2402,8 @@ CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; + if (!pti) + __pcpu[i].pc_kcr3 = ~0ul; } PCPU_SET(curpmap, kernel_pmap); pmap_activate(curthread); @@ -2402,6 +2433,17 @@ X86_PG_A | X86_PG_M; } +static void +pmap_pinit_pml4_pti(vm_page_t pml4pg) +{ + pml4_entry_t *pm_pml4; + int i; + + pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); + for (i = 0; i < NPML4EPG; i++) + pm_pml4[i] = pti_pml4[i]; +} + /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. @@ -2409,7 +2451,7 @@ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { - vm_page_t pml4pg; + vm_page_t pml4pg, pml4pgu; vm_paddr_t pml4phys; int i; @@ -2425,8 +2467,10 @@ pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } - pmap->pm_cr3 = ~0; /* initialize to an invalid value */ + pmap->pm_cr3 = ~0l; /* initialize to an invalid value */ + pmap->pm_pml4u = NULL; + pmap->pm_type = pm_type; if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); @@ -2434,10 +2478,19 @@ * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. + * Install minimal kernel mappings in PTI case. */ - if ((pmap->pm_type = pm_type) == PT_X86) { + if (pm_type == PT_X86) { pmap->pm_cr3 = pml4phys; pmap_pinit_pml4(pml4pg); + if (pti) { + pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + VM_ALLOC_NOOBJ | VM_ALLOC_WAITOK); + pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(pml4pgu)); + pmap_pinit_pml4_pti(pml4pgu); + pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); + } } pmap->pm_root.rt_root = 0; @@ -2509,13 +2562,18 @@ */ if (ptepindex >= (NUPDE + NUPDPE)) { - pml4_entry_t *pml4; + pml4_entry_t *pml4, *pml4u; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; + if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { + pml4u = &pmap->pm_pml4u[pml4index]; + *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | + PG_A | PG_M; + } } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; @@ -2716,6 +2774,11 @@ m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); + + if (pmap->pm_pml4u != NULL) { + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); + vm_page_free(m); + } } static int @@ -7141,6 +7204,10 @@ } else if (cr3 != pmap->pm_cr3) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); + if (pti) { + PCPU_SET(kcr3, pmap->pm_cr3); + PCPU_SET(ucr3, pmap->pm_ucr3); + } } #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); @@ -7464,6 +7531,281 @@ mtx_unlock_spin(&qframe_mtx); } +static vm_page_t +pmap_pti_alloc_page(void) +{ + vm_page_t m; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | + VM_ALLOC_WIRED | VM_ALLOC_ZERO); + return (m); +} + +extern char kernphys[], etext[], dblfault_stack[], nmi0_stack[]; + +static void +pmap_pti_init(void) +{ + vm_page_t pml4_pg; + pdp_entry_t *pdpe; + vm_offset_t va; + int i; + + if (!pti) + return; + pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); + VM_OBJECT_WLOCK(pti_obj); + pml4_pg = pmap_pti_alloc_page(); + pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); + for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && + va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { + pdpe = pmap_pti_pdpe(va); + pmap_pti_wire_pte(pdpe); + } + pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], + (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); + pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + + sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false); + pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + + sizeof(struct gate_descriptor) * NIDT, false); + pmap_pti_add_kva_locked((vm_offset_t)common_tss, + (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false); + CPU_FOREACH(i) { + /* Doublefault stack IST 1 */ + va = common_tss[i].tss_ist1; + pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); + /* NMI stack IST 2 */ + va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); + pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); + } + pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, + (vm_offset_t)etext, true); + pti_finalized = true; + VM_OBJECT_WUNLOCK(pti_obj); +} +SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); + +static pdp_entry_t * +pmap_pti_pdpe(vm_offset_t va) +{ + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + vm_page_t m; + vm_pindex_t pml4_idx; + vm_paddr_t mphys; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + + pml4_idx = pmap_pml4e_index(va); + pml4e = &pti_pml4[pml4_idx]; + m = NULL; + if (*pml4e == 0) { + if (pti_finalized) + panic("pml4 alloc after finalization\n"); + m = pmap_pti_alloc_page(); + if (*pml4e != 0) { + vm_page_free_zero(m); + mphys = *pml4e & ~PAGE_MASK; + } else { + mphys = VM_PAGE_TO_PHYS(m); + *pml4e = mphys | X86_PG_RW | X86_PG_V; + } + } else { + mphys = *pml4e & ~PAGE_MASK; + } + pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); + return (pdpe); +} + +static void +pmap_pti_wire_pte(void *pte) +{ + vm_page_t m; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); + m->wire_count++; +} + +static void +pmap_pti_unwire_pde(void *pde, bool only_ref) +{ + vm_page_t m; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); + MPASS(m->wire_count > 0); + MPASS(only_ref || m->wire_count > 1); + m->wire_count--; + if (m->wire_count == 0) + vm_page_free_zero(m); +} + +static void +pmap_pti_unwire_pte(void *pte, vm_offset_t va) +{ + vm_page_t m; + pd_entry_t *pde; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); + MPASS(m->wire_count > 0); + m->wire_count--; + if (m->wire_count == 0) { + vm_page_free_zero(m); + pde = pmap_pti_pde(va); + MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); + *pde = 0; + pmap_pti_unwire_pde(pde, false); + } +} + +static pd_entry_t * +pmap_pti_pde(vm_offset_t va) +{ + pdp_entry_t *pdpe; + pd_entry_t *pde; + vm_page_t m; + vm_pindex_t pd_idx; + vm_paddr_t mphys; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + + pdpe = pmap_pti_pdpe(va); + if (*pdpe == 0) { + m = pmap_pti_alloc_page(); + if (*pdpe != 0) { + vm_page_free_zero(m); + MPASS((*pdpe & X86_PG_PS) == 0); + mphys = *pdpe & ~PAGE_MASK; + } else { + mphys = VM_PAGE_TO_PHYS(m); + *pdpe = mphys | X86_PG_RW | X86_PG_V; + } + } else { + MPASS((*pdpe & X86_PG_PS) == 0); + mphys = *pdpe & ~PAGE_MASK; + } + + pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); + pd_idx = pmap_pde_index(va); + pde += pd_idx; + return (pde); +} + +static pt_entry_t * +pmap_pti_pte(vm_offset_t va, bool *unwire_pde) +{ + pd_entry_t *pde; + pt_entry_t *pte; + vm_page_t m; + vm_paddr_t mphys; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + + pde = pmap_pti_pde(va); + if (unwire_pde != NULL) { + *unwire_pde = true; + pmap_pti_wire_pte(pde); + } + if (*pde == 0) { + m = pmap_pti_alloc_page(); + if (*pde != 0) { + vm_page_free_zero(m); + MPASS((*pde & X86_PG_PS) == 0); + mphys = *pde & ~(PAGE_MASK | pg_nx); + } else { + mphys = VM_PAGE_TO_PHYS(m); + *pde = mphys | X86_PG_RW | X86_PG_V; + if (unwire_pde != NULL) + *unwire_pde = false; + } + } else { + MPASS((*pde & X86_PG_PS) == 0); + mphys = *pde & ~(PAGE_MASK | pg_nx); + } + + pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); + pte += pmap_pte_index(va); + + return (pte); +} + +static void +pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) +{ + vm_paddr_t pa; + pd_entry_t *pde; + pt_entry_t *pte, ptev; + bool unwire_pde; + + VM_OBJECT_ASSERT_WLOCKED(pti_obj); + + sva = trunc_page(sva); + MPASS(sva > VM_MAXUSER_ADDRESS); + eva = round_page(eva); + MPASS(sva < eva); + for (; sva < eva; sva += PAGE_SIZE) { + pte = pmap_pti_pte(sva, &unwire_pde); + pa = pmap_kextract(sva); + ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | + (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, + VM_MEMATTR_DEFAULT, FALSE); + if (*pte == 0) { + pte_store(pte, ptev); + pmap_pti_wire_pte(pte); + } else { + KASSERT(!pti_finalized, + ("pti overlap after fin %#lx %#lx %#lx", + sva, *pte, ptev)); + KASSERT(*pte == ptev, + ("pti non-identical pte after fin %#lx %#lx %#lx", + sva, *pte, ptev)); + } + if (unwire_pde) { + pde = pmap_pti_pde(sva); + pmap_pti_unwire_pde(pde, true); + } + } +} + +void +pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) +{ + + if (!pti) + return; + VM_OBJECT_WLOCK(pti_obj); + pmap_pti_add_kva_locked(sva, eva, exec); + VM_OBJECT_WUNLOCK(pti_obj); +} + +void +pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) +{ + pt_entry_t *pte; + vm_offset_t va; + + if (!pti) + return; + sva = rounddown2(sva, PAGE_SIZE); + MPASS(sva > VM_MAXUSER_ADDRESS); + eva = roundup2(eva, PAGE_SIZE); + MPASS(sva < eva); + VM_OBJECT_WLOCK(pti_obj); + for (va = sva; va < eva; va += PAGE_SIZE) { + pte = pmap_pti_pte(va, NULL); + KASSERT((*pte & X86_PG_V) != 0, + ("invalid pte va %#lx pte %#lx pt %#lx", va, + (u_long)pte, *pte)); + pte_clear(pte); + pmap_pti_unwire_pte(pte, va); + } + pmap_invalidate_range(kernel_pmap, sva, eva); + VM_OBJECT_WUNLOCK(pti_obj); +} + #include "opt_ddb.h" #ifdef DDB #include Index: sys/amd64/amd64/sys_machdep.c =================================================================== --- sys/amd64/amd64/sys_machdep.c +++ sys/amd64/amd64/sys_machdep.c @@ -65,6 +65,9 @@ #include +static void user_ldt_deref(struct proc_ldt *pldt); +static void user_ldt_derefl(struct proc_ldt *pldt); + #define MAX_LD 8192 int max_ldt_segment = 512; @@ -83,8 +86,6 @@ } SYSINIT(maxldt, SI_SUB_VM_CONF, SI_ORDER_ANY, max_ldt_segment_init, NULL); -static void user_ldt_derefl(struct proc_ldt *pldt); - #ifndef _SYS_SYSPROTO_H_ struct sysarch_args { int op; @@ -361,7 +362,9 @@ pcb = td->td_pcb; if (pcb->pcb_tssp == NULL) { tssp = (struct amd64tss *)kmem_malloc(kernel_arena, - ctob(IOPAGES+1), M_WAITOK); + ctob(IOPAGES + 1), M_WAITOK); + pmap_pti_add_kva((vm_offset_t)tssp, (vm_offset_t)tssp + + ctob(IOPAGES + 1), false); iomap = (char *)&tssp[1]; memset(iomap, 0xff, IOPERM_BITMAP_SIZE); critical_enter(); @@ -450,6 +453,8 @@ struct proc_ldt *pldt, *new_ldt; struct mdproc *mdp; struct soft_segment_descriptor sldt; + vm_offset_t sva; + vm_size_t sz; mtx_assert(&dt_lock, MA_OWNED); mdp = &p->p_md; @@ -457,13 +462,13 @@ return (mdp->md_ldt); mtx_unlock(&dt_lock); new_ldt = malloc(sizeof(struct proc_ldt), M_SUBPROC, M_WAITOK); - new_ldt->ldt_base = (caddr_t)kmem_malloc(kernel_arena, - max_ldt_segment * sizeof(struct user_segment_descriptor), - M_WAITOK | M_ZERO); + sz = max_ldt_segment * sizeof(struct user_segment_descriptor); + sva = kmem_malloc(kernel_arena, sz, M_WAITOK | M_ZERO); + new_ldt->ldt_base = (caddr_t)sva; + pmap_pti_add_kva(sva, sva + sz, false); new_ldt->ldt_refcnt = 1; - sldt.ssd_base = (uint64_t)new_ldt->ldt_base; - sldt.ssd_limit = max_ldt_segment * - sizeof(struct user_segment_descriptor) - 1; + sldt.ssd_base = sva; + sldt.ssd_limit = sz - 1; sldt.ssd_type = SDT_SYSLDT; sldt.ssd_dpl = SEL_KPL; sldt.ssd_p = 1; @@ -473,8 +478,8 @@ mtx_lock(&dt_lock); pldt = mdp->md_ldt; if (pldt != NULL && !force) { - kmem_free(kernel_arena, (vm_offset_t)new_ldt->ldt_base, - max_ldt_segment * sizeof(struct user_segment_descriptor)); + pmap_pti_remove_kva(sva, sva + sz); + kmem_free(kernel_arena, sva, sz); free(new_ldt, M_SUBPROC); return (pldt); } @@ -521,15 +526,19 @@ static void user_ldt_derefl(struct proc_ldt *pldt) { + vm_offset_t sva; + vm_size_t sz; if (--pldt->ldt_refcnt == 0) { - kmem_free(kernel_arena, (vm_offset_t)pldt->ldt_base, - max_ldt_segment * sizeof(struct user_segment_descriptor)); + sva = (vm_offset_t)pldt->ldt_base; + sz = max_ldt_segment * sizeof(struct user_segment_descriptor); + pmap_pti_remove_kva(sva, sva + sz); + kmem_free(kernel_arena, sva, sz); free(pldt, M_SUBPROC); } } -void +static void user_ldt_deref(struct proc_ldt *pldt) { Index: sys/amd64/amd64/trap.c =================================================================== --- sys/amd64/amd64/trap.c +++ sys/amd64/amd64/trap.c @@ -448,9 +448,28 @@ * problem here and not have to check all the * selectors and pointers when the user changes * them. + * + * In case of PTI, the IRETQ faulted while the + * kernel used the pti stack, and exception + * frame records %rsp value pointing to that + * stack. If we return normally to + * doreti_iret_fault, the trapframe is + * reconstructed on pti stack, and calltrap() + * called on it as well. Due to the very + * limited pti stack size, kernel does not + * survive for too long. Switch to the normal + * thread stack for the trap handling. + * + * Magic '5' is the number of qwords occupied by + * the hardware trap frame. */ if (frame->tf_rip == (long)doreti_iret) { frame->tf_rip = (long)doreti_iret_fault; + if (pti && frame->tf_rsp == (uintptr_t)PCPU_PTR( + pti_stack) + (PC_PTI_STACK_SZ - 5) * + sizeof(register_t)) + frame->tf_rsp = PCPU_GET(rsp0) - 5 * + sizeof(register_t); return; } if (frame->tf_rip == (long)ld_ds) { Index: sys/amd64/amd64/vm_machdep.c =================================================================== --- sys/amd64/amd64/vm_machdep.c +++ sys/amd64/amd64/vm_machdep.c @@ -339,6 +339,8 @@ * Clean TSS/iomap */ if (pcb->pcb_tssp != NULL) { + pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp, + (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1)); kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp, ctob(IOPAGES + 1)); pcb->pcb_tssp = NULL; Index: sys/amd64/ia32/ia32_exception.S =================================================================== --- sys/amd64/ia32/ia32_exception.S +++ sys/amd64/ia32/ia32_exception.S @@ -40,8 +40,13 @@ * that it originated in supervisor mode and skip the swapgs. */ SUPERALIGN_TEXT +IDTVEC(int0x80_syscall_pti) + PTI_UENTRY has_err=0 + jmp int0x80_syscall_common + SUPERALIGN_TEXT IDTVEC(int0x80_syscall) swapgs +int0x80_syscall_common: pushq $2 /* sizeof "int 0x80" */ subq $TF_ERR,%rsp /* skip over tf_trapno */ movq %rdi,TF_RDI(%rsp) Index: sys/amd64/ia32/ia32_syscall.c =================================================================== --- sys/amd64/ia32/ia32_syscall.c +++ sys/amd64/ia32/ia32_syscall.c @@ -95,7 +95,8 @@ #define IDTVEC(name) __CONCAT(X,name) -extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd); +extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(int0x80_syscall_pti), + IDTVEC(rsvd), IDTVEC(rsvd_pti); void ia32_syscall(struct trapframe *frame); /* Called from asm code */ @@ -208,14 +209,16 @@ ia32_syscall_enable(void *dummy) { - setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0); + setidt(IDT_SYSCALL, pti ? &IDTVEC(int0x80_syscall_pti) : + &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0); } static void ia32_syscall_disable(void *dummy) { - setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); + setidt(IDT_SYSCALL, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), + SDT_SYSIGT, SEL_KPL, 0); } SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL); Index: sys/amd64/include/asmacros.h =================================================================== --- sys/amd64/include/asmacros.h +++ sys/amd64/include/asmacros.h @@ -1,9 +1,17 @@ +/* -*- mode: asm -*- */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * + * Copyright (c) 2018 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by + * Konstantin Belousov under sponsorship from + * the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -145,69 +153,6 @@ popq %rbp #ifdef LOCORE -/* - * Convenience macro for declaring interrupt entry points. - */ -#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \ - .type __CONCAT(X,name),@function; __CONCAT(X,name): - -/* - * Macros to create and destroy a trap frame. - */ -#define PUSH_FRAME \ - subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \ - testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \ - jz 1f ; /* Yes, dont swapgs again */ \ - swapgs ; \ -1: movq %rdi,TF_RDI(%rsp) ; \ - movq %rsi,TF_RSI(%rsp) ; \ - movq %rdx,TF_RDX(%rsp) ; \ - movq %rcx,TF_RCX(%rsp) ; \ - movq %r8,TF_R8(%rsp) ; \ - movq %r9,TF_R9(%rsp) ; \ - movq %rax,TF_RAX(%rsp) ; \ - movq %rbx,TF_RBX(%rsp) ; \ - movq %rbp,TF_RBP(%rsp) ; \ - movq %r10,TF_R10(%rsp) ; \ - movq %r11,TF_R11(%rsp) ; \ - movq %r12,TF_R12(%rsp) ; \ - movq %r13,TF_R13(%rsp) ; \ - movq %r14,TF_R14(%rsp) ; \ - movq %r15,TF_R15(%rsp) ; \ - movw %fs,TF_FS(%rsp) ; \ - movw %gs,TF_GS(%rsp) ; \ - movw %es,TF_ES(%rsp) ; \ - movw %ds,TF_DS(%rsp) ; \ - movl $TF_HASSEGS,TF_FLAGS(%rsp) ; \ - cld ; \ - testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel ? */ \ - jz 2f ; /* yes, leave PCB_FULL_IRET alone */ \ - movq PCPU(CURPCB),%r8 ; \ - andl $~PCB_FULL_IRET,PCB_FLAGS(%r8) ; \ -2: - -#define POP_FRAME \ - movq TF_RDI(%rsp),%rdi ; \ - movq TF_RSI(%rsp),%rsi ; \ - movq TF_RDX(%rsp),%rdx ; \ - movq TF_RCX(%rsp),%rcx ; \ - movq TF_R8(%rsp),%r8 ; \ - movq TF_R9(%rsp),%r9 ; \ - movq TF_RAX(%rsp),%rax ; \ - movq TF_RBX(%rsp),%rbx ; \ - movq TF_RBP(%rsp),%rbp ; \ - movq TF_R10(%rsp),%r10 ; \ - movq TF_R11(%rsp),%r11 ; \ - movq TF_R12(%rsp),%r12 ; \ - movq TF_R13(%rsp),%r13 ; \ - movq TF_R14(%rsp),%r14 ; \ - movq TF_R15(%rsp),%r15 ; \ - testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \ - jz 1f ; /* keep kernel GS.base */ \ - cli ; \ - swapgs ; \ -1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ - /* * Access per-CPU data. */ @@ -216,6 +161,125 @@ movq %gs:PC_PRVSPACE, reg ; \ addq $PC_ ## member, reg +/* + * Convenience macro for declaring interrupt entry points. + */ +#define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \ + .type __CONCAT(X,name),@function; __CONCAT(X,name): + + .macro SAVE_SEGS + movw %fs,TF_FS(%rsp) + movw %gs,TF_GS(%rsp) + movw %es,TF_ES(%rsp) + movw %ds,TF_DS(%rsp) + .endm + + .macro MOVE_STACKS qw + offset=0 + .rept \qw + movq offset(%rsp),%rdx + movq %rdx,offset(%rax) + offset=offset+8 + .endr + .endm + + .macro PTI_UENTRY has_err + swapgs + pushq %rax + pushq %rdx + movq PCPU(KCR3),%rax + movq %rax,%cr3 + movq PCPU(RSP0),%rax + subq $PTI_SIZE,%rax + MOVE_STACKS (PTI_SIZE / 8) - 1 + \has_err + movq %rax,%rsp + popq %rdx + popq %rax + .endm + + .macro PTI_ENTRY name, cont, has_err=0 + ALIGN_TEXT + .globl X\name\()_pti + .type X\name\()_pti,@function +X\name\()_pti: + /* %rax, %rdx and possibly err not yet pushed */ + testb $SEL_RPL_MASK,PTI_CS-(2+1-\has_err)*8(%rsp) + jz \cont + PTI_UENTRY \has_err + swapgs + jmp \cont + .endm + + .macro PTI_INTRENTRY vec_name + SUPERALIGN_TEXT + .globl X\vec_name\()_pti + .type X\vec_name\()_pti,@function +X\vec_name\()_pti: + testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* err, %rax, %rdx not pushed */ + jz \vec_name\()_u + PTI_UENTRY has_err=0 + jmp \vec_name\()_u + .endm + + .macro INTR_PUSH_FRAME vec_name + SUPERALIGN_TEXT + .globl X\vec_name + .type X\vec_name,@function +X\vec_name: + testb $SEL_RPL_MASK,PTI_CS-3*8(%rsp) /* come from kernel? */ + jz \vec_name\()_u /* Yes, dont swapgs again */ + swapgs +\vec_name\()_u: + subq $TF_RIP,%rsp /* skip dummy tf_err and tf_trapno */ + movq %rdi,TF_RDI(%rsp) + movq %rsi,TF_RSI(%rsp) + movq %rdx,TF_RDX(%rsp) + movq %rcx,TF_RCX(%rsp) + movq %r8,TF_R8(%rsp) + movq %r9,TF_R9(%rsp) + movq %rax,TF_RAX(%rsp) + movq %rbx,TF_RBX(%rsp) + movq %rbp,TF_RBP(%rsp) + movq %r10,TF_R10(%rsp) + movq %r11,TF_R11(%rsp) + movq %r12,TF_R12(%rsp) + movq %r13,TF_R13(%rsp) + movq %r14,TF_R14(%rsp) + movq %r15,TF_R15(%rsp) + SAVE_SEGS + movl $TF_HASSEGS,TF_FLAGS(%rsp) + cld + testb $SEL_RPL_MASK,TF_CS(%rsp) /* come from kernel ? */ + jz 1f /* yes, leave PCB_FULL_IRET alone */ + movq PCPU(CURPCB),%r8 + andl $~PCB_FULL_IRET,PCB_FLAGS(%r8) +1: + .endm + + .macro INTR_HANDLER vec_name + .text + PTI_INTRENTRY \vec_name + INTR_PUSH_FRAME \vec_name + .endm + + .macro RESTORE_REGS + movq TF_RDI(%rsp),%rdi + movq TF_RSI(%rsp),%rsi + movq TF_RDX(%rsp),%rdx + movq TF_RCX(%rsp),%rcx + movq TF_R8(%rsp),%r8 + movq TF_R9(%rsp),%r9 + movq TF_RAX(%rsp),%rax + movq TF_RBX(%rsp),%rbx + movq TF_RBP(%rsp),%rbp + movq TF_R10(%rsp),%r10 + movq TF_R11(%rsp),%r11 + movq TF_R12(%rsp),%r12 + movq TF_R13(%rsp),%r13 + movq TF_R14(%rsp),%r14 + movq TF_R15(%rsp),%r15 + .endm + #endif /* LOCORE */ #ifdef __STDC__ Index: sys/amd64/include/frame.h =================================================================== --- sys/amd64/include/frame.h +++ sys/amd64/include/frame.h @@ -1,6 +1,50 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * This file is in the public domain. + * $FreeBSD$ */ -/* $FreeBSD$ */ + +#ifndef _AMD64_FRAME_H +#define _AMD64_FRAME_H #include + +struct pti_frame { + register_t pti_rdx; + register_t pti_rax; + register_t pti_err; + register_t pti_rip; + register_t pti_cs; + register_t pti_rflags; + register_t pti_rsp; + register_t pti_ss; +}; + +#endif Index: sys/amd64/include/pcpu.h =================================================================== --- sys/amd64/include/pcpu.h +++ sys/amd64/include/pcpu.h @@ -35,6 +35,7 @@ #error "sys/cdefs.h is a prerequisite for this file" #endif +#define PC_PTI_STACK_SZ 16 /* * The SMP parts are setup in pmap.c and locore.s for the BSP, and * mp_machdep.c sets up the data for the AP's to "see" when they awake. @@ -48,8 +49,11 @@ struct pmap *pc_curpmap; \ struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \ struct amd64tss *pc_commontssp;/* Common TSS for the CPU */ \ + uint64_t pc_kcr3; \ + uint64_t pc_ucr3; \ register_t pc_rsp0; \ register_t pc_scratch_rsp; /* User %rsp in syscall */ \ + register_t pc_scratch_rax; \ u_int pc_apic_id; \ u_int pc_acpi_id; /* ACPI CPU id */ \ /* Pointer to the CPU %fs descriptor */ \ @@ -63,12 +67,13 @@ uint64_t pc_pm_save_cnt; \ u_int pc_cmci_mask; /* MCx banks for CMCI */ \ uint64_t pc_dbreg[16]; /* ddb debugging regs */ \ + uint64_t pc_pti_stack[PC_PTI_STACK_SZ]; \ int pc_dbreg_cmd; /* ddb debugging reg cmd */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \ uint32_t pc_pcid_next; \ uint32_t pc_pcid_gen; \ uint32_t pc_smp_tlb_done; /* TLB op acknowledgement */ \ - char __pad[384] /* be divisor of PAGE_SIZE \ + char __pad[232] /* be divisor of PAGE_SIZE \ after cache alignment */ #define PC_DBREG_CMD_NONE 0 Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h +++ sys/amd64/include/pmap.h @@ -277,7 +277,7 @@ #define pde_store(pdep, pde) pte_store(pdep, pde) -extern pt_entry_t pg_nx; +extern pt_entry_t pg_nx, pg_g; #endif /* _KERNEL */ @@ -315,7 +315,9 @@ struct pmap { struct mtx pm_mtx; pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ + pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */ uint64_t pm_cr3; + uint64_t pm_ucr3; TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ enum pmap_type pm_type; /* regular or nested tables */ @@ -429,6 +431,8 @@ void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec); +void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva); #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */ Index: sys/amd64/include/proc.h =================================================================== --- sys/amd64/include/proc.h +++ sys/amd64/include/proc.h @@ -92,7 +92,6 @@ struct proc_ldt *user_ldt_alloc(struct proc *, int); void user_ldt_free(struct thread *); -void user_ldt_deref(struct proc_ldt *); struct sysarch_args; int sysarch_ldt(struct thread *td, struct sysarch_args *uap, int uap_space); int amd64_set_ldt_data(struct thread *td, int start, int num, Index: sys/amd64/include/smp.h =================================================================== --- sys/amd64/include/smp.h +++ sys/amd64/include/smp.h @@ -30,7 +30,18 @@ inthand_t IDTVEC(invltlb_pcid), /* TLB shootdowns - global, pcid */ IDTVEC(invltlb_invpcid),/* TLB shootdowns - global, invpcid */ - IDTVEC(justreturn); /* interrupt CPU with minimum overhead */ + IDTVEC(justreturn), /* interrupt CPU with minimum overhead */ + IDTVEC(invltlb_pcid_pti), + IDTVEC(invltlb_invpcid_pti), + IDTVEC(justreturn1_pti), + IDTVEC(invltlb_pti), + IDTVEC(invlpg_pti), + IDTVEC(invlrng_pti), + IDTVEC(invlcache_pti), + IDTVEC(ipi_intr_bitmap_handler_pti), + IDTVEC(cpustop_pti), + IDTVEC(cpususpend_pti), + IDTVEC(rendezvous_pti); void invltlb_pcid_handler(void); void invltlb_invpcid_handler(void); Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c +++ sys/amd64/vmm/intel/vmx.c @@ -695,7 +695,8 @@ MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, &tmp); if (error == 0) { - pirvec = lapic_ipi_alloc(&IDTVEC(justreturn)); + pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); if (pirvec < 0) { if (bootverbose) { printf("vmx_init: unable to allocate " Index: sys/amd64/vmm/vmm.c =================================================================== --- sys/amd64/vmm/vmm.c +++ sys/amd64/vmm/vmm.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -327,7 +328,8 @@ vmm_host_state_init(); - vmm_ipinum = lapic_ipi_alloc(&IDTVEC(justreturn)); + vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); if (vmm_ipinum < 0) vmm_ipinum = IPI_AST; Index: sys/dev/hyperv/vmbus/amd64/vmbus_vector.S =================================================================== --- sys/dev/hyperv/vmbus/amd64/vmbus_vector.S +++ sys/dev/hyperv/vmbus/amd64/vmbus_vector.S @@ -26,11 +26,11 @@ * $FreeBSD$ */ +#include "assym.s" + #include #include -#include "assym.s" - /* * This is the Hyper-V vmbus channel direct callback interrupt. * Only used when it is running on Hyper-V. @@ -38,7 +38,7 @@ .text SUPERALIGN_TEXT IDTVEC(vmbus_isr) - PUSH_FRAME + INTR_PUSH_FRAME FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call vmbus_handle_intr Index: sys/i386/i386/apic_vector.s =================================================================== --- sys/i386/i386/apic_vector.s +++ sys/i386/i386/apic_vector.s @@ -70,6 +70,7 @@ #define ISR_VEC(index, vec_name) \ .text ; \ SUPERALIGN_TEXT ; \ +IDTVEC(vec_name ## _pti) ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ SET_KERNEL_SREGS ; \ @@ -123,6 +124,7 @@ */ .text SUPERALIGN_TEXT +IDTVEC(timerint_pti) IDTVEC(timerint) PUSH_FRAME SET_KERNEL_SREGS @@ -139,6 +141,7 @@ */ .text SUPERALIGN_TEXT +IDTVEC(cmcint_pti) IDTVEC(cmcint) PUSH_FRAME SET_KERNEL_SREGS @@ -153,6 +156,7 @@ */ .text SUPERALIGN_TEXT +IDTVEC(errorint_pti) IDTVEC(errorint) PUSH_FRAME SET_KERNEL_SREGS Index: sys/i386/i386/atpic_vector.s =================================================================== --- sys/i386/i386/atpic_vector.s +++ sys/i386/i386/atpic_vector.s @@ -46,6 +46,7 @@ #define INTR(irq_num, vec_name) \ .text ; \ SUPERALIGN_TEXT ; \ +IDTVEC(vec_name ##_pti) ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ SET_KERNEL_SREGS ; \ Index: sys/i386/i386/exception.s =================================================================== --- sys/i386/i386/exception.s +++ sys/i386/i386/exception.s @@ -133,6 +133,7 @@ TRAP(T_PAGEFLT) IDTVEC(mchk) pushl $0; TRAP(T_MCHK) +IDTVEC(rsvd_pti) IDTVEC(rsvd) pushl $0; TRAP(T_RESERVED) IDTVEC(fpu) Index: sys/i386/i386/pmap.c =================================================================== --- sys/i386/i386/pmap.c +++ sys/i386/i386/pmap.c @@ -280,6 +280,8 @@ "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; +int pti; + static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); Index: sys/x86/include/apicvar.h =================================================================== --- sys/x86/include/apicvar.h +++ sys/x86/include/apicvar.h @@ -185,7 +185,11 @@ IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint), - IDTVEC(spuriousint), IDTVEC(timerint); + IDTVEC(spuriousint), IDTVEC(timerint), + IDTVEC(apic_isr1_pti), IDTVEC(apic_isr2_pti), IDTVEC(apic_isr3_pti), + IDTVEC(apic_isr4_pti), IDTVEC(apic_isr5_pti), IDTVEC(apic_isr6_pti), + IDTVEC(apic_isr7_pti), IDTVEC(cmcint_pti), IDTVEC(errorint_pti), + IDTVEC(spuriousint_pti), IDTVEC(timerint_pti); extern vm_paddr_t lapic_paddr; extern int *apic_cpuids; Index: sys/x86/include/x86_var.h =================================================================== --- sys/x86/include/x86_var.h +++ sys/x86/include/x86_var.h @@ -83,6 +83,7 @@ extern int use_xsave; extern uint64_t xsave_mask; extern u_int max_apic_id; +extern int pti; struct pcb; struct thread; Index: sys/x86/isa/atpic.c =================================================================== --- sys/x86/isa/atpic.c +++ sys/x86/isa/atpic.c @@ -80,6 +80,16 @@ IDTVEC(atpic_intr9), IDTVEC(atpic_intr10), IDTVEC(atpic_intr11), IDTVEC(atpic_intr12), IDTVEC(atpic_intr13), IDTVEC(atpic_intr14), IDTVEC(atpic_intr15); +/* XXXKIB i386 uses stubs until pti comes */ +inthand_t + IDTVEC(atpic_intr0_pti), IDTVEC(atpic_intr1_pti), + IDTVEC(atpic_intr2_pti), IDTVEC(atpic_intr3_pti), + IDTVEC(atpic_intr4_pti), IDTVEC(atpic_intr5_pti), + IDTVEC(atpic_intr6_pti), IDTVEC(atpic_intr7_pti), + IDTVEC(atpic_intr8_pti), IDTVEC(atpic_intr9_pti), + IDTVEC(atpic_intr10_pti), IDTVEC(atpic_intr11_pti), + IDTVEC(atpic_intr12_pti), IDTVEC(atpic_intr13_pti), + IDTVEC(atpic_intr14_pti), IDTVEC(atpic_intr15_pti); #define IRQ(ap, ai) ((ap)->at_irqbase + (ai)->at_irq) @@ -92,7 +102,7 @@ #define INTSRC(irq) \ { { &atpics[(irq) / 8].at_pic }, IDTVEC(atpic_intr ## irq ), \ - (irq) % 8 } + IDTVEC(atpic_intr ## irq ## _pti), (irq) % 8 } struct atpic { struct pic at_pic; @@ -104,7 +114,7 @@ struct atpic_intsrc { struct intsrc at_intsrc; - inthand_t *at_intr; + inthand_t *at_intr, *at_intr_pti; int at_irq; /* Relative to PIC base. */ enum intr_trigger at_trigger; u_long at_count; @@ -408,7 +418,8 @@ ai->at_intsrc.is_count = &ai->at_count; ai->at_intsrc.is_straycount = &ai->at_straycount; setidt(((struct atpic *)ai->at_intsrc.is_pic)->at_intbase + - ai->at_irq, ai->at_intr, SDT_ATPIC, SEL_KPL, GSEL_ATPIC); + ai->at_irq, pti ? ai->at_intr_pti : ai->at_intr, SDT_ATPIC, + SEL_KPL, GSEL_ATPIC); } /* Index: sys/x86/x86/local_apic.c =================================================================== --- sys/x86/x86/local_apic.c +++ sys/x86/x86/local_apic.c @@ -171,13 +171,23 @@ IDTVEC(apic_isr7), /* 224 - 255 */ }; +static inthand_t *ioint_pti_handlers[] = { + NULL, /* 0 - 31 */ + IDTVEC(apic_isr1_pti), /* 32 - 63 */ + IDTVEC(apic_isr2_pti), /* 64 - 95 */ + IDTVEC(apic_isr3_pti), /* 96 - 127 */ + IDTVEC(apic_isr4_pti), /* 128 - 159 */ + IDTVEC(apic_isr5_pti), /* 160 - 191 */ + IDTVEC(apic_isr6_pti), /* 192 - 223 */ + IDTVEC(apic_isr7_pti), /* 224 - 255 */ +}; static u_int32_t lapic_timer_divisors[] = { APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16, APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128 }; -extern inthand_t IDTVEC(rsvd); +extern inthand_t IDTVEC(rsvd_pti), IDTVEC(rsvd); volatile char *lapic_map; vm_paddr_t lapic_paddr; @@ -496,15 +506,18 @@ PCPU_SET(apic_id, lapic_id()); /* Local APIC timer interrupt. */ - setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC); + setidt(APIC_TIMER_INT, pti ? IDTVEC(timerint_pti) : IDTVEC(timerint), + SDT_APIC, SEL_KPL, GSEL_APIC); /* Local APIC error interrupt. */ - setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); + setidt(APIC_ERROR_INT, pti ? IDTVEC(errorint_pti) : IDTVEC(errorint), + SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ /* Local APIC CMCI. */ - setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC); + setidt(APIC_CMC_INT, pti ? IDTVEC(cmcint_pti) : IDTVEC(cmcint), + SDT_APICT, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { arat = 0; @@ -1569,8 +1582,8 @@ KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif - setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL, - GSEL_APIC); + setidt(vector, (pti ? ioint_pti_handlers : ioint_handlers)[vector / 32], + SDT_APIC, SEL_KPL, GSEL_APIC); } static void @@ -1589,7 +1602,8 @@ * We can not currently clear the idt entry because other cpus * may have a valid vector at this offset. */ - setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC); + setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT, + SEL_KPL, GSEL_APIC); #endif } @@ -2087,6 +2101,7 @@ * APIC page is mapped as an uncached region. In x2APIC mode there is an * explicit 'mfence' before the ICR MSR is written. Therefore in both cases * the IDT slot update is globally visible before the IPI is delivered. + * XXX pti */ static int native_lapic_ipi_alloc(inthand_t *ipifunc) @@ -2095,7 +2110,8 @@ long func; int idx, vector; - KASSERT(ipifunc != &IDTVEC(rsvd), ("invalid ipifunc %p", ipifunc)); + KASSERT(ipifunc != &IDTVEC(rsvd) && ipifunc != &IDTVEC(rsvd_pti), + ("invalid ipifunc %p", ipifunc)); vector = -1; mtx_lock_spin(&icu_lock); @@ -2124,8 +2140,10 @@ mtx_lock_spin(&icu_lock); ip = &idt[vector]; func = (ip->gd_hioffset << 16) | ip->gd_looffset; - KASSERT(func != (uintptr_t)&IDTVEC(rsvd), + KASSERT(func != (uintptr_t)&IDTVEC(rsvd) && + func != (uintptr_t)&IDTVEC(rsvd_pti), ("invalid idtfunc %#lx", func)); - setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC); + setidt(vector, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_APICT, + SEL_KPL, GSEL_APIC); mtx_unlock_spin(&icu_lock); }