diff --git a/sys/amd64/amd64/trap.c b/sys/amd64/amd64/trap.c index 7f1175da41df..692491ecfc5a 100644 --- a/sys/amd64/amd64/trap.c +++ b/sys/amd64/amd64/trap.c @@ -1,1267 +1,1276 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include /* * AMD64 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef KDTRACE_HOOKS #include #endif extern inthand_t IDTVEC(bpt), IDTVEC(bpt_pti), IDTVEC(dbg), IDTVEC(fast_syscall), IDTVEC(fast_syscall_pti), IDTVEC(fast_syscall32), IDTVEC(int0x80_syscall_pti), IDTVEC(int0x80_syscall); void __noinline trap(struct trapframe *frame); void trap_check(struct trapframe *frame); void dblfault_handler(struct trapframe *frame); static int trap_pfault(struct trapframe *, bool, int *, int *); static void trap_diag(struct trapframe *, vm_offset_t); static void trap_fatal(struct trapframe *, vm_offset_t); #ifdef KDTRACE_HOOKS static bool trap_user_dtrace(struct trapframe *, int (**hook)(struct trapframe *)); #endif static const char UNKNOWN[] = "unknown"; static const char *const trap_msg[] = { [0] = UNKNOWN, /* unused */ [T_PRIVINFLT] = "privileged instruction fault", [2] = UNKNOWN, /* unused */ [T_BPTFLT] = "breakpoint instruction fault", [4] = UNKNOWN, /* unused */ [5] = UNKNOWN, /* unused */ [T_ARITHTRAP] = "arithmetic trap", [7] = UNKNOWN, /* unused */ [8] = UNKNOWN, /* unused */ [T_PROTFLT] = "general protection fault", [T_TRCTRAP] = "debug exception", [11] = UNKNOWN, /* unused */ [T_PAGEFLT] = "page fault", [13] = UNKNOWN, /* unused */ [T_ALIGNFLT] = "alignment fault", [15] = UNKNOWN, /* unused */ [16] = UNKNOWN, /* unused */ [17] = UNKNOWN, /* unused */ [T_DIVIDE] = "integer divide fault", [T_NMI] = "non-maskable interrupt trap", [T_OFLOW] = "overflow trap", [T_BOUND] = "FPU bounds check fault", [T_DNA] = "FPU device not available", [T_DOUBLEFLT] = "double fault", [T_FPOPFLT] = "FPU operand fetch fault", [T_TSSFLT] = "invalid TSS fault", [T_SEGNPFLT] = "segment not present fault", [T_STKFLT] = "stack fault", [T_MCHK] = "machine check trap", [T_XMMFLT] = "SIMD floating-point exception", [T_RESERVED] = "reserved (unknown) fault", [31] = UNKNOWN, /* reserved */ [T_DTRACE_RET] = "DTrace pid return trap", }; static const char * traptype_to_msg(u_int type) { return (type < nitems(trap_msg) ? trap_msg[type] : "unknown/reserved trap"); } static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); +u_long cnt_efirt_faults; +int print_efirt_faults = 1; + /* * Control L1D flush on return from NMI. * * Tunable can be set to the following values: * 0 - only enable flush on return from NMI if required by vmm.ko (default) * >1 - always flush on return from NMI. * * Post-boot, the sysctl indicates if flushing is currently enabled. */ int nmi_flush_l1d_sw; SYSCTL_INT(_machdep, OID_AUTO, nmi_flush_l1d_sw, CTLFLAG_RWTUN, &nmi_flush_l1d_sw, 0, "Flush L1 Data Cache on NMI exit, software bhyve L1TF mitigation assist"); /* * Table of handlers for various segment load faults. */ static const struct { uintptr_t faddr; uintptr_t fhandler; } sfhandlers[] = { { .faddr = (uintptr_t)ld_ds, .fhandler = (uintptr_t)ds_load_fault, }, { .faddr = (uintptr_t)ld_es, .fhandler = (uintptr_t)es_load_fault, }, { .faddr = (uintptr_t)ld_fs, .fhandler = (uintptr_t)fs_load_fault, }, { .faddr = (uintptr_t)ld_gs, .fhandler = (uintptr_t)gs_load_fault, }, { .faddr = (uintptr_t)ld_gsbase, .fhandler = (uintptr_t)gsbase_load_fault }, { .faddr = (uintptr_t)ld_fsbase, .fhandler = (uintptr_t)fsbase_load_fault, }, }; /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { ksiginfo_t ksi; struct thread *td; struct proc *p; register_t addr, dr6; size_t i; int pf, signo, ucode; u_int type; td = curthread; p = td->td_proc; dr6 = 0; kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0); kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED); VM_CNT_INC(v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI && ipi_nmi_handler() == 0) return; #endif #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); return; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI. If the PMC module is * active, pass the 'rip' value to the PMC module's interrupt * handler. A non-zero return value from the handler means that * the NMI was consumed by it and we can return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(frame) != 0) return; #endif } if ((frame->tf_rflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame)) { uprintf( "pid %ld (%s): trap %d (%s) " "with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type, trap_msg[type]); } else { switch (type) { case T_NMI: case T_BPTFLT: case T_TRCTRAP: case T_PROTFLT: case T_SEGNPFLT: case T_STKFLT: break; default: printf( "kernel trap %d with interrupts disabled\n", type); /* * We shouldn't enable interrupts while holding a * spin lock. */ if (td->td_md.md_spinlock_count == 0) enable_intr(); } } } if (TRAPF_USERMODE(frame)) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_rip; if (td->td_cowgen != atomic_load_int(&p->p_cowgen)) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ signo = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ #ifdef KDTRACE_HOOKS if (trap_user_dtrace(frame, &dtrace_pid_probe_ptr)) return; #else enable_intr(); #endif signo = SIGTRAP; ucode = TRAP_BRKPT; break; case T_TRCTRAP: /* debug exception */ enable_intr(); signo = SIGTRAP; ucode = TRAP_TRACE; dr6 = rdr6(); if ((dr6 & DBREG_DR6_BS) != 0) { PROC_LOCK(td->td_proc); if ((td->td_dbgflags & TDB_STEP) != 0) { td->td_frame->tf_rflags &= ~PSL_T; td->td_dbgflags &= ~TDB_STEP; } PROC_UNLOCK(td->td_proc); } break; case T_ARITHTRAP: /* arithmetic trap */ ucode = fputrap_x87(); if (ucode == -1) return; signo = SIGFPE; break; case T_PROTFLT: /* general protection fault */ signo = SIGBUS; ucode = BUS_OBJERR; break; case T_STKFLT: /* stack fault */ case T_SEGNPFLT: /* segment not present fault */ signo = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ signo = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: signo = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: signo = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ /* * Can emulator handle this trap? */ if (*p->p_sysent->sv_trap != NULL && (*p->p_sysent->sv_trap)(td) == 0) return; pf = trap_pfault(frame, true, &signo, &ucode); if (pf == -1) return; if (pf == 0) goto userret; addr = frame->tf_addr; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; signo = SIGFPE; break; case T_NMI: nmi_handle_intr(type, frame); return; case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; signo = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; signo = SIGFPE; break; case T_DNA: /* transparent fault (due to context switch "late") */ KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); fpudna(); return; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; signo = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ ucode = fputrap_sse(); if (ucode == -1) return; signo = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: (void)trap_user_dtrace(frame, &dtrace_return_probe_ptr); return; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); /* * Most likely, EFI RT faulted. This check prevents * kdb from handling breakpoints set on the BIOS text, * if such option is ever needed. */ if ((td->td_pflags2 & TDP2_EFIRT) != 0 && curpcb->pcb_onfault != NULL && type != T_PAGEFLT) { - trap_diag(frame, 0); - printf("EFI RT fault %s\n", traptype_to_msg(type)); + u_long cnt = atomic_fetchadd_long(&cnt_efirt_faults, 1); + + if ((print_efirt_faults == 1 && cnt == 1) || + print_efirt_faults == 2) { + trap_diag(frame, 0); + printf("EFI RT fault %s\n", + traptype_to_msg(type)); + } frame->tf_rip = (long)curpcb->pcb_onfault; return; } switch (type) { case T_PAGEFLT: /* page fault */ (void)trap_pfault(frame, false, NULL, NULL); return; case T_DNA: if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); fpudna(); return; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * For now, supporting kernel handler * registration for FPU traps is overkill. */ trap_fatal(frame, 0); return; case T_STKFLT: /* stack fault */ case T_PROTFLT: /* general protection fault */ case T_SEGNPFLT: /* segment not present fault */ if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %rip's and %rsp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. * * In case of PTI, the IRETQ faulted while the * kernel used the pti stack, and exception * frame records %rsp value pointing to that * stack. If we return normally to * doreti_iret_fault, the trapframe is * reconstructed on pti stack, and calltrap() * called on it as well. Due to the very * limited pti stack size, kernel does not * survive for too long. Switch to the normal * thread stack for the trap handling. * * Magic '5' is the number of qwords occupied by * the hardware trap frame. */ if (frame->tf_rip == (long)doreti_iret) { KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); frame->tf_rip = (long)doreti_iret_fault; if ((PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3) && (frame->tf_rsp == (uintptr_t)PCPU_GET( pti_rsp0) - 5 * sizeof(register_t))) { frame->tf_rsp = PCPU_GET(rsp0) - 5 * sizeof(register_t); } return; } for (i = 0; i < nitems(sfhandlers); i++) { if (frame->tf_rip == sfhandlers[i].faddr) { KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); frame->tf_rip = sfhandlers[i].fhandler; return; } } if (curpcb->pcb_onfault != NULL) { frame->tf_rip = (long)curpcb->pcb_onfault; return; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_rflags & PSL_NT) { frame->tf_rflags &= ~PSL_NT; return; } break; case T_TRCTRAP: /* debug exception */ /* Clear any pending debug events. */ dr6 = rdr6(); load_dr6(0); /* * Ignore debug register exceptions due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap(dr6)) return; /* * Malicious user code can configure a debug * register watchpoint to trap on data access * to the top of stack and then execute 'pop * %ss; int 3'. Due to exception deferral for * 'pop %ss', the CPU will not interrupt 'int * 3' to raise the DB# exception for the debug * register but will postpone the DB# until * execution of the first instruction of the * BP# handler (in kernel mode). Normally the * previous check would ignore DB# exceptions * for watchpoints on user addresses raised in * kernel mode. However, some CPU errata * include cases where DB# exceptions do not * properly set bits in %dr6, e.g. Haswell * HSD23 and Skylake-X SKZ24. * * A deferred DB# can also be raised on the * first instructions of system call entry * points or single-step traps via similar use * of 'pop %ss' or 'mov xxx, %ss'. */ if (pti) { if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall_pti) || #ifdef COMPAT_FREEBSD32 frame->tf_rip == (uintptr_t)IDTVEC(int0x80_syscall_pti) || #endif frame->tf_rip == (uintptr_t)IDTVEC(bpt_pti)) return; } else { if (frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall) || #ifdef COMPAT_FREEBSD32 frame->tf_rip == (uintptr_t)IDTVEC(int0x80_syscall) || #endif frame->tf_rip == (uintptr_t)IDTVEC(bpt)) return; } if (frame->tf_rip == (uintptr_t)IDTVEC(dbg) || /* Needed for AMD. */ frame->tf_rip == (uintptr_t)IDTVEC(fast_syscall32)) return; /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB if (kdb_trap(type, dr6, frame)) return; #endif break; case T_NMI: nmi_handle_intr(type, frame); return; } trap_fatal(frame, 0); return; } ksiginfo_init_trap(&ksi); ksi.ksi_signo = signo; ksi.ksi_code = ucode; ksi.ksi_trapno = type; ksi.ksi_addr = (void *)addr; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %#lx code %d type %d " "addr %#lx rsp %#lx rip %#lx rax %#lx " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type, addr, frame->tf_rsp, frame->tf_rip, frame->tf_rax, fubyte((void *)(frame->tf_rip + 0)), fubyte((void *)(frame->tf_rip + 1)), fubyte((void *)(frame->tf_rip + 2)), fubyte((void *)(frame->tf_rip + 3)), fubyte((void *)(frame->tf_rip + 4)), fubyte((void *)(frame->tf_rip + 5)), fubyte((void *)(frame->tf_rip + 6)), fubyte((void *)(frame->tf_rip + 7))); } KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); userret: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); } /* * Ensure that we ignore any DTrace-induced faults. This function cannot * be instrumented, so it cannot generate such faults itself. */ void trap_check(struct trapframe *frame) { #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, frame->tf_trapno) != 0) return; #endif trap(frame); } static bool trap_is_smap(struct trapframe *frame) { /* * A page fault on a userspace address is classified as * SMAP-induced if: * - SMAP is supported; * - kernel mode accessed present data page; * - rflags.AC was cleared. * Kernel must never access user space with rflags.AC cleared * if SMAP is enabled. */ return ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 && (frame->tf_err & (PGEX_P | PGEX_U | PGEX_I | PGEX_RSV)) == PGEX_P && (frame->tf_rflags & PSL_AC) == 0); } static bool trap_is_pti(struct trapframe *frame) { return (PCPU_GET(curpmap)->pm_ucr3 != PMAP_NO_CR3 && pg_nx != 0 && (frame->tf_err & (PGEX_P | PGEX_W | PGEX_U | PGEX_I)) == (PGEX_P | PGEX_U | PGEX_I) && (curpcb->pcb_saved_ucr3 & ~CR3_PCID_MASK) == (PCPU_GET(curpmap)->pm_cr3 & ~CR3_PCID_MASK)); } /* * Handle all details of a page fault. * Returns: * -1 if this fault was fatal, typically from kernel mode * (cannot happen, but we need to return something). * 0 if this fault was handled by updating either the user or kernel * page table, execution can continue. * 1 if this fault was from usermode and it was not handled, a synchronous * signal should be delivered to the thread. *signo returns the signal * number, *ucode gives si_code. */ static int trap_pfault(struct trapframe *frame, bool usermode, int *signo, int *ucode) { struct thread *td; struct proc *p; vm_map_t map; vm_offset_t eva; int rv; vm_prot_t ftype; MPASS(!usermode || (signo != NULL && ucode != NULL)); td = curthread; p = td->td_proc; eva = frame->tf_addr; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } if (eva >= VM_MIN_KERNEL_ADDRESS) { /* * Don't allow user-mode faults in kernel address space. */ if (usermode) { *signo = SIGSEGV; *ucode = SEGV_MAPERR; return (1); } map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a usermode address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. * * If SMAP is enabled, filter SMAP faults also, * because illegal access might occur to the mapped * user address, causing infinite loop. */ if (!usermode && (td->td_intr_nesting_level != 0 || trap_is_smap(frame) || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * User-mode protection key violation (PKU). May happen * either from usermode or from kernel if copyin accessed * key-protected mapping. */ if ((frame->tf_err & PGEX_PK) != 0) { if (eva > VM_MAXUSER_ADDRESS) { trap_fatal(frame, eva); return (-1); } if (usermode) { *signo = SIGSEGV; *ucode = SEGV_PKUERR; return (1); } goto after_vmfault; } /* * If nx protection of the usermode portion of kernel page * tables caused trap, panic. */ if (usermode && trap_is_pti(frame)) panic("PTI: pid %d comm %s tf_err %#lx", p->p_pid, p->p_comm, frame->tf_err); /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } if (usermode) return (1); after_vmfault: if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { if ((td->td_pflags2 & TDP2_EFIRT) != 0) { trap_diag(frame, eva); printf("EFI RT page fault\n"); } frame->tf_rip = (long)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } static void trap_diag(struct trapframe *frame, vm_offset_t eva) { int code, ss; u_int type; struct soft_segment_descriptor softseg; struct user_segment_descriptor *gdt; code = frame->tf_err; type = frame->tf_trapno; gdt = *PCPU_PTR(gdt); sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg); printf("\n\nFatal trap %d: %s while in %s mode\n", type, type < nitems(trap_msg) ? trap_msg[type] : UNKNOWN, TRAPF_USERMODE(frame) ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); printf("fault code = %s %s %s%s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", code & PGEX_PK ? " prot key" : "", code & PGEX_SGX ? " SGX" : "", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%lx:0x%lx\n", frame->tf_cs & 0xffff, frame->tf_rip); ss = frame->tf_ss & 0xffff; printf("stack pointer = 0x%x:0x%lx\n", ss, frame->tf_rsp); printf("frame pointer = 0x%x:0x%lx\n", ss, frame->tf_rbp); printf("code segment = base 0x%lx, limit 0x%lx, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, long %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_rflags & PSL_T) printf("trace trap, "); if (frame->tf_rflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_rflags & PSL_NT) printf("nested task, "); if (frame->tf_rflags & PSL_RF) printf("resume, "); printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); printf("rdi: %016lx rsi: %016lx rdx: %016lx\n", frame->tf_rdi, frame->tf_rsi, frame->tf_rdx); printf("rcx: %016lx r8: %016lx r9: %016lx\n", frame->tf_rcx, frame->tf_r8, frame->tf_r9); printf("rax: %016lx rbx: %016lx rbp: %016lx\n", frame->tf_rax, frame->tf_rbx, frame->tf_rbp); printf("r10: %016lx r11: %016lx r12: %016lx\n", frame->tf_r10, frame->tf_r11, frame->tf_r12); printf("r13: %016lx r14: %016lx r15: %016lx\n", frame->tf_r13, frame->tf_r14, frame->tf_r15); printf("trap number = %d\n", type); } static void trap_fatal(struct trapframe *frame, vm_offset_t eva) { u_int type; type = frame->tf_trapno; trap_diag(frame, eva); #ifdef KDB if (debugger_on_trap) { bool handled; kdb_why = KDB_WHY_TRAP; handled = kdb_trap(type, 0, frame); kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif panic("%s", traptype_to_msg(type)); } #ifdef KDTRACE_HOOKS /* * Invoke a userspace DTrace hook. The hook pointer is cleared when no * userspace probes are enabled, so we must synchronize with DTrace to ensure * that a trapping thread is able to call the hook before it is cleared. */ static bool trap_user_dtrace(struct trapframe *frame, int (**hookp)(struct trapframe *)) { int (*hook)(struct trapframe *); hook = atomic_load_ptr(hookp); enable_intr(); if (hook != NULL) return ((hook)(frame) == 0); return (false); } #endif /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). */ void dblfault_handler(struct trapframe *frame) { kmsan_mark(frame, sizeof(*frame), KMSAN_STATE_INITED); #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault\n" "rip %#lx rsp %#lx rbp %#lx\n" "rax %#lx rdx %#lx rbx %#lx\n" "rcx %#lx rsi %#lx rdi %#lx\n" "r8 %#lx r9 %#lx r10 %#lx\n" "r11 %#lx r12 %#lx r13 %#lx\n" "r14 %#lx r15 %#lx rflags %#lx\n" "cs %#lx ss %#lx ds %#hx es %#hx fs %#hx gs %#hx\n" "fsbase %#lx gsbase %#lx kgsbase %#lx\n", frame->tf_rip, frame->tf_rsp, frame->tf_rbp, frame->tf_rax, frame->tf_rdx, frame->tf_rbx, frame->tf_rcx, frame->tf_rdi, frame->tf_rsi, frame->tf_r8, frame->tf_r9, frame->tf_r10, frame->tf_r11, frame->tf_r12, frame->tf_r13, frame->tf_r14, frame->tf_r15, frame->tf_rflags, frame->tf_cs, frame->tf_ss, frame->tf_ds, frame->tf_es, frame->tf_fs, frame->tf_gs, rdmsr(MSR_FSBASE), rdmsr(MSR_GSBASE), rdmsr(MSR_KGSBASE)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } static int __noinline cpu_fetch_syscall_args_fallback(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; syscallarg_t *argp; caddr_t params; int reg, regcnt, error; p = td->td_proc; frame = td->td_frame; reg = 0; regcnt = NARGREGS; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = frame->tf_rdi; reg++; regcnt--; } if (sa->code >= p->p_sysent->sv_size) sa->callp = &nosys_sysent; else sa->callp = &p->p_sysent->sv_table[sa->code]; KASSERT(sa->callp->sy_narg <= nitems(sa->args), ("Too many syscall arguments!")); argp = &frame->tf_rdi; argp += reg; memcpy(sa->args, argp, sizeof(sa->args[0]) * NARGREGS); if (sa->callp->sy_narg > regcnt) { params = (caddr_t)frame->tf_rsp + sizeof(register_t); error = copyin(params, &sa->args[regcnt], (sa->callp->sy_narg - regcnt) * sizeof(sa->args[0])); if (__predict_false(error != 0)) return (error); } td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; struct trapframe *frame; struct syscall_args *sa; p = td->td_proc; frame = td->td_frame; sa = &td->td_sa; sa->code = frame->tf_rax; sa->original_code = sa->code; if (__predict_false(sa->code == SYS_syscall || sa->code == SYS___syscall || sa->code >= p->p_sysent->sv_size)) return (cpu_fetch_syscall_args_fallback(td, sa)); sa->callp = &p->p_sysent->sv_table[sa->code]; KASSERT(sa->callp->sy_narg <= nitems(sa->args), ("Too many syscall arguments!")); if (__predict_false(sa->callp->sy_narg > NARGREGS)) return (cpu_fetch_syscall_args_fallback(td, sa)); memcpy(sa->args, &frame->tf_rdi, sizeof(sa->args[0]) * NARGREGS); td->td_retval[0] = 0; td->td_retval[1] = frame->tf_rdx; return (0); } #include "../../kern/subr_syscall.c" static void (*syscall_ret_l1d_flush)(void); int syscall_ret_l1d_flush_mode; static void flush_l1d_hw(void) { wrmsr(MSR_IA32_FLUSH_CMD, IA32_FLUSH_CMD_L1D); } static void __noinline amd64_syscall_ret_flush_l1d_check(int error) { void (*p)(void); if (error != EEXIST && error != EAGAIN && error != EXDEV && error != ENOENT && error != ENOTCONN && error != EINPROGRESS) { p = atomic_load_ptr(&syscall_ret_l1d_flush); if (p != NULL) p(); } } static void __inline amd64_syscall_ret_flush_l1d_check_inline(int error) { if (__predict_false(error != 0)) amd64_syscall_ret_flush_l1d_check(error); } void amd64_syscall_ret_flush_l1d(int error) { amd64_syscall_ret_flush_l1d_check_inline(error); } void amd64_syscall_ret_flush_l1d_recalc(void) { bool l1d_hw; l1d_hw = (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0; again: switch (syscall_ret_l1d_flush_mode) { case 0: syscall_ret_l1d_flush = NULL; break; case 1: syscall_ret_l1d_flush = l1d_hw ? flush_l1d_hw : flush_l1d_sw_abi; break; case 2: syscall_ret_l1d_flush = l1d_hw ? flush_l1d_hw : NULL; break; case 3: syscall_ret_l1d_flush = flush_l1d_sw_abi; break; default: syscall_ret_l1d_flush_mode = 1; goto again; } } static int machdep_syscall_ret_flush_l1d(SYSCTL_HANDLER_ARGS) { int error, val; val = syscall_ret_l1d_flush_mode; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); syscall_ret_l1d_flush_mode = val; amd64_syscall_ret_flush_l1d_recalc(); return (0); } SYSCTL_PROC(_machdep, OID_AUTO, syscall_ret_flush_l1d, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, machdep_syscall_ret_flush_l1d, "I", "Flush L1D on syscall return with error (0 - off, 1 - on, " "2 - use hw only, 3 - use sw only)"); /* * System call handler for native binaries. The trap frame is already * set up by the assembler trampoline and a pointer to it is saved in * td_frame. */ void amd64_syscall(struct thread *td, int traced) { ksiginfo_t ksi; kmsan_mark(td->td_frame, sizeof(*td->td_frame), KMSAN_STATE_INITED); KASSERT(TRAPF_USERMODE(td->td_frame), ("%s: not from user mode", __func__)); syscallenter(td); /* * Traced syscall. */ if (__predict_false(traced)) { td->td_frame->tf_rflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)td->td_frame->tf_rip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, td->td_sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, td->td_sa.code))); KASSERT(pmap_not_in_di(), ("System call %s returning with leaked invl_gen %lu", syscallname(td->td_proc, td->td_sa.code), td->td_md.md_invl_gen.gen)); syscallret(td); /* * If the user-supplied value of %rip is not a canonical * address, then some CPUs will trigger a ring 0 #GP during * the sysret instruction. However, the fault handler would * execute in ring 0 with the user's %gs and %rsp which would * not be safe. Instead, use the full return path which * catches the problem safely. */ if (__predict_false(td->td_frame->tf_rip >= (la57 ? VM_MAXUSER_ADDRESS_LA57 : VM_MAXUSER_ADDRESS_LA48))) set_pcb_flags(td->td_pcb, PCB_FULL_IRET); amd64_syscall_ret_flush_l1d_check_inline(td->td_errno); } diff --git a/sys/arm64/arm64/trap.c b/sys/arm64/arm64/trap.c index fdcb7e708daf..9f792332a931 100644 --- a/sys/arm64/arm64/trap.c +++ b/sys/arm64/arm64/trap.c @@ -1,773 +1,776 @@ /*- * Copyright (c) 2014 Andrew Turner * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include #endif #ifdef VFP #include #endif #ifdef KDB #include #endif #ifdef DDB #include #include #endif /* Called from exception.S */ void do_el1h_sync(struct thread *, struct trapframe *); void do_el0_sync(struct thread *, struct trapframe *); void do_el0_error(struct trapframe *); void do_serror(struct trapframe *); void unhandled_exception(struct trapframe *); static void print_gp_register(const char *name, uint64_t value); static void print_registers(struct trapframe *frame); int (*dtrace_invop_jump_addr)(struct trapframe *); +u_long cnt_efirt_faults; +int print_efirt_faults; + typedef void (abort_handler)(struct thread *, struct trapframe *, uint64_t, uint64_t, int); static abort_handler align_abort; static abort_handler data_abort; static abort_handler external_abort; static abort_handler *abort_handlers[] = { [ISS_DATA_DFSC_TF_L0] = data_abort, [ISS_DATA_DFSC_TF_L1] = data_abort, [ISS_DATA_DFSC_TF_L2] = data_abort, [ISS_DATA_DFSC_TF_L3] = data_abort, [ISS_DATA_DFSC_AFF_L1] = data_abort, [ISS_DATA_DFSC_AFF_L2] = data_abort, [ISS_DATA_DFSC_AFF_L3] = data_abort, [ISS_DATA_DFSC_PF_L1] = data_abort, [ISS_DATA_DFSC_PF_L2] = data_abort, [ISS_DATA_DFSC_PF_L3] = data_abort, [ISS_DATA_DFSC_ALIGN] = align_abort, [ISS_DATA_DFSC_EXT] = external_abort, [ISS_DATA_DFSC_EXT_L0] = external_abort, [ISS_DATA_DFSC_EXT_L1] = external_abort, [ISS_DATA_DFSC_EXT_L2] = external_abort, [ISS_DATA_DFSC_EXT_L3] = external_abort, [ISS_DATA_DFSC_ECC] = external_abort, [ISS_DATA_DFSC_ECC_L0] = external_abort, [ISS_DATA_DFSC_ECC_L1] = external_abort, [ISS_DATA_DFSC_ECC_L2] = external_abort, [ISS_DATA_DFSC_ECC_L3] = external_abort, }; static __inline void call_trapsignal(struct thread *td, int sig, int code, void *addr, int trapno) { ksiginfo_t ksi; ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = code; ksi.ksi_addr = addr; ksi.ksi_trapno = trapno; trapsignal(td, &ksi); } int cpu_fetch_syscall_args(struct thread *td) { struct proc *p; syscallarg_t *ap, *dst_ap; struct syscall_args *sa; p = td->td_proc; sa = &td->td_sa; ap = td->td_frame->tf_x; dst_ap = &sa->args[0]; sa->code = td->td_frame->tf_x[8]; sa->original_code = sa->code; if (__predict_false(sa->code == SYS_syscall || sa->code == SYS___syscall)) { sa->code = *ap++; } else { *dst_ap++ = *ap++; } if (__predict_false(sa->code >= p->p_sysent->sv_size)) sa->callp = &nosys_sysent; else sa->callp = &p->p_sysent->sv_table[sa->code]; KASSERT(sa->callp->sy_narg <= nitems(sa->args), ("Syscall %d takes too many arguments", sa->code)); memcpy(dst_ap, ap, (nitems(sa->args) - 1) * sizeof(*dst_ap)); td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } #include "../../kern/subr_syscall.c" /* * Test for fault generated by given access instruction in * bus_peek_ or bus_poke_ bus function. */ extern uint32_t generic_bs_peek_1f, generic_bs_peek_2f; extern uint32_t generic_bs_peek_4f, generic_bs_peek_8f; extern uint32_t generic_bs_poke_1f, generic_bs_poke_2f; extern uint32_t generic_bs_poke_4f, generic_bs_poke_8f; static bool test_bs_fault(void *addr) { return (addr == &generic_bs_peek_1f || addr == &generic_bs_peek_2f || addr == &generic_bs_peek_4f || addr == &generic_bs_peek_8f || addr == &generic_bs_poke_1f || addr == &generic_bs_poke_2f || addr == &generic_bs_poke_4f || addr == &generic_bs_poke_8f); } static void svc_handler(struct thread *td, struct trapframe *frame) { if ((frame->tf_esr & ESR_ELx_ISS_MASK) == 0) { syscallenter(td); syscallret(td); } else { call_trapsignal(td, SIGILL, ILL_ILLOPN, (void *)frame->tf_elr, ESR_ELx_EXCEPTION(frame->tf_esr)); userret(td, frame); } } static void align_abort(struct thread *td, struct trapframe *frame, uint64_t esr, uint64_t far, int lower) { if (!lower) { print_registers(frame); print_gp_register("far", far); printf(" esr: 0x%.16lx\n", esr); panic("Misaligned access from kernel space!"); } call_trapsignal(td, SIGBUS, BUS_ADRALN, (void *)frame->tf_elr, ESR_ELx_EXCEPTION(frame->tf_esr)); userret(td, frame); } static void external_abort(struct thread *td, struct trapframe *frame, uint64_t esr, uint64_t far, int lower) { if (lower) { call_trapsignal(td, SIGBUS, BUS_OBJERR, (void *)far, ESR_ELx_EXCEPTION(frame->tf_esr)); userret(td, frame); return; } /* * Try to handle synchronous external aborts caused by * bus_space_peek() and/or bus_space_poke() functions. */ if (test_bs_fault((void *)frame->tf_elr)) { frame->tf_elr = (uint64_t)generic_bs_fault; return; } print_registers(frame); print_gp_register("far", far); panic("Unhandled external data abort"); } /* * It is unsafe to access the stack canary value stored in "td" until * kernel map translation faults are handled, see the pmap_klookup() call below. * Thus, stack-smashing detection with per-thread canaries must be disabled in * this function. */ static void NO_PERTHREAD_SSP data_abort(struct thread *td, struct trapframe *frame, uint64_t esr, uint64_t far, int lower) { struct vm_map *map; struct pcb *pcb; vm_prot_t ftype; int error, sig, ucode; #ifdef KDB bool handled; #endif /* * According to the ARMv8-A rev. A.g, B2.10.5 "Load-Exclusive * and Store-Exclusive instruction usage restrictions", state * of the exclusive monitors after data abort exception is unknown. */ clrex(); #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif if (lower) { map = &td->td_proc->p_vmspace->vm_map; } else if (!ADDR_IS_CANONICAL(far)) { /* We received a TBI/PAC/etc. fault from the kernel */ error = KERN_INVALID_ADDRESS; pcb = td->td_pcb; goto bad_far; } else if (ADDR_IS_KERNEL(far)) { /* * Handle a special case: the data abort was caused by accessing * a thread structure while its mapping was being promoted or * demoted, as a consequence of the break-before-make rule. It * is not safe to enable interrupts or dereference "td" before * this case is handled. * * In principle, if pmap_klookup() fails, there is no need to * call pmap_fault() below, but avoiding that call is not worth * the effort. */ if (ESR_ELx_EXCEPTION(esr) == EXCP_DATA_ABORT) { switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: if (pmap_klookup(far, NULL)) return; break; } } intr_enable(); map = kernel_map; } else { intr_enable(); map = &td->td_proc->p_vmspace->vm_map; if (map == NULL) map = kernel_map; } pcb = td->td_pcb; /* * Try to handle translation, access flag, and permission faults. * Translation faults may occur as a result of the required * break-before-make sequence used when promoting or demoting * superpages. Such faults must not occur while holding the pmap lock, * or pmap_fault() will recurse on that lock. */ if ((lower || map == kernel_map || pcb->pcb_onfault != 0) && pmap_fault(map->pmap, esr, far) == KERN_SUCCESS) return; #ifdef INVARIANTS if (td->td_md.md_spinlock_count != 0) { print_registers(frame); print_gp_register("far", far); printf(" esr: 0x%.16lx\n", esr); panic("data abort with spinlock held (spinlock count %d != 0)", td->td_md.md_spinlock_count); } #endif if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { print_registers(frame); print_gp_register("far", far); printf(" esr: 0x%.16lx\n", esr); panic("data abort in critical section or under mutex"); } switch (ESR_ELx_EXCEPTION(esr)) { case EXCP_INSN_ABORT: case EXCP_INSN_ABORT_L: ftype = VM_PROT_EXECUTE; break; default: /* * If the exception was because of a read or cache operation * pass a read fault type into the vm code. Cache operations * need read permission but will set the WnR flag when the * memory is unmapped. */ if ((esr & ISS_DATA_WnR) == 0 || (esr & ISS_DATA_CM) != 0) ftype = VM_PROT_READ; else ftype = VM_PROT_WRITE; break; } /* Fault in the page. */ error = vm_fault_trap(map, far, ftype, VM_FAULT_NORMAL, &sig, &ucode); if (error != KERN_SUCCESS) { if (lower) { call_trapsignal(td, sig, ucode, (void *)far, ESR_ELx_EXCEPTION(esr)); } else { bad_far: if (td->td_intr_nesting_level == 0 && pcb->pcb_onfault != 0) { frame->tf_x[0] = error; frame->tf_elr = pcb->pcb_onfault; return; } printf("Fatal data abort:\n"); print_registers(frame); print_gp_register("far", far); printf(" esr: 0x%.16lx\n", esr); #ifdef KDB if (debugger_on_trap) { kdb_why = KDB_WHY_TRAP; handled = kdb_trap(ESR_ELx_EXCEPTION(esr), 0, frame); kdb_why = KDB_WHY_UNSET; if (handled) return; } #endif panic("vm_fault failed: 0x%lx error %d", frame->tf_elr, error); } } if (lower) userret(td, frame); } static void print_gp_register(const char *name, uint64_t value) { #if defined(DDB) c_db_sym_t sym; const char *sym_name; db_expr_t sym_value; db_expr_t offset; #endif printf(" %s: 0x%.16lx", name, value); #if defined(DDB) /* If this looks like a kernel address try to find the symbol */ if (value >= VM_MIN_KERNEL_ADDRESS) { sym = db_search_symbol(value, DB_STGY_ANY, &offset); if (sym != C_DB_SYM_NULL) { db_symbol_values(sym, &sym_name, &sym_value); printf(" (%s + 0x%lx)", sym_name, offset); } } #endif printf("\n"); } static void print_registers(struct trapframe *frame) { char name[4]; u_int reg; for (reg = 0; reg < nitems(frame->tf_x); reg++) { snprintf(name, sizeof(name), "%sx%d", (reg < 10) ? " " : "", reg); print_gp_register(name, frame->tf_x[reg]); } printf(" sp: 0x%.16lx\n", frame->tf_sp); print_gp_register(" lr", frame->tf_lr); print_gp_register("elr", frame->tf_elr); printf("spsr: 0x%.16lx\n", frame->tf_spsr); } #ifdef VFP static void fpe_trap(struct thread *td, void *addr, uint32_t exception) { int code; code = FPE_FLTIDO; if ((exception & ISS_FP_TFV) != 0) { if ((exception & ISS_FP_IOF) != 0) code = FPE_FLTINV; else if ((exception & ISS_FP_DZF) != 0) code = FPE_FLTDIV; else if ((exception & ISS_FP_OFF) != 0) code = FPE_FLTOVF; else if ((exception & ISS_FP_UFF) != 0) code = FPE_FLTUND; else if ((exception & ISS_FP_IXF) != 0) code = FPE_FLTRES; } call_trapsignal(td, SIGFPE, code, addr, exception); } #endif /* * See the comment above data_abort(). */ void NO_PERTHREAD_SSP do_el1h_sync(struct thread *td, struct trapframe *frame) { uint32_t exception; uint64_t esr, far; int dfsc; kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0); far = frame->tf_far; /* Read the esr register to get the exception details */ esr = frame->tf_esr; exception = ESR_ELx_EXCEPTION(esr); #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, exception)) return; #endif CTR4(KTR_TRAP, "%s: exception=%lu, elr=0x%lx, esr=0x%lx", __func__, exception, frame->tf_elr, esr); /* * Enable debug exceptions if we aren't already handling one. They will * be masked again in the exception handler's epilogue. */ switch (exception) { case EXCP_BRK: case EXCP_BRKPT_EL1: case EXCP_WATCHPT_EL1: case EXCP_SOFTSTP_EL1: break; default: dbg_enable(); break; } switch (exception) { case EXCP_FP_SIMD: case EXCP_TRAP_FP: #ifdef VFP if ((td->td_pcb->pcb_fpflags & PCB_FP_KERN) != 0) { vfp_restore_state(); } else #endif { print_registers(frame); printf(" esr: 0x%.16lx\n", esr); panic("VFP exception in the kernel"); } break; case EXCP_INSN_ABORT: case EXCP_DATA_ABORT: dfsc = esr & ISS_DATA_DFSC_MASK; if (dfsc < nitems(abort_handlers) && abort_handlers[dfsc] != NULL) { abort_handlers[dfsc](td, frame, esr, far, 0); } else { print_registers(frame); print_gp_register("far", far); printf(" esr: 0x%.16lx\n", esr); panic("Unhandled EL1 %s abort: 0x%x", exception == EXCP_INSN_ABORT ? "instruction" : "data", dfsc); } break; case EXCP_BRK: #ifdef KDTRACE_HOOKS if ((esr & ESR_ELx_ISS_MASK) == 0x40d /* BRK_IMM16_VAL */ && dtrace_invop_jump_addr != NULL && dtrace_invop_jump_addr(frame) == 0) break; #endif #ifdef KDB kdb_trap(exception, 0, frame); #else panic("No debugger in kernel."); #endif break; case EXCP_BRKPT_EL1: case EXCP_WATCHPT_EL1: case EXCP_SOFTSTP_EL1: #ifdef KDB kdb_trap(exception, 0, frame); #else panic("No debugger in kernel."); #endif break; case EXCP_FPAC: /* We can see this if the authentication on PAC fails */ print_registers(frame); print_gp_register("far", far); panic("FPAC kernel exception"); break; case EXCP_UNKNOWN: if (undef_insn(1, frame)) break; print_registers(frame); print_gp_register("far", far); panic("Undefined instruction: %08x", *(uint32_t *)frame->tf_elr); break; case EXCP_BTI: print_registers(frame); print_gp_register("far", far); panic("Branch Target exception"); break; default: print_registers(frame); print_gp_register("far", far); panic("Unknown kernel exception 0x%x esr_el1 0x%lx", exception, esr); } } void do_el0_sync(struct thread *td, struct trapframe *frame) { pcpu_bp_harden bp_harden; uint32_t exception; uint64_t esr, far; int dfsc; /* Check we have a sane environment when entering from userland */ KASSERT((uintptr_t)get_pcpu() >= VM_MIN_KERNEL_ADDRESS, ("Invalid pcpu address from userland: %p (tpidr 0x%lx)", get_pcpu(), READ_SPECIALREG(tpidr_el1))); kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0); far = frame->tf_far; esr = frame->tf_esr; exception = ESR_ELx_EXCEPTION(esr); if (exception == EXCP_INSN_ABORT_L && far > VM_MAXUSER_ADDRESS) { /* * Userspace may be trying to train the branch predictor to * attack the kernel. If we are on a CPU affected by this * call the handler to clear the branch predictor state. */ bp_harden = PCPU_GET(bp_harden); if (bp_harden != NULL) bp_harden(); } intr_enable(); CTR4(KTR_TRAP, "%s: exception=%lu, elr=0x%lx, esr=0x%lx", __func__, exception, frame->tf_elr, esr); switch (exception) { case EXCP_FP_SIMD: #ifdef VFP vfp_restore_state(); #else panic("VFP exception in userland"); #endif break; case EXCP_TRAP_FP: #ifdef VFP fpe_trap(td, (void *)frame->tf_elr, esr); userret(td, frame); #else panic("VFP exception in userland"); #endif break; case EXCP_SVE: /* Returns true if this thread can use SVE */ if (!sve_restore_state(td)) call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_SVC32: case EXCP_SVC64: svc_handler(td, frame); break; case EXCP_INSN_ABORT_L: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: dfsc = esr & ISS_DATA_DFSC_MASK; if (dfsc < nitems(abort_handlers) && abort_handlers[dfsc] != NULL) abort_handlers[dfsc](td, frame, esr, far, 1); else { print_registers(frame); print_gp_register("far", far); printf(" esr: 0x%.16lx\n", esr); panic("Unhandled EL0 %s abort: 0x%x", exception == EXCP_INSN_ABORT_L ? "instruction" : "data", dfsc); } break; case EXCP_UNKNOWN: if (!undef_insn(0, frame)) call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)far, exception); userret(td, frame); break; case EXCP_FPAC: call_trapsignal(td, SIGILL, ILL_ILLOPN, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_SP_ALIGN: call_trapsignal(td, SIGBUS, BUS_ADRALN, (void *)frame->tf_sp, exception); userret(td, frame); break; case EXCP_PC_ALIGN: call_trapsignal(td, SIGBUS, BUS_ADRALN, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_BRKPT_EL0: case EXCP_BRK: #ifdef COMPAT_FREEBSD32 case EXCP_BRKPT_32: #endif /* COMPAT_FREEBSD32 */ call_trapsignal(td, SIGTRAP, TRAP_BRKPT, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_WATCHPT_EL0: call_trapsignal(td, SIGTRAP, TRAP_TRACE, (void *)far, exception); userret(td, frame); break; case EXCP_MSR: /* * The CPU can raise EXCP_MSR when userspace executes an mrs * instruction to access a special register userspace doesn't * have access to. */ if (!undef_insn(0, frame)) call_trapsignal(td, SIGILL, ILL_PRVOPC, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_SOFTSTP_EL0: PROC_LOCK(td->td_proc); if ((td->td_dbgflags & TDB_STEP) != 0) { td->td_frame->tf_spsr &= ~PSR_SS; td->td_pcb->pcb_flags &= ~PCB_SINGLE_STEP; WRITE_SPECIALREG(mdscr_el1, READ_SPECIALREG(mdscr_el1) & ~MDSCR_SS); } PROC_UNLOCK(td->td_proc); call_trapsignal(td, SIGTRAP, TRAP_TRACE, (void *)frame->tf_elr, exception); userret(td, frame); break; case EXCP_BTI: call_trapsignal(td, SIGILL, ILL_ILLOPC, (void *)frame->tf_elr, exception); userret(td, frame); break; default: call_trapsignal(td, SIGBUS, BUS_OBJERR, (void *)frame->tf_elr, exception); userret(td, frame); break; } KASSERT( (td->td_pcb->pcb_fpflags & ~(PCB_FP_USERMASK|PCB_FP_SVEVALID)) == 0, ("Kernel VFP flags set while entering userspace")); KASSERT( td->td_pcb->pcb_fpusaved == &td->td_pcb->pcb_fpustate, ("Kernel VFP state in use when entering userspace")); } /* * TODO: We will need to handle these later when we support ARMv8.2 RAS. */ void do_serror(struct trapframe *frame) { uint64_t esr, far; kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0); far = frame->tf_far; esr = frame->tf_esr; print_registers(frame); print_gp_register("far", far); printf(" esr: 0x%.16lx\n", esr); panic("Unhandled System Error"); } void unhandled_exception(struct trapframe *frame) { uint64_t esr, far; kasan_mark(frame, sizeof(*frame), sizeof(*frame), 0); far = frame->tf_far; esr = frame->tf_esr; print_registers(frame); print_gp_register("far", far); printf(" esr: 0x%.16lx\n", esr); panic("Unhandled exception"); } diff --git a/sys/dev/efidev/efirt.c b/sys/dev/efidev/efirt.c index 1fb9b3bb01d1..4e630a5dde37 100644 --- a/sys/dev/efidev/efirt.c +++ b/sys/dev/efidev/efirt.c @@ -1,835 +1,844 @@ /*- * Copyright (c) 2004 Marcel Moolenaar * Copyright (c) 2001 Doug Rabson * Copyright (c) 2016, 2018 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_ACPI #include #endif #define EFI_TABLE_ALLOC_MAX 0x800000 static struct efi_systbl *efi_systbl; static eventhandler_tag efi_shutdown_tag; /* * The following pointers point to tables in the EFI runtime service data pages. * Care should be taken to make sure that we've properly entered the EFI runtime * environment (efi_enter()) before dereferencing them. */ static struct efi_cfgtbl *efi_cfgtbl; static struct efi_rt *efi_runtime; static int efi_status2err[25] = { 0, /* EFI_SUCCESS */ ENOEXEC, /* EFI_LOAD_ERROR */ EINVAL, /* EFI_INVALID_PARAMETER */ ENOSYS, /* EFI_UNSUPPORTED */ EMSGSIZE, /* EFI_BAD_BUFFER_SIZE */ EOVERFLOW, /* EFI_BUFFER_TOO_SMALL */ EBUSY, /* EFI_NOT_READY */ EIO, /* EFI_DEVICE_ERROR */ EROFS, /* EFI_WRITE_PROTECTED */ EAGAIN, /* EFI_OUT_OF_RESOURCES */ EIO, /* EFI_VOLUME_CORRUPTED */ ENOSPC, /* EFI_VOLUME_FULL */ ENXIO, /* EFI_NO_MEDIA */ ESTALE, /* EFI_MEDIA_CHANGED */ ENOENT, /* EFI_NOT_FOUND */ EACCES, /* EFI_ACCESS_DENIED */ ETIMEDOUT, /* EFI_NO_RESPONSE */ EADDRNOTAVAIL, /* EFI_NO_MAPPING */ ETIMEDOUT, /* EFI_TIMEOUT */ EDOOFUS, /* EFI_NOT_STARTED */ EALREADY, /* EFI_ALREADY_STARTED */ ECANCELED, /* EFI_ABORTED */ EPROTO, /* EFI_ICMP_ERROR */ EPROTO, /* EFI_TFTP_ERROR */ EPROTO /* EFI_PROTOCOL_ERROR */ }; enum efi_table_type { TYPE_ESRT = 0, TYPE_PROP }; static int efi_enter(void); static void efi_leave(void); int efi_status_to_errno(efi_status status) { u_long code; code = status & 0x3ffffffffffffffful; return (code < nitems(efi_status2err) ? efi_status2err[code] : EDOOFUS); } static struct mtx efi_lock; -static SYSCTL_NODE(_hw, OID_AUTO, efi, CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, +SYSCTL_NODE(_hw, OID_AUTO, efi, CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, "EFI"); static bool efi_poweroff = true; SYSCTL_BOOL(_hw_efi, OID_AUTO, poweroff, CTLFLAG_RWTUN, &efi_poweroff, 0, "If true, use EFI runtime services to power off in preference to ACPI"); +extern int print_efirt_faults; +SYSCTL_INT(_hw_efi, OID_AUTO, print_faults, CTLFLAG_RWTUN, + &print_efirt_faults, 0, + "Print fault information upon trap from EFIRT calls: " + "0 - never, 1 - once, 2 - always"); +extern u_long cnt_efirt_faults; +SYSCTL_ULONG(_hw_efi, OID_AUTO, total_faults, CTLFLAG_RD, + &cnt_efirt_faults, 0, + "Total number of faults that occurred during EFIRT calls"); static bool efi_is_in_map(struct efi_md *map, int ndesc, int descsz, vm_offset_t addr) { struct efi_md *p; int i; for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, descsz)) { if ((p->md_attr & EFI_MD_ATTR_RT) == 0) continue; if (addr >= p->md_virt && addr < p->md_virt + p->md_pages * EFI_PAGE_SIZE) return (true); } return (false); } static void efi_shutdown_final(void *dummy __unused, int howto) { /* * On some systems, ACPI S5 is missing or does not function properly. * When present, shutdown via EFI Runtime Services instead, unless * disabled. */ if ((howto & RB_POWEROFF) != 0 && efi_poweroff) (void)efi_reset_system(EFI_RESET_SHUTDOWN); } static int efi_init(void) { struct efi_map_header *efihdr; struct efi_md *map; struct efi_rt *rtdm; caddr_t kmdp; size_t efisz; int ndesc, rt_disabled; rt_disabled = 0; TUNABLE_INT_FETCH("efi.rt.disabled", &rt_disabled); if (rt_disabled == 1) return (0); mtx_init(&efi_lock, "efi", NULL, MTX_DEF); if (efi_systbl_phys == 0) { if (bootverbose) printf("EFI systbl not available\n"); return (0); } efi_systbl = (struct efi_systbl *)efi_phys_to_kva(efi_systbl_phys); if (efi_systbl == NULL || efi_systbl->st_hdr.th_sig != EFI_SYSTBL_SIG) { efi_systbl = NULL; if (bootverbose) printf("EFI systbl signature invalid\n"); return (0); } efi_cfgtbl = (efi_systbl->st_cfgtbl == 0) ? NULL : (struct efi_cfgtbl *)efi_systbl->st_cfgtbl; if (efi_cfgtbl == NULL) { if (bootverbose) printf("EFI config table is not present\n"); } kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efihdr = (struct efi_map_header *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP); if (efihdr == NULL) { if (bootverbose) printf("EFI map is not present\n"); return (0); } efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; map = (struct efi_md *)((uint8_t *)efihdr + efisz); if (efihdr->descriptor_size == 0) return (ENOMEM); ndesc = efihdr->memory_size / efihdr->descriptor_size; if (!efi_create_1t1_map(map, ndesc, efihdr->descriptor_size)) { if (bootverbose) printf("EFI cannot create runtime map\n"); return (ENOMEM); } efi_runtime = (efi_systbl->st_rt == 0) ? NULL : (struct efi_rt *)efi_systbl->st_rt; if (efi_runtime == NULL) { if (bootverbose) printf("EFI runtime services table is not present\n"); efi_destroy_1t1_map(); return (ENXIO); } #if defined(__aarch64__) || defined(__amd64__) /* * Some UEFI implementations have multiple implementations of the * RS->GetTime function. They switch from one we can only use early * in the boot process to one valid as a RunTime service only when we * call RS->SetVirtualAddressMap. As this is not always the case, e.g. * with an old loader.efi, check if the RS->GetTime function is within * the EFI map, and fail to attach if not. */ rtdm = (struct efi_rt *)efi_phys_to_kva((uintptr_t)efi_runtime); if (rtdm == NULL || !efi_is_in_map(map, ndesc, efihdr->descriptor_size, (vm_offset_t)rtdm->rt_gettime)) { if (bootverbose) printf( "EFI runtime services table has an invalid pointer\n"); efi_runtime = NULL; efi_destroy_1t1_map(); return (ENXIO); } #endif /* * We use SHUTDOWN_PRI_LAST - 1 to trigger after IPMI, but before ACPI. */ efi_shutdown_tag = EVENTHANDLER_REGISTER(shutdown_final, efi_shutdown_final, NULL, SHUTDOWN_PRI_LAST - 1); return (0); } static void efi_uninit(void) { /* Most likely disabled by tunable */ if (efi_runtime == NULL) return; if (efi_shutdown_tag != NULL) EVENTHANDLER_DEREGISTER(shutdown_final, efi_shutdown_tag); efi_destroy_1t1_map(); efi_systbl = NULL; efi_cfgtbl = NULL; efi_runtime = NULL; mtx_destroy(&efi_lock); } static int rt_ok(void) { if (efi_runtime == NULL) return (ENXIO); return (0); } /* * The fpu_kern_enter() call in allows firmware to use FPU, as * mandated by the specification. It also enters a critical section, * giving us neccessary protection against context switches. */ static int efi_enter(void) { struct thread *td; pmap_t curpmap; int error; if (efi_runtime == NULL) return (ENXIO); td = curthread; curpmap = &td->td_proc->p_vmspace->vm_pmap; PMAP_LOCK(curpmap); mtx_lock(&efi_lock); fpu_kern_enter(td, NULL, FPU_KERN_NOCTX); error = efi_arch_enter(); if (error != 0) { fpu_kern_leave(td, NULL); mtx_unlock(&efi_lock); PMAP_UNLOCK(curpmap); } else { MPASS((td->td_pflags2 & TDP2_EFIRT) == 0); td->td_pflags2 |= TDP2_EFIRT; } return (error); } static void efi_leave(void) { struct thread *td; pmap_t curpmap; td = curthread; MPASS((td->td_pflags2 & TDP2_EFIRT) != 0); td->td_pflags2 &= ~TDP2_EFIRT; efi_arch_leave(); curpmap = &curproc->p_vmspace->vm_pmap; fpu_kern_leave(td, NULL); mtx_unlock(&efi_lock); PMAP_UNLOCK(curpmap); } static int get_table(struct uuid *uuid, void **ptr) { struct efi_cfgtbl *ct; u_long count; int error; if (efi_cfgtbl == NULL || efi_systbl == NULL) return (ENXIO); error = efi_enter(); if (error != 0) return (error); count = efi_systbl->st_entries; ct = efi_cfgtbl; while (count--) { if (!bcmp(&ct->ct_uuid, uuid, sizeof(*uuid))) { *ptr = ct->ct_data; efi_leave(); return (0); } ct++; } efi_leave(); return (ENOENT); } static int get_table_length(enum efi_table_type type, size_t *table_len, void **taddr) { switch (type) { case TYPE_ESRT: { struct efi_esrt_table *esrt = NULL; struct uuid uuid = EFI_TABLE_ESRT; uint32_t fw_resource_count = 0; size_t len = sizeof(*esrt); int error; void *buf; error = efi_get_table(&uuid, (void **)&esrt); if (error != 0) return (error); buf = malloc(len, M_TEMP, M_WAITOK); error = physcopyout((vm_paddr_t)esrt, buf, len); if (error != 0) { free(buf, M_TEMP); return (error); } /* Check ESRT version */ if (((struct efi_esrt_table *)buf)->fw_resource_version != ESRT_FIRMWARE_RESOURCE_VERSION) { free(buf, M_TEMP); return (ENODEV); } fw_resource_count = ((struct efi_esrt_table *)buf)-> fw_resource_count; if (fw_resource_count > EFI_TABLE_ALLOC_MAX / sizeof(struct efi_esrt_entry_v1)) { free(buf, M_TEMP); return (ENOMEM); } len += fw_resource_count * sizeof(struct efi_esrt_entry_v1); *table_len = len; if (taddr != NULL) *taddr = esrt; free(buf, M_TEMP); return (0); } case TYPE_PROP: { struct uuid uuid = EFI_PROPERTIES_TABLE; struct efi_prop_table *prop; size_t len = sizeof(*prop); uint32_t prop_len; int error; void *buf; error = efi_get_table(&uuid, (void **)&prop); if (error != 0) return (error); buf = malloc(len, M_TEMP, M_WAITOK); error = physcopyout((vm_paddr_t)prop, buf, len); if (error != 0) { free(buf, M_TEMP); return (error); } prop_len = ((struct efi_prop_table *)buf)->length; if (prop_len > EFI_TABLE_ALLOC_MAX) { free(buf, M_TEMP); return (ENOMEM); } *table_len = prop_len; if (taddr != NULL) *taddr = prop; free(buf, M_TEMP); return (0); } } return (ENOENT); } static int copy_table(struct uuid *uuid, void **buf, size_t buf_len, size_t *table_len) { static const struct known_table { struct uuid uuid; enum efi_table_type type; } tables[] = { { EFI_TABLE_ESRT, TYPE_ESRT }, { EFI_PROPERTIES_TABLE, TYPE_PROP } }; size_t table_idx; void *taddr; int rc; for (table_idx = 0; table_idx < nitems(tables); table_idx++) { if (!bcmp(&tables[table_idx].uuid, uuid, sizeof(*uuid))) break; } if (table_idx == nitems(tables)) return (EINVAL); rc = get_table_length(tables[table_idx].type, table_len, &taddr); if (rc != 0) return rc; /* return table length to userspace */ if (buf == NULL) return (0); *buf = malloc(*table_len, M_TEMP, M_WAITOK); rc = physcopyout((vm_paddr_t)taddr, *buf, *table_len); return (rc); } static int efi_rt_handle_faults = EFI_RT_HANDLE_FAULTS_DEFAULT; SYSCTL_INT(_machdep, OID_AUTO, efi_rt_handle_faults, CTLFLAG_RWTUN, &efi_rt_handle_faults, 0, "Call EFI RT methods with fault handler wrapper around"); static int efi_rt_arch_call_nofault(struct efirt_callinfo *ec) { switch (ec->ec_argcnt) { case 0: ec->ec_efi_status = ((register_t EFIABI_ATTR (*)(void)) ec->ec_fptr)(); break; case 1: ec->ec_efi_status = ((register_t EFIABI_ATTR (*)(register_t)) ec->ec_fptr)(ec->ec_arg1); break; case 2: ec->ec_efi_status = ((register_t EFIABI_ATTR (*)(register_t, register_t))ec->ec_fptr)(ec->ec_arg1, ec->ec_arg2); break; case 3: ec->ec_efi_status = ((register_t EFIABI_ATTR (*)(register_t, register_t, register_t))ec->ec_fptr)(ec->ec_arg1, ec->ec_arg2, ec->ec_arg3); break; case 4: ec->ec_efi_status = ((register_t EFIABI_ATTR (*)(register_t, register_t, register_t, register_t))ec->ec_fptr)( ec->ec_arg1, ec->ec_arg2, ec->ec_arg3, ec->ec_arg4); break; case 5: ec->ec_efi_status = ((register_t EFIABI_ATTR (*)(register_t, register_t, register_t, register_t, register_t)) ec->ec_fptr)(ec->ec_arg1, ec->ec_arg2, ec->ec_arg3, ec->ec_arg4, ec->ec_arg5); break; default: panic("efi_rt_arch_call: %d args", (int)ec->ec_argcnt); } return (0); } static int efi_call(struct efirt_callinfo *ecp) { int error; error = efi_enter(); if (error != 0) return (error); error = efi_rt_handle_faults ? efi_rt_arch_call(ecp) : efi_rt_arch_call_nofault(ecp); efi_leave(); if (error == 0) error = efi_status_to_errno(ecp->ec_efi_status); else if (bootverbose) printf("EFI %s call faulted, error %d\n", ecp->ec_name, error); return (error); } #define EFI_RT_METHOD_PA(method) \ ((uintptr_t)((struct efi_rt *)efi_phys_to_kva((uintptr_t) \ efi_runtime))->method) static int efi_get_time_locked(struct efi_tm *tm, struct efi_tmcap *tmcap) { struct efirt_callinfo ec; int error; EFI_TIME_OWNED(); if (efi_runtime == NULL) return (ENXIO); bzero(&ec, sizeof(ec)); ec.ec_name = "rt_gettime"; ec.ec_argcnt = 2; ec.ec_arg1 = (uintptr_t)tm; ec.ec_arg2 = (uintptr_t)tmcap; ec.ec_fptr = EFI_RT_METHOD_PA(rt_gettime); error = efi_call(&ec); if (error == 0) kmsan_mark(tm, sizeof(*tm), KMSAN_STATE_INITED); return (error); } static int get_time(struct efi_tm *tm) { struct efi_tmcap dummy; int error; if (efi_runtime == NULL) return (ENXIO); EFI_TIME_LOCK(); /* * UEFI spec states that the Capabilities argument to GetTime is * optional, but some UEFI implementations choke when passed a NULL * pointer. Pass a dummy efi_tmcap, even though we won't use it, * to workaround such implementations. */ error = efi_get_time_locked(tm, &dummy); EFI_TIME_UNLOCK(); return (error); } static int get_waketime(uint8_t *enabled, uint8_t *pending, struct efi_tm *tm) { struct efirt_callinfo ec; int error; #ifdef DEV_ACPI UINT32 acpiRtcEnabled; #endif if (efi_runtime == NULL) return (ENXIO); EFI_TIME_LOCK(); bzero(&ec, sizeof(ec)); ec.ec_name = "rt_getwaketime"; ec.ec_argcnt = 3; ec.ec_arg1 = (uintptr_t)enabled; ec.ec_arg2 = (uintptr_t)pending; ec.ec_arg3 = (uintptr_t)tm; ec.ec_fptr = EFI_RT_METHOD_PA(rt_getwaketime); error = efi_call(&ec); EFI_TIME_UNLOCK(); #ifdef DEV_ACPI if (error == 0) { error = AcpiReadBitRegister(ACPI_BITREG_RT_CLOCK_ENABLE, &acpiRtcEnabled); if (ACPI_SUCCESS(error)) { *enabled = *enabled && acpiRtcEnabled; } else error = EIO; } #endif return (error); } static int set_waketime(uint8_t enable, struct efi_tm *tm) { struct efirt_callinfo ec; int error; if (efi_runtime == NULL) return (ENXIO); EFI_TIME_LOCK(); bzero(&ec, sizeof(ec)); ec.ec_name = "rt_setwaketime"; ec.ec_argcnt = 2; ec.ec_arg1 = (uintptr_t)enable; ec.ec_arg2 = (uintptr_t)tm; ec.ec_fptr = EFI_RT_METHOD_PA(rt_setwaketime); error = efi_call(&ec); EFI_TIME_UNLOCK(); #ifdef DEV_ACPI if (error == 0) { error = AcpiWriteBitRegister(ACPI_BITREG_RT_CLOCK_ENABLE, (enable != 0) ? 1 : 0); if (ACPI_FAILURE(error)) error = EIO; } #endif return (error); } static int get_time_capabilities(struct efi_tmcap *tmcap) { struct efi_tm dummy; int error; if (efi_runtime == NULL) return (ENXIO); EFI_TIME_LOCK(); error = efi_get_time_locked(&dummy, tmcap); EFI_TIME_UNLOCK(); return (error); } static int reset_system(enum efi_reset type) { struct efirt_callinfo ec; switch (type) { case EFI_RESET_COLD: case EFI_RESET_WARM: case EFI_RESET_SHUTDOWN: break; default: return (EINVAL); } if (efi_runtime == NULL) return (ENXIO); bzero(&ec, sizeof(ec)); ec.ec_name = "rt_reset"; ec.ec_argcnt = 4; ec.ec_arg1 = (uintptr_t)type; ec.ec_arg2 = (uintptr_t)0; ec.ec_arg3 = (uintptr_t)0; ec.ec_arg4 = (uintptr_t)NULL; ec.ec_fptr = EFI_RT_METHOD_PA(rt_reset); return (efi_call(&ec)); } static int efi_set_time_locked(struct efi_tm *tm) { struct efirt_callinfo ec; EFI_TIME_OWNED(); if (efi_runtime == NULL) return (ENXIO); bzero(&ec, sizeof(ec)); ec.ec_name = "rt_settime"; ec.ec_argcnt = 1; ec.ec_arg1 = (uintptr_t)tm; ec.ec_fptr = EFI_RT_METHOD_PA(rt_settime); return (efi_call(&ec)); } static int set_time(struct efi_tm *tm) { int error; if (efi_runtime == NULL) return (ENXIO); EFI_TIME_LOCK(); error = efi_set_time_locked(tm); EFI_TIME_UNLOCK(); return (error); } static int var_get(efi_char *name, struct uuid *vendor, uint32_t *attrib, size_t *datasize, void *data) { struct efirt_callinfo ec; int error; if (efi_runtime == NULL) return (ENXIO); bzero(&ec, sizeof(ec)); ec.ec_argcnt = 5; ec.ec_name = "rt_getvar"; ec.ec_arg1 = (uintptr_t)name; ec.ec_arg2 = (uintptr_t)vendor; ec.ec_arg3 = (uintptr_t)attrib; ec.ec_arg4 = (uintptr_t)datasize; ec.ec_arg5 = (uintptr_t)data; ec.ec_fptr = EFI_RT_METHOD_PA(rt_getvar); error = efi_call(&ec); if (error == 0) kmsan_mark(data, *datasize, KMSAN_STATE_INITED); return (error); } static int var_nextname(size_t *namesize, efi_char *name, struct uuid *vendor) { struct efirt_callinfo ec; int error; if (efi_runtime == NULL) return (ENXIO); bzero(&ec, sizeof(ec)); ec.ec_argcnt = 3; ec.ec_name = "rt_scanvar"; ec.ec_arg1 = (uintptr_t)namesize; ec.ec_arg2 = (uintptr_t)name; ec.ec_arg3 = (uintptr_t)vendor; ec.ec_fptr = EFI_RT_METHOD_PA(rt_scanvar); error = efi_call(&ec); if (error == 0) kmsan_mark(name, *namesize, KMSAN_STATE_INITED); return (error); } static int var_set(efi_char *name, struct uuid *vendor, uint32_t attrib, size_t datasize, void *data) { struct efirt_callinfo ec; if (efi_runtime == NULL) return (ENXIO); bzero(&ec, sizeof(ec)); ec.ec_argcnt = 5; ec.ec_name = "rt_setvar"; ec.ec_arg1 = (uintptr_t)name; ec.ec_arg2 = (uintptr_t)vendor; ec.ec_arg3 = (uintptr_t)attrib; ec.ec_arg4 = (uintptr_t)datasize; ec.ec_arg5 = (uintptr_t)data; ec.ec_fptr = EFI_RT_METHOD_PA(rt_setvar); return (efi_call(&ec)); } const static struct efi_ops efi_ops = { .rt_ok = rt_ok, .get_table = get_table, .copy_table = copy_table, .get_time = get_time, .get_time_capabilities = get_time_capabilities, .reset_system = reset_system, .set_time = set_time, .get_waketime = get_waketime, .set_waketime = set_waketime, .var_get = var_get, .var_nextname = var_nextname, .var_set = var_set, }; const struct efi_ops *active_efi_ops = &efi_ops; static int efirt_modevents(module_t m, int event, void *arg __unused) { switch (event) { case MOD_LOAD: return (efi_init()); case MOD_UNLOAD: efi_uninit(); return (0); case MOD_SHUTDOWN: return (0); default: return (EOPNOTSUPP); } } static moduledata_t efirt_moddata = { .name = "efirt", .evhand = efirt_modevents, .priv = NULL, }; /* After fpuinitstate, before efidev */ DECLARE_MODULE(efirt, efirt_moddata, SI_SUB_DRIVERS, SI_ORDER_SECOND); MODULE_VERSION(efirt, 1);