Index: head/sys/kern/kern_shutdown.c =================================================================== --- head/sys/kern/kern_shutdown.c (revision 366502) +++ head/sys/kern/kern_shutdown.c (revision 366503) @@ -1,1752 +1,1751 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ekcd.h" #include "opt_kdb.h" #include "opt_panic.h" #include "opt_printf.h" #include "opt_sched.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer"); #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME; SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN, &panic_reboot_wait_time, 0, "Seconds to wait before rebooting after a panic"); /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED int debugger_on_panic = 0; #else int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_panic, 0, "Run debugger on kernel panic"); int debugger_on_trap = 0; SYSCTL_INT(_debug, OID_AUTO, debugger_on_trap, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_trap, 0, "Run debugger on kernel trap before panic"); #ifdef KDB_TRACE static int trace_on_panic = 1; static bool trace_all_panics = true; #else static int trace_on_panic = 0; static bool trace_all_panics = false; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &trace_on_panic, 0, "Print stack trace on kernel panic"); SYSCTL_BOOL(_debug, OID_AUTO, trace_all_panics, CTLFLAG_RWTUN, &trace_all_panics, 0, "Print stack traces on secondary kernel panics"); #endif /* KDB */ static int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); static bool poweroff_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, poweroff_on_panic, CTLFLAG_RWTUN, &poweroff_on_panic, 0, "Do a power off instead of a reboot on a panic"); static bool powercycle_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, powercycle_on_panic, CTLFLAG_RWTUN, &powercycle_on_panic, 0, "Do a power cycle instead of a reboot on a panic"); static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Shutdown environment"); #ifndef DIAGNOSTIC static int show_busybufs; #else static int show_busybufs = 1; #endif SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW, &show_busybufs, 0, "Show busy buffers during shutdown"); int suspend_blocked = 0; SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW, &suspend_blocked, 0, "Block suspend due to a pending shutdown"); #ifdef EKCD FEATURE(ekcd, "Encrypted kernel crash dumps support"); MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data"); struct kerneldumpcrypto { uint8_t kdc_encryption; uint8_t kdc_iv[KERNELDUMP_IV_MAX_SIZE]; union { struct { keyInstance aes_ki; cipherInstance aes_ci; } u_aes; struct chacha_ctx u_chacha; } u; #define kdc_ki u.u_aes.aes_ki #define kdc_ci u.u_aes.aes_ci #define kdc_chacha u.u_chacha uint32_t kdc_dumpkeysize; struct kerneldumpkey kdc_dumpkey[]; }; #endif struct kerneldumpcomp { uint8_t kdc_format; struct compressor *kdc_stream; uint8_t *kdc_buf; size_t kdc_resid; }; static struct kerneldumpcomp *kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression); static void kerneldumpcomp_destroy(struct dumperinfo *di); static int kerneldumpcomp_write_cb(void *base, size_t len, off_t off, void *arg); static int kerneldump_gzlevel = 6; SYSCTL_INT(_kern, OID_AUTO, kerneldump_gzlevel, CTLFLAG_RWTUN, &kerneldump_gzlevel, 0, "Kernel crash dump compression level"); /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ const char *panicstr; bool __read_frequently panicked; int __read_mostly dumping; /* system is dumping */ int rebooting; /* system is rebooting */ /* * Used to serialize between sysctl kern.shutdown.dumpdevname and list * modifications via ioctl. */ static struct mtx dumpconf_list_lk; MTX_SYSINIT(dumper_configs, &dumpconf_list_lk, "dumper config list", MTX_DEF); /* Our selected dumper(s). */ static TAILQ_HEAD(dumpconflist, dumperinfo) dumper_configs = TAILQ_HEAD_INITIALIZER(dumper_configs); /* Context information for dump-debuggers. */ static struct pcb dumppcb; /* Registers. */ lwpid_t dumptid; /* Thread ID. */ static struct cdevsw reroot_cdevsw = { .d_version = D_VERSION, .d_name = "reroot", }; static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); static int kern_reroot(void); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL); /* * The only reason this exists is to create the /dev/reroot/ directory, * used by reroot code in init(8) as a mountpoint for tmpfs. */ static void reroot_conf(void *unused) { int error; struct cdev *cdev; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev, &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot"); if (error != 0) { printf("%s: failed to create device node, error %d", __func__, error); } } SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL); /* * The system call that results in a reboot. */ /* ARGSUSED */ int sys_reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_system_check_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = priv_check(td, PRIV_REBOOT); if (error == 0) { if (uap->opt & RB_REROOT) error = kern_reroot(); else kern_reboot(uap->opt); } return (error); } static void shutdown_nice_task_fn(void *arg, int pending __unused) { int howto; howto = (uintptr_t)arg; /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(initproc); if (howto & RB_POWEROFF) kern_psignal(initproc, SIGUSR2); else if (howto & RB_POWERCYCLE) kern_psignal(initproc, SIGWINCH); else if (howto & RB_HALT) kern_psignal(initproc, SIGUSR1); else kern_psignal(initproc, SIGINT); PROC_UNLOCK(initproc); } static struct task shutdown_nice_task = TASK_INITIALIZER(0, &shutdown_nice_task_fn, NULL); /* * Called by events that want to shut down.. e.g on a PC */ void shutdown_nice(int howto) { if (initproc != NULL && !SCHEDULER_STOPPED()) { shutdown_nice_task.ta_context = (void *)(uintptr_t)howto; taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task); } else { /* * No init(8) running, or scheduler would not allow it * to run, so simply reboot. */ kern_reboot(howto | RB_NOSYNC); } } static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } int doadump(boolean_t textdump) { boolean_t coredump; int error; error = 0; if (dumping) return (EBUSY); if (TAILQ_EMPTY(&dumper_configs)) return (ENXIO); savectx(&dumppcb); dumptid = curthread->td_tid; dumping++; coredump = TRUE; #ifdef DDB if (textdump && textdump_pending) { coredump = FALSE; textdump_dumpsys(TAILQ_FIRST(&dumper_configs)); } #endif if (coredump) { struct dumperinfo *di; TAILQ_FOREACH(di, &dumper_configs, di_next) { error = dumpsys(di); if (error == 0) break; } } dumping--; return (error); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void kern_reboot(int howto) { static int once = 0; /* * Normal paths here don't hold Giant, but we can wind up here * unexpectedly with it held. Drop it now so we don't have to * drop and pick it up elsewhere. The paths it is locking will * never be returned to, and it is preferable to preclude * deadlock than to lock against code that won't ever * continue. */ while (mtx_owned(&Giant)) mtx_unlock(&Giant); #if defined(SMP) /* * Bind us to the first CPU so that all shutdown code runs there. Some * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ if (!SCHEDULER_STOPPED()) { thread_lock(curthread); sched_bind(curthread, CPU_FIRST()); thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("boot: not running on cpu 0")); } #endif /* We're in the process of rebooting. */ rebooting = 1; /* We are out of the debugger now. */ kdb_active = 0; /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); /* * Now sync filesystems */ if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) { once = 1; bufshutdown(show_busybufs); } print_uptime(); cngrab(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(TRUE); /* Now that we're going to really halt the system... */ EVENTHANDLER_INVOKE(shutdown_final, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * The system call that results in changing the rootfs. */ static int kern_reroot(void) { struct vnode *oldrootvnode, *vp; struct mount *mp, *devmp; int error; if (curproc != initproc) return (EPERM); /* * Mark the filesystem containing currently-running executable * (the temporary copy of init(8)) busy. */ vp = curproc->p_textvp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (error); mp = vp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp); error = vfs_busy(mp, 0); vn_lock(vp, LK_SHARED | LK_RETRY); vfs_rel(mp); if (error != 0) { VOP_UNLOCK(vp); return (ENOENT); } if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp); /* * Remove the filesystem containing currently-running executable * from the mount list, to prevent it from being unmounted * by vfs_unmountall(), and to avoid confusing vfs_mountroot(). * * Also preserve /dev - forcibly unmounting it could cause driver * reinitialization. */ vfs_ref(rootdevmp); devmp = rootdevmp; rootdevmp = NULL; mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); TAILQ_REMOVE(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); oldrootvnode = rootvnode; /* * Unmount everything except for the two filesystems preserved above. */ vfs_unmountall(); /* * Add /dev back; vfs_mountroot() will move it into its new place. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); rootdevmp = devmp; vfs_rel(rootdevmp); /* * Mount the new rootfs. */ vfs_mountroot(); /* * Update all references to the old rootvnode. */ mountcheckdirs(oldrootvnode, rootvnode); /* * Add the temporary filesystem back and unbusy it. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_unbusy(mp); return (0); } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); wdog_kern_pat(WD_TO_NEVER); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: break; } } } /* * Check to see if the system paniced, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (panic_reboot_wait_time != 0) { if (panic_reboot_wait_time != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", panic_reboot_wait_time); for (loop = panic_reboot_wait_time * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ /* * Acquiring smp_ipi_mtx here has a double effect: * - it disables interrupts avoiding CPU0 preemption * by fast handlers (thus deadlocking against other CPUs) * - it avoids deadlocks against smp_rendezvous() or, more * generally, threads busy-waiting, with this spinlock held, * and waiting for responses by threads on other CPUs * (ie. smp_tlb_shootdown()). * * For the !SMP case it just needs to handle the former problem. */ #ifdef SMP mtx_lock_spin(&smp_ipi_mtx); #else spinlock_enter(); #endif - /* cpu_boot(howto); */ /* doesn't do anything at the moment */ cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #if defined(WITNESS) || defined(INVARIANT_SUPPORT) static int kassert_warn_only = 0; #ifdef KDB static int kassert_do_kdb = 0; #endif #ifdef KTR static int kassert_do_ktr = 0; #endif static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; static int kassert_suppress_in_panic = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "kassert options"); #ifdef KASSERT_PANIC_OPTIONAL #define KASSERT_RWTUN CTLFLAG_RWTUN #else #define KASSERT_RWTUN CTLFLAG_RDTUN #endif SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, KASSERT_RWTUN, &kassert_warn_only, 0, "KASSERT triggers a panic (0) or just a warning (1)"); #ifdef KDB SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, KASSERT_RWTUN, &kassert_do_kdb, 0, "KASSERT will enter the debugger"); #endif #ifdef KTR SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, KASSERT_RWTUN, &kassert_do_ktr, 0, "KASSERT does a KTR, set this to the KTRMASK you want"); #endif SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, KASSERT_RWTUN, &kassert_do_log, 0, "If warn_only is enabled, log (1) or do not log (0) assertion violations"); SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RD | CTLFLAG_STATS, &kassert_warnings, 0, "number of KASSERTs that have been triggered"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, KASSERT_RWTUN, &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, KASSERT_RWTUN, &kassert_log_pps_limit, 0, "limit number of log messages per second"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, KASSERT_RWTUN, &kassert_log_mute_at, 0, "max number of KASSERTS to log"); SYSCTL_INT(_debug_kassert, OID_AUTO, suppress_in_panic, KASSERT_RWTUN, &kassert_suppress_in_panic, 0, "KASSERTs will be suppressed while handling a panic"); #undef KASSERT_RWTUN static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_NEEDGIANT, NULL, 0, kassert_sysctl_kassert, "I", "set to trigger a test kassert"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); return (0); } #ifdef KASSERT_PANIC_OPTIONAL /* * Called by KASSERT, this decides if we will panic * or if we will log via printf and/or ktr. */ void kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* * If we are suppressing secondary panics, log the warning but do not * re-enter panic/kdb. */ if (panicstr != NULL && kassert_suppress_in_panic) { if (kassert_do_log) { printf("KASSERT failed: %s\n", buf); #ifdef KDB if (trace_all_panics && trace_on_panic) kdb_backtrace(); #endif } return; } /* * panic if we're not just warning, or if we've exceeded * kassert_log_panic_at warnings. */ if (!kassert_warn_only || (kassert_log_panic_at > 0 && kassert_warnings >= kassert_log_panic_at)) { va_start(ap, fmt); vpanic(fmt, ap); /* NORETURN */ } #ifdef KTR if (kassert_do_ktr) CTR0(ktr_mask, buf); #endif /* KTR */ /* * log if we've not yet met the mute limit. */ if (kassert_do_log && (kassert_log_mute_at == 0 || kassert_warnings < kassert_log_mute_at)) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { printf("KASSERT failed: %s\n", buf); kdb_backtrace(); } } #ifdef KDB if (kassert_do_kdb) { kdb_enter(KDB_WHY_KASSERT, buf); } #endif atomic_add_int(&kassert_warnings, 1); } #endif /* KASSERT_PANIC_OPTIONAL */ #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); } void vpanic(const char *fmt, va_list ap) { #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; static char buf[256]; spinlock_enter(); #ifdef SMP /* * stop_cpus_hard(other_cpus) should prevent multiple CPUs from * concurrently entering panic. Only the winner will proceed * further. */ if (panicstr == NULL && !kdb_active) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); } #endif /* * Ensure that the scheduler is stopped while panicking, even if panic * has been entered from kdb. */ td->td_stopsched = 1; bootopt = RB_AUTOBOOT; newpanic = 0; if (panicstr) bootopt |= RB_NOSYNC; else { bootopt |= RB_DUMP; panicstr = fmt; panicked = true; newpanic = 1; } if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; cngrab(); printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif printf("time = %jd\n", (intmax_t )time_second); #ifdef KDB if ((newpanic || trace_all_panics) && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter(KDB_WHY_PANIC, "panic"); #endif /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; if (poweroff_on_panic) bootopt |= RB_POWEROFF; if (powercycle_on_panic) bootopt |= RB_POWERCYCLE; kern_reboot(bootopt); } /* * Support for poweroff delay. * * Please note that setting this delay too short might power off your machine * before the write cache on your hard disk has been flushed, leading to * soft-updates inconsistencies. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)"); static void poweroff_wait(void *junk, int howto) { if ((howto & (RB_POWEROFF | RB_POWERCYCLE)) == 0 || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process"); void kproc_shutdown(void *arg, int howto) { struct proc *p; int error; if (panicstr) return; p = (struct proc *)arg; printf("Waiting (max %d seconds) for system process `%s' to stop... ", kproc_shutdown_wait, p->p_comm); error = kproc_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } void kthread_shutdown(void *arg, int howto) { struct thread *td; int error; if (panicstr) return; td = (struct thread *)arg; printf("Waiting (max %d seconds) for system thread `%s' to stop... ", kproc_shutdown_wait, td->td_name); error = kthread_suspend(td, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } static int dumpdevname_sysctl_handler(SYSCTL_HANDLER_ARGS) { char buf[256]; struct dumperinfo *di; struct sbuf sb; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req); mtx_lock(&dumpconf_list_lk); TAILQ_FOREACH(di, &dumper_configs, di_next) { if (di != TAILQ_FIRST(&dumper_configs)) sbuf_putc(&sb, ','); sbuf_cat(&sb, di->di_devname); } mtx_unlock(&dumpconf_list_lk); error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } SYSCTL_PROC(_kern_shutdown, OID_AUTO, dumpdevname, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, &dumper_configs, 0, dumpdevname_sysctl_handler, "A", "Device(s) for kernel dumps"); static int _dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical, size_t length); #ifdef EKCD static struct kerneldumpcrypto * kerneldumpcrypto_create(size_t blocksize, uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey) { struct kerneldumpcrypto *kdc; struct kerneldumpkey *kdk; uint32_t dumpkeysize; dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize); kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO); arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0); kdc->kdc_encryption = encryption; switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0) goto failed; break; case KERNELDUMP_ENC_CHACHA20: chacha_keysetup(&kdc->kdc_chacha, key, 256); break; default: goto failed; } kdc->kdc_dumpkeysize = dumpkeysize; kdk = kdc->kdc_dumpkey; kdk->kdk_encryption = kdc->kdc_encryption; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); kdk->kdk_encryptedkeysize = htod32(encryptedkeysize); memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize); return (kdc); failed: zfree(kdc, M_EKCD); return (NULL); } static int kerneldumpcrypto_init(struct kerneldumpcrypto *kdc) { uint8_t hash[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; struct kerneldumpkey *kdk; int error; error = 0; if (kdc == NULL) return (0); /* * When a user enters ddb it can write a crash dump multiple times. * Each time it should be encrypted using a different IV. */ SHA256_Init(&ctx); SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv)); SHA256_Final(hash, &ctx); bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv)); switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, kdc->kdc_iv) <= 0) { error = EINVAL; goto out; } break; case KERNELDUMP_ENC_CHACHA20: chacha_ivsetup(&kdc->kdc_chacha, kdc->kdc_iv, NULL); break; default: error = EINVAL; goto out; } kdk = kdc->kdc_dumpkey; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); out: explicit_bzero(hash, sizeof(hash)); return (error); } static uint32_t kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc) { if (kdc == NULL) return (0); return (kdc->kdc_dumpkeysize); } #endif /* EKCD */ static struct kerneldumpcomp * kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression) { struct kerneldumpcomp *kdcomp; int format; switch (compression) { case KERNELDUMP_COMP_GZIP: format = COMPRESS_GZIP; break; case KERNELDUMP_COMP_ZSTD: format = COMPRESS_ZSTD; break; default: return (NULL); } kdcomp = malloc(sizeof(*kdcomp), M_DUMPER, M_WAITOK | M_ZERO); kdcomp->kdc_format = compression; kdcomp->kdc_stream = compressor_init(kerneldumpcomp_write_cb, format, di->maxiosize, kerneldump_gzlevel, di); if (kdcomp->kdc_stream == NULL) { free(kdcomp, M_DUMPER); return (NULL); } kdcomp->kdc_buf = malloc(di->maxiosize, M_DUMPER, M_WAITOK | M_NODUMP); return (kdcomp); } static void kerneldumpcomp_destroy(struct dumperinfo *di) { struct kerneldumpcomp *kdcomp; kdcomp = di->kdcomp; if (kdcomp == NULL) return; compressor_fini(kdcomp->kdc_stream); zfree(kdcomp->kdc_buf, M_DUMPER); free(kdcomp, M_DUMPER); } /* * Must not be present on global list. */ static void free_single_dumper(struct dumperinfo *di) { if (di == NULL) return; zfree(di->blockbuf, M_DUMPER); kerneldumpcomp_destroy(di); #ifdef EKCD zfree(di->kdcrypto, M_EKCD); #endif zfree(di, M_DUMPER); } /* Registration of dumpers */ int dumper_insert(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda) { struct dumperinfo *newdi, *listdi; bool inserted; uint8_t index; int error; index = kda->kda_index; MPASS(index != KDA_REMOVE && index != KDA_REMOVE_DEV && index != KDA_REMOVE_ALL); error = priv_check(curthread, PRIV_SETDUMPER); if (error != 0) return (error); newdi = malloc(sizeof(*newdi) + strlen(devname) + 1, M_DUMPER, M_WAITOK | M_ZERO); memcpy(newdi, di_template, sizeof(*newdi)); newdi->blockbuf = NULL; newdi->kdcrypto = NULL; newdi->kdcomp = NULL; strcpy(newdi->di_devname, devname); if (kda->kda_encryption != KERNELDUMP_ENC_NONE) { #ifdef EKCD newdi->kdcrypto = kerneldumpcrypto_create(di_template->blocksize, kda->kda_encryption, kda->kda_key, kda->kda_encryptedkeysize, kda->kda_encryptedkey); if (newdi->kdcrypto == NULL) { error = EINVAL; goto cleanup; } #else error = EOPNOTSUPP; goto cleanup; #endif } if (kda->kda_compression != KERNELDUMP_COMP_NONE) { #ifdef EKCD /* * We can't support simultaneous unpadded block cipher * encryption and compression because there is no guarantee the * length of the compressed result is exactly a multiple of the * cipher block size. */ if (kda->kda_encryption == KERNELDUMP_ENC_AES_256_CBC) { error = EOPNOTSUPP; goto cleanup; } #endif newdi->kdcomp = kerneldumpcomp_create(newdi, kda->kda_compression); if (newdi->kdcomp == NULL) { error = EINVAL; goto cleanup; } } newdi->blockbuf = malloc(newdi->blocksize, M_DUMPER, M_WAITOK | M_ZERO); /* Add the new configuration to the queue */ mtx_lock(&dumpconf_list_lk); inserted = false; TAILQ_FOREACH(listdi, &dumper_configs, di_next) { if (index == 0) { TAILQ_INSERT_BEFORE(listdi, newdi, di_next); inserted = true; break; } index--; } if (!inserted) TAILQ_INSERT_TAIL(&dumper_configs, newdi, di_next); mtx_unlock(&dumpconf_list_lk); return (0); cleanup: free_single_dumper(newdi); return (error); } #ifdef DDB void dumper_ddb_insert(struct dumperinfo *newdi) { TAILQ_INSERT_HEAD(&dumper_configs, newdi, di_next); } void dumper_ddb_remove(struct dumperinfo *di) { TAILQ_REMOVE(&dumper_configs, di, di_next); } #endif static bool dumper_config_match(const struct dumperinfo *di, const char *devname, const struct diocskerneldump_arg *kda) { if (kda->kda_index == KDA_REMOVE_ALL) return (true); if (strcmp(di->di_devname, devname) != 0) return (false); /* * Allow wildcard removal of configs matching a device on g_dev_orphan. */ if (kda->kda_index == KDA_REMOVE_DEV) return (true); if (di->kdcomp != NULL) { if (di->kdcomp->kdc_format != kda->kda_compression) return (false); } else if (kda->kda_compression != KERNELDUMP_COMP_NONE) return (false); #ifdef EKCD if (di->kdcrypto != NULL) { if (di->kdcrypto->kdc_encryption != kda->kda_encryption) return (false); /* * Do we care to verify keys match to delete? It seems weird * to expect multiple fallback dump configurations on the same * device that only differ in crypto key. */ } else #endif if (kda->kda_encryption != KERNELDUMP_ENC_NONE) return (false); return (true); } int dumper_remove(const char *devname, const struct diocskerneldump_arg *kda) { struct dumperinfo *di, *sdi; bool found; int error; error = priv_check(curthread, PRIV_SETDUMPER); if (error != 0) return (error); /* * Try to find a matching configuration, and kill it. * * NULL 'kda' indicates remove any configuration matching 'devname', * which may remove multiple configurations in atypical configurations. */ found = false; mtx_lock(&dumpconf_list_lk); TAILQ_FOREACH_SAFE(di, &dumper_configs, di_next, sdi) { if (dumper_config_match(di, devname, kda)) { found = true; TAILQ_REMOVE(&dumper_configs, di, di_next); free_single_dumper(di); } } mtx_unlock(&dumpconf_list_lk); /* Only produce ENOENT if a more targeted match didn't match. */ if (!found && kda->kda_index == KDA_REMOVE) return (ENOENT); return (0); } static int dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length) { if (di->mediasize > 0 && length != 0 && (offset < di->mediaoffset || offset - di->mediaoffset + length > di->mediasize)) { if (di->kdcomp != NULL && offset >= di->mediaoffset) { printf( "Compressed dump failed to fit in device boundaries.\n"); return (E2BIG); } printf("Attempt to write outside dump device boundaries.\n" "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n", (intmax_t)offset, (intmax_t)di->mediaoffset, (uintmax_t)length, (intmax_t)di->mediasize); return (ENOSPC); } if (length % di->blocksize != 0) { printf("Attempt to write partial block of length %ju.\n", (uintmax_t)length); return (EINVAL); } if (offset % di->blocksize != 0) { printf("Attempt to write at unaligned offset %jd.\n", (intmax_t)offset); return (EINVAL); } return (0); } #ifdef EKCD static int dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size) { switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf, 8 * size, buf) <= 0) { return (EIO); } if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, buf + size - 16 /* IV size for AES-256-CBC */) <= 0) { return (EIO); } break; case KERNELDUMP_ENC_CHACHA20: chacha_encrypt_bytes(&kdc->kdc_chacha, buf, buf, size); break; default: return (EINVAL); } return (0); } /* Encrypt data and call dumper. */ static int dump_encrypted_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { static uint8_t buf[KERNELDUMP_BUFFER_SIZE]; struct kerneldumpcrypto *kdc; int error; size_t nbytes; kdc = di->kdcrypto; while (length > 0) { nbytes = MIN(length, sizeof(buf)); bcopy(virtual, buf, nbytes); if (dump_encrypt(kdc, buf, nbytes) != 0) return (EIO); error = dump_write(di, buf, physical, offset, nbytes); if (error != 0) return (error); offset += nbytes; virtual = (void *)((uint8_t *)virtual + nbytes); length -= nbytes; } return (0); } #endif /* EKCD */ static int kerneldumpcomp_write_cb(void *base, size_t length, off_t offset, void *arg) { struct dumperinfo *di; size_t resid, rlength; int error; di = arg; if (length % di->blocksize != 0) { /* * This must be the final write after flushing the compression * stream. Write as many full blocks as possible and stash the * residual data in the dumper's block buffer. It will be * padded and written in dump_finish(). */ rlength = rounddown(length, di->blocksize); if (rlength != 0) { error = _dump_append(di, base, 0, rlength); if (error != 0) return (error); } resid = length - rlength; memmove(di->blockbuf, (uint8_t *)base + rlength, resid); bzero((uint8_t *)di->blockbuf + resid, di->blocksize - resid); di->kdcomp->kdc_resid = resid; return (EAGAIN); } return (_dump_append(di, base, 0, length)); } /* * Write kernel dump headers at the beginning and end of the dump extent. * Write the kernel dump encryption key after the leading header if we were * configured to do so. */ static int dump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh) { #ifdef EKCD struct kerneldumpcrypto *kdc; #endif void *buf, *key; size_t hdrsz; uint64_t extent; uint32_t keysize; int error; hdrsz = sizeof(*kdh); if (hdrsz > di->blocksize) return (ENOMEM); #ifdef EKCD kdc = di->kdcrypto; key = kdc->kdc_dumpkey; keysize = kerneldumpcrypto_dumpkeysize(kdc); #else key = NULL; keysize = 0; #endif /* * If the dump device has special handling for headers, let it take care * of writing them out. */ if (di->dumper_hdr != NULL) return (di->dumper_hdr(di, kdh, key, keysize)); if (hdrsz == di->blocksize) buf = kdh; else { buf = di->blockbuf; memset(buf, 0, di->blocksize); memcpy(buf, kdh, hdrsz); } extent = dtoh64(kdh->dumpextent); #ifdef EKCD if (kdc != NULL) { error = dump_write(di, kdc->kdc_dumpkey, 0, di->mediaoffset + di->mediasize - di->blocksize - extent - keysize, keysize); if (error != 0) return (error); } #endif error = dump_write(di, buf, 0, di->mediaoffset + di->mediasize - 2 * di->blocksize - extent - keysize, di->blocksize); if (error == 0) error = dump_write(di, buf, 0, di->mediaoffset + di->mediasize - di->blocksize, di->blocksize); return (error); } /* * Don't touch the first SIZEOF_METADATA bytes on the dump device. This is to * protect us from metadata and metadata from us. */ #define SIZEOF_METADATA (64 * 1024) /* * Do some preliminary setup for a kernel dump: initialize state for encryption, * if requested, and make sure that we have enough space on the dump device. * * We set things up so that the dump ends before the last sector of the dump * device, at which the trailing header is written. * * +-----------+------+-----+----------------------------+------+ * | | lhdr | key | ... kernel dump ... | thdr | * +-----------+------+-----+----------------------------+------+ * 1 blk opt <------- dump extent --------> 1 blk * * Dumps written using dump_append() start at the beginning of the extent. * Uncompressed dumps will use the entire extent, but compressed dumps typically * will not. The true length of the dump is recorded in the leading and trailing * headers once the dump has been completed. * * The dump device may provide a callback, in which case it will initialize * dumpoff and take care of laying out the headers. */ int dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh) { uint64_t dumpextent, span; uint32_t keysize; int error; #ifdef EKCD error = kerneldumpcrypto_init(di->kdcrypto); if (error != 0) return (error); keysize = kerneldumpcrypto_dumpkeysize(di->kdcrypto); #else error = 0; keysize = 0; #endif if (di->dumper_start != NULL) { error = di->dumper_start(di); } else { dumpextent = dtoh64(kdh->dumpextent); span = SIZEOF_METADATA + dumpextent + 2 * di->blocksize + keysize; if (di->mediasize < span) { if (di->kdcomp == NULL) return (E2BIG); /* * We don't yet know how much space the compressed dump * will occupy, so try to use the whole swap partition * (minus the first 64KB) in the hope that the * compressed dump will fit. If that doesn't turn out to * be enough, the bounds checking in dump_write() * will catch us and cause the dump to fail. */ dumpextent = di->mediasize - span + dumpextent; kdh->dumpextent = htod64(dumpextent); } /* * The offset at which to begin writing the dump. */ di->dumpoff = di->mediaoffset + di->mediasize - di->blocksize - dumpextent; } di->origdumpoff = di->dumpoff; return (error); } static int _dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical, size_t length) { int error; #ifdef EKCD if (di->kdcrypto != NULL) error = dump_encrypted_write(di, virtual, physical, di->dumpoff, length); else #endif error = dump_write(di, virtual, physical, di->dumpoff, length); if (error == 0) di->dumpoff += length; return (error); } /* * Write to the dump device starting at dumpoff. When compression is enabled, * writes to the device will be performed using a callback that gets invoked * when the compression stream's output buffer is full. */ int dump_append(struct dumperinfo *di, void *virtual, vm_offset_t physical, size_t length) { void *buf; if (di->kdcomp != NULL) { /* Bounce through a buffer to avoid CRC errors. */ if (length > di->maxiosize) return (EINVAL); buf = di->kdcomp->kdc_buf; memmove(buf, virtual, length); return (compressor_write(di->kdcomp->kdc_stream, buf, length)); } return (_dump_append(di, virtual, physical, length)); } /* * Write to the dump device at the specified offset. */ int dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { int error; error = dump_check_bounds(di, offset, length); if (error != 0) return (error); return (di->dumper(di->priv, virtual, physical, offset, length)); } /* * Perform kernel dump finalization: flush the compression stream, if necessary, * write the leading and trailing kernel dump headers now that we know the true * length of the dump, and optionally write the encryption key following the * leading header. */ int dump_finish(struct dumperinfo *di, struct kerneldumpheader *kdh) { int error; if (di->kdcomp != NULL) { error = compressor_flush(di->kdcomp->kdc_stream); if (error == EAGAIN) { /* We have residual data in di->blockbuf. */ error = _dump_append(di, di->blockbuf, 0, di->blocksize); if (error == 0) /* Compensate for _dump_append()'s adjustment. */ di->dumpoff -= di->blocksize - di->kdcomp->kdc_resid; di->kdcomp->kdc_resid = 0; } if (error != 0) return (error); /* * We now know the size of the compressed dump, so update the * header accordingly and recompute parity. */ kdh->dumplength = htod64(di->dumpoff - di->origdumpoff); kdh->parity = 0; kdh->parity = kerneldump_parity(kdh); compressor_reset(di->kdcomp->kdc_stream); } error = dump_write_headers(di, kdh); if (error != 0) return (error); (void)dump_write(di, NULL, 0, 0, 0); return (0); } void dump_init_header(const struct dumperinfo *di, struct kerneldumpheader *kdh, const char *magic, uint32_t archver, uint64_t dumplen) { size_t dstsize; bzero(kdh, sizeof(*kdh)); strlcpy(kdh->magic, magic, sizeof(kdh->magic)); strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture)); kdh->version = htod32(KERNELDUMPVERSION); kdh->architectureversion = htod32(archver); kdh->dumplength = htod64(dumplen); kdh->dumpextent = kdh->dumplength; kdh->dumptime = htod64(time_second); #ifdef EKCD kdh->dumpkeysize = htod32(kerneldumpcrypto_dumpkeysize(di->kdcrypto)); #else kdh->dumpkeysize = 0; #endif kdh->blocksize = htod32(di->blocksize); strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname)); dstsize = sizeof(kdh->versionstring); if (strlcpy(kdh->versionstring, version, dstsize) >= dstsize) kdh->versionstring[dstsize - 2] = '\n'; if (panicstr != NULL) strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); if (di->kdcomp != NULL) kdh->compression = di->kdcomp->kdc_format; kdh->parity = kerneldump_parity(kdh); } #ifdef DDB DB_SHOW_COMMAND(panic, db_show_panic) { if (panicstr == NULL) db_printf("panicstr not set\n"); else db_printf("panic: %s\n", panicstr); } #endif Index: head/sys/sys/systm.h =================================================================== --- head/sys/sys/systm.h (revision 366502) +++ head/sys/sys/systm.h (revision 366503) @@ -1,650 +1,649 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)systm.h 8.7 (Berkeley) 3/29/95 * $FreeBSD$ */ #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #include #include #include #include #include #include /* for people using printf mainly */ __NULLABILITY_PRAGMA_PUSH #ifdef _KERNEL extern int cold; /* nonzero if we are doing a cold boot */ extern int suspend_blocked; /* block suspend due to pending shutdown */ extern int rebooting; /* kern_reboot() has been called. */ extern const char *panicstr; /* panic message */ extern bool panicked; #define KERNEL_PANICKED() __predict_false(panicked) extern char version[]; /* system version */ extern char compiler_version[]; /* compiler version */ extern char copyright[]; /* system copyright */ extern int kstack_pages; /* number of kernel stack pages */ extern u_long pagesizes[]; /* supported page sizes */ extern long physmem; /* physical memory */ extern long realmem; /* 'real' memory */ extern char *rootdevnames[2]; /* names of possible root devices */ extern int boothowto; /* reboot flags, from console subsystem */ extern int bootverbose; /* nonzero to print verbose messages */ extern int maxusers; /* system tune hint */ extern int ngroups_max; /* max # of supplemental groups */ extern int vm_guest; /* Running as virtual machine guest? */ /* * Detected virtual machine guest types. The intention is to expand * and/or add to the VM_GUEST_VM type if specific VM functionality is * ever implemented (e.g. vendor-specific paravirtualization features). * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_GUEST_VBOX, VM_GUEST_PARALLELS, VM_LAST }; #ifdef INVARIANTS /* The option is always available */ #define VNASSERT(exp, vp, msg) do { \ if (__predict_false(!(exp))) { \ vn_printf(vp, "VNASSERT failed: %s not true at %s:%d (%s)\n",\ #exp, __FILE__, __LINE__, __func__); \ kassert_panic msg; \ } \ } while (0) #define VNPASS(exp, vp) do { \ const char *_exp = #exp; \ VNASSERT(exp, vp, ("condition %s not met at %s:%d (%s)", \ _exp, __FILE__, __LINE__, __func__)); \ } while (0) #define __assert_unreachable() \ panic("executing segment marked as unreachable at %s:%d (%s)\n", \ __FILE__, __LINE__, __func__) #else #define VNASSERT(exp, vp, msg) do { \ } while (0) #define VNPASS(exp, vp) do { \ } while (0) #define __assert_unreachable() __unreachable() #endif #ifndef CTASSERT /* Allow lint to override */ #define CTASSERT(x) _Static_assert(x, "compile-time assertion failed") #endif #endif /* KERNEL */ /* * These functions need to be declared before the KASSERT macro is invoked in * !KASSERT_PANIC_OPTIONAL builds, so their declarations are sort of out of * place compared to other function definitions in this header. On the other * hand, this header is a bit disorganized anyway. */ void panic(const char *, ...) __dead2 __printflike(1, 2); void vpanic(const char *, __va_list) __dead2 __printflike(1, 0); #if defined(_STANDALONE) /* * Until we have more experience with KASSERTS that are called * from the boot loader, they are off. The bootloader does this * a little differently than the kernel (we just call printf atm). * we avoid most of the common functions in the boot loader, so * declare printf() here too. */ int printf(const char *, ...) __printflike(1, 2); # define kassert_panic printf #else /* !_STANDALONE */ # if defined(WITNESS) || defined(INVARIANT_SUPPORT) # ifdef KASSERT_PANIC_OPTIONAL void kassert_panic(const char *fmt, ...) __printflike(1, 2); # else # define kassert_panic panic # endif /* KASSERT_PANIC_OPTIONAL */ # endif /* defined(WITNESS) || defined(INVARIANT_SUPPORT) */ #endif /* _STANDALONE */ #if defined(INVARIANTS) || defined(_STANDALONE) #define KASSERT(exp,msg) do { \ if (__predict_false(!(exp))) \ kassert_panic msg; \ } while (0) #else /* !INVARIANTS && !_STANDALONE */ #define KASSERT(exp,msg) do { \ } while (0) #endif /* INVARIANTS || _STANDALONE */ /* * Helpful macros for quickly coming up with assertions with informative * panic messages. */ #define MPASS(ex) MPASS4(ex, #ex, __FILE__, __LINE__) #define MPASS2(ex, what) MPASS4(ex, what, __FILE__, __LINE__) #define MPASS3(ex, file, line) MPASS4(ex, #ex, file, line) #define MPASS4(ex, what, file, line) \ KASSERT((ex), ("Assertion %s failed at %s:%d", what, file, line)) /* * Align variables. */ #define __read_mostly __section(".data.read_mostly") #define __read_frequently __section(".data.read_frequently") #define __exclusive_cache_line __aligned(CACHE_LINE_SIZE) \ __section(".data.exclusive_cache_line") #ifdef _KERNEL #include /* MAXCPU */ #include /* curthread */ #include /* * Assert that a pointer can be loaded from memory atomically. * * This assertion enforces stronger alignment than necessary. For example, * on some architectures, atomicity for unaligned loads will depend on * whether or not the load spans multiple cache lines. */ #define ASSERT_ATOMIC_LOAD_PTR(var, msg) \ KASSERT(sizeof(var) == sizeof(void *) && \ ((uintptr_t)&(var) & (sizeof(void *) - 1)) == 0, msg) /* * Assert that a thread is in critical(9) section. */ #define CRITICAL_ASSERT(td) \ KASSERT((td)->td_critnest >= 1, ("Not in critical section")); /* * If we have already panic'd and this is the thread that called * panic(), then don't block on any mutexes but silently succeed. * Otherwise, the kernel will deadlock since the scheduler isn't * going to run the thread that holds any lock we need. */ #define SCHEDULER_STOPPED_TD(td) ({ \ MPASS((td) == curthread); \ __predict_false((td)->td_stopsched); \ }) #define SCHEDULER_STOPPED() SCHEDULER_STOPPED_TD(curthread) /* * XXX the hints declarations are even more misplaced than most declarations * in this file, since they are needed in one file (per arch) and only used * in two files. * XXX most of these variables should be const. */ extern int osreldate; extern bool dynamic_kenv; extern struct mtx kenv_lock; extern char *kern_envp; extern char *md_envp; extern char static_env[]; extern char static_hints[]; /* by config for now */ extern char **kenvp; extern const void *zero_region; /* address space maps to a zeroed page */ extern int unmapped_buf_allowed; #ifdef __LP64__ #define IOSIZE_MAX iosize_max() #define DEVFS_IOSIZE_MAX devfs_iosize_max() #else #define IOSIZE_MAX SSIZE_MAX #define DEVFS_IOSIZE_MAX SSIZE_MAX #endif /* * General function declarations. */ struct inpcb; struct lock_object; struct malloc_type; struct mtx; struct proc; struct socket; struct thread; struct tty; struct ucred; struct uio; struct _jmp_buf; struct trapframe; struct eventtimer; int setjmp(struct _jmp_buf *) __returns_twice; void longjmp(struct _jmp_buf *, int) __dead2; int dumpstatus(vm_offset_t addr, off_t count); int nullop(void); int eopnotsupp(void); int ureadc(int, struct uio *); void hashdestroy(void *, struct malloc_type *, u_long); void *hashinit(int count, struct malloc_type *type, u_long *hashmask); void *hashinit_flags(int count, struct malloc_type *type, u_long *hashmask, int flags); #define HASH_NOWAIT 0x00000001 #define HASH_WAITOK 0x00000002 void *phashinit(int count, struct malloc_type *type, u_long *nentries); void *phashinit_flags(int count, struct malloc_type *type, u_long *nentries, int flags); void g_waitidle(void); -void cpu_boot(int); void cpu_flush_dcache(void *, size_t); void cpu_rootconf(void); void critical_enter_KBI(void); void critical_exit_KBI(void); void critical_exit_preempt(void); void init_param1(void); void init_param2(long physpages); void init_static_kenv(char *, size_t); void tablefull(const char *); /* * Allocate per-thread "current" state in the linuxkpi */ extern int (*lkpi_alloc_current)(struct thread *, int); int linux_alloc_current_noop(struct thread *, int); #if defined(KLD_MODULE) || defined(KTR_CRITICAL) || !defined(_KERNEL) || defined(GENOFFSET) #define critical_enter() critical_enter_KBI() #define critical_exit() critical_exit_KBI() #else static __inline void critical_enter(void) { struct thread_lite *td; td = (struct thread_lite *)curthread; td->td_critnest++; __compiler_membar(); } static __inline void critical_exit(void) { struct thread_lite *td; td = (struct thread_lite *)curthread; KASSERT(td->td_critnest != 0, ("critical_exit: td_critnest == 0")); __compiler_membar(); td->td_critnest--; __compiler_membar(); if (__predict_false(td->td_owepreempt)) critical_exit_preempt(); } #endif #ifdef EARLY_PRINTF typedef void early_putc_t(int ch); extern early_putc_t *early_putc; #endif int kvprintf(char const *, void (*)(int, void*), void *, int, __va_list) __printflike(1, 0); void log(int, const char *, ...) __printflike(2, 3); void log_console(struct uio *); void vlog(int, const char *, __va_list) __printflike(2, 0); int asprintf(char **ret, struct malloc_type *mtp, const char *format, ...) __printflike(3, 4); int printf(const char *, ...) __printflike(1, 2); int snprintf(char *, size_t, const char *, ...) __printflike(3, 4); int sprintf(char *buf, const char *, ...) __printflike(2, 3); int uprintf(const char *, ...) __printflike(1, 2); int vprintf(const char *, __va_list) __printflike(1, 0); int vasprintf(char **ret, struct malloc_type *mtp, const char *format, __va_list ap) __printflike(3, 0); int vsnprintf(char *, size_t, const char *, __va_list) __printflike(3, 0); int vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0); int vsprintf(char *buf, const char *, __va_list) __printflike(2, 0); int sscanf(const char *, char const * _Nonnull, ...) __scanflike(2, 3); int vsscanf(const char * _Nonnull, char const * _Nonnull, __va_list) __scanflike(2, 0); long strtol(const char *, char **, int); u_long strtoul(const char *, char **, int); quad_t strtoq(const char *, char **, int); u_quad_t strtouq(const char *, char **, int); void tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4); void vtprintf(struct proc *, int, const char *, __va_list) __printflike(3, 0); void hexdump(const void *ptr, int length, const char *hdr, int flags); #define HD_COLUMN_MASK 0xff #define HD_DELIM_MASK 0xff00 #define HD_OMIT_COUNT (1 << 16) #define HD_OMIT_HEX (1 << 17) #define HD_OMIT_CHARS (1 << 18) #define ovbcopy(f, t, l) bcopy((f), (t), (l)) void bcopy(const void * _Nonnull from, void * _Nonnull to, size_t len); void bzero(void * _Nonnull buf, size_t len); void explicit_bzero(void * _Nonnull, size_t); int bcmp(const void *b1, const void *b2, size_t len); void *memset(void * _Nonnull buf, int c, size_t len); void *memcpy(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove(void * _Nonnull dest, const void * _Nonnull src, size_t n); int memcmp(const void *b1, const void *b2, size_t len); #ifdef KCSAN void *kcsan_memset(void *, int, size_t); void *kcsan_memcpy(void *, const void *, size_t); void *kcsan_memmove(void *, const void *, size_t); int kcsan_memcmp(const void *, const void *, size_t); #define bcopy(from, to, len) kcsan_memmove((to), (from), (len)) #define bzero(buf, len) kcsan_memset((buf), 0, (len)) #define bcmp(b1, b2, len) kcsan_memcmp((b1), (b2), (len)) #define memset(buf, c, len) kcsan_memset((buf), (c), (len)) #define memcpy(to, from, len) kcsan_memcpy((to), (from), (len)) #define memmove(dest, src, n) kcsan_memmove((dest), (src), (n)) #define memcmp(b1, b2, len) kcsan_memcmp((b1), (b2), (len)) #else #define bcopy(from, to, len) __builtin_memmove((to), (from), (len)) #define bzero(buf, len) __builtin_memset((buf), 0, (len)) #define bcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len)) #define memset(buf, c, len) __builtin_memset((buf), (c), (len)) #define memcpy(to, from, len) __builtin_memcpy((to), (from), (len)) #define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) #define memcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len)) #endif void *memset_early(void * _Nonnull buf, int c, size_t len); #define bzero_early(buf, len) memset_early((buf), 0, (len)) void *memcpy_early(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove_early(void * _Nonnull dest, const void * _Nonnull src, size_t n); #define bcopy_early(from, to, len) memmove_early((to), (from), (len)) #define copystr(src, dst, len, outlen) ({ \ size_t __r, __len, *__outlen; \ \ __len = (len); \ __outlen = (outlen); \ __r = strlcpy((dst), (src), __len); \ if (__outlen != NULL) \ *__outlen = ((__r >= __len) ? __len : __r + 1); \ ((__r >= __len) ? ENAMETOOLONG : 0); \ }) int copyinstr(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len, size_t * __restrict lencopied); int copyin(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int copyin_nofault(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int copyout(const void * _Nonnull __restrict kaddr, void * __restrict udaddr, size_t len); int copyout_nofault(const void * _Nonnull __restrict kaddr, void * __restrict udaddr, size_t len); #ifdef KCSAN int kcsan_copyin(const void *, void *, size_t); int kcsan_copyinstr(const void *, void *, size_t, size_t *); int kcsan_copyout(const void *, void *, size_t); #define copyin(u, k, l) kcsan_copyin((u), (k), (l)) #define copyinstr(u, k, l, lc) kcsan_copyinstr((u), (k), (l), (lc)) #define copyout(k, u, l) kcsan_copyout((k), (u), (l)) #endif int fubyte(volatile const void *base); long fuword(volatile const void *base); int fuword16(volatile const void *base); int32_t fuword32(volatile const void *base); int64_t fuword64(volatile const void *base); int fueword(volatile const void *base, long *val); int fueword32(volatile const void *base, int32_t *val); int fueword64(volatile const void *base, int64_t *val); int subyte(volatile void *base, int byte); int suword(volatile void *base, long word); int suword16(volatile void *base, int word); int suword32(volatile void *base, int32_t word); int suword64(volatile void *base, int64_t word); uint32_t casuword32(volatile uint32_t *base, uint32_t oldval, uint32_t newval); u_long casuword(volatile u_long *p, u_long oldval, u_long newval); int casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int casueword(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); void realitexpire(void *); int sysbeep(int hertz, int period); void hardclock(int cnt, int usermode); void hardclock_sync(int cpu); void softclock(void *); void statclock(int cnt, int usermode); void profclock(int cnt, int usermode, uintfptr_t pc); int hardclockintr(void); void startprofclock(struct proc *); void stopprofclock(struct proc *); void cpu_startprofclock(void); void cpu_stopprofclock(void); void suspendclock(void); void resumeclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq); extern int cpu_disable_c2_sleep; extern int cpu_disable_c3_sleep; char *kern_getenv(const char *name); void freeenv(char *env); int getenv_int(const char *name, int *data); int getenv_uint(const char *name, unsigned int *data); int getenv_long(const char *name, long *data); int getenv_ulong(const char *name, unsigned long *data); int getenv_string(const char *name, char *data, int size); int getenv_int64(const char *name, int64_t *data); int getenv_uint64(const char *name, uint64_t *data); int getenv_quad(const char *name, quad_t *data); int getenv_bool(const char *name, bool *data); bool getenv_is_true(const char *name); bool getenv_is_false(const char *name); int kern_setenv(const char *name, const char *value); int kern_unsetenv(const char *name); int testenv(const char *name); int getenv_array(const char *name, void *data, int size, int *psize, int type_size, bool allow_signed); #define GETENV_UNSIGNED false /* negative numbers not allowed */ #define GETENV_SIGNED true /* negative numbers allowed */ typedef uint64_t (cpu_tick_f)(void); void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var); extern cpu_tick_f *cpu_ticks; uint64_t cpu_tickrate(void); uint64_t cputick2usec(uint64_t tick); #ifdef APM_FIXUP_CALLTODO struct timeval; void adjust_timeout_calltodo(struct timeval *time_change); #endif /* APM_FIXUP_CALLTODO */ #include /* Initialize the world */ void consinit(void); void cpu_initclocks(void); void cpu_initclocks_bsp(void); void cpu_initclocks_ap(void); void usrinfoinit(void); /* Finalize the world */ void kern_reboot(int) __dead2; void shutdown_nice(int); /* Stubs for obsolete functions that used to be for interrupt management */ static __inline intrmask_t splhigh(void) { return 0; } static __inline intrmask_t splimp(void) { return 0; } static __inline intrmask_t splnet(void) { return 0; } static __inline intrmask_t spltty(void) { return 0; } static __inline void splx(intrmask_t ipl __unused) { return; } /* * Common `proc' functions are declared here so that proc.h can be included * less often. */ int _sleep(const void * _Nonnull chan, struct lock_object *lock, int pri, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep(chan, mtx, pri, wmesg, timo) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) #define msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr), \ (flags)) int msleep_spin_sbt(const void * _Nonnull chan, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep_spin(chan, mtx, wmesg, timo) \ msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define pause(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK) #define pause_sig(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK | C_CATCH) #define tsleep(chan, pri, wmesg, timo) \ _sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) #define tsleep_sbt(chan, pri, wmesg, bt, pr, flags) \ _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) void wakeup(const void *chan); void wakeup_one(const void *chan); void wakeup_any(const void *chan); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning */ struct cdev; dev_t dev2udev(struct cdev *x); const char *devtoname(struct cdev *cdev); #ifdef __LP64__ size_t devfs_iosize_max(void); size_t iosize_max(void); #endif int poll_no_poll(int events); /* XXX: Should be void nanodelay(u_int nsec); */ void DELAY(int usec); /* Root mount holdback API */ struct root_hold_token { int flags; const char *who; TAILQ_ENTRY(root_hold_token) list; }; struct root_hold_token *root_mount_hold(const char *identifier); void root_mount_hold_token(const char *identifier, struct root_hold_token *h); void root_mount_rel(struct root_hold_token *h); int root_mounted(void); /* * Unit number allocation API. (kern/subr_unit.c) */ struct unrhdr; struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); void clear_unrhdr(struct unrhdr *uh); void clean_unrhdr(struct unrhdr *uh); void clean_unrhdrl(struct unrhdr *uh); int alloc_unr(struct unrhdr *uh); int alloc_unr_specific(struct unrhdr *uh, u_int item); int alloc_unrl(struct unrhdr *uh); void free_unr(struct unrhdr *uh, u_int item); #ifndef __LP64__ #define UNR64_LOCKED #endif struct unrhdr64 { uint64_t counter; }; static __inline void new_unrhdr64(struct unrhdr64 *unr64, uint64_t low) { unr64->counter = low; } #ifdef UNR64_LOCKED uint64_t alloc_unr64(struct unrhdr64 *); #else static __inline uint64_t alloc_unr64(struct unrhdr64 *unr64) { return (atomic_fetchadd_64(&unr64->counter, 1)); } #endif void intr_prof_stack_use(struct thread *td, struct trapframe *frame); void counted_warning(unsigned *counter, const char *msg); /* * APIs to manage deprecation and obsolescence. */ struct device; void _gone_in(int major, const char *msg); void _gone_in_dev(struct device *dev, int major, const char *msg); #ifdef NO_OBSOLETE_CODE #define __gone_ok(m, msg) \ _Static_assert(m < P_OSREL_MAJOR(__FreeBSD_version)), \ "Obsolete code: " msg); #else #define __gone_ok(m, msg) #endif #define gone_in(major, msg) __gone_ok(major, msg) _gone_in(major, msg) #define gone_in_dev(dev, major, msg) __gone_ok(major, msg) _gone_in_dev(dev, major, msg) #endif /* _KERNEL */ __NULLABILITY_PRAGMA_POP #endif /* !_SYS_SYSTM_H_ */ Index: head/sys/x86/x86/cpu_machdep.c =================================================================== --- head/sys/x86/x86/cpu_machdep.c (revision 366502) +++ head/sys/x86/x86/cpu_machdep.c (revision 366503) @@ -1,1508 +1,1497 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_atpic.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" #ifdef __i386__ #include "opt_apic.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #ifdef CPU_ELAN #include #endif #include #include #include #include #include #include #include #include #include #include #include #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 #ifdef SMP static u_int cpu_reset_proxyid; static volatile u_int cpu_reset_proxy_active; #endif struct msr_op_arg { u_int msr; int op; uint64_t arg1; }; static void x86_msr_op_one(void *argp) { struct msr_op_arg *a; uint64_t v; a = argp; switch (a->op) { case MSR_OP_ANDNOT: v = rdmsr(a->msr); v &= ~a->arg1; wrmsr(a->msr, v); break; case MSR_OP_OR: v = rdmsr(a->msr); v |= a->arg1; wrmsr(a->msr, v); break; case MSR_OP_WRITE: wrmsr(a->msr, a->arg1); break; } } #define MSR_OP_EXMODE_MASK 0xf0000000 #define MSR_OP_OP_MASK 0x000000ff void x86_msr_op(u_int msr, u_int op, uint64_t arg1) { struct thread *td; struct msr_op_arg a; u_int exmode; int bound_cpu, i, is_bound; a.op = op & MSR_OP_OP_MASK; MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR || a.op == MSR_OP_WRITE); exmode = op & MSR_OP_EXMODE_MASK; MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED || exmode == MSR_OP_RENDEZVOUS); a.msr = msr; a.arg1 = arg1; switch (exmode) { case MSR_OP_LOCAL: x86_msr_op_one(&a); break; case MSR_OP_SCHED: td = curthread; thread_lock(td); is_bound = sched_is_bound(td); bound_cpu = td->td_oncpu; CPU_FOREACH(i) { sched_bind(td, i); x86_msr_op_one(&a); } if (is_bound) sched_bind(td, bound_cpu); else sched_unbind(td); thread_unlock(td); break; case MSR_OP_RENDEZVOUS: smp_rendezvous(NULL, x86_msr_op_one, NULL, &a); break; } } /* * Automatically initialized per CPU errata in cpu_idle_tun below. */ bool mwait_cpustop_broken = false; SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN, &mwait_cpustop_broken, 0, "Can not reliably wake MONITOR/MWAIT cpus without interrupts"); /* - * Machine dependent boot() routine - * - * I haven't seen anything to put here yet - * Possibly some stuff might be grafted back here from boot() - */ -void -cpu_boot(int howto) -{ -} - -/* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } void acpi_cpu_c1(void) { __asm __volatile("sti; hlt"); } /* * Use mwait to pause execution while waiting for an interrupt or * another thread to signal that there is more work. * * NOTE: Interrupts will cause a wakeup; however, this function does * not enable interrupt handling. The caller is responsible to enable * interrupts. */ void acpi_cpu_idle_mwait(uint32_t mwait_hint) { int *state; uint64_t v; /* * A comment in Linux patch claims that 'CPUs run faster with * speculation protection disabled. All CPU threads in a core * must disable speculation protection for it to be * disabled. Disable it while we are idle so the other * hyperthread can run fast.' * * XXXKIB. Software coordination mode should be supported, * but all Intel CPUs provide hardware coordination. */ state = &PCPU_PTR(monitorbuf)->idle_state; KASSERT(atomic_load_int(state) == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); atomic_store_int(state, STATE_MWAIT); if (PCPU_GET(ibpb_set) || hw_ssb_active) { v = rdmsr(MSR_IA32_SPEC_CTRL); wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS | IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD)); } else { v = 0; } cpu_monitor(state, 0, 0); if (atomic_load_int(state) == STATE_MWAIT) cpu_mwait(MWAIT_INTRBREAK, mwait_hint); /* * SSB cannot be disabled while we sleep, or rather, if it was * disabled, the sysctl thread will bind to our cpu to tweak * MSR. */ if (v != 0) wrmsr(MSR_IA32_SPEC_CTRL, v); /* * We should exit on any event that interrupts mwait, because * that event might be a wanted interrupt. */ atomic_store_int(state, STATE_RUNNING); } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; uint64_t acnt, mcnt, perf; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); #ifdef __i386__ if ((cpu_feature & CPUID_TSC) == 0) return (EOPNOTSUPP); #endif /* * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, * DELAY(9) based logic fails. */ if (tsc_is_invariant && !tsc_perf_stat) return (EOPNOTSUPP); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); if (tsc_is_invariant) { wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); tsc1 = rdtsc(); DELAY(1000); mcnt = rdmsr(MSR_MPERF); acnt = rdmsr(MSR_APERF); tsc2 = rdtsc(); intr_restore(reg); perf = 1000 * acnt / mcnt; *rate = (tsc2 - tsc1) * perf; } else { tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; } #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) halt(); } static void cpu_reset_real(void) { struct region_descriptor null_idt; int b; disable_intr(); #ifdef CPU_ELAN if (elan_mmcr != NULL) elan_mmcr->RESCFG = 1; #endif #ifdef __i386__ if (cpu == CPU_GEODE1100) { /* Attempt Geode's own reset */ outl(0xcf8, 0x80009044ul); outl(0xcfc, 0xf); } #endif #if !defined(BROKEN_KEYBOARD_RESET) /* * Attempt to do a CPU reset via the keyboard controller, * do not turn off GateA20, as any machine that fails * to do the reset here would then end up in no man's land. */ outb(IO_KBD + 4, 0xFE); DELAY(500000); /* wait 0.5 sec to see if that did it */ #endif /* * Attempt to force a reset via the Reset Control register at * I/O port 0xcf9. Bit 2 forces a system reset when it * transitions from 0 to 1. Bit 1 selects the type of reset * to attempt: 0 selects a "soft" reset, and 1 selects a * "hard" reset. We try a "hard" reset. The first write sets * bit 1 to select a "hard" reset and clears bit 2. The * second write forces a 0 -> 1 transition in bit 2 to trigger * a reset. */ outb(0xcf9, 0x2); outb(0xcf9, 0x6); DELAY(500000); /* wait 0.5 sec to see if that did it */ /* * Attempt to force a reset via the Fast A20 and Init register * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. * Bit 0 asserts INIT# when set to 1. We are careful to only * preserve bit 1 while setting bit 0. We also must clear bit * 0 before setting it if it isn't already clear. */ b = inb(0x92); if (b != 0xff) { if ((b & 0x1) != 0) outb(0x92, b & 0xfe); outb(0x92, b | 0x1); DELAY(500000); /* wait 0.5 sec to see if that did it */ } printf("No known reset method worked, attempting CPU shutdown\n"); DELAY(1000000); /* wait 1 sec for printf to complete */ /* Wipe the IDT. */ null_idt.rd_limit = 0; null_idt.rd_base = 0; lidt(&null_idt); /* "good night, sweet prince .... " */ breakpoint(); /* NOTREACHED */ while(1); } #ifdef SMP static void cpu_reset_proxy(void) { cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) ia32_pause(); /* Wait for other cpu to see that we've started */ printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); DELAY(1000000); cpu_reset_real(); } #endif void cpu_reset(void) { #ifdef SMP struct monitorbuf *mb; cpuset_t map; u_int cnt; if (smp_started) { map = all_cpus; CPU_CLR(PCPU_GET(cpuid), &map); CPU_ANDNOT(&map, &stopped_cpus); if (!CPU_EMPTY(&map)) { printf("cpu_reset: Stopping other CPUs\n"); stop_cpus(map); } if (PCPU_GET(cpuid) != 0) { cpu_reset_proxyid = PCPU_GET(cpuid); cpustop_restartfunc = cpu_reset_proxy; cpu_reset_proxy_active = 0; printf("cpu_reset: Restarting BSP\n"); /* Restart CPU #0. */ CPU_SETOF(0, &started_cpus); mb = &pcpu_find(0)->pc_monitorbuf; atomic_store_int(&mb->stop_state, MONITOR_STOPSTATE_RUNNING); cnt = 0; while (cpu_reset_proxy_active == 0 && cnt < 10000000) { ia32_pause(); cnt++; /* Wait for BSP to announce restart */ } if (cpu_reset_proxy_active == 0) { printf("cpu_reset: Failed to restart BSP\n"); } else { cpu_reset_proxy_active = 2; while (1) ia32_pause(); /* NOTREACHED */ } } DELAY(1000000); } #endif cpu_reset_real(); /* NOTREACHED */ } bool cpu_mwait_usable(void) { return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags & (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) == (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK))); } void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); static void cpu_idle_acpi(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_SLEEPING); /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) cpu_idle_hook(sbt); else acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); } static void cpu_idle_hlt(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_SLEEPING); /* * Since we may be in a critical section from cpu_idle(), if * an interrupt fires during that critical section we may have * a pending preemption. If the CPU halts, then that thread * may not execute until a later interrupt awakens the CPU. * To handle this race, check for a runnable thread after * disabling interrupts and immediately return if one is * found. Also, we must absolutely guarentee that hlt is * the next instruction after sti. This ensures that any * interrupt that fires after the call to disable_intr() will * immediately awaken the CPU from hlt. Finally, please note * that on x86 this works fine because of interrupts enabled only * after the instruction following sti takes place, while IF is set * to 1 immediately, allowing hlt instruction to acknowledge the * interrupt. */ disable_intr(); if (sched_runnable()) enable_intr(); else acpi_cpu_c1(); atomic_store_int(state, STATE_RUNNING); } static void cpu_idle_mwait(sbintime_t sbt) { int *state; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_MWAIT); /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) { atomic_store_int(state, STATE_RUNNING); enable_intr(); return; } cpu_monitor(state, 0, 0); if (atomic_load_int(state) == STATE_MWAIT) __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); else enable_intr(); atomic_store_int(state, STATE_RUNNING); } static void cpu_idle_spin(sbintime_t sbt) { int *state; int i; state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_RUNNING); /* * The sched_runnable() call is racy but as long as there is * a loop missing it one time will have just a little impact if any * (and it is much better than missing the check at all). */ for (i = 0; i < 1000; i++) { if (sched_runnable()) return; cpu_spinwait(); } } /* * C1E renders the local APIC timer dead, so we disable it by * reading the Interrupt Pending Message register and clearing * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). * * Reference: * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" * #32559 revision 3.00+ */ #define MSR_AMDK8_IPM 0xc0010055 #define AMDK8_SMIONCMPHALT (1ULL << 27) #define AMDK8_C1EONCMPHALT (1ULL << 28) #define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) void cpu_probe_amdc1e(void) { /* * Detect the presence of C1E capability mostly on latest * dual-cores (or future) k8 family. */ if (cpu_vendor_id == CPU_VENDOR_AMD && (cpu_id & 0x00000f00) == 0x00000f00 && (cpu_id & 0x0fff0000) >= 0x00040000) { cpu_ident_amdc1e = 1; } } void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; void cpu_idle(int busy) { uint64_t msr; sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); #ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { cpu_idle_mwait(busy); goto out; } } /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); sbt = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } /* Call main idle method. */ cpu_idle_fn(sbt); /* Switch timers back into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } out: CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } static int cpu_idle_apl31_workaround; SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW, &cpu_idle_apl31_workaround, 0, "Apollo Lake APL31 MWAIT bug workaround"); int cpu_idle_wakeup(int cpu) { struct monitorbuf *mb; int *state; mb = &pcpu_find(cpu)->pc_monitorbuf; state = &mb->idle_state; switch (atomic_load_int(state)) { case STATE_SLEEPING: return (0); case STATE_MWAIT: atomic_store_int(state, STATE_RUNNING); return (cpu_idle_apl31_workaround ? 0 : 1); case STATE_RUNNING: return (1); default: panic("bad monitor state"); return (1); } } /* * Ordered by speed/power consumption. */ static struct { void *id_fn; char *id_name; int id_cpuid2_flag; } idle_tbl[] = { { .id_fn = cpu_idle_spin, .id_name = "spin" }, { .id_fn = cpu_idle_mwait, .id_name = "mwait", .id_cpuid2_flag = CPUID2_MON }, { .id_fn = cpu_idle_hlt, .id_name = "hlt" }, { .id_fn = cpu_idle_acpi, .id_name = "acpi" }, }; static int idle_sysctl_available(SYSCTL_HANDLER_ARGS) { char *avail, *p; int error; int i; avail = malloc(256, M_TEMP, M_WAITOK); p = avail; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_cpuid2_flag != 0 && (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) continue; if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; p += sprintf(p, "%s%s", p != avail ? ", " : "", idle_tbl[i].id_name); } error = sysctl_handle_string(oidp, avail, 0, req); free(avail, M_TEMP); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, 0, 0, idle_sysctl_available, "A", "list of available idle functions"); static bool cpu_idle_selector(const char *new_idle_name) { int i; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_cpuid2_flag != 0 && (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0) continue; if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; if (strcmp(idle_tbl[i].id_name, new_idle_name)) continue; cpu_idle_fn = idle_tbl[i].id_fn; if (bootverbose) printf("CPU idle set to %s\n", idle_tbl[i].id_name); return (true); } return (false); } static int cpu_idle_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16], *p; int error, i; p = "unknown"; for (i = 0; i < nitems(idle_tbl); i++) { if (idle_tbl[i].id_fn == cpu_idle_fn) { p = idle_tbl[i].id_name; break; } } strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); return (cpu_idle_selector(buf) ? 0 : EINVAL); } SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 0, 0, cpu_idle_sysctl, "A", "currently selected idle function"); static void cpu_idle_tun(void *unused __unused) { char tunvar[16]; if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar))) cpu_idle_selector(tunvar); else if (cpu_vendor_id == CPU_VENDOR_AMD && CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) { /* Ryzen erratas 1057, 1109. */ cpu_idle_selector("hlt"); idle_mwait = 0; mwait_cpustop_broken = true; } if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) { /* * Apollo Lake errata APL31 (public errata APL30). * Stores to the armed address range may not trigger * MWAIT to resume execution. OS needs to use * interrupts to wake processors from MWAIT-induced * sleep states. */ cpu_idle_apl31_workaround = 1; mwait_cpustop_broken = true; } TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround); } SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL); static int panic_on_nmi = 0xff; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, &panic_on_nmi, 0, "Panic on NMI: 1 = H/W failure; 2 = unknown; 0xff = all"); int nmi_is_broadcast = 1; SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN, &nmi_is_broadcast, 0, "Chipset NMI is broadcast"); int (*apei_nmi)(void); void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame) { bool claimed = false; #ifdef DEV_ISA /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(frame->tf_err)) { claimed = true; if ((panic_on_nmi & 1) != 0) panic("NMI indicates hardware failure"); } #endif /* DEV_ISA */ /* ACPI Platform Error Interfaces callback. */ if (apei_nmi != NULL && (*apei_nmi)()) claimed = true; /* * NMIs can be useful for debugging. They can be hooked up to a * pushbutton, usually on an ISA, PCI, or PCIe card. They can also be * generated by an IPMI BMC, either manually or in response to a * watchdog timeout. For example, see the "power diag" command in * ports/sysutils/ipmitool. They can also be generated by a * hypervisor; see "bhyvectl --inject-nmi". */ #ifdef KDB if (!claimed && (panic_on_nmi & 2) != 0) { if (debugger_on_panic) { printf("NMI/cpu%d ... going to debugger\n", cpu); claimed = kdb_trap(type, 0, frame); } } #endif /* KDB */ if (!claimed && panic_on_nmi != 0) panic("NMI"); } void nmi_handle_intr(u_int type, struct trapframe *frame) { #ifdef SMP if (nmi_is_broadcast) { nmi_call_kdb_smp(type, frame); return; } #endif nmi_call_kdb(PCPU_GET(cpuid), type, frame); } static int hw_ibrs_active; int hw_ibrs_ibpb_active; int hw_ibrs_disable = 1; SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active"); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ibrs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Indirect Branch Restricted Speculation active"); SYSCTL_INT(_machdep_mitigations_ibrs, OID_AUTO, active, CTLFLAG_RD, &hw_ibrs_active, 0, "Indirect Branch Restricted Speculation active"); void hw_ibrs_recalculate(bool for_all_cpus) { if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) { x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL) | (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR), IA32_SPEC_CTRL_IBRS); hw_ibrs_active = hw_ibrs_disable == 0; hw_ibrs_ibpb_active = 0; } else { hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 & CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable; } } static int hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_ibrs_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); hw_ibrs_disable = val != 0; hw_ibrs_recalculate(true); return (0); } SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I", "Disable Indirect Branch Restricted Speculation"); SYSCTL_PROC(_machdep_mitigations_ibrs, OID_AUTO, disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I", "Disable Indirect Branch Restricted Speculation"); int hw_ssb_active; int hw_ssb_disable; SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD, &hw_ssb_active, 0, "Speculative Store Bypass Disable active"); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, ssb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Speculative Store Bypass Disable active"); SYSCTL_INT(_machdep_mitigations_ssb, OID_AUTO, active, CTLFLAG_RD, &hw_ssb_active, 0, "Speculative Store Bypass Disable active"); static void hw_ssb_set(bool enable, bool for_all_cpus) { if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) { hw_ssb_active = 0; return; } hw_ssb_active = enable; x86_msr_op(MSR_IA32_SPEC_CTRL, (enable ? MSR_OP_OR : MSR_OP_ANDNOT) | (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD); } void hw_ssb_recalculate(bool all_cpus) { switch (hw_ssb_disable) { default: hw_ssb_disable = 0; /* FALLTHROUGH */ case 0: /* off */ hw_ssb_set(false, all_cpus); break; case 1: /* on */ hw_ssb_set(true, all_cpus); break; case 2: /* auto */ hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ? false : true, all_cpus); break; } } static int hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_ssb_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); hw_ssb_disable = val; hw_ssb_recalculate(true); return (0); } SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ssb_disable_handler, "I", "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto"); SYSCTL_PROC(_machdep_mitigations_ssb, OID_AUTO, disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ssb_disable_handler, "I", "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto"); int hw_mds_disable; /* * Handler for Microarchitectural Data Sampling issues. Really not a * pointer to C function: on amd64 the code must not change any CPU * architectural state except possibly %rflags. Also, it is always * called with interrupts disabled. */ void mds_handler_void(void); void mds_handler_verw(void); void mds_handler_ivb(void); void mds_handler_bdw(void); void mds_handler_skl_sse(void); void mds_handler_skl_avx(void); void mds_handler_skl_avx512(void); void mds_handler_silvermont(void); void (*mds_handler)(void) = mds_handler_void; static int sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS) { const char *state; if (mds_handler == mds_handler_void) state = "inactive"; else if (mds_handler == mds_handler_verw) state = "VERW"; else if (mds_handler == mds_handler_ivb) state = "software IvyBridge"; else if (mds_handler == mds_handler_bdw) state = "software Broadwell"; else if (mds_handler == mds_handler_skl_sse) state = "software Skylake SSE"; else if (mds_handler == mds_handler_skl_avx) state = "software Skylake AVX"; else if (mds_handler == mds_handler_skl_avx512) state = "software Skylake AVX512"; else if (mds_handler == mds_handler_silvermont) state = "software Silvermont"; else state = "unknown"; return (SYSCTL_OUT(req, state, strlen(state))); } SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_mds_disable_state_handler, "A", "Microarchitectural Data Sampling Mitigation state"); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, mds, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Microarchitectural Data Sampling Mitigation state"); SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_mds_disable_state_handler, "A", "Microarchitectural Data Sampling Mitigation state"); _Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512"); void hw_mds_recalculate(void) { struct pcpu *pc; vm_offset_t b64; u_long xcr0; int i; /* * Allow user to force VERW variant even if MD_CLEAR is not * reported. For instance, hypervisor might unknowingly * filter the cap out. * For the similar reasons, and for testing, allow to enable * mitigation even when MDS_NO cap is set. */ if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 || ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 && hw_mds_disable == 3)) { mds_handler = mds_handler_void; } else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 && hw_mds_disable == 3) || hw_mds_disable == 1) { mds_handler = mds_handler_verw; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e || CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a || CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 || CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d || CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e || CPUID_TO_MODEL(cpu_id) == 0x3a) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Nehalem, SandyBridge, IvyBridge */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc_domainset(672, M_TEMP, DOMAINSET_PREF(pc->pc_domain), M_WAITOK); bzero(pc->pc_mds_buf, 16); } } mds_handler = mds_handler_ivb; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c || CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 || CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f || CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Haswell, Broadwell */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc_domainset(1536, M_TEMP, DOMAINSET_PREF(pc->pc_domain), M_WAITOK); bzero(pc->pc_mds_buf, 16); } } mds_handler = mds_handler_bdw; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id & CPUID_STEPPING) <= 5) || CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e || (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id & CPUID_STEPPING) <= 0xb) || (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id & CPUID_STEPPING) <= 0xc)) && (hw_mds_disable == 2 || hw_mds_disable == 3)) { /* * Skylake, KabyLake, CoffeeLake, WhiskeyLake, * CascadeLake */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) { pc->pc_mds_buf = malloc_domainset(6 * 1024, M_TEMP, DOMAINSET_PREF(pc->pc_domain), M_WAITOK); b64 = (vm_offset_t)malloc_domainset(64 + 63, M_TEMP, DOMAINSET_PREF(pc->pc_domain), M_WAITOK); pc->pc_mds_buf64 = (void *)roundup2(b64, 64); bzero(pc->pc_mds_buf64, 64); } } xcr0 = rxcr(0); if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 && (cpu_stdext_feature & CPUID_STDEXT_AVX512DQ) != 0) mds_handler = mds_handler_skl_avx512; else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 && (cpu_feature2 & CPUID2_AVX) != 0) mds_handler = mds_handler_skl_avx; else mds_handler = mds_handler_skl_sse; } else if (CPUID_TO_FAMILY(cpu_id) == 0x6 && ((CPUID_TO_MODEL(cpu_id) == 0x37 || CPUID_TO_MODEL(cpu_id) == 0x4a || CPUID_TO_MODEL(cpu_id) == 0x4c || CPUID_TO_MODEL(cpu_id) == 0x4d || CPUID_TO_MODEL(cpu_id) == 0x5a || CPUID_TO_MODEL(cpu_id) == 0x5d || CPUID_TO_MODEL(cpu_id) == 0x6e || CPUID_TO_MODEL(cpu_id) == 0x65 || CPUID_TO_MODEL(cpu_id) == 0x75 || CPUID_TO_MODEL(cpu_id) == 0x1c || CPUID_TO_MODEL(cpu_id) == 0x26 || CPUID_TO_MODEL(cpu_id) == 0x27 || CPUID_TO_MODEL(cpu_id) == 0x35 || CPUID_TO_MODEL(cpu_id) == 0x36 || CPUID_TO_MODEL(cpu_id) == 0x7a))) { /* Silvermont, Airmont */ CPU_FOREACH(i) { pc = pcpu_find(i); if (pc->pc_mds_buf == NULL) pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK); } mds_handler = mds_handler_silvermont; } else { hw_mds_disable = 0; mds_handler = mds_handler_void; } } static void hw_mds_recalculate_boot(void *arg __unused) { hw_mds_recalculate(); } SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL); static int sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = hw_mds_disable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val < 0 || val > 3) return (EINVAL); hw_mds_disable = val; hw_mds_recalculate(); return (0); } SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_mds_disable_handler, "I", "Microarchitectural Data Sampling Mitigation " "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO"); SYSCTL_PROC(_machdep_mitigations_mds, OID_AUTO, disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_mds_disable_handler, "I", "Microarchitectural Data Sampling Mitigation " "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO"); /* * Intel Transactional Memory Asynchronous Abort Mitigation * CVE-2019-11135 */ int x86_taa_enable; int x86_taa_state; enum { TAA_NONE = 0, /* No mitigation enabled */ TAA_TSX_DISABLE = 1, /* Disable TSX via MSR */ TAA_VERW = 2, /* Use VERW mitigation */ TAA_AUTO = 3, /* Automatically select the mitigation */ /* The states below are not selectable by the operator */ TAA_TAA_UC = 4, /* Mitigation present in microcode */ TAA_NOT_PRESENT = 5 /* TSX is not present */ }; static void taa_set(bool enable, bool all) { x86_msr_op(MSR_IA32_TSX_CTRL, (enable ? MSR_OP_OR : MSR_OP_ANDNOT) | (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL), IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR); } void x86_taa_recalculate(void) { static int taa_saved_mds_disable = 0; int taa_need = 0, taa_state = 0; int mds_disable = 0, need_mds_recalc = 0; /* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */ if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 || (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) { /* TSX is not present */ x86_taa_state = TAA_NOT_PRESENT; return; } /* Check to see what mitigation options the CPU gives us */ if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) { /* CPU is not suseptible to TAA */ taa_need = TAA_TAA_UC; } else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) { /* * CPU can turn off TSX. This is the next best option * if TAA_NO hardware mitigation isn't present */ taa_need = TAA_TSX_DISABLE; } else { /* No TSX/TAA specific remedies are available. */ if (x86_taa_enable == TAA_TSX_DISABLE) { if (bootverbose) printf("TSX control not available\n"); return; } else taa_need = TAA_VERW; } /* Can we automatically take action, or are we being forced? */ if (x86_taa_enable == TAA_AUTO) taa_state = taa_need; else taa_state = x86_taa_enable; /* No state change, nothing to do */ if (taa_state == x86_taa_state) { if (bootverbose) printf("No TSX change made\n"); return; } /* Does the MSR need to be turned on or off? */ if (taa_state == TAA_TSX_DISABLE) taa_set(true, true); else if (x86_taa_state == TAA_TSX_DISABLE) taa_set(false, true); /* Does MDS need to be set to turn on VERW? */ if (taa_state == TAA_VERW) { taa_saved_mds_disable = hw_mds_disable; mds_disable = hw_mds_disable = 1; need_mds_recalc = 1; } else if (x86_taa_state == TAA_VERW) { mds_disable = hw_mds_disable = taa_saved_mds_disable; need_mds_recalc = 1; } if (need_mds_recalc) { hw_mds_recalculate(); if (mds_disable != hw_mds_disable) { if (bootverbose) printf("Cannot change MDS state for TAA\n"); /* Don't update our state */ return; } } x86_taa_state = taa_state; return; } static void taa_recalculate_boot(void * arg __unused) { x86_taa_recalculate(); } SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TSX Asynchronous Abort Mitigation"); static int sysctl_taa_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = x86_taa_enable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val < TAA_NONE || val > TAA_AUTO) return (EINVAL); x86_taa_enable = val; x86_taa_recalculate(); return (0); } SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_taa_handler, "I", "TAA Mitigation enablement control " "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO"); static int sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS) { const char *state; switch (x86_taa_state) { case TAA_NONE: state = "inactive"; break; case TAA_TSX_DISABLE: state = "TSX disabled"; break; case TAA_VERW: state = "VERW"; break; case TAA_TAA_UC: state = "Mitigated in microcode"; break; case TAA_NOT_PRESENT: state = "TSX not present"; break; default: state = "unknown"; } return (SYSCTL_OUT(req, state, strlen(state))); } SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_taa_state_handler, "A", "TAA Mitigation state"); int __read_frequently cpu_flush_rsb_ctxsw; SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw, CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0, "Flush Return Stack Buffer on context switch"); SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "MCU Optimization, disable RDSEED mitigation"); int x86_rngds_mitg_enable = 1; void x86_rngds_mitg_recalculate(bool all_cpus) { if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) return; x86_msr_op(MSR_IA32_MCU_OPT_CTRL, (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) | (all_cpus ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL), IA32_RNGDS_MITG_DIS); } static int sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS) { int error, val; val = x86_rngds_mitg_enable; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); x86_rngds_mitg_enable = val; x86_rngds_mitg_recalculate(true); return (0); } SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, sysctl_rngds_mitg_enable_handler, "I", "MCU Optimization, disabling RDSEED mitigation control " "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled"); static int sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS) { const char *state; if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) { state = "Not applicable"; } else if (x86_rngds_mitg_enable == 0) { state = "RDSEED not serialized"; } else { state = "Mitigated"; } return (SYSCTL_OUT(req, state, strlen(state))); } SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rngds_state_handler, "A", "MCU Optimization state"); /* * Enable and restore kernel text write permissions. * Callers must ensure that disable_wp()/restore_wp() are executed * without rescheduling on the same core. */ bool disable_wp(void) { u_int cr0; cr0 = rcr0(); if ((cr0 & CR0_WP) == 0) return (false); load_cr0(cr0 & ~CR0_WP); return (true); } void restore_wp(bool old_wp) { if (old_wp) load_cr0(rcr0() | CR0_WP); } bool acpi_get_fadt_bootflags(uint16_t *flagsp) { #ifdef DEV_ACPI ACPI_TABLE_FADT *fadt; vm_paddr_t physaddr; physaddr = acpi_find_table(ACPI_SIG_FADT); if (physaddr == 0) return (false); fadt = acpi_map_table(physaddr, ACPI_SIG_FADT); if (fadt == NULL) return (false); *flagsp = fadt->BootFlags; acpi_unmap_table(fadt); return (true); #else return (false); #endif }