Index: projects/powernv/kern/kern_clock.c =================================================================== --- projects/powernv/kern/kern_clock.c (revision 304041) +++ projects/powernv/kern/kern_clock.c (revision 304042) @@ -1,931 +1,933 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_kdb.h" #include "opt_device_polling.h" #include "opt_hwpmc_hooks.h" #include "opt_ntp.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GPROF #include #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , clock, hard); PMC_SOFT_DEFINE( , , clock, stat); PMC_SOFT_DEFINE_EX( , , clock, prof, \ cpu_startprofclock, cpu_stopprofclock); #endif #ifdef DEVICE_POLLING extern void hardclock_device_poll(void); #endif /* DEVICE_POLLING */ static void initclocks(void *dummy); SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL); /* Spin-lock protecting profiling statistics. */ static struct mtx time_lock; SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE2(sched, , , tick, "struct thread *", "struct proc *"); static int sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS) { int error; long cp_time[CPUSTATES]; #ifdef SCTL_MASK32 int i; unsigned int cp_time32[CPUSTATES]; #endif read_cpu_time(cp_time); #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { if (!req->oldptr) return SYSCTL_OUT(req, 0, sizeof(cp_time32)); for (i = 0; i < CPUSTATES; i++) cp_time32[i] = (unsigned int)cp_time[i]; error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32)); } else #endif { if (!req->oldptr) return SYSCTL_OUT(req, 0, sizeof(cp_time)); error = SYSCTL_OUT(req, cp_time, sizeof(cp_time)); } return error; } SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE, 0,0, sysctl_kern_cp_time, "LU", "CPU time statistics"); static long empty[CPUSTATES]; static int sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS) { struct pcpu *pcpu; int error; int c; long *cp_time; #ifdef SCTL_MASK32 unsigned int cp_time32[CPUSTATES]; int i; #endif if (!req->oldptr) { #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1)); else #endif return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1)); } for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) { if (!CPU_ABSENT(c)) { pcpu = pcpu_find(c); cp_time = pcpu->pc_cp_time; } else { cp_time = empty; } #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { for (i = 0; i < CPUSTATES; i++) cp_time32[i] = (unsigned int)cp_time[i]; error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32)); } else #endif error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES); } return error; } SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE, 0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics"); #ifdef DEADLKRES static const char *blessed[] = { "getblk", "so_snd_sx", "so_rcv_sx", NULL }; static int slptime_threshold = 1800; static int blktime_threshold = 900; static int sleepfreq = 3; static void deadlkres(void) { struct proc *p; struct thread *td; void *wchan; int blkticks, i, slpticks, slptype, tryl, tticks; tryl = 0; for (;;) { blkticks = blktime_threshold * hz; slpticks = slptime_threshold * hz; /* * Avoid to sleep on the sx_lock in order to avoid a possible * priority inversion problem leading to starvation. * If the lock can't be held after 100 tries, panic. */ if (!sx_try_slock(&allproc_lock)) { if (tryl > 100) panic("%s: possible deadlock detected on allproc_lock\n", __func__); tryl++; pause("allproc", sleepfreq * hz); continue; } tryl = 0; FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_ON_LOCK(td)) { /* * The thread should be blocked on a * turnstile, simply check if the * turnstile channel is in good state. */ MPASS(td->td_blocked != NULL); tticks = ticks - td->td_blktick; thread_unlock(td); if (tticks > blkticks) { /* * Accordingly with provided * thresholds, this thread is * stuck for too long on a * turnstile. */ PROC_UNLOCK(p); sx_sunlock(&allproc_lock); panic("%s: possible deadlock detected for %p, blocked for %d ticks\n", __func__, td, tticks); } } else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { /* * Check if the thread is sleeping on a * lock, otherwise skip the check. * Drop the thread lock in order to * avoid a LOR with the sleepqueue * spinlock. */ wchan = td->td_wchan; tticks = ticks - td->td_slptick; thread_unlock(td); slptype = sleepq_type(wchan); if ((slptype == SLEEPQ_SX || slptype == SLEEPQ_LK) && tticks > slpticks) { /* * Accordingly with provided * thresholds, this thread is * stuck for too long on a * sleepqueue. * However, being on a * sleepqueue, we might still * check for the blessed * list. */ tryl = 0; for (i = 0; blessed[i] != NULL; i++) { if (!strcmp(blessed[i], td->td_wmesg)) { tryl = 1; break; } } if (tryl != 0) { tryl = 0; continue; } PROC_UNLOCK(p); sx_sunlock(&allproc_lock); panic("%s: possible deadlock detected for %p, blocked for %d ticks\n", __func__, td, tticks); } } else thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* Sleep for sleepfreq seconds. */ pause("-", sleepfreq * hz); } } static struct kthread_desc deadlkres_kd = { "deadlkres", deadlkres, (struct thread **)NULL }; SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd); static SYSCTL_NODE(_debug, OID_AUTO, deadlkres, CTLFLAG_RW, 0, "Deadlock resolver"); SYSCTL_INT(_debug_deadlkres, OID_AUTO, slptime_threshold, CTLFLAG_RW, &slptime_threshold, 0, "Number of seconds within is valid to sleep on a sleepqueue"); SYSCTL_INT(_debug_deadlkres, OID_AUTO, blktime_threshold, CTLFLAG_RW, &blktime_threshold, 0, "Number of seconds within is valid to block on a turnstile"); SYSCTL_INT(_debug_deadlkres, OID_AUTO, sleepfreq, CTLFLAG_RW, &sleepfreq, 0, "Number of seconds between any deadlock resolver thread run"); #endif /* DEADLKRES */ void read_cpu_time(long *cp_time) { struct pcpu *pc; int i, j; /* Sum up global cp_time[]. */ bzero(cp_time, sizeof(long) * CPUSTATES); CPU_FOREACH(i) { pc = pcpu_find(i); for (j = 0; j < CPUSTATES; j++) cp_time[j] += pc->pc_cp_time[j]; } } #ifdef SW_WATCHDOG #include static int watchdog_ticks; static int watchdog_enabled; static void watchdog_fire(void); static void watchdog_config(void *, u_int, int *); #endif /* SW_WATCHDOG */ /* * Clock handling routines. * * This code is written to operate with two timers that run independently of * each other. * * The main timer, running hz times per second, is used to trigger interval * timers, timeouts and rescheduling as needed. * * The second timer handles kernel and user profiling, * and does resource use estimation. If the second timer is programmable, * it is randomized to avoid aliasing between the two clocks. For example, * the randomization prevents an adversary from always giving up the cpu * just before its quantum expires. Otherwise, it would never accumulate * cpu ticks. The mean frequency of the second timer is stathz. * * If no second timer exists, stathz will be zero; in this case we drive * profiling and statistics off the main clock. This WILL NOT be accurate; * do not do it unless absolutely necessary. * * The statistics clock may (or may not) be run at a higher rate while * profiling. This profile clock runs at profhz. We require that profhz * be an integral multiple of stathz. * * If the statistics clock is running fast, it must be divided by the ratio * profhz/stathz for statistics. (For profiling, every tick counts.) * * Time-of-day is maintained using a "timecounter", which may or may * not be related to the hardware generating the above mentioned * interrupts. */ int stathz; int profhz; int profprocs; volatile int ticks; int psratio; static DPCPU_DEFINE(int, pcputicks); /* Per-CPU version of ticks. */ static int global_hardclock_run = 0; /* * Initialize clock frequencies and start both clocks running. */ /* ARGSUSED*/ static void initclocks(dummy) void *dummy; { #ifdef EARLY_AP_STARTUP struct proc *p; struct thread *td; #endif register int i; /* * Set divisors to 1 (normal case) and let the machine-specific * code do its bit. */ mtx_init(&time_lock, "time lock", NULL, MTX_DEF); cpu_initclocks(); /* * Compute profhz/stathz, and fix profhz if needed. */ i = stathz ? stathz : hz; if (profhz == 0) profhz = i; psratio = profhz / i; #ifdef SW_WATCHDOG EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0); #endif /* * Arrange for ticks to wrap 10 minutes after boot to help catch * sign problems sooner. */ ticks = INT_MAX - (hz * 10 * 60); #ifdef EARLY_AP_STARTUP /* * Fixup the tick counts in any blocked or sleeping threads to * account for the jump above. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (TD_ON_LOCK(td)) { MPASS(td->td_blktick == 0); td->td_blktick = ticks; } if (TD_ON_SLEEPQ(td)) { MPASS(td->td_slptick == 0); td->td_slptick = ticks; } thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); #endif } /* * Each time the real-time timer fires, this function is called on all CPUs. * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only * the other CPUs in the system need to call this function. */ void hardclock_cpu(int usermode) { struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; int flags; /* * Run current process's virtual and profile time, as needed. */ pstats = p->p_stats; flags = 0; if (usermode && timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) { PROC_ITIMLOCK(p); if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) flags |= TDF_ALRMPEND | TDF_ASTPENDING; PROC_ITIMUNLOCK(p); } if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) { PROC_ITIMLOCK(p); if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) flags |= TDF_PROFPEND | TDF_ASTPENDING; PROC_ITIMUNLOCK(p); } thread_lock(td); td->td_flags |= flags; thread_unlock(td); #ifdef HWPMC_HOOKS if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid))) PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif callout_process(sbinuptime()); } /* * The real-time timer, interrupting hz times per second. */ void hardclock(int usermode, uintfptr_t pc) { atomic_add_int(&ticks, 1); hardclock_cpu(usermode); tc_ticktock(1); cpu_tick_calibration(); /* * If no separate statistics clock is available, run it from here. * * XXX: this only works for UP */ if (stathz == 0) { profclock(usermode, pc); statclock(usermode); } #ifdef DEVICE_POLLING hardclock_device_poll(); /* this is very short and quick */ #endif /* DEVICE_POLLING */ #ifdef SW_WATCHDOG if (watchdog_enabled > 0 && --watchdog_ticks <= 0) watchdog_fire(); #endif /* SW_WATCHDOG */ } void hardclock_cnt(int cnt, int usermode) { struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; int *t = DPCPU_PTR(pcputicks); int flags, global, newticks; #ifdef SW_WATCHDOG int i; #endif /* SW_WATCHDOG */ /* * Update per-CPU and possibly global ticks values. */ *t += cnt; do { global = ticks; newticks = *t - global; if (newticks <= 0) { if (newticks < -1) *t = global - 1; newticks = 0; break; } } while (!atomic_cmpset_int(&ticks, global, *t)); /* * Run current process's virtual and profile time, as needed. */ pstats = p->p_stats; flags = 0; if (usermode && timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) { PROC_ITIMLOCK(p); if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick * cnt) == 0) flags |= TDF_ALRMPEND | TDF_ASTPENDING; PROC_ITIMUNLOCK(p); } if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) { PROC_ITIMLOCK(p); if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick * cnt) == 0) flags |= TDF_PROFPEND | TDF_ASTPENDING; PROC_ITIMUNLOCK(p); } if (flags != 0) { thread_lock(td); td->td_flags |= flags; thread_unlock(td); } #ifdef HWPMC_HOOKS if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid))) PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif /* We are in charge to handle this tick duty. */ if (newticks > 0) { /* Dangerous and no need to call these things concurrently. */ if (atomic_cmpset_acq_int(&global_hardclock_run, 0, 1)) { tc_ticktock(newticks); #ifdef DEVICE_POLLING /* This is very short and quick. */ hardclock_device_poll(); #endif /* DEVICE_POLLING */ atomic_store_rel_int(&global_hardclock_run, 0); } #ifdef SW_WATCHDOG if (watchdog_enabled > 0) { i = atomic_fetchadd_int(&watchdog_ticks, -newticks); if (i > 0 && i <= newticks) watchdog_fire(); } #endif /* SW_WATCHDOG */ } if (curcpu == CPU_FIRST()) cpu_tick_calibration(); } void hardclock_sync(int cpu) { - int *t = DPCPU_ID_PTR(cpu, pcputicks); + int *t; + KASSERT(!CPU_ABSENT(cpu), ("Absent CPU %d", cpu)); + t = DPCPU_ID_PTR(cpu, pcputicks); *t = ticks; } /* * Compute number of ticks in the specified amount of time. */ int tvtohz(tv) struct timeval *tv; { register unsigned long ticks; register long sec, usec; /* * If the number of usecs in the whole seconds part of the time * difference fits in a long, then the total number of usecs will * fit in an unsigned long. Compute the total and convert it to * ticks, rounding up and adding 1 to allow for the current tick * to expire. Rounding also depends on unsigned long arithmetic * to avoid overflow. * * Otherwise, if the number of ticks in the whole seconds part of * the time difference fits in a long, then convert the parts to * ticks separately and add, using similar rounding methods and * overflow avoidance. This method would work in the previous * case but it is slightly slower and assumes that hz is integral. * * Otherwise, round the time difference down to the maximum * representable value. * * If ints have 32 bits, then the maximum value for any timeout in * 10ms ticks is 248 days. */ sec = tv->tv_sec; usec = tv->tv_usec; if (usec < 0) { sec--; usec += 1000000; } if (sec < 0) { #ifdef DIAGNOSTIC if (usec > 0) { sec++; usec -= 1000000; } printf("tvotohz: negative time difference %ld sec %ld usec\n", sec, usec); #endif ticks = 1; } else if (sec <= LONG_MAX / 1000000) ticks = howmany(sec * 1000000 + (unsigned long)usec, tick) + 1; else if (sec <= LONG_MAX / hz) ticks = sec * hz + howmany((unsigned long)usec, tick) + 1; else ticks = LONG_MAX; if (ticks > INT_MAX) ticks = INT_MAX; return ((int)ticks); } /* * Start profiling on a process. * * Kernel profiling passes proc0 which never exits and hence * keeps the profile clock running constantly. */ void startprofclock(p) register struct proc *p; { PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_STOPPROF) return; if ((p->p_flag & P_PROFIL) == 0) { p->p_flag |= P_PROFIL; mtx_lock(&time_lock); if (++profprocs == 1) cpu_startprofclock(); mtx_unlock(&time_lock); } } /* * Stop profiling on a process. */ void stopprofclock(p) register struct proc *p; { PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_PROFIL) { if (p->p_profthreads != 0) { while (p->p_profthreads != 0) { p->p_flag |= P_STOPPROF; msleep(&p->p_profthreads, &p->p_mtx, PPAUSE, "stopprof", 0); } } if ((p->p_flag & P_PROFIL) == 0) return; p->p_flag &= ~P_PROFIL; mtx_lock(&time_lock); if (--profprocs == 0) cpu_stopprofclock(); mtx_unlock(&time_lock); } } /* * Statistics clock. Updates rusage information and calls the scheduler * to adjust priorities of the active thread. * * This should be called by all active processors. */ void statclock(int usermode) { statclock_cnt(1, usermode); } void statclock_cnt(int cnt, int usermode) { struct rusage *ru; struct vmspace *vm; struct thread *td; struct proc *p; long rss; long *cp_time; td = curthread; p = td->td_proc; cp_time = (long *)PCPU_PTR(cp_time); if (usermode) { /* * Charge the time as appropriate. */ td->td_uticks += cnt; if (p->p_nice > NZERO) cp_time[CP_NICE] += cnt; else cp_time[CP_USER] += cnt; } else { /* * Came from kernel mode, so we were: * - handling an interrupt, * - doing syscall or trap work on behalf of the current * user process, or * - spinning in the idle loop. * Whichever it is, charge the time as appropriate. * Note that we charge interrupts to the current process, * regardless of whether they are ``for'' that process, * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ if ((td->td_pflags & TDP_ITHREAD) || td->td_intr_nesting_level >= 2) { td->td_iticks += cnt; cp_time[CP_INTR] += cnt; } else { td->td_pticks += cnt; td->td_sticks += cnt; if (!TD_IS_IDLETHREAD(td)) cp_time[CP_SYS] += cnt; else cp_time[CP_IDLE] += cnt; } } /* Update resource usage integrals and maximums. */ MPASS(p->p_vmspace != NULL); vm = p->p_vmspace; ru = &td->td_ru; ru->ru_ixrss += pgtok(vm->vm_tsize) * cnt; ru->ru_idrss += pgtok(vm->vm_dsize) * cnt; ru->ru_isrss += pgtok(vm->vm_ssize) * cnt; rss = pgtok(vmspace_resident_count(vm)); if (ru->ru_maxrss < rss) ru->ru_maxrss = rss; KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock", "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz); SDT_PROBE2(sched, , , tick, td, td->td_proc); thread_lock_flags(td, MTX_QUIET); for ( ; cnt > 0; cnt--) sched_clock(td); thread_unlock(td); #ifdef HWPMC_HOOKS if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, stat, td->td_intr_frame); #endif } void profclock(int usermode, uintfptr_t pc) { profclock_cnt(1, usermode, pc); } void profclock_cnt(int cnt, int usermode, uintfptr_t pc) { struct thread *td; #ifdef GPROF struct gmonparam *g; uintfptr_t i; #endif td = curthread; if (usermode) { /* * Came from user mode; CPU was in user state. * If this process is being profiled, record the tick. * if there is no related user location yet, don't * bother trying to count it. */ if (td->td_proc->p_flag & P_PROFIL) addupc_intr(td, pc, cnt); } #ifdef GPROF else { /* * Kernel statistics are just like addupc_intr, only easier. */ g = &_gmonparam; if (g->state == GMON_PROF_ON && pc >= g->lowpc) { i = PC_TO_I(g, pc); if (i < g->textsize) { KCOUNT(g, i) += cnt; } } } #endif #ifdef HWPMC_HOOKS if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, prof, td->td_intr_frame); #endif } /* * Return information about system clocks. */ static int sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) { struct clockinfo clkinfo; /* * Construct clockinfo structure. */ bzero(&clkinfo, sizeof(clkinfo)); clkinfo.hz = hz; clkinfo.tick = tick; clkinfo.profhz = profhz; clkinfo.stathz = stathz ? stathz : hz; return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); } SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_clockrate, "S,clockinfo", "Rate and period of various kernel clocks"); #ifdef SW_WATCHDOG static void watchdog_config(void *unused __unused, u_int cmd, int *error) { u_int u; u = cmd & WD_INTERVAL; if (u >= WD_TO_1SEC) { watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz; watchdog_enabled = 1; *error = 0; } else { watchdog_enabled = 0; } } /* * Handle a watchdog timeout by dumping interrupt information and * then either dropping to DDB or panicking. */ static void watchdog_fire(void) { int nintr; uint64_t inttotal; u_long *curintr; char *curname; curintr = intrcnt; curname = intrnames; inttotal = 0; nintr = sintrcnt / sizeof(u_long); printf("interrupt total\n"); while (--nintr >= 0) { if (*curintr) printf("%-12s %20lu\n", curname, *curintr); curname += strlen(curname) + 1; inttotal += *curintr++; } printf("Total %20ju\n", (uintmax_t)inttotal); #if defined(KDB) && !defined(KDB_UNATTENDED) kdb_backtrace(); kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout"); #else panic("watchdog timeout"); #endif } #endif /* SW_WATCHDOG */ Index: projects/powernv/kern/kern_clocksource.c =================================================================== --- projects/powernv/kern/kern_clocksource.c (revision 304041) +++ projects/powernv/kern/kern_clocksource.c (revision 304042) @@ -1,966 +1,968 @@ /*- * Copyright (c) 2010-2013 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Common routines to manage event timers hardware. */ #include "opt_device_polling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int cpu_deepest_sleep = 0; /* Deepest Cx state available. */ int cpu_disable_c2_sleep = 0; /* Timer dies in C2. */ int cpu_disable_c3_sleep = 0; /* Timer dies in C3. */ static void setuptimer(void); static void loadtimer(sbintime_t now, int first); static int doconfigtimer(void); static void configtimer(int start); static int round_freq(struct eventtimer *et, int freq); static sbintime_t getnextcpuevent(int idle); static sbintime_t getnextevent(void); static int handleevents(sbintime_t now, int fake); static struct mtx et_hw_mtx; #define ET_HW_LOCK(state) \ { \ if (timer->et_flags & ET_FLAGS_PERCPU) \ mtx_lock_spin(&(state)->et_hw_mtx); \ else \ mtx_lock_spin(&et_hw_mtx); \ } #define ET_HW_UNLOCK(state) \ { \ if (timer->et_flags & ET_FLAGS_PERCPU) \ mtx_unlock_spin(&(state)->et_hw_mtx); \ else \ mtx_unlock_spin(&et_hw_mtx); \ } static struct eventtimer *timer = NULL; static sbintime_t timerperiod; /* Timer period for periodic mode. */ static sbintime_t statperiod; /* statclock() events period. */ static sbintime_t profperiod; /* profclock() events period. */ static sbintime_t nexttick; /* Next global timer tick time. */ static u_int busy = 1; /* Reconfiguration is in progress. */ static int profiling; /* Profiling events enabled. */ static char timername[32]; /* Wanted timer. */ TUNABLE_STR("kern.eventtimer.timer", timername, sizeof(timername)); static int singlemul; /* Multiplier for periodic mode. */ SYSCTL_INT(_kern_eventtimer, OID_AUTO, singlemul, CTLFLAG_RWTUN, &singlemul, 0, "Multiplier for periodic mode"); static u_int idletick; /* Run periodic events when idle. */ SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RWTUN, &idletick, 0, "Run periodic events when idle"); static int periodic; /* Periodic or one-shot mode. */ static int want_periodic; /* What mode to prefer. */ TUNABLE_INT("kern.eventtimer.periodic", &want_periodic); struct pcpu_state { struct mtx et_hw_mtx; /* Per-CPU timer mutex. */ u_int action; /* Reconfiguration requests. */ u_int handle; /* Immediate handle resuests. */ sbintime_t now; /* Last tick time. */ sbintime_t nextevent; /* Next scheduled event on this CPU. */ sbintime_t nexttick; /* Next timer tick time. */ sbintime_t nexthard; /* Next hardclock() event. */ sbintime_t nextstat; /* Next statclock() event. */ sbintime_t nextprof; /* Next profclock() event. */ sbintime_t nextcall; /* Next callout event. */ sbintime_t nextcallopt; /* Next optional callout event. */ int ipi; /* This CPU needs IPI. */ int idle; /* This CPU is in idle mode. */ }; static DPCPU_DEFINE(struct pcpu_state, timerstate); DPCPU_DEFINE(sbintime_t, hardclocktime); /* * Timer broadcast IPI handler. */ int hardclockintr(void) { sbintime_t now; struct pcpu_state *state; int done; if (doconfigtimer() || busy) return (FILTER_HANDLED); state = DPCPU_PTR(timerstate); now = state->now; CTR3(KTR_SPARE2, "ipi at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); done = handleevents(now, 0); return (done ? FILTER_HANDLED : FILTER_STRAY); } /* * Handle all events for specified time on this CPU */ static int handleevents(sbintime_t now, int fake) { sbintime_t t, *hct; struct trapframe *frame; struct pcpu_state *state; int usermode; int done, runs; CTR3(KTR_SPARE2, "handle at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); done = 0; if (fake) { frame = NULL; usermode = 0; } else { frame = curthread->td_intr_frame; usermode = TRAPF_USERMODE(frame); } state = DPCPU_PTR(timerstate); runs = 0; while (now >= state->nexthard) { state->nexthard += tick_sbt; runs++; } if (runs) { hct = DPCPU_PTR(hardclocktime); *hct = state->nexthard - tick_sbt; if (fake < 2) { hardclock_cnt(runs, usermode); done = 1; } } runs = 0; while (now >= state->nextstat) { state->nextstat += statperiod; runs++; } if (runs && fake < 2) { statclock_cnt(runs, usermode); done = 1; } if (profiling) { runs = 0; while (now >= state->nextprof) { state->nextprof += profperiod; runs++; } if (runs && !fake) { profclock_cnt(runs, usermode, TRAPF_PC(frame)); done = 1; } } else state->nextprof = state->nextstat; if (now >= state->nextcallopt) { state->nextcall = state->nextcallopt = SBT_MAX; callout_process(now); } t = getnextcpuevent(0); ET_HW_LOCK(state); if (!busy) { state->idle = 0; state->nextevent = t; loadtimer(now, (fake == 2) && (timer->et_flags & ET_FLAGS_PERCPU)); } ET_HW_UNLOCK(state); return (done); } /* * Schedule binuptime of the next event on current CPU. */ static sbintime_t getnextcpuevent(int idle) { sbintime_t event; struct pcpu_state *state; u_int hardfreq; state = DPCPU_PTR(timerstate); /* Handle hardclock() events, skipping some if CPU is idle. */ event = state->nexthard; if (idle) { hardfreq = (u_int)hz / 2; if (tc_min_ticktock_freq > 2 #ifdef SMP && curcpu == CPU_FIRST() #endif ) hardfreq = hz / tc_min_ticktock_freq; if (hardfreq > 1) event += tick_sbt * (hardfreq - 1); } /* Handle callout events. */ if (event > state->nextcall) event = state->nextcall; if (!idle) { /* If CPU is active - handle other types of events. */ if (event > state->nextstat) event = state->nextstat; if (profiling && event > state->nextprof) event = state->nextprof; } return (event); } /* * Schedule binuptime of the next event on all CPUs. */ static sbintime_t getnextevent(void) { struct pcpu_state *state; sbintime_t event; #ifdef SMP int cpu; #endif int c; state = DPCPU_PTR(timerstate); event = state->nextevent; c = -1; #ifdef SMP if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); if (event > state->nextevent) { event = state->nextevent; c = cpu; } } } #endif CTR4(KTR_SPARE2, "next at %d: next %d.%08x by %d", curcpu, (int)(event >> 32), (u_int)(event & 0xffffffff), c); return (event); } /* Hardware timer callback function. */ static void timercb(struct eventtimer *et, void *arg) { sbintime_t now; sbintime_t *next; struct pcpu_state *state; #ifdef SMP int cpu, bcast; #endif /* Do not touch anything if somebody reconfiguring timers. */ if (busy) return; /* Update present and next tick times. */ state = DPCPU_PTR(timerstate); if (et->et_flags & ET_FLAGS_PERCPU) { next = &state->nexttick; } else next = &nexttick; now = sbinuptime(); if (periodic) *next = now + timerperiod; else *next = -1; /* Next tick is not scheduled yet. */ state->now = now; CTR3(KTR_SPARE2, "intr at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); #ifdef SMP #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); #endif /* Prepare broadcasting to other CPUs for non-per-CPU timers. */ bcast = 0; #ifdef EARLY_AP_STARTUP if ((et->et_flags & ET_FLAGS_PERCPU) == 0) { #else if ((et->et_flags & ET_FLAGS_PERCPU) == 0 && smp_started) { #endif CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); state->now = now; if (now >= state->nextevent) { state->nextevent += SBT_1S; if (curcpu != cpu) { state->ipi = 1; bcast = 1; } } ET_HW_UNLOCK(state); } } #endif /* Handle events for this time on this CPU. */ handleevents(now, 0); #ifdef SMP /* Broadcast interrupt to other CPUs for non-per-CPU timers. */ if (bcast) { CPU_FOREACH(cpu) { if (curcpu == cpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); if (state->ipi) { state->ipi = 0; ipi_cpu(cpu, IPI_HARDCLOCK); } } } #endif } /* * Load new value into hardware timer. */ static void loadtimer(sbintime_t now, int start) { struct pcpu_state *state; sbintime_t new; sbintime_t *next; uint64_t tmp; int eq; if (timer->et_flags & ET_FLAGS_PERCPU) { state = DPCPU_PTR(timerstate); next = &state->nexttick; } else next = &nexttick; if (periodic) { if (start) { /* * Try to start all periodic timers aligned * to period to make events synchronous. */ tmp = now % timerperiod; new = timerperiod - tmp; if (new < tmp) /* Left less then passed. */ new += timerperiod; CTR5(KTR_SPARE2, "load p at %d: now %d.%08x first in %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff), (int)(new >> 32), (u_int)(new & 0xffffffff)); *next = new + now; et_start(timer, new, timerperiod); } } else { new = getnextevent(); eq = (new == *next); CTR4(KTR_SPARE2, "load at %d: next %d.%08x eq %d", curcpu, (int)(new >> 32), (u_int)(new & 0xffffffff), eq); if (!eq) { *next = new; et_start(timer, new - now, 0); } } } /* * Prepare event timer parameters after configuration changes. */ static void setuptimer(void) { int freq; if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0) periodic = 0; else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0) periodic = 1; singlemul = MIN(MAX(singlemul, 1), 20); freq = hz * singlemul; while (freq < (profiling ? profhz : stathz)) freq += hz; freq = round_freq(timer, freq); timerperiod = SBT_1S / freq; } /* * Reconfigure specified per-CPU timer on other CPU. Called from IPI handler. */ static int doconfigtimer(void) { sbintime_t now; struct pcpu_state *state; state = DPCPU_PTR(timerstate); switch (atomic_load_acq_int(&state->action)) { case 1: now = sbinuptime(); ET_HW_LOCK(state); loadtimer(now, 1); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); return (1); case 2: ET_HW_LOCK(state); et_stop(timer); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); return (1); } if (atomic_readandclear_int(&state->handle) && !busy) { now = sbinuptime(); handleevents(now, 0); return (1); } return (0); } /* * Reconfigure specified timer. * For per-CPU timers use IPI to make other CPUs to reconfigure. */ static void configtimer(int start) { sbintime_t now, next; struct pcpu_state *state; int cpu; if (start) { setuptimer(); now = sbinuptime(); } else now = 0; critical_enter(); ET_HW_LOCK(DPCPU_PTR(timerstate)); if (start) { /* Initialize time machine parameters. */ next = now + timerperiod; if (periodic) nexttick = next; else nexttick = -1; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); #endif CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); state->now = now; #ifndef EARLY_AP_STARTUP if (!smp_started && cpu != CPU_FIRST()) state->nextevent = SBT_MAX; else #endif state->nextevent = next; if (periodic) state->nexttick = next; else state->nexttick = -1; state->nexthard = next; state->nextstat = next; state->nextprof = next; state->nextcall = next; state->nextcallopt = next; hardclock_sync(cpu); } busy = 0; /* Start global timer or per-CPU timer of this CPU. */ loadtimer(now, 1); } else { busy = 1; /* Stop global timer or per-CPU timer of this CPU. */ et_stop(timer); } ET_HW_UNLOCK(DPCPU_PTR(timerstate)); #ifdef SMP #ifdef EARLY_AP_STARTUP /* If timer is global we are done. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { #else /* If timer is global or there is no other CPUs yet - we are done. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || !smp_started) { #endif critical_exit(); return; } /* Set reconfigure flags for other CPUs. */ CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); atomic_store_rel_int(&state->action, (cpu == curcpu) ? 0 : ( start ? 1 : 2)); } /* Broadcast reconfigure IPI. */ ipi_all_but_self(IPI_HARDCLOCK); /* Wait for reconfiguration completed. */ restart: cpu_spinwait(); CPU_FOREACH(cpu) { if (cpu == curcpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); if (atomic_load_acq_int(&state->action)) goto restart; } #endif critical_exit(); } /* * Calculate nearest frequency supported by hardware timer. */ static int round_freq(struct eventtimer *et, int freq) { uint64_t div; if (et->et_frequency != 0) { div = lmax((et->et_frequency + freq / 2) / freq, 1); if (et->et_flags & ET_FLAGS_POW2DIV) div = 1 << (flsl(div + div / 2) - 1); freq = (et->et_frequency + div / 2) / div; } if (et->et_min_period > SBT_1S) panic("Event timer \"%s\" doesn't support sub-second periods!", et->et_name); else if (et->et_min_period != 0) freq = min(freq, SBT2FREQ(et->et_min_period)); if (et->et_max_period < SBT_1S && et->et_max_period != 0) freq = max(freq, SBT2FREQ(et->et_max_period)); return (freq); } /* * Configure and start event timers (BSP part). */ void cpu_initclocks_bsp(void) { struct pcpu_state *state; int base, div, cpu; mtx_init(&et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); state->nextcall = SBT_MAX; state->nextcallopt = SBT_MAX; } periodic = want_periodic; /* Grab requested timer or the best of present. */ if (timername[0]) timer = et_find(timername, 0, 0); if (timer == NULL && periodic) { timer = et_find(NULL, ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC); } if (timer == NULL) { timer = et_find(NULL, ET_FLAGS_ONESHOT, ET_FLAGS_ONESHOT); } if (timer == NULL && !periodic) { timer = et_find(NULL, ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC); } if (timer == NULL) panic("No usable event timer found!"); et_init(timer, timercb, NULL, NULL); /* Adapt to timer capabilities. */ if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0) periodic = 0; else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0) periodic = 1; if (timer->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep++; /* * We honor the requested 'hz' value. * We want to run stathz in the neighborhood of 128hz. * We would like profhz to run as often as possible. */ if (singlemul <= 0 || singlemul > 20) { if (hz >= 1500 || (hz % 128) == 0) singlemul = 1; else if (hz >= 750) singlemul = 2; else singlemul = 4; } if (periodic) { base = round_freq(timer, hz * singlemul); singlemul = max((base + hz / 2) / hz, 1); hz = (base + singlemul / 2) / singlemul; if (base <= 128) stathz = base; else { div = base / 128; if (div >= singlemul && (div % singlemul) == 0) div++; stathz = base / div; } profhz = stathz; while ((profhz + stathz) <= 128 * 64) profhz += stathz; profhz = round_freq(timer, profhz); } else { hz = round_freq(timer, hz); stathz = round_freq(timer, 127); profhz = round_freq(timer, stathz * 64); } tick = 1000000 / hz; tick_sbt = SBT_1S / hz; tick_bt = sbttobt(tick_sbt); statperiod = SBT_1S / stathz; profperiod = SBT_1S / profhz; ET_LOCK(); configtimer(1); ET_UNLOCK(); } /* * Start per-CPU event timers on APs. */ void cpu_initclocks_ap(void) { sbintime_t now; struct pcpu_state *state; struct thread *td; state = DPCPU_PTR(timerstate); now = sbinuptime(); ET_HW_LOCK(state); state->now = now; hardclock_sync(curcpu); spinlock_enter(); ET_HW_UNLOCK(state); td = curthread; td->td_intr_nesting_level++; handleevents(state->now, 2); td->td_intr_nesting_level--; spinlock_exit(); } /* * Switch to profiling clock rates. */ void cpu_startprofclock(void) { ET_LOCK(); if (profiling == 0) { if (periodic) { configtimer(0); profiling = 1; configtimer(1); } else profiling = 1; } else profiling++; ET_UNLOCK(); } /* * Switch to regular clock rates. */ void cpu_stopprofclock(void) { ET_LOCK(); if (profiling == 1) { if (periodic) { configtimer(0); profiling = 0; configtimer(1); } else profiling = 0; } else profiling--; ET_UNLOCK(); } /* * Switch to idle mode (all ticks handled). */ sbintime_t cpu_idleclock(void) { sbintime_t now, t; struct pcpu_state *state; if (idletick || busy || (periodic && (timer->et_flags & ET_FLAGS_PERCPU)) #ifdef DEVICE_POLLING || curcpu == CPU_FIRST() #endif ) return (-1); state = DPCPU_PTR(timerstate); if (periodic) now = state->now; else now = sbinuptime(); CTR3(KTR_SPARE2, "idle at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); t = getnextcpuevent(1); ET_HW_LOCK(state); state->idle = 1; state->nextevent = t; if (!periodic) loadtimer(now, 0); ET_HW_UNLOCK(state); return (MAX(t - now, 0)); } /* * Switch to active mode (skip empty ticks). */ void cpu_activeclock(void) { sbintime_t now; struct pcpu_state *state; struct thread *td; state = DPCPU_PTR(timerstate); if (state->idle == 0 || busy) return; if (periodic) now = state->now; else now = sbinuptime(); CTR3(KTR_SPARE2, "active at %d: now %d.%08x", curcpu, (int)(now >> 32), (u_int)(now & 0xffffffff)); spinlock_enter(); td = curthread; td->td_intr_nesting_level++; handleevents(now, 1); td->td_intr_nesting_level--; spinlock_exit(); } /* * Change the frequency of the given timer. This changes et->et_frequency and * if et is the active timer it reconfigures the timer on all CPUs. This is * intended to be a private interface for the use of et_change_frequency() only. */ void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq) { ET_LOCK(); if (et == timer) { configtimer(0); et->et_frequency = newfreq; configtimer(1); } else et->et_frequency = newfreq; ET_UNLOCK(); } void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt) { struct pcpu_state *state; /* Do not touch anything if somebody reconfiguring timers. */ if (busy) return; CTR6(KTR_SPARE2, "new co at %d: on %d at %d.%08x - %d.%08x", curcpu, cpu, (int)(bt_opt >> 32), (u_int)(bt_opt & 0xffffffff), (int)(bt >> 32), (u_int)(bt & 0xffffffff)); + + KASSERT(!CPU_ABSENT(cpu), ("Absent CPU %d", cpu)); state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); /* * If there is callout time already set earlier -- do nothing. * This check may appear redundant because we check already in * callout_process() but this double check guarantees we're safe * with respect to race conditions between interrupts execution * and scheduling. */ state->nextcallopt = bt_opt; if (bt >= state->nextcall) goto done; state->nextcall = bt; /* If there is some other event set earlier -- do nothing. */ if (bt >= state->nextevent) goto done; state->nextevent = bt; /* If timer is periodic -- there is nothing to reprogram. */ if (periodic) goto done; /* If timer is global or of the current CPU -- reprogram it. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) { loadtimer(sbinuptime(), 0); done: ET_HW_UNLOCK(state); return; } /* Otherwise make other CPU to reprogram it. */ state->handle = 1; ET_HW_UNLOCK(state); #ifdef SMP ipi_cpu(cpu, IPI_HARDCLOCK); #endif } /* * Report or change the active event timers hardware. */ static int sysctl_kern_eventtimer_timer(SYSCTL_HANDLER_ARGS) { char buf[32]; struct eventtimer *et; int error; ET_LOCK(); et = timer; snprintf(buf, sizeof(buf), "%s", et->et_name); ET_UNLOCK(); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); ET_LOCK(); et = timer; if (error != 0 || req->newptr == NULL || strcasecmp(buf, et->et_name) == 0) { ET_UNLOCK(); return (error); } et = et_find(buf, 0, 0); if (et == NULL) { ET_UNLOCK(); return (ENOENT); } configtimer(0); et_free(timer); if (et->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep++; if (timer->et_flags & ET_FLAGS_C3STOP) cpu_disable_c3_sleep--; periodic = want_periodic; timer = et; et_init(timer, timercb, NULL, NULL); configtimer(1); ET_UNLOCK(); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, timer, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_timer, "A", "Chosen event timer"); /* * Report or change the active event timer periodicity. */ static int sysctl_kern_eventtimer_periodic(SYSCTL_HANDLER_ARGS) { int error, val; val = periodic; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); ET_LOCK(); configtimer(0); periodic = want_periodic = val; configtimer(1); ET_UNLOCK(); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, periodic, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_periodic, "I", "Enable event timer periodic mode"); #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(clocksource, db_show_clocksource) { struct pcpu_state *st; int c; CPU_FOREACH(c) { st = DPCPU_ID_PTR(c, timerstate); db_printf( "CPU %2d: action %d handle %d ipi %d idle %d\n" " now %#jx nevent %#jx (%jd)\n" " ntick %#jx (%jd) nhard %#jx (%jd)\n" " nstat %#jx (%jd) nprof %#jx (%jd)\n" " ncall %#jx (%jd) ncallopt %#jx (%jd)\n", c, st->action, st->handle, st->ipi, st->idle, (uintmax_t)st->now, (uintmax_t)st->nextevent, (uintmax_t)(st->nextevent - st->now) / tick_sbt, (uintmax_t)st->nexttick, (uintmax_t)(st->nexttick - st->now) / tick_sbt, (uintmax_t)st->nexthard, (uintmax_t)(st->nexthard - st->now) / tick_sbt, (uintmax_t)st->nextstat, (uintmax_t)(st->nextstat - st->now) / tick_sbt, (uintmax_t)st->nextprof, (uintmax_t)(st->nextprof - st->now) / tick_sbt, (uintmax_t)st->nextcall, (uintmax_t)(st->nextcall - st->now) / tick_sbt, (uintmax_t)st->nextcallopt, (uintmax_t)(st->nextcallopt - st->now) / tick_sbt); } } #endif Index: projects/powernv/kern/kern_shutdown.c =================================================================== --- projects/powernv/kern/kern_shutdown.c (revision 304041) +++ projects/powernv/kern/kern_shutdown.c (revision 304042) @@ -1,942 +1,943 @@ /*- * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kdb.h" #include "opt_panic.h" #include "opt_sched.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer"); #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME; SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN, &panic_reboot_wait_time, 0, "Seconds to wait before rebooting after a panic"); /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED int debugger_on_panic = 0; #else int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &debugger_on_panic, 0, "Run debugger on kernel panic"); #ifdef KDB_TRACE static int trace_on_panic = 1; #else static int trace_on_panic = 0; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &trace_on_panic, 0, "Print stack trace on kernel panic"); #endif /* KDB */ static int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment"); #ifndef DIAGNOSTIC static int show_busybufs; #else static int show_busybufs = 1; #endif SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW, &show_busybufs, 0, ""); int suspend_blocked = 0; SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW, &suspend_blocked, 0, "Block suspend due to a pending shutdown"); /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ const char *panicstr; int dumping; /* system is dumping */ int rebooting; /* system is rebooting */ static struct dumperinfo dumper; /* our selected dumper */ /* Context information for dump-debuggers. */ static struct pcb dumppcb; /* Registers. */ lwpid_t dumptid; /* Thread ID. */ static struct cdevsw reroot_cdevsw = { .d_version = D_VERSION, .d_name = "reroot", }; static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); static int kern_reroot(void); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL); /* * The only reason this exists is to create the /dev/reroot/ directory, * used by reroot code in init(8) as a mountpoint for tmpfs. */ static void reroot_conf(void *unused) { int error; struct cdev *cdev; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev, &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot"); if (error != 0) { printf("%s: failed to create device node, error %d", __func__, error); } } SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL); /* * The system call that results in a reboot. */ /* ARGSUSED */ int sys_reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_system_check_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = priv_check(td, PRIV_REBOOT); if (error == 0) { if (uap->opt & RB_REROOT) { error = kern_reroot(); } else { mtx_lock(&Giant); kern_reboot(uap->opt); mtx_unlock(&Giant); } } return (error); } /* * Called by events that want to shut down.. e.g on a PC */ void shutdown_nice(int howto) { if (initproc != NULL) { /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(initproc); if (howto & RB_POWEROFF) kern_psignal(initproc, SIGUSR2); else if (howto & RB_HALT) kern_psignal(initproc, SIGUSR1); else kern_psignal(initproc, SIGINT); PROC_UNLOCK(initproc); } else { /* No init(8) running, so simply reboot. */ kern_reboot(howto | RB_NOSYNC); } } static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } int doadump(boolean_t textdump) { boolean_t coredump; int error; error = 0; if (dumping) return (EBUSY); if (dumper.dumper == NULL) return (ENXIO); savectx(&dumppcb); dumptid = curthread->td_tid; dumping++; coredump = TRUE; #ifdef DDB if (textdump && textdump_pending) { coredump = FALSE; textdump_dumpsys(&dumper); } #endif if (coredump) error = dumpsys(&dumper); dumping--; return (error); } /* * Shutdown the system cleanly to prepare for reboot, halt, or power off. */ void kern_reboot(int howto) { static int once = 0; #if defined(SMP) /* - * Bind us to CPU 0 so that all shutdown code runs there. Some + * Bind us to the first CPU so that all shutdown code runs there. Some * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ if (!SCHEDULER_STOPPED()) { thread_lock(curthread); - sched_bind(curthread, 0); + sched_bind(curthread, CPU_FIRST()); thread_unlock(curthread); - KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0")); + KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), + ("boot: not running on cpu 0")); } #endif /* We're in the process of rebooting. */ rebooting = 1; /* We are out of the debugger now. */ kdb_active = 0; /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); /* * Now sync filesystems */ if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) { once = 1; bufshutdown(show_busybufs); } print_uptime(); cngrab(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(TRUE); /* Now that we're going to really halt the system... */ EVENTHANDLER_INVOKE(shutdown_final, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * The system call that results in changing the rootfs. */ static int kern_reroot(void) { struct vnode *oldrootvnode, *vp; struct mount *mp, *devmp; int error; if (curproc != initproc) return (EPERM); /* * Mark the filesystem containing currently-running executable * (the temporary copy of init(8)) busy. */ vp = curproc->p_textvp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (error); mp = vp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp, 0); error = vfs_busy(mp, 0); vn_lock(vp, LK_SHARED | LK_RETRY); vfs_rel(mp); if (error != 0) { VOP_UNLOCK(vp, 0); return (ENOENT); } if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp, 0); /* * Remove the filesystem containing currently-running executable * from the mount list, to prevent it from being unmounted * by vfs_unmountall(), and to avoid confusing vfs_mountroot(). * * Also preserve /dev - forcibly unmounting it could cause driver * reinitialization. */ vfs_ref(rootdevmp); devmp = rootdevmp; rootdevmp = NULL; mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); TAILQ_REMOVE(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); oldrootvnode = rootvnode; /* * Unmount everything except for the two filesystems preserved above. */ vfs_unmountall(); /* * Add /dev back; vfs_mountroot() will move it into its new place. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); rootdevmp = devmp; vfs_rel(rootdevmp); /* * Mount the new rootfs. */ vfs_mountroot(); /* * Update all references to the old rootvnode. */ mountcheckdirs(oldrootvnode, rootvnode); /* * Add the temporary filesystem back and unbusy it. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_unbusy(mp); return (0); } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: howto &= ~RB_HALT; break; } } } /* * Check to see if the system paniced, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (panic_reboot_wait_time != 0) { if (panic_reboot_wait_time != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", panic_reboot_wait_time); for (loop = panic_reboot_wait_time * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ /* * Acquiring smp_ipi_mtx here has a double effect: * - it disables interrupts avoiding CPU0 preemption * by fast handlers (thus deadlocking against other CPUs) * - it avoids deadlocks against smp_rendezvous() or, more * generally, threads busy-waiting, with this spinlock held, * and waiting for responses by threads on other CPUs * (ie. smp_tlb_shootdown()). * * For the !SMP case it just needs to handle the former problem. */ #ifdef SMP mtx_lock_spin(&smp_ipi_mtx); #else spinlock_enter(); #endif /* cpu_boot(howto); */ /* doesn't do anything at the moment */ cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #if defined(WITNESS) || defined(INVARIANT_SUPPORT) static int kassert_warn_only = 0; #ifdef KDB static int kassert_do_kdb = 0; #endif #ifdef KTR static int kassert_do_ktr = 0; #endif static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW, NULL, "kassert options"); SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, CTLFLAG_RWTUN, &kassert_warn_only, 0, "KASSERT triggers a panic (1) or just a warning (0)"); #ifdef KDB SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, CTLFLAG_RWTUN, &kassert_do_kdb, 0, "KASSERT will enter the debugger"); #endif #ifdef KTR SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, CTLFLAG_RWTUN, &kassert_do_ktr, 0, "KASSERT does a KTR, set this to the KTRMASK you want"); #endif SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, CTLFLAG_RWTUN, &kassert_do_log, 0, "KASSERT triggers a panic (1) or just a warning (0)"); SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RWTUN, &kassert_warnings, 0, "number of KASSERTs that have been triggered"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, CTLFLAG_RWTUN, &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, CTLFLAG_RWTUN, &kassert_log_pps_limit, 0, "limit number of log messages per second"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, CTLFLAG_RWTUN, &kassert_log_mute_at, 0, "max number of KASSERTS to log"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0, kassert_sysctl_kassert, "I", "set to trigger a test kassert"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); return (0); } /* * Called by KASSERT, this decides if we will panic * or if we will log via printf and/or ktr. */ void kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* * panic if we're not just warning, or if we've exceeded * kassert_log_panic_at warnings. */ if (!kassert_warn_only || (kassert_log_panic_at > 0 && kassert_warnings >= kassert_log_panic_at)) { va_start(ap, fmt); vpanic(fmt, ap); /* NORETURN */ } #ifdef KTR if (kassert_do_ktr) CTR0(ktr_mask, buf); #endif /* KTR */ /* * log if we've not yet met the mute limit. */ if (kassert_do_log && (kassert_log_mute_at == 0 || kassert_warnings < kassert_log_mute_at)) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { printf("KASSERT failed: %s\n", buf); kdb_backtrace(); } } #ifdef KDB if (kassert_do_kdb) { kdb_enter(KDB_WHY_KASSERT, buf); } #endif atomic_add_int(&kassert_warnings, 1); } #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); } void vpanic(const char *fmt, va_list ap) { #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; static char buf[256]; spinlock_enter(); #ifdef SMP /* * stop_cpus_hard(other_cpus) should prevent multiple CPUs from * concurrently entering panic. Only the winner will proceed * further. */ if (panicstr == NULL && !kdb_active) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); } /* * Ensure that the scheduler is stopped while panicking, even if panic * has been entered from kdb. */ td->td_stopsched = 1; #endif bootopt = RB_AUTOBOOT; newpanic = 0; if (panicstr) bootopt |= RB_NOSYNC; else { bootopt |= RB_DUMP; panicstr = fmt; newpanic = 1; } if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; cngrab(); printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif #ifdef KDB if (newpanic && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter(KDB_WHY_PANIC, "panic"); #endif /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; kern_reboot(bootopt); } /* * Support for poweroff delay. * * Please note that setting this delay too short might power off your machine * before the write cache on your hard disk has been flushed, leading to * soft-updates inconsistencies. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)"); static void poweroff_wait(void *junk, int howto) { if (!(howto & RB_POWEROFF) || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process"); void kproc_shutdown(void *arg, int howto) { struct proc *p; int error; if (panicstr) return; p = (struct proc *)arg; printf("Waiting (max %d seconds) for system process `%s' to stop... ", kproc_shutdown_wait, p->p_comm); error = kproc_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } void kthread_shutdown(void *arg, int howto) { struct thread *td; int error; if (panicstr) return; td = (struct thread *)arg; printf("Waiting (max %d seconds) for system thread `%s' to stop... ", kproc_shutdown_wait, td->td_name); error = kthread_suspend(td, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } static char dumpdevname[sizeof(((struct cdev*)NULL)->si_name)]; SYSCTL_STRING(_kern_shutdown, OID_AUTO, dumpdevname, CTLFLAG_RD, dumpdevname, 0, "Device for kernel dumps"); /* Registration of dumpers */ int set_dumper(struct dumperinfo *di, const char *devname, struct thread *td) { size_t wantcopy; int error; error = priv_check(td, PRIV_SETDUMPER); if (error != 0) return (error); if (di == NULL) { if (dumper.blockbuf != NULL) free(dumper.blockbuf, M_DUMPER); bzero(&dumper, sizeof(dumper)); dumpdevname[0] = '\0'; return (0); } if (dumper.dumper != NULL) return (EBUSY); dumper = *di; wantcopy = strlcpy(dumpdevname, devname, sizeof(dumpdevname)); if (wantcopy >= sizeof(dumpdevname)) { printf("set_dumper: device name truncated from '%s' -> '%s'\n", devname, dumpdevname); } dumper.blockbuf = malloc(di->blocksize, M_DUMPER, M_WAITOK | M_ZERO); return (0); } /* Call dumper with bounds checking. */ int dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length) { if (length != 0 && (offset < di->mediaoffset || offset - di->mediaoffset + length > di->mediasize)) { printf("Attempt to write outside dump device boundaries.\n" "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n", (intmax_t)offset, (intmax_t)di->mediaoffset, (uintmax_t)length, (intmax_t)di->mediasize); return (ENOSPC); } return (di->dumper(di->priv, virtual, physical, offset, length)); } /* Call dumper with bounds checking. */ int dump_write_pad(struct dumperinfo *di, void *virtual, vm_offset_t physical, off_t offset, size_t length, size_t *size) { char *temp; int ret; if (length > di->blocksize) return (ENOMEM); *size = di->blocksize; if (length == di->blocksize) temp = virtual; else { temp = di->blockbuf; memset(temp + length, 0, di->blocksize - length); memcpy(temp, virtual, length); } ret = dump_write(di, temp, physical, offset, *size); return (ret); } void mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver, uint64_t dumplen, uint32_t blksz) { bzero(kdh, sizeof(*kdh)); strlcpy(kdh->magic, magic, sizeof(kdh->magic)); strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture)); kdh->version = htod32(KERNELDUMPVERSION); kdh->architectureversion = htod32(archver); kdh->dumplength = htod64(dumplen); kdh->dumptime = htod64(time_second); kdh->blocksize = htod32(blksz); strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname)); strlcpy(kdh->versionstring, version, sizeof(kdh->versionstring)); if (panicstr != NULL) strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); kdh->parity = kerneldump_parity(kdh); } #ifdef DDB DB_SHOW_COMMAND(panic, db_show_panic) { if (panicstr == NULL) db_printf("panicstr not set\n"); else db_printf("panic: %s\n", panicstr); } #endif Index: projects/powernv/kern/subr_pcpu.c =================================================================== --- projects/powernv/kern/subr_pcpu.c (revision 304041) +++ projects/powernv/kern/subr_pcpu.c (revision 304042) @@ -1,419 +1,421 @@ /*- * Copyright (c) 2001 Wind River Systems, Inc. * All rights reserved. * Written by: John Baldwin * * Copyright (c) 2009 Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module provides MI support for per-cpu data. * * Each architecture determines the mapping of logical CPU IDs to physical * CPUs. The requirements of this mapping are as follows: * - Logical CPU IDs must reside in the range 0 ... MAXCPU - 1. * - The mapping is not required to be dense. That is, there may be * gaps in the mappings. * - The platform sets the value of MAXCPU in . * - It is suggested, but not required, that in the non-SMP case, the * platform define MAXCPU to be 1 and define the logical ID of the * sole CPU as 0. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PCPU, "Per-cpu", "Per-cpu resource accouting."); struct dpcpu_free { uintptr_t df_start; int df_len; TAILQ_ENTRY(dpcpu_free) df_link; }; static DPCPU_DEFINE(char, modspace[DPCPU_MODMIN]); static TAILQ_HEAD(, dpcpu_free) dpcpu_head = TAILQ_HEAD_INITIALIZER(dpcpu_head); static struct sx dpcpu_lock; uintptr_t dpcpu_off[MAXCPU]; struct pcpu *cpuid_to_pcpu[MAXCPU]; struct cpuhead cpuhead = STAILQ_HEAD_INITIALIZER(cpuhead); /* * Initialize the MI portions of a struct pcpu. */ void pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { bzero(pcpu, size); KASSERT(cpuid >= 0 && cpuid < MAXCPU, ("pcpu_init: invalid cpuid %d", cpuid)); pcpu->pc_cpuid = cpuid; cpuid_to_pcpu[cpuid] = pcpu; STAILQ_INSERT_TAIL(&cpuhead, pcpu, pc_allcpu); cpu_pcpu_init(pcpu, cpuid, size); pcpu->pc_rm_queue.rmq_next = &pcpu->pc_rm_queue; pcpu->pc_rm_queue.rmq_prev = &pcpu->pc_rm_queue; } void dpcpu_init(void *dpcpu, int cpuid) { struct pcpu *pcpu; pcpu = pcpu_find(cpuid); pcpu->pc_dynamic = (uintptr_t)dpcpu - DPCPU_START; /* * Initialize defaults from our linker section. */ memcpy(dpcpu, (void *)DPCPU_START, DPCPU_BYTES); /* * Place it in the global pcpu offset array. */ dpcpu_off[cpuid] = pcpu->pc_dynamic; } static void dpcpu_startup(void *dummy __unused) { struct dpcpu_free *df; df = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO); df->df_start = (uintptr_t)&DPCPU_NAME(modspace); df->df_len = DPCPU_MODMIN; TAILQ_INSERT_HEAD(&dpcpu_head, df, df_link); sx_init(&dpcpu_lock, "dpcpu alloc lock"); } SYSINIT(dpcpu, SI_SUB_KLD, SI_ORDER_FIRST, dpcpu_startup, 0); /* * UMA_PCPU_ZONE zones, that are available for all kernel * consumers. Right now 64 bit zone is used for counter(9) * and pointer zone is used by flowtable. */ uma_zone_t pcpu_zone_64; uma_zone_t pcpu_zone_ptr; static void pcpu_zones_startup(void) { pcpu_zone_64 = uma_zcreate("64 pcpu", sizeof(uint64_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); if (sizeof(uint64_t) == sizeof(void *)) pcpu_zone_ptr = pcpu_zone_64; else pcpu_zone_ptr = uma_zcreate("ptr pcpu", sizeof(void *), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } SYSINIT(pcpu_zones, SI_SUB_KMEM, SI_ORDER_ANY, pcpu_zones_startup, NULL); /* * First-fit extent based allocator for allocating space in the per-cpu * region reserved for modules. This is only intended for use by the * kernel linkers to place module linker sets. */ void * dpcpu_alloc(int size) { struct dpcpu_free *df; void *s; s = NULL; size = roundup2(size, sizeof(void *)); sx_xlock(&dpcpu_lock); TAILQ_FOREACH(df, &dpcpu_head, df_link) { if (df->df_len < size) continue; if (df->df_len == size) { s = (void *)df->df_start; TAILQ_REMOVE(&dpcpu_head, df, df_link); free(df, M_PCPU); break; } s = (void *)df->df_start; df->df_len -= size; df->df_start = df->df_start + size; break; } sx_xunlock(&dpcpu_lock); return (s); } /* * Free dynamic per-cpu space at module unload time. */ void dpcpu_free(void *s, int size) { struct dpcpu_free *df; struct dpcpu_free *dn; uintptr_t start; uintptr_t end; size = roundup2(size, sizeof(void *)); start = (uintptr_t)s; end = start + size; /* * Free a region of space and merge it with as many neighbors as * possible. Keeping the list sorted simplifies this operation. */ sx_xlock(&dpcpu_lock); TAILQ_FOREACH(df, &dpcpu_head, df_link) { if (df->df_start > end) break; /* * If we expand at the end of an entry we may have to * merge it with the one following it as well. */ if (df->df_start + df->df_len == start) { df->df_len += size; dn = TAILQ_NEXT(df, df_link); if (df->df_start + df->df_len == dn->df_start) { df->df_len += dn->df_len; TAILQ_REMOVE(&dpcpu_head, dn, df_link); free(dn, M_PCPU); } sx_xunlock(&dpcpu_lock); return; } if (df->df_start == end) { df->df_start = start; df->df_len += size; sx_xunlock(&dpcpu_lock); return; } } dn = malloc(sizeof(*df), M_PCPU, M_WAITOK | M_ZERO); dn->df_start = start; dn->df_len = size; if (df) TAILQ_INSERT_BEFORE(df, dn, df_link); else TAILQ_INSERT_TAIL(&dpcpu_head, dn, df_link); sx_xunlock(&dpcpu_lock); } /* * Initialize the per-cpu storage from an updated linker-set region. */ void dpcpu_copy(void *s, int size) { #ifdef SMP uintptr_t dpcpu; int i; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; memcpy((void *)(dpcpu + (uintptr_t)s), s, size); } #else memcpy((void *)(dpcpu_off[0] + (uintptr_t)s), s, size); #endif } /* * Destroy a struct pcpu. */ void pcpu_destroy(struct pcpu *pcpu) { STAILQ_REMOVE(&cpuhead, pcpu, pcpu, pc_allcpu); cpuid_to_pcpu[pcpu->pc_cpuid] = NULL; dpcpu_off[pcpu->pc_cpuid] = 0; } /* * Locate a struct pcpu by cpu id. */ struct pcpu * pcpu_find(u_int cpuid) { + KASSERT(cpuid_to_pcpu[cpuid] != NULL, + ("Getting unitialized PCPU %d", cpuid)); return (cpuid_to_pcpu[cpuid]); } int sysctl_dpcpu_quad(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; int64_t count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(int64_t *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } int sysctl_dpcpu_long(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; long count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(long *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } int sysctl_dpcpu_int(SYSCTL_HANDLER_ARGS) { uintptr_t dpcpu; int count; int i; count = 0; CPU_FOREACH(i) { dpcpu = dpcpu_off[i]; if (dpcpu == 0) continue; count += *(int *)(dpcpu + (uintptr_t)arg1); } return (SYSCTL_OUT(req, &count, sizeof(count))); } #ifdef DDB DB_SHOW_COMMAND(dpcpu_off, db_show_dpcpu_off) { int id; CPU_FOREACH(id) { db_printf("dpcpu_off[%2d] = 0x%jx (+ DPCPU_START = %p)\n", id, (uintmax_t)dpcpu_off[id], (void *)(uintptr_t)(dpcpu_off[id] + DPCPU_START)); } } static void show_pcpu(struct pcpu *pc) { struct thread *td; db_printf("cpuid = %d\n", pc->pc_cpuid); db_printf("dynamic pcpu = %p\n", (void *)pc->pc_dynamic); db_printf("curthread = "); td = pc->pc_curthread; if (td != NULL) db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, td->td_name); else db_printf("none\n"); db_printf("curpcb = %p\n", pc->pc_curpcb); db_printf("fpcurthread = "); td = pc->pc_fpcurthread; if (td != NULL) db_printf("%p: pid %d \"%s\"\n", td, td->td_proc->p_pid, td->td_name); else db_printf("none\n"); db_printf("idlethread = "); td = pc->pc_idlethread; if (td != NULL) db_printf("%p: tid %d \"%s\"\n", td, td->td_tid, td->td_name); else db_printf("none\n"); db_show_mdpcpu(pc); #ifdef VIMAGE db_printf("curvnet = %p\n", pc->pc_curthread->td_vnet); #endif #ifdef WITNESS db_printf("spin locks held:\n"); witness_list_locks(&pc->pc_spinlocks, db_printf); #endif } DB_SHOW_COMMAND(pcpu, db_show_pcpu) { struct pcpu *pc; int id; if (have_addr) id = ((addr >> 4) % 16) * 10 + (addr % 16); else id = PCPU_GET(cpuid); pc = pcpu_find(id); if (pc == NULL) { db_printf("CPU %d not found\n", id); return; } show_pcpu(pc); } DB_SHOW_ALL_COMMAND(pcpu, db_show_cpu_all) { struct pcpu *pc; int id; db_printf("Current CPU: %d\n\n", PCPU_GET(cpuid)); - for (id = 0; id <= mp_maxid; id++) { + CPU_FOREACH(id) { pc = pcpu_find(id); if (pc != NULL) { show_pcpu(pc); db_printf("\n"); } } } DB_SHOW_ALIAS(allpcpu, db_show_cpu_all); #endif