Index: projects/calloutng/sys/kern/kern_clock.c =================================================================== --- projects/calloutng/sys/kern/kern_clock.c (revision 237575) +++ projects/calloutng/sys/kern/kern_clock.c (revision 237576) @@ -1,895 +1,895 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_kdb.h" #include "opt_device_polling.h" #include "opt_hwpmc_hooks.h" #include "opt_kdtrace.h" #include "opt_ntp.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef GPROF #include #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , clock, hard); PMC_SOFT_DEFINE( , , clock, stat); #endif #ifdef DEVICE_POLLING extern void hardclock_device_poll(void); #endif /* DEVICE_POLLING */ static void initclocks(void *dummy); SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL); /* Spin-lock protecting profiling statistics. */ static struct mtx time_lock; SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE2(sched, , , tick, tick, "struct thread *", "struct proc *"); static int sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS) { int error; long cp_time[CPUSTATES]; #ifdef SCTL_MASK32 int i; unsigned int cp_time32[CPUSTATES]; #endif read_cpu_time(cp_time); #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { if (!req->oldptr) return SYSCTL_OUT(req, 0, sizeof(cp_time32)); for (i = 0; i < CPUSTATES; i++) cp_time32[i] = (unsigned int)cp_time[i]; error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32)); } else #endif { if (!req->oldptr) return SYSCTL_OUT(req, 0, sizeof(cp_time)); error = SYSCTL_OUT(req, cp_time, sizeof(cp_time)); } return error; } SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE, 0,0, sysctl_kern_cp_time, "LU", "CPU time statistics"); static long empty[CPUSTATES]; static int sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS) { struct pcpu *pcpu; int error; int c; long *cp_time; #ifdef SCTL_MASK32 unsigned int cp_time32[CPUSTATES]; int i; #endif if (!req->oldptr) { #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1)); else #endif return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1)); } for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) { if (!CPU_ABSENT(c)) { pcpu = pcpu_find(c); cp_time = pcpu->pc_cp_time; } else { cp_time = empty; } #ifdef SCTL_MASK32 if (req->flags & SCTL_MASK32) { for (i = 0; i < CPUSTATES; i++) cp_time32[i] = (unsigned int)cp_time[i]; error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32)); } else #endif error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES); } return error; } SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE, 0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics"); #ifdef DEADLKRES static const char *blessed[] = { "getblk", "so_snd_sx", "so_rcv_sx", NULL }; static int slptime_threshold = 1800; static int blktime_threshold = 900; static int sleepfreq = 3; static void deadlkres(void) { struct proc *p; struct thread *td; void *wchan; int blkticks, i, slpticks, slptype, tryl, tticks; tryl = 0; for (;;) { blkticks = blktime_threshold * hz; slpticks = slptime_threshold * hz; /* * Avoid to sleep on the sx_lock in order to avoid a possible * priority inversion problem leading to starvation. * If the lock can't be held after 100 tries, panic. */ if (!sx_try_slock(&allproc_lock)) { if (tryl > 100) panic("%s: possible deadlock detected on allproc_lock\n", __func__); tryl++; pause("allproc", sleepfreq * hz); continue; } tryl = 0; FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { /* * Once a thread is found in "interesting" * state a possible ticks wrap-up needs to be * checked. */ thread_lock(td); if (TD_ON_LOCK(td) && ticks < td->td_blktick) { /* * The thread should be blocked on a * turnstile, simply check if the * turnstile channel is in good state. */ MPASS(td->td_blocked != NULL); tticks = ticks - td->td_blktick; thread_unlock(td); if (tticks > blkticks) { /* * Accordingly with provided * thresholds, this thread is * stuck for too long on a * turnstile. */ PROC_UNLOCK(p); sx_sunlock(&allproc_lock); panic("%s: possible deadlock detected for %p, blocked for %d ticks\n", __func__, td, tticks); } } else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td) && ticks < td->td_blktick) { /* * Check if the thread is sleeping on a * lock, otherwise skip the check. * Drop the thread lock in order to * avoid a LOR with the sleepqueue * spinlock. */ wchan = td->td_wchan; tticks = ticks - td->td_slptick; thread_unlock(td); slptype = sleepq_type(wchan); if ((slptype == SLEEPQ_SX || slptype == SLEEPQ_LK) && tticks > slpticks) { /* * Accordingly with provided * thresholds, this thread is * stuck for too long on a * sleepqueue. * However, being on a * sleepqueue, we might still * check for the blessed * list. */ tryl = 0; for (i = 0; blessed[i] != NULL; i++) { if (!strcmp(blessed[i], td->td_wmesg)) { tryl = 1; break; } } if (tryl != 0) { tryl = 0; continue; } PROC_UNLOCK(p); sx_sunlock(&allproc_lock); panic("%s: possible deadlock detected for %p, blocked for %d ticks\n", __func__, td, tticks); } } else thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); /* Sleep for sleepfreq seconds. */ pause("-", sleepfreq * hz); } } static struct kthread_desc deadlkres_kd = { "deadlkres", deadlkres, (struct thread **)NULL }; SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd); static SYSCTL_NODE(_debug, OID_AUTO, deadlkres, CTLFLAG_RW, 0, "Deadlock resolver"); SYSCTL_INT(_debug_deadlkres, OID_AUTO, slptime_threshold, CTLFLAG_RW, &slptime_threshold, 0, "Number of seconds within is valid to sleep on a sleepqueue"); SYSCTL_INT(_debug_deadlkres, OID_AUTO, blktime_threshold, CTLFLAG_RW, &blktime_threshold, 0, "Number of seconds within is valid to block on a turnstile"); SYSCTL_INT(_debug_deadlkres, OID_AUTO, sleepfreq, CTLFLAG_RW, &sleepfreq, 0, "Number of seconds between any deadlock resolver thread run"); #endif /* DEADLKRES */ void read_cpu_time(long *cp_time) { struct pcpu *pc; int i, j; /* Sum up global cp_time[]. */ bzero(cp_time, sizeof(long) * CPUSTATES); CPU_FOREACH(i) { pc = pcpu_find(i); for (j = 0; j < CPUSTATES; j++) cp_time[j] += pc->pc_cp_time[j]; } } #ifdef SW_WATCHDOG #include static int watchdog_ticks; static int watchdog_enabled; static void watchdog_fire(void); static void watchdog_config(void *, u_int, int *); #endif /* SW_WATCHDOG */ /* * Clock handling routines. * * This code is written to operate with two timers that run independently of * each other. * * The main timer, running hz times per second, is used to trigger interval * timers, timeouts and rescheduling as needed. * * The second timer handles kernel and user profiling, * and does resource use estimation. If the second timer is programmable, * it is randomized to avoid aliasing between the two clocks. For example, * the randomization prevents an adversary from always giving up the cpu * just before its quantum expires. Otherwise, it would never accumulate * cpu ticks. The mean frequency of the second timer is stathz. * * If no second timer exists, stathz will be zero; in this case we drive * profiling and statistics off the main clock. This WILL NOT be accurate; * do not do it unless absolutely necessary. * * The statistics clock may (or may not) be run at a higher rate while * profiling. This profile clock runs at profhz. We require that profhz * be an integral multiple of stathz. * * If the statistics clock is running fast, it must be divided by the ratio * profhz/stathz for statistics. (For profiling, every tick counts.) * * Time-of-day is maintained using a "timecounter", which may or may * not be related to the hardware generating the above mentioned * interrupts. */ int stathz; int profhz; int profprocs; int ticks; int psratio; static DPCPU_DEFINE(int, pcputicks); /* Per-CPU version of ticks. */ static int global_hardclock_run = 0; /* * Initialize clock frequencies and start both clocks running. */ /* ARGSUSED*/ static void initclocks(dummy) void *dummy; { register int i; /* * Set divisors to 1 (normal case) and let the machine-specific * code do its bit. */ mtx_init(&time_lock, "time lock", NULL, MTX_DEF); cpu_initclocks(); /* * Compute profhz/stathz, and fix profhz if needed. */ i = stathz ? stathz : hz; if (profhz == 0) profhz = i; psratio = profhz / i; #ifdef SW_WATCHDOG EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0); #endif } /* * Each time the real-time timer fires, this function is called on all CPUs. * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only * the other CPUs in the system need to call this function. */ void hardclock_cpu(int usermode) { struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; int flags; /* * Run current process's virtual and profile time, as needed. */ pstats = p->p_stats; flags = 0; if (usermode && timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) { PROC_SLOCK(p); if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) flags |= TDF_ALRMPEND | TDF_ASTPENDING; PROC_SUNLOCK(p); } if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) { PROC_SLOCK(p); if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) flags |= TDF_PROFPEND | TDF_ASTPENDING; PROC_SUNLOCK(p); } thread_lock(td); sched_tick(1); td->td_flags |= flags; thread_unlock(td); #ifdef HWPMC_HOOKS if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid))) PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif - callout_tick(); + callout_process(); } /* * The real-time timer, interrupting hz times per second. */ void hardclock(int usermode, uintfptr_t pc) { atomic_add_int((volatile int *)&ticks, 1); hardclock_cpu(usermode); tc_ticktock(1); cpu_tick_calibration(); /* * If no separate statistics clock is available, run it from here. * * XXX: this only works for UP */ if (stathz == 0) { profclock(usermode, pc); statclock(usermode); } #ifdef DEVICE_POLLING hardclock_device_poll(); /* this is very short and quick */ #endif /* DEVICE_POLLING */ #ifdef SW_WATCHDOG if (watchdog_enabled > 0 && --watchdog_ticks <= 0) watchdog_fire(); #endif /* SW_WATCHDOG */ } void hardclock_cnt(int cnt, int usermode) { struct pstats *pstats; struct thread *td = curthread; struct proc *p = td->td_proc; int *t = DPCPU_PTR(pcputicks); int flags, global, newticks; #ifdef SW_WATCHDOG int i; #endif /* SW_WATCHDOG */ /* * Update per-CPU and possibly global ticks values. */ *t += cnt; do { global = ticks; newticks = *t - global; if (newticks <= 0) { if (newticks < -1) *t = global - 1; newticks = 0; break; } } while (!atomic_cmpset_int(&ticks, global, *t)); /* * Run current process's virtual and profile time, as needed. */ pstats = p->p_stats; flags = 0; if (usermode && timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) { PROC_SLOCK(p); if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick * cnt) == 0) flags |= TDF_ALRMPEND | TDF_ASTPENDING; PROC_SUNLOCK(p); } if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) { PROC_SLOCK(p); if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick * cnt) == 0) flags |= TDF_PROFPEND | TDF_ASTPENDING; PROC_SUNLOCK(p); } thread_lock(td); sched_tick(cnt); td->td_flags |= flags; thread_unlock(td); #ifdef HWPMC_HOOKS if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid))) PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL); if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame); #endif /* We are in charge to handle this tick duty. */ if (newticks > 0) { /* Dangerous and no need to call these things concurrently. */ if (atomic_cmpset_acq_int(&global_hardclock_run, 0, 1)) { tc_ticktock(newticks); #ifdef DEVICE_POLLING /* This is very short and quick. */ hardclock_device_poll(); #endif /* DEVICE_POLLING */ atomic_store_rel_int(&global_hardclock_run, 0); } #ifdef SW_WATCHDOG if (watchdog_enabled > 0) { i = atomic_fetchadd_int(&watchdog_ticks, -newticks); if (i > 0 && i <= newticks) watchdog_fire(); } #endif /* SW_WATCHDOG */ } if (curcpu == CPU_FIRST()) cpu_tick_calibration(); } void hardclock_sync(int cpu) { int *t = DPCPU_ID_PTR(cpu, pcputicks); *t = ticks; } /* * Compute number of ticks in the specified amount of time. */ int tvtohz(tv) struct timeval *tv; { register unsigned long ticks; register long sec, usec; /* * If the number of usecs in the whole seconds part of the time * difference fits in a long, then the total number of usecs will * fit in an unsigned long. Compute the total and convert it to * ticks, rounding up and adding 1 to allow for the current tick * to expire. Rounding also depends on unsigned long arithmetic * to avoid overflow. * * Otherwise, if the number of ticks in the whole seconds part of * the time difference fits in a long, then convert the parts to * ticks separately and add, using similar rounding methods and * overflow avoidance. This method would work in the previous * case but it is slightly slower and assumes that hz is integral. * * Otherwise, round the time difference down to the maximum * representable value. * * If ints have 32 bits, then the maximum value for any timeout in * 10ms ticks is 248 days. */ sec = tv->tv_sec; usec = tv->tv_usec; if (usec < 0) { sec--; usec += 1000000; } if (sec < 0) { #ifdef DIAGNOSTIC if (usec > 0) { sec++; usec -= 1000000; } printf("tvotohz: negative time difference %ld sec %ld usec\n", sec, usec); #endif ticks = 1; } else if (sec <= LONG_MAX / 1000000) ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) / tick + 1; else if (sec <= LONG_MAX / hz) ticks = sec * hz + ((unsigned long)usec + (tick - 1)) / tick + 1; else ticks = LONG_MAX; if (ticks > INT_MAX) ticks = INT_MAX; return ((int)ticks); } /* * Start profiling on a process. * * Kernel profiling passes proc0 which never exits and hence * keeps the profile clock running constantly. */ void startprofclock(p) register struct proc *p; { PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_STOPPROF) return; if ((p->p_flag & P_PROFIL) == 0) { p->p_flag |= P_PROFIL; mtx_lock(&time_lock); if (++profprocs == 1) cpu_startprofclock(); mtx_unlock(&time_lock); } } /* * Stop profiling on a process. */ void stopprofclock(p) register struct proc *p; { PROC_LOCK_ASSERT(p, MA_OWNED); if (p->p_flag & P_PROFIL) { if (p->p_profthreads != 0) { p->p_flag |= P_STOPPROF; while (p->p_profthreads != 0) msleep(&p->p_profthreads, &p->p_mtx, PPAUSE, "stopprof", 0); p->p_flag &= ~P_STOPPROF; } if ((p->p_flag & P_PROFIL) == 0) return; p->p_flag &= ~P_PROFIL; mtx_lock(&time_lock); if (--profprocs == 0) cpu_stopprofclock(); mtx_unlock(&time_lock); } } /* * Statistics clock. Updates rusage information and calls the scheduler * to adjust priorities of the active thread. * * This should be called by all active processors. */ void statclock(int usermode) { statclock_cnt(1, usermode); } void statclock_cnt(int cnt, int usermode) { struct rusage *ru; struct vmspace *vm; struct thread *td; struct proc *p; long rss; long *cp_time; td = curthread; p = td->td_proc; cp_time = (long *)PCPU_PTR(cp_time); if (usermode) { /* * Charge the time as appropriate. */ td->td_uticks += cnt; if (p->p_nice > NZERO) cp_time[CP_NICE] += cnt; else cp_time[CP_USER] += cnt; } else { /* * Came from kernel mode, so we were: * - handling an interrupt, * - doing syscall or trap work on behalf of the current * user process, or * - spinning in the idle loop. * Whichever it is, charge the time as appropriate. * Note that we charge interrupts to the current process, * regardless of whether they are ``for'' that process, * so that we know how much of its real time was spent * in ``non-process'' (i.e., interrupt) work. */ if ((td->td_pflags & TDP_ITHREAD) || td->td_intr_nesting_level >= 2) { td->td_iticks += cnt; cp_time[CP_INTR] += cnt; } else { td->td_pticks += cnt; td->td_sticks += cnt; if (!TD_IS_IDLETHREAD(td)) cp_time[CP_SYS] += cnt; else cp_time[CP_IDLE] += cnt; } } /* Update resource usage integrals and maximums. */ MPASS(p->p_vmspace != NULL); vm = p->p_vmspace; ru = &td->td_ru; ru->ru_ixrss += pgtok(vm->vm_tsize) * cnt; ru->ru_idrss += pgtok(vm->vm_dsize) * cnt; ru->ru_isrss += pgtok(vm->vm_ssize) * cnt; rss = pgtok(vmspace_resident_count(vm)); if (ru->ru_maxrss < rss) ru->ru_maxrss = rss; KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock", "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz); SDT_PROBE2(sched, , , tick, td, td->td_proc); thread_lock_flags(td, MTX_QUIET); for ( ; cnt > 0; cnt--) sched_clock(td); thread_unlock(td); #ifdef HWPMC_HOOKS if (td->td_intr_frame != NULL) PMC_SOFT_CALL_TF( , , clock, stat, td->td_intr_frame); #endif } void profclock(int usermode, uintfptr_t pc) { profclock_cnt(1, usermode, pc); } void profclock_cnt(int cnt, int usermode, uintfptr_t pc) { struct thread *td; #ifdef GPROF struct gmonparam *g; uintfptr_t i; #endif td = curthread; if (usermode) { /* * Came from user mode; CPU was in user state. * If this process is being profiled, record the tick. * if there is no related user location yet, don't * bother trying to count it. */ if (td->td_proc->p_flag & P_PROFIL) addupc_intr(td, pc, cnt); } #ifdef GPROF else { /* * Kernel statistics are just like addupc_intr, only easier. */ g = &_gmonparam; if (g->state == GMON_PROF_ON && pc >= g->lowpc) { i = PC_TO_I(g, pc); if (i < g->textsize) { KCOUNT(g, i) += cnt; } } } #endif } /* * Return information about system clocks. */ static int sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS) { struct clockinfo clkinfo; /* * Construct clockinfo structure. */ bzero(&clkinfo, sizeof(clkinfo)); clkinfo.hz = hz; clkinfo.tick = tick; clkinfo.profhz = profhz; clkinfo.stathz = stathz ? stathz : hz; return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); } SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_MPSAFE, 0, 0, sysctl_kern_clockrate, "S,clockinfo", "Rate and period of various kernel clocks"); #ifdef SW_WATCHDOG static void watchdog_config(void *unused __unused, u_int cmd, int *error) { u_int u; u = cmd & WD_INTERVAL; if (u >= WD_TO_1SEC) { watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz; watchdog_enabled = 1; *error = 0; } else { watchdog_enabled = 0; } } /* * Handle a watchdog timeout by dumping interrupt information and * then either dropping to DDB or panicking. */ static void watchdog_fire(void) { int nintr; uint64_t inttotal; u_long *curintr; char *curname; curintr = intrcnt; curname = intrnames; inttotal = 0; nintr = sintrcnt / sizeof(u_long); printf("interrupt total\n"); while (--nintr >= 0) { if (*curintr) printf("%-12s %20lu\n", curname, *curintr); curname += strlen(curname) + 1; inttotal += *curintr++; } printf("Total %20ju\n", (uintmax_t)inttotal); #if defined(KDB) && !defined(KDB_UNATTENDED) kdb_backtrace(); kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout"); #else panic("watchdog timeout"); #endif } #endif /* SW_WATCHDOG */ Index: projects/calloutng/sys/kern/kern_clocksource.c =================================================================== --- projects/calloutng/sys/kern/kern_clocksource.c (revision 237575) +++ projects/calloutng/sys/kern/kern_clocksource.c (revision 237576) @@ -1,975 +1,975 @@ /*- * Copyright (c) 2010-2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Common routines to manage event timers hardware. */ #include "opt_device_polling.h" #include "opt_kdtrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include cyclic_clock_func_t cyclic_clock_func = NULL; #endif int cpu_can_deep_sleep = 0; /* C3 state is available. */ int cpu_disable_deep_sleep = 0; /* Timer dies in C3. */ static void setuptimer(void); static void loadtimer(struct bintime *now, int first); static int doconfigtimer(void); static void configtimer(int start); static int round_freq(struct eventtimer *et, int freq); static void getnextcpuevent(struct bintime *event, int idle); static void getnextevent(struct bintime *event); static int handleevents(struct bintime *now, int fake); static void cpu_new_callout(int cpu, struct bintime bt); static struct mtx et_hw_mtx; #define ET_HW_LOCK(state) \ { \ if (timer->et_flags & ET_FLAGS_PERCPU) \ mtx_lock_spin(&(state)->et_hw_mtx); \ else \ mtx_lock_spin(&et_hw_mtx); \ } #define ET_HW_UNLOCK(state) \ { \ if (timer->et_flags & ET_FLAGS_PERCPU) \ mtx_unlock_spin(&(state)->et_hw_mtx); \ else \ mtx_unlock_spin(&et_hw_mtx); \ } static struct eventtimer *timer = NULL; static struct bintime timerperiod; /* Timer period for periodic mode. */ static struct bintime hardperiod; /* hardclock() events period. */ static struct bintime statperiod; /* statclock() events period. */ static struct bintime profperiod; /* profclock() events period. */ static struct bintime nexttick; /* Next global timer tick time. */ static struct bintime nexthard; /* Next global hardlock() event. */ static u_int busy = 0; /* Reconfiguration is in progress. */ static int profiling = 0; /* Profiling events enabled. */ static char timername[32]; /* Wanted timer. */ TUNABLE_STR("kern.eventtimer.timer", timername, sizeof(timername)); static int singlemul = 0; /* Multiplier for periodic mode. */ TUNABLE_INT("kern.eventtimer.singlemul", &singlemul); SYSCTL_INT(_kern_eventtimer, OID_AUTO, singlemul, CTLFLAG_RW, &singlemul, 0, "Multiplier for periodic mode"); static u_int idletick = 0; /* Run periodic events when idle. */ TUNABLE_INT("kern.eventtimer.idletick", &idletick); SYSCTL_UINT(_kern_eventtimer, OID_AUTO, idletick, CTLFLAG_RW, &idletick, 0, "Run periodic events when idle"); static u_int activetick = 1; /* Run all periodic events when active. */ TUNABLE_INT("kern.eventtimer.activetick", &activetick); SYSCTL_UINT(_kern_eventtimer, OID_AUTO, activetick, CTLFLAG_RW, &activetick, 0, "Run all periodic events when active"); static int periodic = 0; /* Periodic or one-shot mode. */ static int want_periodic = 0; /* What mode to prefer. */ TUNABLE_INT("kern.eventtimer.periodic", &want_periodic); struct pcpu_state { struct mtx et_hw_mtx; /* Per-CPU timer mutex. */ u_int action; /* Reconfiguration requests. */ u_int handle; /* Immediate handle resuests. */ struct bintime now; /* Last tick time. */ struct bintime nextevent; /* Next scheduled event on this CPU. */ struct bintime nexttick; /* Next timer tick time. */ struct bintime nexthard; /* Next hardlock() event. */ struct bintime nextstat; /* Next statclock() event. */ struct bintime nextprof; /* Next profclock() event. */ struct bintime nextcall; /* Next callout event. */ #ifdef KDTRACE_HOOKS struct bintime nextcyc; /* Next OpenSolaris cyclics event. */ #endif int ipi; /* This CPU needs IPI. */ int idle; /* This CPU is in idle mode. */ }; static DPCPU_DEFINE(struct pcpu_state, timerstate); #define FREQ2BT(freq, bt) \ { \ (bt)->sec = 0; \ (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ } #define BT2FREQ(bt) \ (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ ((bt)->frac >> 1)) /* * Timer broadcast IPI handler. */ int hardclockintr(void) { struct bintime now; struct pcpu_state *state; int done; if (doconfigtimer() || busy) return (FILTER_HANDLED); state = DPCPU_PTR(timerstate); now = state->now; CTR4(KTR_SPARE2, "ipi at %d: now %d.%08x%08x", curcpu, now.sec, (u_int)(now.frac >> 32), (u_int)(now.frac & 0xffffffff)); done = handleevents(&now, 0); return (done ? FILTER_HANDLED : FILTER_STRAY); } /* * Handle all events for specified time on this CPU */ static int handleevents(struct bintime *now, int fake) { struct bintime t; struct trapframe *frame; struct pcpu_state *state; uintfptr_t pc; int usermode; int done, runs; CTR4(KTR_SPARE2, "handle at %d: now %d.%08x%08x", curcpu, now->sec, (u_int)(now->frac >> 32), (u_int)(now->frac & 0xffffffff)); done = 0; if (fake) { frame = NULL; usermode = 0; pc = 0; } else { frame = curthread->td_intr_frame; usermode = TRAPF_USERMODE(frame); pc = TRAPF_PC(frame); } state = DPCPU_PTR(timerstate); runs = 0; while (bintime_cmp(now, &state->nexthard, >=)) { bintime_add(&state->nexthard, &hardperiod); runs++; } if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 && bintime_cmp(&state->nexthard, &nexthard, >)) nexthard = state->nexthard; if (runs && fake < 2) { hardclock_cnt(runs, usermode); done = 1; } runs = 0; while (bintime_cmp(now, &state->nextstat, >=)) { bintime_add(&state->nextstat, &statperiod); runs++; } if (runs && fake < 2) { statclock_cnt(runs, usermode); done = 1; } if (profiling) { runs = 0; while (bintime_cmp(now, &state->nextprof, >=)) { bintime_add(&state->nextprof, &profperiod); runs++; } if (runs && !fake) { profclock_cnt(runs, usermode, pc); done = 1; } } else state->nextprof = state->nextstat; if (bintime_cmp(now, &state->nextcall, >=) && (state->nextcall.sec != -1)) { state->nextcall.sec = -1; - callout_tick(); + callout_process(); } #ifdef KDTRACE_HOOKS if (fake == 0 && cyclic_clock_func != NULL && state->nextcyc.sec != -1 && bintime_cmp(now, &state->nextcyc, >=)) { state->nextcyc.sec = -1; (*cyclic_clock_func)(frame); } #endif getnextcpuevent(&t, 0); if (fake == 2) { state->nextevent = t; return (done); } ET_HW_LOCK(state); if (!busy) { state->idle = 0; state->nextevent = t; loadtimer(now, 0); } ET_HW_UNLOCK(state); return (done); } /* * Schedule binuptime of the next event on current CPU. */ static void getnextcpuevent(struct bintime *event, int idle) { struct pcpu_state *state; struct bintime tmp; int hardfreq; state = DPCPU_PTR(timerstate); /* Handle hardclock() events, skipping some is CPU is idle. */ *event = state->nexthard; if (idle || (!activetick && !profiling && (timer->et_flags & ET_FLAGS_PERCPU) == 0)) { hardfreq = idle ? 4 : (stathz / 2); if (curcpu == CPU_FIRST() && tc_min_ticktock_freq > hardfreq) hardfreq = tc_min_ticktock_freq; if (hz > hardfreq) { tmp = hardperiod; bintime_mul(&tmp, hz / hardfreq - 1); bintime_add(event, &tmp); } } /* Handle callout events. */ if (state->nextcall.sec != -1 && bintime_cmp(event, &state->nextcall, >)) *event = state->nextcall; if (!idle) { /* If CPU is active - handle other types of events. */ if (bintime_cmp(event, &state->nextstat, >)) *event = state->nextstat; if (profiling && bintime_cmp(event, &state->nextprof, >)) *event = state->nextprof; } #ifdef KDTRACE_HOOKS if (state->nextcyc.sec != -1 && bintime_cmp(event, &state->nextcyc, >)) *event = state->nextcyc; #endif } /* * Schedule binuptime of the next event on all CPUs. */ static void getnextevent(struct bintime *event) { struct pcpu_state *state; #ifdef SMP int cpu; #endif int c, nonidle; state = DPCPU_PTR(timerstate); *event = state->nextevent; c = curcpu; nonidle = !state->idle; if ((timer->et_flags & ET_FLAGS_PERCPU) == 0) { #ifdef SMP CPU_FOREACH(cpu) { if (curcpu == cpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); nonidle += !state->idle; if (bintime_cmp(event, &state->nextevent, >)) { *event = state->nextevent; c = cpu; } } #endif if (nonidle != 0 && bintime_cmp(event, &nexthard, >)) *event = nexthard; } CTR5(KTR_SPARE2, "next at %d: next %d.%08x%08x by %d", curcpu, event->sec, (u_int)(event->frac >> 32), (u_int)(event->frac & 0xffffffff), c); } /* Hardware timer callback function. */ static void timercb(struct eventtimer *et, void *arg) { struct bintime now; struct bintime *next; struct pcpu_state *state; #ifdef SMP int cpu, bcast; #endif /* Do not touch anything if somebody reconfiguring timers. */ if (busy) return; /* Update present and next tick times. */ state = DPCPU_PTR(timerstate); if (et->et_flags & ET_FLAGS_PERCPU) { next = &state->nexttick; } else next = &nexttick; if (periodic) { now = *next; /* Ex-next tick time becomes present time. */ bintime_add(next, &timerperiod); /* Next tick in 1 period. */ } else { binuptime(&now); /* Get present time from hardware. */ next->sec = -1; /* Next tick is not scheduled yet. */ } state->now = now; CTR4(KTR_SPARE2, "intr at %d: now %d.%08x%08x", curcpu, (int)(now.sec), (u_int)(now.frac >> 32), (u_int)(now.frac & 0xffffffff)); #ifdef SMP /* Prepare broadcasting to other CPUs for non-per-CPU timers. */ bcast = 0; if ((et->et_flags & ET_FLAGS_PERCPU) == 0 && smp_started) { CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); state->now = now; if (bintime_cmp(&now, &state->nextevent, >=)) { state->nextevent.sec++; if (curcpu != cpu) { state->ipi = 1; bcast = 1; } } ET_HW_UNLOCK(state); } } #endif /* Handle events for this time on this CPU. */ handleevents(&now, 0); #ifdef SMP /* Broadcast interrupt to other CPUs for non-per-CPU timers. */ if (bcast) { CPU_FOREACH(cpu) { if (curcpu == cpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); if (state->ipi) { state->ipi = 0; ipi_cpu(cpu, IPI_HARDCLOCK); } } } #endif } /* * Load new value into hardware timer. */ static void loadtimer(struct bintime *now, int start) { struct pcpu_state *state; struct bintime new; struct bintime *next; uint64_t tmp; int eq; if (timer->et_flags & ET_FLAGS_PERCPU) { state = DPCPU_PTR(timerstate); next = &state->nexttick; } else next = &nexttick; if (periodic) { if (start) { /* * Try to start all periodic timers aligned * to period to make events synchronous. */ tmp = ((uint64_t)now->sec << 36) + (now->frac >> 28); tmp = (tmp % (timerperiod.frac >> 28)) << 28; new.sec = 0; new.frac = timerperiod.frac - tmp; if (new.frac < tmp) /* Left less then passed. */ bintime_add(&new, &timerperiod); CTR5(KTR_SPARE2, "load p at %d: now %d.%08x first in %d.%08x", curcpu, now->sec, (u_int)(now->frac >> 32), new.sec, (u_int)(new.frac >> 32)); *next = new; bintime_add(next, now); et_start(timer, &new, &timerperiod); } } else { getnextevent(&new); eq = bintime_cmp(&new, next, ==); CTR5(KTR_SPARE2, "load at %d: next %d.%08x%08x eq %d", curcpu, new.sec, (u_int)(new.frac >> 32), (u_int)(new.frac & 0xffffffff), eq); if (!eq) { *next = new; bintime_sub(&new, now); et_start(timer, &new, NULL); } } } /* * Prepare event timer parameters after configuration changes. */ static void setuptimer(void) { int freq; if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0) periodic = 0; else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0) periodic = 1; singlemul = MIN(MAX(singlemul, 1), 20); freq = hz * singlemul; while (freq < (profiling ? profhz : stathz)) freq += hz; freq = round_freq(timer, freq); FREQ2BT(freq, &timerperiod); } /* * Reconfigure specified per-CPU timer on other CPU. Called from IPI handler. */ static int doconfigtimer(void) { struct bintime now; struct pcpu_state *state; state = DPCPU_PTR(timerstate); switch (atomic_load_acq_int(&state->action)) { case 1: binuptime(&now); ET_HW_LOCK(state); loadtimer(&now, 1); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); return (1); case 2: ET_HW_LOCK(state); et_stop(timer); ET_HW_UNLOCK(state); state->handle = 0; atomic_store_rel_int(&state->action, 0); return (1); } if (atomic_readandclear_int(&state->handle) && !busy) { binuptime(&now); handleevents(&now, 0); return (1); } return (0); } /* * Reconfigure specified timer. * For per-CPU timers use IPI to make other CPUs to reconfigure. */ static void configtimer(int start) { struct bintime now, next; struct pcpu_state *state; int cpu; if (start) { setuptimer(); binuptime(&now); } critical_enter(); ET_HW_LOCK(DPCPU_PTR(timerstate)); if (start) { /* Initialize time machine parameters. */ next = now; bintime_add(&next, &timerperiod); if (periodic) nexttick = next; else nexttick.sec = -1; CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); state->now = now; state->nextevent = next; if (periodic) state->nexttick = next; else state->nexttick.sec = -1; state->nexthard = next; state->nextstat = next; state->nextprof = next; hardclock_sync(cpu); } busy = 0; /* Start global timer or per-CPU timer of this CPU. */ loadtimer(&now, 1); } else { busy = 1; /* Stop global timer or per-CPU timer of this CPU. */ et_stop(timer); } ET_HW_UNLOCK(DPCPU_PTR(timerstate)); #ifdef SMP /* If timer is global or there is no other CPUs yet - we are done. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || !smp_started) { critical_exit(); return; } /* Set reconfigure flags for other CPUs. */ CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); atomic_store_rel_int(&state->action, (cpu == curcpu) ? 0 : ( start ? 1 : 2)); } /* Broadcast reconfigure IPI. */ ipi_all_but_self(IPI_HARDCLOCK); /* Wait for reconfiguration completed. */ restart: cpu_spinwait(); CPU_FOREACH(cpu) { if (cpu == curcpu) continue; state = DPCPU_ID_PTR(cpu, timerstate); if (atomic_load_acq_int(&state->action)) goto restart; } #endif critical_exit(); } /* * Calculate nearest frequency supported by hardware timer. */ static int round_freq(struct eventtimer *et, int freq) { uint64_t div; if (et->et_frequency != 0) { div = lmax((et->et_frequency + freq / 2) / freq, 1); if (et->et_flags & ET_FLAGS_POW2DIV) div = 1 << (flsl(div + div / 2) - 1); freq = (et->et_frequency + div / 2) / div; } if (et->et_min_period.sec > 0) freq = 0; else if (et->et_min_period.frac != 0) freq = min(freq, BT2FREQ(&et->et_min_period)); if (et->et_max_period.sec == 0 && et->et_max_period.frac != 0) freq = max(freq, BT2FREQ(&et->et_max_period)); return (freq); } /* * Configure and start event timers (BSP part). */ void cpu_initclocks_bsp(void) { struct pcpu_state *state; int base, div, cpu; mtx_init(&et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); CPU_FOREACH(cpu) { state = DPCPU_ID_PTR(cpu, timerstate); mtx_init(&state->et_hw_mtx, "et_hw_mtx", NULL, MTX_SPIN); #ifdef KDTRACE_HOOKS state->nextcyc.sec = -1; #endif state->nextcall.sec = -1; } callout_new_inserted = cpu_new_callout; periodic = want_periodic; /* Grab requested timer or the best of present. */ if (timername[0]) timer = et_find(timername, 0, 0); if (timer == NULL && periodic) { timer = et_find(NULL, ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC); } if (timer == NULL) { timer = et_find(NULL, ET_FLAGS_ONESHOT, ET_FLAGS_ONESHOT); } if (timer == NULL && !periodic) { timer = et_find(NULL, ET_FLAGS_PERIODIC, ET_FLAGS_PERIODIC); } if (timer == NULL) panic("No usable event timer found!"); et_init(timer, timercb, NULL, NULL); /* Adapt to timer capabilities. */ if (periodic && (timer->et_flags & ET_FLAGS_PERIODIC) == 0) periodic = 0; else if (!periodic && (timer->et_flags & ET_FLAGS_ONESHOT) == 0) periodic = 1; if (timer->et_flags & ET_FLAGS_C3STOP) cpu_disable_deep_sleep++; /* * We honor the requested 'hz' value. * We want to run stathz in the neighborhood of 128hz. * We would like profhz to run as often as possible. */ if (singlemul <= 0 || singlemul > 20) { if (hz >= 1500 || (hz % 128) == 0) singlemul = 1; else if (hz >= 750) singlemul = 2; else singlemul = 4; } if (periodic) { base = round_freq(timer, hz * singlemul); singlemul = max((base + hz / 2) / hz, 1); hz = (base + singlemul / 2) / singlemul; if (base <= 128) stathz = base; else { div = base / 128; if (div >= singlemul && (div % singlemul) == 0) div++; stathz = base / div; } profhz = stathz; while ((profhz + stathz) <= 128 * 64) profhz += stathz; profhz = round_freq(timer, profhz); } else { hz = round_freq(timer, hz); stathz = round_freq(timer, 127); profhz = round_freq(timer, stathz * 64); } tick = 1000000 / hz; FREQ2BT(hz, &hardperiod); FREQ2BT(stathz, &statperiod); FREQ2BT(profhz, &profperiod); ET_LOCK(); configtimer(1); ET_UNLOCK(); } /* * Start per-CPU event timers on APs. */ void cpu_initclocks_ap(void) { struct bintime now; struct pcpu_state *state; state = DPCPU_PTR(timerstate); binuptime(&now); ET_HW_LOCK(state); if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 && periodic) { state->now = nexttick; bintime_sub(&state->now, &timerperiod); } else state->now = now; hardclock_sync(curcpu); handleevents(&state->now, 2); if (timer->et_flags & ET_FLAGS_PERCPU) loadtimer(&now, 1); ET_HW_UNLOCK(state); } /* * Switch to profiling clock rates. */ void cpu_startprofclock(void) { ET_LOCK(); if (periodic) { configtimer(0); profiling = 1; configtimer(1); } else profiling = 1; ET_UNLOCK(); } /* * Switch to regular clock rates. */ void cpu_stopprofclock(void) { ET_LOCK(); if (periodic) { configtimer(0); profiling = 0; configtimer(1); } else profiling = 0; ET_UNLOCK(); } /* * Switch to idle mode (all ticks handled). */ void cpu_idleclock(void) { struct bintime now, t; struct pcpu_state *state; if (idletick || busy || (periodic && (timer->et_flags & ET_FLAGS_PERCPU)) #ifdef DEVICE_POLLING || curcpu == CPU_FIRST() #endif ) return; state = DPCPU_PTR(timerstate); if (periodic) now = state->now; else binuptime(&now); CTR4(KTR_SPARE2, "idle at %d: now %d.%08x%08x", curcpu, now.sec, (u_int)(now.frac >> 32), (u_int)(now.frac & 0xffffffff)); getnextcpuevent(&t, 1); ET_HW_LOCK(state); state->idle = 1; state->nextevent = t; if (!periodic) loadtimer(&now, 0); ET_HW_UNLOCK(state); } /* * Switch to active mode (skip empty ticks). */ void cpu_activeclock(void) { struct bintime now; struct pcpu_state *state; struct thread *td; state = DPCPU_PTR(timerstate); if (state->idle == 0 || busy) return; if (periodic) now = state->now; else binuptime(&now); CTR4(KTR_SPARE2, "active at %d: now %d.%08x%08x", curcpu, now.sec, (u_int)(now.frac >> 32), (u_int)(now.frac & 0xffffffff)); spinlock_enter(); td = curthread; td->td_intr_nesting_level++; handleevents(&now, 1); td->td_intr_nesting_level--; spinlock_exit(); } #ifdef KDTRACE_HOOKS void clocksource_cyc_set(const struct bintime *t) { struct bintime now; struct pcpu_state *state; state = DPCPU_PTR(timerstate); if (periodic) now = state->now; else binuptime(&now); CTR4(KTR_SPARE2, "set_cyc at %d: now %d.%08x%08x", curcpu, now.sec, (u_int)(now.frac >> 32), (u_int)(now.frac & 0xffffffff)); CTR4(KTR_SPARE2, "set_cyc at %d: t %d.%08x%08x", curcpu, t->sec, (u_int)(t->frac >> 32), (u_int)(t->frac & 0xffffffff)); ET_HW_LOCK(state); if (bintime_cmp(t, &state->nextcyc, ==)) { ET_HW_UNLOCK(state); return; } state->nextcyc = *t; if (bintime_cmp(&state->nextcyc, &state->nextevent, >=)) { ET_HW_UNLOCK(state); return; } state->nextevent = state->nextcyc; if (!periodic) loadtimer(&now, 0); ET_HW_UNLOCK(state); } #endif static void cpu_new_callout(int cpu, struct bintime bt) { struct bintime now; struct pcpu_state *state; CTR5(KTR_SPARE2, "new co at %d: on %d at %d.%08x%08x", curcpu, cpu, (int)(bt.sec), (u_int)(bt.frac >> 32), (u_int)(bt.frac & 0xffffffff)); state = DPCPU_ID_PTR(cpu, timerstate); ET_HW_LOCK(state); /* If there is callout time already set earlier -- do nothing. */ if (state->nextcall.sec != -1 && bintime_cmp(&bt, &state->nextcall, >=)) { ET_HW_UNLOCK(state); return; } state->nextcall = bt; /* If there is some some other event set earlier -- do nothing. */ if (bintime_cmp(&state->nextcall, &state->nextevent, >=)) { ET_HW_UNLOCK(state); return; } state->nextevent = state->nextcall; /* If timer is periodic -- there is nothing to reprogram. */ if (periodic) { ET_HW_UNLOCK(state); return; } /* If timer is global or of the current CPU -- reprogram it. */ if ((timer->et_flags & ET_FLAGS_PERCPU) == 0 || cpu == curcpu) { binuptime(&now); loadtimer(&now, 0); ET_HW_UNLOCK(state); return; } /* Otherwise make other CPU to reprogram it. */ state->handle = 1; ET_HW_UNLOCK(state); ipi_cpu(cpu, IPI_HARDCLOCK); } /* * Report or change the active event timers hardware. */ static int sysctl_kern_eventtimer_timer(SYSCTL_HANDLER_ARGS) { char buf[32]; struct eventtimer *et; int error; ET_LOCK(); et = timer; snprintf(buf, sizeof(buf), "%s", et->et_name); ET_UNLOCK(); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); ET_LOCK(); et = timer; if (error != 0 || req->newptr == NULL || strcasecmp(buf, et->et_name) == 0) { ET_UNLOCK(); return (error); } et = et_find(buf, 0, 0); if (et == NULL) { ET_UNLOCK(); return (ENOENT); } configtimer(0); et_free(timer); if (et->et_flags & ET_FLAGS_C3STOP) cpu_disable_deep_sleep++; if (timer->et_flags & ET_FLAGS_C3STOP) cpu_disable_deep_sleep--; periodic = want_periodic; timer = et; et_init(timer, timercb, NULL, NULL); configtimer(1); ET_UNLOCK(); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, timer, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_timer, "A", "Chosen event timer"); /* * Report or change the active event timer periodicity. */ static int sysctl_kern_eventtimer_periodic(SYSCTL_HANDLER_ARGS) { int error, val; val = periodic; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); ET_LOCK(); configtimer(0); periodic = want_periodic = val; configtimer(1); ET_UNLOCK(); return (error); } SYSCTL_PROC(_kern_eventtimer, OID_AUTO, periodic, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_eventtimer_periodic, "I", "Enable event timer periodic mode"); Index: projects/calloutng/sys/kern/kern_timeout.c =================================================================== --- projects/calloutng/sys/kern/kern_timeout.c (revision 237575) +++ projects/calloutng/sys/kern/kern_timeout.c (revision 237576) @@ -1,1267 +1,1251 @@ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_kdtrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif SDT_PROVIDER_DEFINE(callout_execute); SDT_PROBE_DEFINE(callout_execute, kernel, , callout_start, callout-start); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_start, 0, "struct callout *"); SDT_PROBE_DEFINE(callout_execute, kernel, , callout_end, callout-end); SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0, "struct callout *"); static int avg_gcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0, "Average number of Giant callouts made per softclock call. Units = 1/1000"); static int avg_lockcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0, "Average number of lock callouts made per softclock call. Units = 1/1000"); static int avg_mpcalls; SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0, "Average number of MP callouts made per softclock call. Units = 1/1000"); /* * TODO: * allocate more timeout table slots when table overflows. */ int callwheelsize, callwheelmask; /* * The callout cpu migration entity represents informations necessary for * describing the migrating callout to the new callout cpu. * The cached informations are very important for deferring migration when * the migrating callout is already running. */ struct cc_mig_ent { #ifdef SMP void (*ce_migration_func)(void *); void *ce_migration_arg; int ce_migration_cpu; struct bintime ce_migration_time; #endif }; /* * There is one struct callout_cpu per cpu, holding all relevant * state for the callout processing thread on the individual CPU. - * In particular: - * cc_ticks is incremented once per tick in callout_cpu(). - * It tracks the global 'ticks' but in a way that the individual - * threads should not worry about races in the order in which - * hardclock() and hardclock_cpu() run on the various CPUs. - * cc_softclock is advanced in callout_cpu() to point to the - * first entry in cc_callwheel that may need handling. In turn, - * a softclock() is scheduled so it can serve the various entries i - * such that cc_softclock <= i <= cc_ticks . - * XXX maybe cc_softclock and cc_ticks should be volatile ? - * - * cc_ticks is also used in callout_reset_cpu() to determine - * when the callout should be served. */ struct callout_cpu { struct cc_mig_ent cc_migrating_entity; struct mtx cc_lock; struct callout *cc_callout; struct callout_tailq *cc_callwheel; + struct callout_tailq *cc_localexp; struct callout_list cc_callfree; struct callout *cc_next; struct callout *cc_curr; + struct bintime cc_firstevent; + struct bintime cc_lastscan; void *cc_cookie; - struct bintime cc_ticks; - struct bintime cc_softticks; int cc_cancel; int cc_waiting; - struct bintime cc_firsttick; - struct callout_tailq *cc_localexp; }; #ifdef SMP #define cc_migration_func cc_migrating_entity.ce_migration_func #define cc_migration_arg cc_migrating_entity.ce_migration_arg #define cc_migration_cpu cc_migrating_entity.ce_migration_cpu #define cc_migration_time cc_migrating_entity.ce_migration_time struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) #define CC_SELF() CC_CPU(PCPU_GET(cpuid)) #else struct callout_cpu cc_cpu; #define CC_CPU(cpu) &cc_cpu #define CC_SELF() &cc_cpu #endif #define CC_LOCK(cc) mtx_lock_spin(&(cc)->cc_lock) #define CC_UNLOCK(cc) mtx_unlock_spin(&(cc)->cc_lock) #define CC_LOCK_ASSERT(cc) mtx_assert(&(cc)->cc_lock, MA_OWNED) #define FREQ2BT(freq, bt) \ { \ (bt)->sec = 0; \ (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ } static int timeout_cpu; void (*callout_new_inserted)(int cpu, struct bintime bt) = NULL; static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); /** * Locked by cc_lock: - * cc_curr - If a callout is in progress, it is curr_callout. - * If curr_callout is non-NULL, threads waiting in + * cc_curr - If a callout is in progress, it is cc_curr. + * If cc_curr is non-NULL, threads waiting in * callout_drain() will be woken up as soon as the * relevant callout completes. - * cc_cancel - Changing to 1 with both callout_lock and c_lock held + * cc_cancel - Changing to 1 with both callout_lock and cc_lock held * guarantees that the current callout will not run. * The softclock() function sets this to 0 before it * drops callout_lock to acquire c_lock, and it calls * the handler only if curr_cancelled is still 0 after - * c_lock is successfully acquired. + * cc_lock is successfully acquired. * cc_waiting - If a thread is waiting in callout_drain(), then * callout_wait is nonzero. Set only when - * curr_callout is non-NULL. + * cc_curr is non-NULL. */ /* * Resets the migration entity tied to a specific callout cpu. */ static void cc_cme_cleanup(struct callout_cpu *cc) { #ifdef SMP cc->cc_migration_cpu = CPUBLOCK; cc->cc_migration_time.sec = 0; cc->cc_migration_time.frac = 0; cc->cc_migration_func = NULL; cc->cc_migration_arg = NULL; #endif } /* * Checks if migration is requested by a specific callout cpu. */ static int cc_cme_migrating(struct callout_cpu *cc) { #ifdef SMP return (cc->cc_migration_cpu != CPUBLOCK); #else return (0); #endif } /* * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization * * This code is called very early in the kernel initialization sequence, * and may be called more then once. */ caddr_t kern_timeout_callwheel_alloc(caddr_t v) { struct callout_cpu *cc; timeout_cpu = PCPU_GET(cpuid); cc = CC_CPU(timeout_cpu); /* * Calculate callout wheel size */ callwheelsize = 1; while (callwheelsize < ncallout) callwheelsize <<= 1; callwheelmask = callwheelsize - 1; cc->cc_callout = (struct callout *)v; v = (caddr_t)(cc->cc_callout + ncallout); cc->cc_callwheel = (struct callout_tailq *)v; v = (caddr_t)(cc->cc_callwheel + callwheelsize); cc->cc_localexp = (struct callout_tailq *)v; v = (caddr_t)(cc->cc_localexp + 1); return(v); } static void callout_cpu_init(struct callout_cpu *cc) { struct callout *c; int i; mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE); SLIST_INIT(&cc->cc_callfree); for (i = 0; i < callwheelsize; i++) { TAILQ_INIT(&cc->cc_callwheel[i]); } TAILQ_INIT(cc->cc_localexp); cc_cme_cleanup(cc); if (cc->cc_callout == NULL) return; for (i = 0; i < ncallout; i++) { c = &cc->cc_callout[i]; callout_init(c, 0); c->c_flags = CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } - cc->cc_softticks.sec = 0; - cc->cc_softticks.frac = 0; } #ifdef SMP /* * Switches the cpu tied to a specific callout. * The function expects a locked incoming callout cpu and returns with * locked outcoming callout cpu. */ static struct callout_cpu * callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) { struct callout_cpu *new_cc; MPASS(c != NULL && cc != NULL); CC_LOCK_ASSERT(cc); /* * Avoid interrupts and preemption firing after the callout cpu * is blocked in order to avoid deadlocks as the new thread * may be willing to acquire the callout cpu lock. */ c->c_cpu = CPUBLOCK; spinlock_enter(); CC_UNLOCK(cc); new_cc = CC_CPU(new_cpu); CC_LOCK(new_cc); spinlock_exit(); c->c_cpu = new_cpu; return (new_cc); } #endif /* * kern_timeout_callwheel_init() - initialize previously reserved callwheel * space. * * This code is called just once, after the space reserved for the * callout wheel has been finalized. */ void kern_timeout_callwheel_init(void) { callout_cpu_init(CC_CPU(timeout_cpu)); } /* * Start standard softclock thread. */ static void start_softclock(void *dummy) { struct callout_cpu *cc; #ifdef SMP int cpu; #endif cc = CC_CPU(timeout_cpu); if (swi_add(&clk_intr_event, "clock", softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); #ifdef SMP CPU_FOREACH(cpu) { if (cpu == timeout_cpu) continue; cc = CC_CPU(cpu); if (swi_add(NULL, "clock", softclock, cc, SWI_CLOCK, INTR_MPSAFE, &cc->cc_cookie)) panic("died while creating standard software ithreads"); cc->cc_callout = NULL; /* Only cpu0 handles timeout(). */ cc->cc_callwheel = malloc( sizeof(struct callout_tailq) * callwheelsize, M_CALLOUT, M_WAITOK); cc->cc_localexp = malloc( sizeof(struct callout_tailq), M_CALLOUT, M_WAITOK); callout_cpu_init(cc); } #endif } SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL); static inline int callout_hash(struct bintime *bt) { return (int) ((bt->sec<<10)+(bt->frac>>54)); } static inline int get_bucket(struct bintime *bt) { return callout_hash(bt) & callwheelmask; } void -callout_tick(void) +callout_process(void) { struct bintime limit, max, min, next, now, tmp_max, tmp_min; struct callout *tmp; struct callout_cpu *cc; struct callout_tailq *sc; int cpu, first, future, last, need_softclock; /* * Process callouts at a very low cpu priority, so we don't keep the * relatively high clock interrupt priority any longer than necessary. */ need_softclock = 0; cc = CC_SELF(); mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); binuptime(&now); cpu = curcpu; - first = callout_hash(&cc->cc_softticks); + first = callout_hash(&cc->cc_lastscan); last = callout_hash(&now); /* * Check if we wrapped around the entire wheel from the last scan. * In case, we need to scan entirely the wheel for pending callouts. */ last = (last - first >= callwheelsize) ? (first - 1) & callwheelmask : last & callwheelmask; first &= callwheelmask; for (;;) { sc = &cc->cc_callwheel[first]; TAILQ_FOREACH(tmp, sc, c_links.tqe) { if (bintime_cmp(&tmp->c_time, &now, <=)) { /* * Consumer told us the callout may be run * directly from hardware interrupt context. */ if (tmp->c_flags & CALLOUT_DIRECT) { tmp->c_func(tmp->c_arg); TAILQ_REMOVE(sc, tmp, c_links.tqe); tmp->c_flags &= ~CALLOUT_PENDING; } else { TAILQ_INSERT_TAIL(cc->cc_localexp, tmp,c_staiter); TAILQ_REMOVE(sc, tmp, c_links.tqe); tmp->c_flags |= CALLOUT_PROCESSED; need_softclock = 1; } } } if (first == last) break; first = (first + 1) & callwheelmask; } future = ((last + hz/4) & callwheelmask); max.sec = max.frac = INT_MAX; min.sec = min.frac = INT_MAX; limit.sec = 0; limit.frac = (uint64_t)1 << (64 - 2); bintime_add(&limit, &now); /* * Look for the first bucket in the future that contains some event, * up to some point, so that we can look for aggregation. */ for (;;) { sc = &cc->cc_callwheel[last]; TAILQ_FOREACH(tmp, sc, c_links.tqe) { tmp_max = tmp_min = tmp->c_time; bintime_add(&tmp_max, &tmp->c_precision); bintime_sub(&tmp_min, &tmp->c_precision); /* * This is the fist event we're going to process or * event maximal time is less than present minimal. * In both cases, take it. */ if (bintime_cmp(&tmp_max, &min, <)) { max = tmp_max; min = tmp_min; continue; } /* * Event minimal time is bigger than present maximal * time, so it cannot be aggregated. */ if (bintime_cmp(&tmp_min, &max, >)) continue; /* * If neither of the two previous happened, just take * the intersection of events. */ min = (bintime_cmp(&tmp_min, &min, >)) ? tmp_min : min; max = (bintime_cmp(&tmp_max, &max, >)) ? tmp_max : max; } if (last == future || (max.sec != INT_MAX && min.sec != INT_MAX)) break; last = (last + 1) & callwheelmask; } if (max.sec == INT_MAX && min.sec == INT_MAX) { next.sec = 0; next.frac = (uint64_t)1 << (64 - 2); bintime_add(&next, &now); } /* * Now that we found something to aggregate, schedule an interrupt in * the middle of the previously calculated range. */ else { bintime_add(&max, &min); next = max; next.frac >>= 1; if (next.sec & 1) next.frac |= ((uint64_t)1 << 63); next.sec >>= 1; } - cc->cc_firsttick = next; + cc->cc_firstevent = next; if (callout_new_inserted != NULL) (*callout_new_inserted)(cpu, next); - cc->cc_softticks = now; + cc->cc_lastscan = now; mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. */ if (need_softclock) { swi_sched(cc->cc_cookie, 0); } } static struct callout_cpu * callout_lock(struct callout *c) { struct callout_cpu *cc; int cpu; for (;;) { cpu = c->c_cpu; #ifdef SMP if (cpu == CPUBLOCK) { while (c->c_cpu == CPUBLOCK) cpu_spinwait(); continue; } #endif cc = CC_CPU(cpu); CC_LOCK(cc); if (cpu == c->c_cpu) break; CC_UNLOCK(cc); } return (cc); } static void callout_cc_add(struct callout *c, struct callout_cpu *cc, struct bintime to_bintime, void (*func)(void *), void *arg, int cpu, int flags) { struct timeval tv; int bucket; CC_LOCK_ASSERT(cc); - if (bintime_cmp(&to_bintime, &cc->cc_softticks, <)) { - to_bintime = cc->cc_softticks; + if (bintime_cmp(&to_bintime, &cc->cc_lastscan, <)) { + to_bintime = cc->cc_lastscan; } c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); if (flags & C_DIRECT_EXEC) c->c_flags |= CALLOUT_DIRECT; c->c_flags &= ~CALLOUT_PROCESSED; c->c_func = func; c->c_time = to_bintime; tv.tv_sec = 0; if (flags & C_10US) { tv.tv_usec = 10; timeval2bintime(&tv, &c->c_precision); } else if (flags & C_100US) { tv.tv_usec = 100; timeval2bintime(&tv, &c->c_precision); } else if (flags & C_1MS) { tv.tv_usec = 1000; timeval2bintime(&tv, &c->c_precision); } else { c->c_precision.sec = 0; c->c_precision.frac = 0; } bucket = get_bucket(&c->c_time); TAILQ_INSERT_TAIL(&cc->cc_callwheel[bucket & callwheelmask], c, c_links.tqe); /* * Inform the eventtimers(4) subsystem there's a new callout * that has been inserted. */ if (callout_new_inserted != NULL && - (bintime_cmp(&to_bintime, &cc->cc_firsttick, <) || - (cc->cc_firsttick.sec == 0 && cc->cc_firsttick.frac == 0))) { - cc->cc_firsttick = to_bintime; + (bintime_cmp(&to_bintime, &cc->cc_firstevent, <) || + (cc->cc_firstevent.sec == 0 && cc->cc_firstevent.frac == 0))) { + cc->cc_firstevent = to_bintime; (*callout_new_inserted)(cpu, to_bintime); } } static void callout_cc_del(struct callout *c, struct callout_cpu *cc) { if (cc->cc_next == c) cc->cc_next = TAILQ_NEXT(c, c_staiter); if (c->c_flags & CALLOUT_LOCAL_ALLOC) { c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } } static struct callout * softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls, int *lockcalls, int *gcalls) { void (*c_func)(void *); void *c_arg; struct lock_class *class; struct lock_object *c_lock; int c_flags, sharedlock; #ifdef SMP struct callout_cpu *new_cc; void (*new_func)(void *); void *new_arg; int new_cpu; struct bintime new_time; #endif #ifdef DIAGNOSTIC struct bintime bt1, bt2; struct timespec ts2; static uint64_t maxdt = 36893488147419102LL; /* 2 msec */ static timeout_t *lastfunc; #endif cc->cc_next = TAILQ_NEXT(c, c_staiter); class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1; c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_flags = c->c_flags; if (c->c_flags & CALLOUT_LOCAL_ALLOC) c->c_flags = CALLOUT_LOCAL_ALLOC; else c->c_flags &= ~CALLOUT_PENDING; cc->cc_curr = c; cc->cc_cancel = 0; CC_UNLOCK(cc); if (c_lock != NULL) { class->lc_lock(c_lock, sharedlock); /* * The callout may have been cancelled * while we switched locks. */ if (cc->cc_cancel) { class->lc_unlock(c_lock); goto skip; } /* The callout cannot be stopped now. */ cc->cc_cancel = 1; if (c_lock == &Giant.lock_object) { (*gcalls)++; CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } else { (*lockcalls)++; CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p", c, c_func, c_arg); } } else { (*mpcalls)++; CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p", c, c_func, c_arg); } #ifdef DIAGNOSTIC binuptime(&bt1); #endif THREAD_NO_SLEEPING(); SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0); c_func(c_arg); SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0); THREAD_SLEEPING_OK(); #ifdef DIAGNOSTIC binuptime(&bt2); bintime_sub(&bt2, &bt1); if (bt2.frac > maxdt) { if (lastfunc != c_func || bt2.frac > maxdt * 2) { bintime2timespec(&bt2, &ts2); printf( "Expensive timeout(9) function: %p(%p) %jd.%09ld s\n", c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec); } maxdt = bt2.frac; lastfunc = c_func; } #endif CTR1(KTR_CALLOUT, "callout %p finished", c); if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0) class->lc_unlock(c_lock); skip: CC_LOCK(cc); /* * If the current callout is locally allocated (from * timeout(9)) then put it on the freelist. * * Note: we need to check the cached copy of c_flags because * if it was not local, then it's not safe to deref the * callout pointer. */ if (c_flags & CALLOUT_LOCAL_ALLOC) { KASSERT(c->c_flags == CALLOUT_LOCAL_ALLOC, ("corrupted callout")); c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } cc->cc_curr = NULL; if (cc->cc_waiting) { /* * There is someone waiting for the * callout to complete. * If the callout was scheduled for * migration just cancel it. */ if (cc_cme_migrating(cc)) cc_cme_cleanup(cc); cc->cc_waiting = 0; CC_UNLOCK(cc); wakeup(&cc->cc_waiting); CC_LOCK(cc); } else if (cc_cme_migrating(cc)) { #ifdef SMP /* * If the callout was scheduled for * migration just perform it now. */ new_cpu = cc->cc_migration_cpu; new_time = cc->cc_migration_time; new_func = cc->cc_migration_func; new_arg = cc->cc_migration_arg; cc_cme_cleanup(cc); /* * Handle deferred callout stops */ if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) { CTR3(KTR_CALLOUT, "deferred cancelled %p func %p arg %p", c, new_func, new_arg); callout_cc_del(c, cc); goto nextc; } c->c_flags &= ~CALLOUT_DFRMIGRATION; /* * It should be assert here that the * callout is not destroyed but that * is not easy. */ new_cc = callout_cpu_switch(c, cc, new_cpu); callout_cc_add(c, new_cc, new_time, new_func, new_arg, new_cpu, 0); CC_UNLOCK(new_cc); CC_LOCK(cc); #else panic("migration should not happen"); #endif } #ifdef SMP nextc: #endif return (cc->cc_next); } /* * The callout mechanism is based on the work of Adam M. Costello and * George Varghese, published in a technical report entitled "Redesigning * the BSD Callout and Timer Facilities" and modified slightly for inclusion * in FreeBSD by Justin T. Gibbs. The original work on the data structures * used in this implementation was published by G. Varghese and T. Lauck in * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for * the Efficient Implementation of a Timer Facility" in the Proceedings of * the 11th ACM Annual Symposium on Operating Systems Principles, * Austin, Texas Nov 1987. */ /* * Software (low priority) clock interrupt. * Run periodic events from timeout queue. */ void softclock(void *arg) { struct callout_cpu *cc; struct callout *c; int steps; /* #steps since we last allowed interrupts */ int mpcalls; int lockcalls; int gcalls; #ifndef MAX_SOFTCLOCK_STEPS #define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ #endif /* MAX_SOFTCLOCK_STEPS */ mpcalls = 0; lockcalls = 0; gcalls = 0; steps = 0; cc = (struct callout_cpu *)arg; CC_LOCK(cc); c = TAILQ_FIRST(cc->cc_localexp); while (c != NULL) { ++steps; if (steps >= MAX_SOFTCLOCK_STEPS) { cc->cc_next = c; /* Give interrupts a chance. */ CC_UNLOCK(cc); ; /* nothing */ CC_LOCK(cc); c = cc->cc_next; steps = 0; } else { TAILQ_REMOVE(cc->cc_localexp, c, c_staiter); c = softclock_call_cc(c, cc, &mpcalls, &lockcalls, &gcalls); steps = 0; } } avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8; avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8; avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8; cc->cc_next = NULL; CC_UNLOCK(cc); } /* * timeout -- * Execute a function after a specified length of time. * * untimeout -- * Cancel previous timeout function call. * * callout_handle_init -- * Initialize a handle so that using it with untimeout is benign. * * See AT&T BCI Driver Reference Manual for specification. This * implementation differs from that one in that although an * identification value is returned from timeout, the original * arguments to timeout as well as the identifier are used to * identify entries for untimeout. */ struct callout_handle timeout(ftn, arg, to_ticks) timeout_t *ftn; void *arg; int to_ticks; { struct callout_cpu *cc; struct callout *new; struct callout_handle handle; cc = CC_CPU(timeout_cpu); CC_LOCK(cc); /* Fill in the next free callout structure. */ new = SLIST_FIRST(&cc->cc_callfree); if (new == NULL) /* XXX Attempt to malloc first */ panic("timeout table full"); SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle); callout_reset(new, to_ticks, ftn, arg); handle.callout = new; CC_UNLOCK(cc); return (handle); } void untimeout(ftn, arg, handle) timeout_t *ftn; void *arg; struct callout_handle handle; { struct callout_cpu *cc; /* * Check for a handle that was initialized * by callout_handle_init, but never used * for a real timeout. */ if (handle.callout == NULL) return; cc = callout_lock(handle.callout); if (handle.callout->c_func == ftn && handle.callout->c_arg == arg) callout_stop(handle.callout); CC_UNLOCK(cc); } void callout_handle_init(struct callout_handle *handle) { handle->callout = NULL; } /* * New interface; clients allocate their own callout structures. * * callout_reset() - establish or change a timeout * callout_stop() - disestablish a timeout * callout_init() - initialize a callout structure so that it can * safely be passed to callout_reset() and callout_stop() * * defines three convenience macros: * * callout_active() - returns truth if callout has not been stopped, * drained, or deactivated since the last time the callout was * reset. * callout_pending() - returns truth if callout is still waiting for timeout * callout_deactivate() - marks the callout as having been serviced */ int _callout_reset_on(struct callout *c, struct bintime *bt, int to_ticks, void (*ftn)(void *), void *arg, int cpu, int flags) { struct bintime now, to_bt; struct callout_cpu *cc; int cancelled = 0; int bucket; if (bt == NULL) { FREQ2BT(hz,&to_bt); getbinuptime(&now); bintime_mul(&to_bt,to_ticks); bintime_add(&to_bt,&now); } else to_bt = *bt; /* * Don't allow migration of pre-allocated callouts lest they * become unbalanced. */ if (c->c_flags & CALLOUT_LOCAL_ALLOC) cpu = c->c_cpu; cc = callout_lock(c); if (cc->cc_curr == c) { /* * We're being asked to reschedule a callout which is * currently in progress. If there is a lock then we * can cancel the callout if it has not really started. */ if (c->c_lock != NULL && !cc->cc_cancel) cancelled = cc->cc_cancel = 1; if (cc->cc_waiting) { /* * Someone has called callout_drain to kill this * callout. Don't reschedule. */ CTR4(KTR_CALLOUT, "%s %p func %p arg %p", cancelled ? "cancelled" : "failed to cancel", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (cancelled); } } if (c->c_flags & CALLOUT_PENDING) { if ((c->c_flags & CALLOUT_PROCESSED) == 0) { if (cc->cc_next == c) cc->cc_next = TAILQ_NEXT(c, c_links.tqe); bucket = get_bucket(&c->c_time); TAILQ_REMOVE(&cc->cc_callwheel[bucket], c, c_links.tqe); } else { if (cc->cc_next == c) cc->cc_next = TAILQ_NEXT(c, c_staiter); TAILQ_REMOVE(cc->cc_localexp, c, c_staiter); } cancelled = 1; c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); } #ifdef SMP /* * If the callout must migrate try to perform it immediately. * If the callout is currently running, just defer the migration * to a more appropriate moment. */ if (c->c_cpu != cpu) { if (cc->cc_curr == c) { cc->cc_migration_cpu = cpu; cc->cc_migration_time = to_bt; cc->cc_migration_func = ftn; cc->cc_migration_arg = arg; c->c_flags |= CALLOUT_DFRMIGRATION; CTR6(KTR_CALLOUT, "migration of %p func %p arg %p in %d.%08x to %u deferred", c, c->c_func, c->c_arg, (int)(to_bt.sec), (u_int)(to_bt.frac >> 32), cpu); CC_UNLOCK(cc); return (cancelled); } cc = callout_cpu_switch(c, cc, cpu); } #endif callout_cc_add(c, cc, to_bt, ftn, arg, cpu, flags); CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_bt.sec), (u_int)(to_bt.frac >> 32)); CC_UNLOCK(cc); return (cancelled); } /* * Common idioms that can be optimized in the future. */ int callout_schedule_on(struct callout *c, int to_ticks, int cpu) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu); } int callout_schedule(struct callout *c, int to_ticks) { return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu); } int _callout_stop_safe(c, safe) struct callout *c; int safe; { struct callout_cpu *cc, *old_cc; struct lock_class *class; int use_lock, sq_locked, bucket; /* * Some old subsystems don't hold Giant while running a callout_stop(), * so just discard this check for the moment. */ if (!safe && c->c_lock != NULL) { if (c->c_lock == &Giant.lock_object) use_lock = mtx_owned(&Giant); else { use_lock = 1; class = LOCK_CLASS(c->c_lock); class->lc_assert(c->c_lock, LA_XLOCKED); } } else use_lock = 0; sq_locked = 0; old_cc = NULL; again: cc = callout_lock(c); /* * If the callout was migrating while the callout cpu lock was * dropped, just drop the sleepqueue lock and check the states * again. */ if (sq_locked != 0 && cc != old_cc) { #ifdef SMP CC_UNLOCK(cc); sleepq_release(&old_cc->cc_waiting); sq_locked = 0; old_cc = NULL; goto again; #else panic("migration should not happen"); #endif } /* * If the callout isn't pending, it's not on the queue, so * don't attempt to remove it from the queue. We can try to * stop it by other means however. */ if (!(c->c_flags & CALLOUT_PENDING)) { c->c_flags &= ~CALLOUT_ACTIVE; /* * If it wasn't on the queue and it isn't the current * callout, then we can't stop it, so just bail. */ if (cc->cc_curr != c) { CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); if (sq_locked) sleepq_release(&cc->cc_waiting); return (0); } if (safe) { /* * The current callout is running (or just * about to run) and blocking is allowed, so * just wait for the current invocation to * finish. */ while (cc->cc_curr == c) { /* * Use direct calls to sleepqueue interface * instead of cv/msleep in order to avoid * a LOR between cc_lock and sleepqueue * chain spinlocks. This piece of code * emulates a msleep_spin() call actually. * * If we already have the sleepqueue chain * locked, then we can safely block. If we * don't already have it locked, however, * we have to drop the cc_lock to lock * it. This opens several races, so we * restart at the beginning once we have * both locks. If nothing has changed, then * we will end up back here with sq_locked * set. */ if (!sq_locked) { CC_UNLOCK(cc); sleepq_lock(&cc->cc_waiting); sq_locked = 1; old_cc = cc; goto again; } /* * Migration could be cancelled here, but * as long as it is still not sure when it * will be packed up, just let softclock() * take care of it. */ cc->cc_waiting = 1; DROP_GIANT(); CC_UNLOCK(cc); sleepq_add(&cc->cc_waiting, &cc->cc_lock.lock_object, "codrain", SLEEPQ_SLEEP, 0); sleepq_wait(&cc->cc_waiting, 0); sq_locked = 0; old_cc = NULL; /* Reacquire locks previously released. */ PICKUP_GIANT(); CC_LOCK(cc); } } else if (use_lock && !cc->cc_cancel) { /* * The current callout is waiting for its * lock which we hold. Cancel the callout * and return. After our caller drops the * lock, the callout will be skipped in * softclock(). */ cc->cc_cancel = 1; CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); KASSERT(!cc_cme_migrating(cc), ("callout wrongly scheduled for migration")); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain locked")); return (1); } else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) { c->c_flags &= ~CALLOUT_DFRMIGRATION; CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); return (1); } CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", c, c->c_func, c->c_arg); CC_UNLOCK(cc); KASSERT(!sq_locked, ("sleepqueue chain still locked")); return (0); } if (sq_locked) sleepq_release(&cc->cc_waiting); c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); if ((c->c_flags & CALLOUT_PROCESSED) == 0) { bucket = get_bucket(&c->c_time); TAILQ_REMOVE(&cc->cc_callwheel[bucket], c, c_links.tqe); } else TAILQ_REMOVE(cc->cc_localexp, c, c_staiter); callout_cc_del(c, cc); CC_UNLOCK(cc); return (1); } void callout_init(c, mpsafe) struct callout *c; int mpsafe; { bzero(c, sizeof *c); if (mpsafe) { c->c_lock = NULL; c->c_flags = CALLOUT_RETURNUNLOCKED; } else { c->c_lock = &Giant.lock_object; c->c_flags = 0; } c->c_cpu = timeout_cpu; } void _callout_init_lock(c, lock, flags) struct callout *c; struct lock_object *lock; int flags; { bzero(c, sizeof *c); c->c_lock = lock; KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, ("callout_init_lock: bad flags %d", flags)); KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class", __func__)); c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); c->c_cpu = timeout_cpu; } #ifdef APM_FIXUP_CALLTODO /* * Adjust the kernel calltodo timeout list. This routine is used after * an APM resume to recalculate the calltodo timer list values with the * number of hz's we have been sleeping. The next hardclock() will detect * that there are fired timers and run softclock() to execute them. * * Please note, I have not done an exhaustive analysis of what code this * might break. I am motivated to have my select()'s and alarm()'s that * have expired during suspend firing upon resume so that the applications * which set the timer can do the maintanence the timer was for as close * as possible to the originally intended time. Testing this code for a * week showed that resuming from a suspend resulted in 22 to 25 timers * firing, which seemed independant on whether the suspend was 2 hours or * 2 days. Your milage may vary. - Ken Key */ void adjust_timeout_calltodo(time_change) struct timeval *time_change; { register struct callout *p; unsigned long delta_ticks; /* * How many ticks were we asleep? * (stolen from tvtohz()). */ /* Don't do anything */ if (time_change->tv_sec < 0) return; else if (time_change->tv_sec <= LONG_MAX / 1000000) delta_ticks = (time_change->tv_sec * 1000000 + time_change->tv_usec + (tick - 1)) / tick + 1; else if (time_change->tv_sec <= LONG_MAX / hz) delta_ticks = time_change->tv_sec * hz + (time_change->tv_usec + (tick - 1)) / tick + 1; else delta_ticks = LONG_MAX; if (delta_ticks > INT_MAX) delta_ticks = INT_MAX; /* * Now rip through the timer calltodo list looking for timers * to expire. */ /* don't collide with softclock() */ CC_LOCK(cc); for (p = calltodo.c_next; p != NULL; p = p->c_next) { p->c_time -= delta_ticks; /* Break if the timer had more time on it than delta_ticks */ if (p->c_time > 0) break; /* take back the ticks the timer didn't use (p->c_time <= 0) */ delta_ticks = -p->c_time; } CC_UNLOCK(cc); return; } #endif /* APM_FIXUP_CALLTODO */ Index: projects/calloutng/sys/sys/callout.h =================================================================== --- projects/calloutng/sys/sys/callout.h (revision 237575) +++ projects/calloutng/sys/sys/callout.h (revision 237576) @@ -1,102 +1,102 @@ /*- * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)callout.h 8.2 (Berkeley) 1/21/94 * $FreeBSD$ */ #ifndef _SYS_CALLOUT_H_ #define _SYS_CALLOUT_H_ #include #define CALLOUT_LOCAL_ALLOC 0x0001 /* was allocated from callfree */ #define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ #define CALLOUT_PENDING 0x0004 /* callout is waiting for timeout */ #define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ #define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */ #define CALLOUT_SHAREDLOCK 0x0020 /* callout lock held in shared mode */ #define CALLOUT_DFRMIGRATION 0x0040 /* callout in deferred migration mode */ #define CALLOUT_PROCESSED 0x0080 /* callout in wheel or processing list? */ #define CALLOUT_DIRECT 0x1000 /* allow exec from hw int context */ #define C_DIRECT_EXEC 0x0001 /* direct execution of callout */ #define C_10US 0x0002 /* precision field */ #define C_100US 0x0004 /* precision field */ #define C_1MS 0x0008 /* precision field */ struct callout_handle { struct callout *callout; }; #ifdef _KERNEL extern int ncallout; #define callout_active(c) ((c)->c_flags & CALLOUT_ACTIVE) #define callout_deactivate(c) ((c)->c_flags &= ~CALLOUT_ACTIVE) #define callout_drain(c) _callout_stop_safe(c, 1) void callout_init(struct callout *, int); void _callout_init_lock(struct callout *, struct lock_object *, int); #define callout_init_mtx(c, mtx, flags) \ _callout_init_lock((c), ((mtx) != NULL) ? &(mtx)->lock_object : \ NULL, (flags)) #define callout_init_rw(c, rw, flags) \ _callout_init_lock((c), ((rw) != NULL) ? &(rw)->lock_object : \ NULL, (flags)) #define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) int _callout_reset_on(struct callout *, struct bintime *, int, void (*)(void *), void *, int, int); #define callout_reset_on(c, to_ticks, fn, arg, cpu) \ _callout_reset_on((c), (NULL), (to_ticks), (fn), (arg), (cpu), \ (0)) #define callout_reset_flags_on(c, to_ticks, fn, arg, cpu, flags) \ _callout_reset_on((c), (NULL), (to_ticks), (fn), (arg), (cpu), \ (flags)) #define callout_reset_bt_on(c, bt, fn, arg, cpu, flags) \ _callout_reset_on((c), (bt), (0), (fn), (arg), (cpu), (flags)) #define callout_reset(c, on_tick, fn, arg) \ callout_reset_on((c), (on_tick), (fn), (arg), (c)->c_cpu) #define callout_reset_curcpu(c, on_tick, fn, arg) \ callout_reset_on((c), (on_tick), (fn), (arg), PCPU_GET(cpuid)) int callout_schedule(struct callout *, int); int callout_schedule_on(struct callout *, int, int); #define callout_schedule_curcpu(c, on_tick) \ callout_schedule_on((c), (on_tick), PCPU_GET(cpuid)) #define callout_stop(c) _callout_stop_safe(c, 0) int _callout_stop_safe(struct callout *, int); -void callout_tick(void); +void callout_process(void); extern void (*callout_new_inserted)(int cpu, struct bintime bt); #endif #endif /* _SYS_CALLOUT_H_ */