Index: share/man/man9/Makefile =================================================================== --- share/man/man9/Makefile +++ share/man/man9/Makefile @@ -1766,6 +1766,7 @@ timeout.9 callout_schedule_sbt_curcpu.9 \ timeout.9 callout_schedule_sbt_on.9 \ timeout.9 callout_stop.9 \ + timeout.9 callout_when.9 \ timeout.9 untimeout.9 MLINKS+=ucred.9 cred_update_thread.9 \ ucred.9 crcopy.9 \ Index: share/man/man9/timeout.9 =================================================================== --- share/man/man9/timeout.9 +++ share/man/man9/timeout.9 @@ -29,7 +29,7 @@ .\" .\" $FreeBSD$ .\" -.Dd July 4, 2016 +.Dd July 27, 2016 .Dt TIMEOUT 9 .Os .Sh NAME @@ -56,6 +56,7 @@ .Nm callout_schedule_sbt_curcpu , .Nm callout_schedule_sbt_on , .Nm callout_stop , +.Nm callout_when , .Nm timeout , .Nm untimeout .Nd execute a function after a specified length of time @@ -122,6 +123,9 @@ "sbintime_t pr" "int cpu" "int flags" .Ft int .Fn callout_stop "struct callout *c" +.Ft sbintime_t +.Fn callout_when "sbintime_t sbt" "sbintime_t precision" "int flags" \ +"sbintime_t *sbt_res" "sbintime_t *precision_res" .Ft struct callout_handle .Fn timeout "timeout_t *func" "void *arg" "int ticks" .Ft void @@ -387,6 +391,25 @@ Smaller values .Pq which result in larger time intervals allow the callout subsystem to aggregate more events in one timer interrupt. +.It Dv C_PRECALC +The +.Fa sbt +argument specifies absolute time at which the callout should be run, +and the +.Fa pr +argument specifies non-adjustable requested precision. +The +.Fa sbt +and +.Fa pr +values should be calculated by an earlier call to +.Fn callout_when +which uses the user-supplied +.Fa sbt , +.Fa pr , +and +.Fa flags +values. .It Dv C_HARDCLOCK Align the timeouts to .Fn hardclock @@ -503,6 +526,39 @@ .Em does not clear it when a callout expires normally via the execution of the callout function. +.Pp +The +.Fn callout_when +function may be used to pre-calculate the absolute time at which the +timeout should be run and the precision of the scheduled run time +according to the required time +.Fa sbt , +precision +.Fa precision , +and additional adjustments requested by the +.Fa flags +argument. +Flags accepted by the +.Fn callout_when +function are the same as flags for the +.Fa callout_reset +function. +Resulting time is assigned to the variable pointed to by the +.Fa sbt_res +argument, and resulted precision is assigned to +.Fa *precision_res . +When passing the results to +.Fa callout_reset , +add the +.Va C_PRECALC +flag to +.Fa flags , +to avoid incorrect re-adjustment. +The function is intended for situations where precise time of the callout +run should be known in advance, since +trying to read this time from the callout structure itself after a +.Fn callout_reset +call is racy. .Ss "Avoiding Race Conditions" The callout subsystem invokes callout functions from its own thread context. Index: sys/ddb/db_ps.c =================================================================== --- sys/ddb/db_ps.c +++ sys/ddb/db_ps.c @@ -375,8 +375,13 @@ db_printf(" lock: %s turnstile: %p\n", td->td_lockname, td->td_blocked); if (TD_ON_SLEEPQ(td)) - db_printf(" wmesg: %s wchan: %p\n", td->td_wmesg, - td->td_wchan); + db_printf( + " wmesg: %s wchan: %p sleeptimo %lx. %jx (curr %lx. %jx)\n", + td->td_wmesg, td->td_wchan, + (long)sbttobt(td->td_sleeptimo).sec, + (uintmax_t)sbttobt(td->td_sleeptimo).frac, + (long)sbttobt(sbinuptime()).sec, + (uintmax_t)sbttobt(sbinuptime()).frac); db_printf(" priority: %d\n", td->td_priority); db_printf(" container lock: %s (%p)\n", lock->lo_name, lock); if (td->td_swvoltick != 0) { Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -318,7 +318,7 @@ /* * Don't even bother to lock if none at this instant, - * we really don't care about the next instant.. + * we really don't care about the next instant. */ if (!TAILQ_EMPTY(&zombie_threads)) { mtx_lock_spin(&zombie_lock); @@ -383,6 +383,7 @@ if (td->td_kstack != 0) vm_thread_dispose(td); vm_domain_policy_cleanup(&td->td_vm_dom_policy); + callout_drain(&td->td_slpcallout); uma_zfree(thread_zone, td); } @@ -580,6 +581,7 @@ td->td_cpuset = NULL; cpu_thread_clean(td); thread_cow_free(td); + callout_drain(&td->td_slpcallout); thread_reap(); /* check for zombie threads etc. */ } Index: sys/kern/kern_timeout.c =================================================================== --- sys/kern/kern_timeout.c +++ sys/kern/kern_timeout.c @@ -945,6 +945,56 @@ handle->callout = NULL; } +void +callout_when(sbintime_t sbt, sbintime_t precision, int flags, + sbintime_t *res, sbintime_t *prec_res) +{ + sbintime_t to_sbt, to_pr; + + if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) { + *res = sbt; + *prec_res = precision; + return; + } + if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt) + sbt = tick_sbt; + if ((flags & C_HARDCLOCK) != 0 || +#ifdef NO_EVENTTIMERS + sbt >= sbt_timethreshold) { + to_sbt = getsbinuptime(); + + /* Add safety belt for the case of hz > 1000. */ + to_sbt += tc_tick_sbt - tick_sbt; +#else + sbt >= sbt_tickthreshold) { + /* + * Obtain the time of the last hardclock() call on + * this CPU directly from the kern_clocksource.c. + * This value is per-CPU, but it is equal for all + * active ones. + */ +#ifdef __LP64__ + to_sbt = DPCPU_GET(hardclocktime); +#else + spinlock_enter(); + to_sbt = DPCPU_GET(hardclocktime); + spinlock_exit(); +#endif +#endif + if ((flags & C_HARDCLOCK) == 0) + to_sbt += tick_sbt; + } else + to_sbt = sbinuptime(); + if (SBT_MAX - to_sbt < sbt) + to_sbt = SBT_MAX; + else + to_sbt += sbt; + *res = to_sbt; + to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : + sbt >> C_PRELGET(flags)); + *prec_res = to_pr > precision ? to_pr : precision; +} + /* * New interface; clients allocate their own callout structures. * @@ -962,10 +1012,10 @@ * callout_deactivate() - marks the callout as having been serviced */ int -callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, +callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec, void (*ftn)(void *), void *arg, int cpu, int flags) { - sbintime_t to_sbt, pr; + sbintime_t to_sbt, precision; struct callout_cpu *cc; int cancelled, direct; int ignore_cpu=0; @@ -978,47 +1028,8 @@ /* Invalid CPU spec */ panic("Invalid CPU in callout %d", cpu); } - if (flags & C_ABSOLUTE) { - to_sbt = sbt; - } else { - if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) - sbt = tick_sbt; - if ((flags & C_HARDCLOCK) || -#ifdef NO_EVENTTIMERS - sbt >= sbt_timethreshold) { - to_sbt = getsbinuptime(); + callout_when(sbt, prec, flags, &to_sbt, &precision); - /* Add safety belt for the case of hz > 1000. */ - to_sbt += tc_tick_sbt - tick_sbt; -#else - sbt >= sbt_tickthreshold) { - /* - * Obtain the time of the last hardclock() call on - * this CPU directly from the kern_clocksource.c. - * This value is per-CPU, but it is equal for all - * active ones. - */ -#ifdef __LP64__ - to_sbt = DPCPU_GET(hardclocktime); -#else - spinlock_enter(); - to_sbt = DPCPU_GET(hardclocktime); - spinlock_exit(); -#endif -#endif - if ((flags & C_HARDCLOCK) == 0) - to_sbt += tick_sbt; - } else - to_sbt = sbinuptime(); - if (SBT_MAX - to_sbt < sbt) - to_sbt = SBT_MAX; - else - to_sbt += sbt; - pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : - sbt >> C_PRELGET(flags)); - if (pr > precision) - precision = pr; - } /* * This flag used to be added by callout_cc_add, but the * first time you call this we could end up with the Index: sys/kern/subr_sleepqueue.c =================================================================== --- sys/kern/subr_sleepqueue.c +++ sys/kern/subr_sleepqueue.c @@ -378,6 +378,7 @@ { struct sleepqueue_chain *sc; struct thread *td; + sbintime_t pr1; td = curthread; sc = SC_LOOKUP(wchan); @@ -387,8 +388,14 @@ MPASS(wchan != NULL); if (cold) panic("timed sleep before timers are working"); - callout_reset_sbt_on(&td->td_slpcallout, sbt, pr, - sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC); + KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx", + td->td_tid, td, (uintmax_t)td->td_sleeptimo)); + thread_lock(td); + callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1); + thread_unlock(td); + callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1, + sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC | + C_DIRECT_EXEC); } /* @@ -576,37 +583,36 @@ sleepq_check_timeout(void) { struct thread *td; + int res; td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* - * If TDF_TIMEOUT is set, we timed out. + * If TDF_TIMEOUT is set, we timed out. But recheck + * td_sleeptimo anyway. */ - if (td->td_flags & TDF_TIMEOUT) { - td->td_flags &= ~TDF_TIMEOUT; - return (EWOULDBLOCK); + res = 0; + if (td->td_sleeptimo != 0) { + if (td->td_sleeptimo <= sbinuptime()) + res = EWOULDBLOCK; + td->td_sleeptimo = 0; } - - /* - * If TDF_TIMOFAIL is set, the timeout ran after we had - * already been woken up. - */ - if (td->td_flags & TDF_TIMOFAIL) - td->td_flags &= ~TDF_TIMOFAIL; - - /* - * If callout_stop() fails, then the timeout is running on - * another CPU, so synchronize with it to avoid having it - * accidentally wake up a subsequent sleep. - */ - else if (_callout_stop_safe(&td->td_slpcallout, CS_EXECUTING, NULL) - == 0) { - td->td_flags |= TDF_TIMEOUT; - TD_SET_SLEEPING(td); - mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL); - } - return (0); + if (td->td_flags & TDF_TIMEOUT) + td->td_flags &= ~TDF_TIMEOUT; + else + /* + * We ignore the situation where timeout subsystem was + * unable to stop our callout. The struct thread is + * type-stable, the callout will use the correct + * memory when running. The checks of the + * td_sleeptimo value in this function and in + * sleepq_timeout() ensure that the thread does not + * get spurious wakeups, even if the callout was reset + * or thread reused. + */ + callout_stop(&td->td_slpcallout); + return (res); } /* @@ -914,12 +920,17 @@ CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); - /* - * First, see if the thread is asleep and get the wait channel if - * it is. - */ thread_lock(td); - if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { + + if (td->td_sleeptimo > sbinuptime() || td->td_sleeptimo == 0) { + /* + * The thread does not want a timeout (yet). + */ + } else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { + /* + * See if the thread is asleep and get the wait + * channel if it is. + */ wchan = td->td_wchan; sc = SC_LOOKUP(wchan); THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); @@ -927,40 +938,16 @@ MPASS(sq != NULL); td->td_flags |= TDF_TIMEOUT; wakeup_swapper = sleepq_resume_thread(sq, td, 0); - thread_unlock(td); - if (wakeup_swapper) - kick_proc0(); - return; - } - - /* - * If the thread is on the SLEEPQ but isn't sleeping yet, it - * can either be on another CPU in between sleepq_add() and - * one of the sleepq_*wait*() routines or it can be in - * sleepq_catch_signals(). - */ - if (TD_ON_SLEEPQ(td)) { + } else if (TD_ON_SLEEPQ(td)) { + /* + * If the thread is on the SLEEPQ but isn't sleeping + * yet, it can either be on another CPU in between + * sleepq_add() and one of the sleepq_*wait*() + * routines or it can be in sleepq_catch_signals(). + */ td->td_flags |= TDF_TIMEOUT; - thread_unlock(td); - return; } - /* - * Now check for the edge cases. First, if TDF_TIMEOUT is set, - * then the other thread has already yielded to us, so clear - * the flag and resume it. If TDF_TIMEOUT is not set, then the - * we know that the other thread is not on a sleep queue, but it - * hasn't resumed execution yet. In that case, set TDF_TIMOFAIL - * to let it know that the timeout has already run and doesn't - * need to be canceled. - */ - if (td->td_flags & TDF_TIMEOUT) { - MPASS(TD_IS_SLEEPING(td)); - td->td_flags &= ~TDF_TIMEOUT; - TD_CLR_SLEEPING(td); - wakeup_swapper = setrunnable(td); - } else - td->td_flags |= TDF_TIMOFAIL; thread_unlock(td); if (wakeup_swapper) kick_proc0(); Index: sys/sys/callout.h =================================================================== --- sys/sys/callout.h +++ sys/sys/callout.h @@ -57,6 +57,7 @@ #define C_PRELGET(x) (int)((((x) >> 1) & C_PRELRANGE) - 1) #define C_HARDCLOCK 0x0100 /* align to hardclock() calls */ #define C_ABSOLUTE 0x0200 /* event time is absolute. */ +#define C_PRECALC 0x0400 /* event time is pre-calculated. */ struct callout_handle { struct callout *callout; @@ -129,6 +130,8 @@ void callout_process(sbintime_t now); #define callout_async_drain(c, d) \ _callout_stop_safe(c, 0, d) +void callout_when(sbintime_t sbt, sbintime_t precision, int flags, + sbintime_t *sbt_res, sbintime_t *prec_res); #endif #endif /* _SYS_CALLOUT_H_ */ Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -282,6 +282,7 @@ int td_no_sleeping; /* (k) Sleeping disabled count. */ int td_dom_rr_idx; /* (k) RR Numa domain selection. */ void *td_su; /* (k) FFS SU private */ + sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ @@ -388,7 +389,7 @@ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ -#define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ +#define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */