Index: kern/init_main.c =================================================================== --- kern/init_main.c +++ kern/init_main.c @@ -504,7 +504,8 @@ callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); - callout_init(&td->td_slpcallout, CALLOUT_MPSAFE); + mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN); + callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0); /* Create credentials. */ p->p_ucred = crget(); Index: kern/kern_condvar.c =================================================================== --- kern/kern_condvar.c +++ kern/kern_condvar.c @@ -313,15 +313,13 @@ DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); + sleepq_release(cvp); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { - if (class->lc_flags & LC_SLEEPABLE) - sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); - if (class->lc_flags & LC_SLEEPABLE) - sleepq_lock(cvp); } + sleepq_lock(cvp); rval = sleepq_timedwait(cvp, 0); #ifdef KTRACE @@ -383,15 +381,13 @@ sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); + sleepq_release(cvp); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { - if (class->lc_flags & LC_SLEEPABLE) - sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); - if (class->lc_flags & LC_SLEEPABLE) - sleepq_lock(cvp); } + sleepq_lock(cvp); rval = sleepq_timedwait_sig(cvp, 0); #ifdef KTRACE Index: kern/kern_lock.c =================================================================== --- kern/kern_lock.c +++ kern/kern_lock.c @@ -210,9 +210,11 @@ GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ? SLEEPQ_INTERRUPTIBLE : 0), queue); - if ((flags & LK_TIMELOCK) && timo) + if ((flags & LK_TIMELOCK) && timo) { + sleepq_release(&lk->lock_object); sleepq_set_timeout(&lk->lock_object, timo); - + sleepq_lock(&lk->lock_object); + } /* * Decisional switch for real sleeping. */ Index: kern/kern_switch.c =================================================================== --- kern/kern_switch.c +++ kern/kern_switch.c @@ -93,8 +93,6 @@ &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), ""); SCHED_STAT_DEFINE_VAR(sleepq, &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), ""); -SCHED_STAT_DEFINE_VAR(sleepqtimo, - &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQTIMO]), ""); SCHED_STAT_DEFINE_VAR(relinquish, &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), ""); SCHED_STAT_DEFINE_VAR(needresched, Index: kern/kern_synch.c =================================================================== --- kern/kern_synch.c +++ kern/kern_synch.c @@ -236,13 +236,17 @@ * return from cursig(). */ sleepq_add(ident, lock, wmesg, sleepq_flags, 0); - if (sbt != 0) - sleepq_set_timeout_sbt(ident, sbt, pr, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); + if (sbt != 0) + sleepq_set_timeout_sbt(ident, sbt, pr, flags); sleepq_lock(ident); + } else if (sbt != 0) { + sleepq_release(ident); + sleepq_set_timeout_sbt(ident, sbt, pr, flags); + sleepq_lock(ident); } if (sbt != 0 && catch) rval = sleepq_timedwait_sig(ident, pri); @@ -306,8 +310,11 @@ * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); - if (sbt != 0) + if (sbt != 0) { + sleepq_release(ident); sleepq_set_timeout_sbt(ident, sbt, pr, flags); + sleepq_lock(ident); + } /* * Can't call ktrace with any spin locks held so it can lock the Index: kern/kern_thread.c =================================================================== --- kern/kern_thread.c +++ kern/kern_thread.c @@ -149,6 +149,9 @@ audit_thread_alloc(td); #endif umtx_thread_alloc(td); + + mtx_init(&td->td_slpmutex, "td_slpmutex", NULL, MTX_SPIN); + callout_init_mtx(&td->td_slpcallout, &td->td_slpmutex, 0); return (0); } @@ -162,6 +165,10 @@ td = (struct thread *)mem; + /* make sure to drain any use of the "td->td_slpcallout" */ + callout_drain(&td->td_slpcallout); + mtx_destroy(&td->td_slpmutex); + #ifdef INVARIANTS /* Verify that this thread is in a safe state to free. */ switch (td->td_state) { @@ -544,7 +551,6 @@ LIST_INIT(&td->td_lprof[0]); LIST_INIT(&td->td_lprof[1]); sigqueue_init(&td->td_sigqueue, p); - callout_init(&td->td_slpcallout, CALLOUT_MPSAFE); TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist); p->p_numthreads++; } Index: kern/kern_timeout.c =================================================================== --- kern/kern_timeout.c +++ kern/kern_timeout.c @@ -54,6 +54,8 @@ #include #include #include +#include +#include #include #include #include @@ -124,37 +126,223 @@ */ u_int callwheelsize, callwheelmask; +typedef void callout_mutex_op_t(struct lock_object *); +typedef int callout_owned_op_t(struct lock_object *); + +struct callout_mutex_ops { + callout_mutex_op_t *lock; + callout_mutex_op_t *unlock; + callout_owned_op_t *owned; +}; + +enum { + CALLOUT_LC_UNUSED_0, + CALLOUT_LC_UNUSED_1, + CALLOUT_LC_UNUSED_2, + CALLOUT_LC_UNUSED_3, + CALLOUT_LC_SPIN, + CALLOUT_LC_MUTEX, + CALLOUT_LC_RW, + CALLOUT_LC_RM, +}; + +static void +callout_mutex_op_none(struct lock_object *lock) +{ +} + +static int +callout_owned_op_none(struct lock_object *lock) +{ + /* + * The callout is so-called "mpsafe" and the application is + * responsible to ensure proper locking. We have no way to + * assert this because we don't know which lock the + * application is using. Proper locking is most critical when + * the same callout is executed on multiple different CPUs. + */ + return (1); +} + +static void +callout_mutex_lock(struct lock_object *lock) +{ + mtx_lock((struct mtx *)lock); +} + +static void +callout_mutex_unlock(struct lock_object *lock) +{ + mtx_unlock((struct mtx *)lock); +} + +static void +callout_mutex_lock_spin(struct lock_object *lock) +{ + mtx_lock_spin((struct mtx *)lock); +} + +static void +callout_mutex_unlock_spin(struct lock_object *lock) +{ + mtx_unlock_spin((struct mtx *)lock); +} + +static int +callout_mutex_owned(struct lock_object *lock) +{ + return (mtx_owned((struct mtx *)lock)); +} + +static void +callout_rm_wlock(struct lock_object *lock) +{ + rm_wlock((struct rmlock *)lock); +} + +static void +callout_rm_wunlock(struct lock_object *lock) +{ + rm_wunlock((struct rmlock *)lock); +} + +static int +callout_rm_owned(struct lock_object *lock) +{ + return (rm_wowned((struct rmlock *)lock)); +} + +static void +callout_rw_wlock(struct lock_object *lock) +{ + rw_wlock((struct rwlock *)lock); +} + +static void +callout_rw_wunlock(struct lock_object *lock) +{ + rw_wunlock((struct rwlock *)lock); +} + +static int +callout_rw_owned(struct lock_object *lock) +{ + return (rw_wowned((struct rwlock *)lock)); +} + +static const struct callout_mutex_ops callout_mutex_ops[8] = { + [CALLOUT_LC_UNUSED_0] = { + .lock = callout_mutex_op_none, + .unlock = callout_mutex_op_none, + .owned = callout_owned_op_none, + }, + [CALLOUT_LC_UNUSED_1] = { + .lock = callout_mutex_op_none, + .unlock = callout_mutex_op_none, + .owned = callout_owned_op_none, + }, + [CALLOUT_LC_UNUSED_2] = { + .lock = callout_mutex_op_none, + .unlock = callout_mutex_op_none, + .owned = callout_owned_op_none, + }, + [CALLOUT_LC_UNUSED_3] = { + .lock = callout_mutex_op_none, + .unlock = callout_mutex_op_none, + .owned = callout_owned_op_none, + }, + [CALLOUT_LC_SPIN] = { + .lock = callout_mutex_lock_spin, + .unlock = callout_mutex_unlock_spin, + .owned = callout_mutex_owned, + }, + [CALLOUT_LC_MUTEX] = { + .lock = callout_mutex_lock, + .unlock = callout_mutex_unlock, + .owned = callout_mutex_owned, + }, + [CALLOUT_LC_RW] = { + .lock = callout_rw_wlock, + .unlock = callout_rw_wunlock, + .owned = callout_rw_owned, + }, + [CALLOUT_LC_RM] = { + .lock = callout_rm_wlock, + .unlock = callout_rm_wunlock, + .owned = callout_rm_owned, + }, +}; + +static void +callout_lock_client(int c_flags, struct lock_object *c_lock) +{ + callout_mutex_ops[CALLOUT_GET_LC(c_flags)].lock(c_lock); +} + +static void +callout_unlock_client(int c_flags, struct lock_object *c_lock) +{ + callout_mutex_ops[CALLOUT_GET_LC(c_flags)].unlock(c_lock); +} + +#ifdef SMP +static int +callout_lock_owned_client(int c_flags, struct lock_object *c_lock) +{ + return (callout_mutex_ops[CALLOUT_GET_LC(c_flags)].owned(c_lock)); +} +#endif + /* - * The callout cpu exec entities represent informations necessary for - * describing the state of callouts currently running on the CPU and the ones - * necessary for migrating callouts to the new callout cpu. In particular, - * the first entry of the array cc_exec_entity holds informations for callout - * running in SWI thread context, while the second one holds informations - * for callout running directly from hardware interrupt context. - * The cached informations are very important for deferring migration when - * the migrating callout is already running. + * The callout CPU exec structure represent information necessary for + * describing the state of callouts currently running on the CPU and + * for handling deferred callout restarts. + * + * In particular, the first entry of the array cc_exec_entity holds + * information for callouts running from the SWI thread context, while + * the second one holds information for callouts running directly from + * the hardware interrupt context. */ struct cc_exec { - struct callout *cc_next; + /* + * The "cc_curr" points to the currently executing callout and + * is protected by the "cc_lock" spinlock. If no callback is + * currently executing it is equal to "NULL". + */ struct callout *cc_curr; -#ifdef SMP - void (*ce_migration_func)(void *); - void *ce_migration_arg; - int ce_migration_cpu; - sbintime_t ce_migration_time; - sbintime_t ce_migration_prec; -#endif - bool cc_cancel; - bool cc_waiting; + /* + * The "cc_restart_args" structure holds the argument for a + * deferred callback restart and is protected by the "cc_lock" + * spinlock. The structure is only valid if "cc_restart" is + * "true". If "cc_restart" is "false" the information in the + * "cc_restart_args" structure shall be ignored. + */ + struct callout_args cc_restart_args; + bool cc_restart; + /* + * The "cc_cancel" variable allows the currently pending + * callback to be atomically cancelled. This field is write + * protected by the "cc_lock" spinlock. + */ + bool cc_cancel; + /* + * The "cc_drain_fn" points to a function which shall be + * called with the argument stored in "cc_drain_arg" when an + * asynchronous drain is performed. This field is write + * protected by the "cc_lock" spinlock. + */ + callout_func_t *cc_drain_fn; + void *cc_drain_arg; }; /* - * There is one struct callout_cpu per cpu, holding all relevant + * There is one "struct callout_cpu" per CPU, holding all relevant * state for the callout processing thread on the individual CPU. */ struct callout_cpu { struct mtx_padalign cc_lock; struct cc_exec cc_exec_entity[2]; + struct callout *cc_exec_next_dir; struct callout *cc_callout; struct callout_list *cc_callwheel; struct callout_tailq cc_expireq; @@ -166,27 +354,7 @@ char cc_ktr_event_name[20]; }; -#define cc_exec_curr cc_exec_entity[0].cc_curr -#define cc_exec_next cc_exec_entity[0].cc_next -#define cc_exec_cancel cc_exec_entity[0].cc_cancel -#define cc_exec_waiting cc_exec_entity[0].cc_waiting -#define cc_exec_curr_dir cc_exec_entity[1].cc_curr -#define cc_exec_next_dir cc_exec_entity[1].cc_next -#define cc_exec_cancel_dir cc_exec_entity[1].cc_cancel -#define cc_exec_waiting_dir cc_exec_entity[1].cc_waiting - #ifdef SMP -#define cc_migration_func cc_exec_entity[0].ce_migration_func -#define cc_migration_arg cc_exec_entity[0].ce_migration_arg -#define cc_migration_cpu cc_exec_entity[0].ce_migration_cpu -#define cc_migration_time cc_exec_entity[0].ce_migration_time -#define cc_migration_prec cc_exec_entity[0].ce_migration_prec -#define cc_migration_func_dir cc_exec_entity[1].ce_migration_func -#define cc_migration_arg_dir cc_exec_entity[1].ce_migration_arg -#define cc_migration_cpu_dir cc_exec_entity[1].ce_migration_cpu -#define cc_migration_time_dir cc_exec_entity[1].ce_migration_time -#define cc_migration_prec_dir cc_exec_entity[1].ce_migration_prec - struct callout_cpu cc_cpu[MAXCPU]; #define CPUBLOCK MAXCPU #define CC_CPU(cpu) (&cc_cpu[(cpu)]) @@ -211,62 +379,11 @@ static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures"); -/** - * Locked by cc_lock: - * cc_curr - If a callout is in progress, it is cc_curr. - * If cc_curr is non-NULL, threads waiting in - * callout_drain() will be woken up as soon as the - * relevant callout completes. - * cc_cancel - Changing to 1 with both callout_lock and cc_lock held - * guarantees that the current callout will not run. - * The softclock() function sets this to 0 before it - * drops callout_lock to acquire c_lock, and it calls - * the handler only if curr_cancelled is still 0 after - * cc_lock is successfully acquired. - * cc_waiting - If a thread is waiting in callout_drain(), then - * callout_wait is nonzero. Set only when - * cc_curr is non-NULL. - */ - /* - * Resets the execution entity tied to a specific callout cpu. + * Kernel low level callwheel initialization called from cpu0 during + * kernel startup: */ static void -cc_cce_cleanup(struct callout_cpu *cc, int direct) -{ - - cc->cc_exec_entity[direct].cc_curr = NULL; - cc->cc_exec_entity[direct].cc_next = NULL; - cc->cc_exec_entity[direct].cc_cancel = false; - cc->cc_exec_entity[direct].cc_waiting = false; -#ifdef SMP - cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK; - cc->cc_exec_entity[direct].ce_migration_time = 0; - cc->cc_exec_entity[direct].ce_migration_prec = 0; - cc->cc_exec_entity[direct].ce_migration_func = NULL; - cc->cc_exec_entity[direct].ce_migration_arg = NULL; -#endif -} - -/* - * Checks if migration is requested by a specific callout cpu. - */ -static int -cc_cce_migrating(struct callout_cpu *cc, int direct) -{ - -#ifdef SMP - return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK); -#else - return (0); -#endif -} - -/* - * Kernel low level callwheel initialization - * called on cpu0 during kernel startup. - */ -static void callout_callwheel_init(void *dummy) { struct callout_cpu *cc; @@ -324,8 +441,6 @@ LIST_INIT(&cc->cc_callwheel[i]); TAILQ_INIT(&cc->cc_expireq); cc->cc_firstevent = SBT_MAX; - for (i = 0; i < 2; i++) - cc_cce_cleanup(cc, i); snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name), "callwheel cpu %d", cpu); if (cc->cc_callout == NULL) /* Only cpu0 handles timeout(9) */ @@ -333,42 +448,12 @@ for (i = 0; i < ncallout; i++) { c = &cc->cc_callout[i]; callout_init(c, 0); - c->c_flags = CALLOUT_LOCAL_ALLOC; + c->c_flags |= CALLOUT_LOCAL_ALLOC; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } } -#ifdef SMP /* - * Switches the cpu tied to a specific callout. - * The function expects a locked incoming callout cpu and returns with - * locked outcoming callout cpu. - */ -static struct callout_cpu * -callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu) -{ - struct callout_cpu *new_cc; - - MPASS(c != NULL && cc != NULL); - CC_LOCK_ASSERT(cc); - - /* - * Avoid interrupts and preemption firing after the callout cpu - * is blocked in order to avoid deadlocks as the new thread - * may be willing to acquire the callout cpu lock. - */ - c->c_cpu = CPUBLOCK; - spinlock_enter(); - CC_UNLOCK(cc); - new_cc = CC_CPU(new_cpu); - CC_LOCK(new_cc); - spinlock_exit(); - c->c_cpu = new_cpu; - return (new_cc); -} -#endif - -/* * Start standard softclock thread. */ static void @@ -444,9 +529,8 @@ #ifdef CALLOUT_PROFILING int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0; #endif - cc = CC_SELF(); - mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET); + CC_LOCK(cc); /* Compute the buckets of the last scan and present times. */ firstb = callout_hash(cc->cc_lastscan); @@ -549,7 +633,7 @@ avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8; avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8; #endif - mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET); + CC_UNLOCK(cc); /* * swi_sched acquires the thread lock, so we don't want to call it * with cc_lock held; incorrect locking order. @@ -562,49 +646,55 @@ callout_lock(struct callout *c) { struct callout_cpu *cc; - int cpu; - - for (;;) { - cpu = c->c_cpu; -#ifdef SMP - if (cpu == CPUBLOCK) { - while (c->c_cpu == CPUBLOCK) - cpu_spinwait(); - continue; - } -#endif - cc = CC_CPU(cpu); - CC_LOCK(cc); - if (cpu == c->c_cpu) - break; - CC_UNLOCK(cc); - } + cc = CC_CPU(c->c_cpu); + CC_LOCK(cc); return (cc); } -static void -callout_cc_add(struct callout *c, struct callout_cpu *cc, - sbintime_t sbt, sbintime_t precision, void (*func)(void *), - void *arg, int cpu, int flags) +static struct callout_cpu * +callout_cc_add_locked(struct callout *c, struct callout_cpu *cc, + struct callout_args *coa, bool can_swap_cpu) { +#ifndef NO_EVENTTIMERS + sbintime_t sbt; +#endif int bucket; CC_LOCK_ASSERT(cc); - if (sbt < cc->cc_lastscan) - sbt = cc->cc_lastscan; - c->c_arg = arg; - c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); - if (flags & C_DIRECT_EXEC) - c->c_flags |= CALLOUT_DIRECT; - c->c_flags &= ~CALLOUT_PROCESSED; - c->c_func = func; - c->c_time = sbt; - c->c_precision = precision; + + /* update flags before swapping locks, if any */ + c->c_flags &= ~(CALLOUT_PROCESSED | CALLOUT_DIRECT | CALLOUT_DEFRESTART); + if (coa->flags & C_DIRECT_EXEC) + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING | CALLOUT_DIRECT); + else + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); + +#ifdef SMP + /* + * Check if we are changing the CPU on which the callback + * should be executed and if we have a lock protecting us: + */ + if (can_swap_cpu != false && coa->cpu != c->c_cpu && + callout_lock_owned_client(c->c_flags, c->c_lock) != 0) { + CC_UNLOCK(cc); + c->c_cpu = coa->cpu; + cc = callout_lock(c); + } +#endif + if (coa->time < cc->cc_lastscan) + coa->time = cc->cc_lastscan; + c->c_arg = coa->arg; + c->c_func = coa->func; + c->c_time = coa->time; + c->c_precision = coa->precision; + bucket = callout_get_bucket(c->c_time); CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x", c, (int)(c->c_precision >> 32), (u_int)(c->c_precision & 0xffffffff)); LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le); + + /* Ensure we are first to be scanned, if called via a callback */ if (cc->cc_bucket == bucket) cc->cc_exec_next_dir = c; #ifndef NO_EVENTTIMERS @@ -617,9 +707,10 @@ sbt = c->c_time + c->c_precision; if (sbt < cc->cc_firstevent) { cc->cc_firstevent = sbt; - cpu_new_callout(cpu, sbt, c->c_time); + cpu_new_callout(coa->cpu, sbt, c->c_time); } #endif + return (cc); } static void @@ -626,8 +717,6 @@ callout_cc_del(struct callout *c, struct callout_cpu *cc) { - if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0) - return; c->c_func = NULL; SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle); } @@ -639,20 +728,10 @@ #endif int direct) { - struct rm_priotracker tracker; - void (*c_func)(void *); + callout_func_t *c_func; void *c_arg; - struct lock_class *class; struct lock_object *c_lock; - uintptr_t lock_status; int c_flags; -#ifdef SMP - struct callout_cpu *new_cc; - void (*new_func)(void *); - void *new_arg; - int flags, new_cpu; - sbintime_t new_prec, new_time; -#endif #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) sbintime_t sbt1, sbt2; struct timespec ts2; @@ -663,37 +742,39 @@ KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) == (CALLOUT_PENDING | CALLOUT_ACTIVE), ("softclock_call_cc: pend|act %p %x", c, c->c_flags)); - class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL; - lock_status = 0; - if (c->c_flags & CALLOUT_SHAREDLOCK) { - if (class == &lock_class_rm) - lock_status = (uintptr_t)&tracker; - else - lock_status = 1; - } c_lock = c->c_lock; c_func = c->c_func; c_arg = c->c_arg; c_flags = c->c_flags; - if (c->c_flags & CALLOUT_LOCAL_ALLOC) - c->c_flags = CALLOUT_LOCAL_ALLOC; - else - c->c_flags &= ~CALLOUT_PENDING; + + /* remove pending bit */ + c->c_flags &= ~CALLOUT_PENDING; + + /* reset our local state */ cc->cc_exec_entity[direct].cc_curr = c; - cc->cc_exec_entity[direct].cc_cancel = false; - CC_UNLOCK(cc); + cc->cc_exec_entity[direct].cc_restart = false; + cc->cc_exec_entity[direct].cc_drain_fn = NULL; + cc->cc_exec_entity[direct].cc_drain_arg = NULL; + if (c_lock != NULL) { - class->lc_lock(c_lock, lock_status); + cc->cc_exec_entity[direct].cc_cancel = false; + CC_UNLOCK(cc); + + /* unlocked region for switching locks */ + + callout_lock_client(c_flags, c_lock); + /* - * The callout may have been cancelled - * while we switched locks. + * Check if the callout may have been cancelled while + * we were switching locks. Even though the callout is + * specifying a lock, it might not be certain this + * lock is locked when starting and stopping callouts. */ + CC_LOCK(cc); if (cc->cc_exec_entity[direct].cc_cancel) { - class->lc_unlock(c_lock); - goto skip; + callout_unlock_client(c_flags, c_lock); + goto skip_cc_locked; } - /* The callout cannot be stopped now. */ - cc->cc_exec_entity[direct].cc_cancel = true; if (c_lock == &Giant.lock_object) { #ifdef CALLOUT_PROFILING (*gcalls)++; @@ -714,6 +795,11 @@ CTR3(KTR_CALLOUT, "callout %p func %p arg %p", c, c_func, c_arg); } + /* The callout cannot be stopped now! */ + cc->cc_exec_entity[direct].cc_cancel = true; + CC_UNLOCK(cc); + + /* unlocked region */ KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running", "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct); #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) @@ -740,85 +826,40 @@ #endif KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle"); CTR1(KTR_CALLOUT, "callout %p finished", c); + + /* + * At this point the callback structure might have been freed, + * so we need to check the previously copied value of + * "c->c_flags": + */ if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0) - class->lc_unlock(c_lock); -skip: + callout_unlock_client(c_flags, c_lock); + CC_LOCK(cc); + +skip_cc_locked: KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr")); cc->cc_exec_entity[direct].cc_curr = NULL; - if (cc->cc_exec_entity[direct].cc_waiting) { + + /* Check if there is anything which needs draining */ + if (cc->cc_exec_entity[direct].cc_drain_fn != NULL) { /* - * There is someone waiting for the - * callout to complete. - * If the callout was scheduled for - * migration just cancel it. + * Unlock the CPU callout last, so that any use of + * structures belonging to the callout are complete: */ - if (cc_cce_migrating(cc, direct)) { - cc_cce_cleanup(cc, direct); - - /* - * It should be assert here that the callout is not - * destroyed but that is not easy. - */ - c->c_flags &= ~CALLOUT_DFRMIGRATION; - } - cc->cc_exec_entity[direct].cc_waiting = false; CC_UNLOCK(cc); - wakeup(&cc->cc_exec_entity[direct].cc_waiting); + /* call drain function unlocked */ + cc->cc_exec_entity[direct].cc_drain_fn( + cc->cc_exec_entity[direct].cc_drain_arg); CC_LOCK(cc); - } else if (cc_cce_migrating(cc, direct)) { - KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0, - ("Migrating legacy callout %p", c)); -#ifdef SMP - /* - * If the callout was scheduled for - * migration just perform it now. - */ - new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu; - new_time = cc->cc_exec_entity[direct].ce_migration_time; - new_prec = cc->cc_exec_entity[direct].ce_migration_prec; - new_func = cc->cc_exec_entity[direct].ce_migration_func; - new_arg = cc->cc_exec_entity[direct].ce_migration_arg; - cc_cce_cleanup(cc, direct); - - /* - * It should be assert here that the callout is not destroyed - * but that is not easy. - * - * As first thing, handle deferred callout stops. - */ - if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) { - CTR3(KTR_CALLOUT, - "deferred cancelled %p func %p arg %p", - c, new_func, new_arg); - callout_cc_del(c, cc); - return; - } - c->c_flags &= ~CALLOUT_DFRMIGRATION; - - new_cc = callout_cpu_switch(c, cc, new_cpu); - flags = (direct) ? C_DIRECT_EXEC : 0; - callout_cc_add(c, new_cc, new_time, new_prec, new_func, - new_arg, new_cpu, flags); - CC_UNLOCK(new_cc); - CC_LOCK(cc); -#else - panic("migration should not happen"); -#endif + } else if (c_flags & CALLOUT_LOCAL_ALLOC) { + /* return callout back to freelist */ + callout_cc_del(c, cc); + } else if (cc->cc_exec_entity[direct].cc_restart) { + /* [re-]schedule callout, if any */ + (void) callout_cc_add_locked(c, cc, + &cc->cc_exec_entity[direct].cc_restart_args, false); } - /* - * If the current callout is locally allocated (from - * timeout(9)) then put it on the freelist. - * - * Note: we need to check the cached copy of c_flags because - * if it was not local, then it's not safe to deref the - * callout pointer. - */ - KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 || - c->c_flags == CALLOUT_LOCAL_ALLOC, - ("corrupted callout")); - if (c_flags & CALLOUT_LOCAL_ALLOC) - callout_cc_del(c, cc); } /* @@ -899,10 +940,11 @@ /* XXX Attempt to malloc first */ panic("timeout table full"); SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle); - callout_reset(new, to_ticks, ftn, arg); handle.callout = new; CC_UNLOCK(cc); + callout_reset(new, to_ticks, ftn, arg); + return (handle); } @@ -910,6 +952,7 @@ untimeout(timeout_t *ftn, void *arg, struct callout_handle handle) { struct callout_cpu *cc; + bool match; /* * Check for a handle that was initialized @@ -920,9 +963,11 @@ return; cc = callout_lock(handle.callout); - if (handle.callout->c_func == ftn && handle.callout->c_arg == arg) + match = (handle.callout->c_func == ftn && handle.callout->c_arg == arg); + CC_UNLOCK(cc); + + if (match) callout_stop(handle.callout); - CC_UNLOCK(cc); } void @@ -931,6 +976,119 @@ handle->callout = NULL; } +static int +callout_restart_async(struct callout *c, struct callout_args *coa, + callout_func_t *drain_fn, void *drain_arg) +{ + struct callout_cpu *cc; + int cancelled; + int direct; + + cc = callout_lock(c); + + /* Figure out if the callout is direct or not */ + direct = ((c->c_flags & CALLOUT_DIRECT) != 0); + + /* + * Check if the callback is currently scheduled for + * completion: + */ + if (cc->cc_exec_entity[direct].cc_curr == c) { + /* + * Try to prevent the callback from running by setting + * the "cc_cancel" variable to "true". Also check if + * the callout was previously subject to a deferred + * callout restart: + */ + if (cc->cc_exec_entity[direct].cc_cancel == false || + (c->c_flags & CALLOUT_DEFRESTART) != 0) { + cc->cc_exec_entity[direct].cc_cancel = true; + cancelled = 1; + } else { + cancelled = 0; + } + + /* + * Prevent callback restart if "callout_drain_xxx()" + * is being called or we are stopping the callout or + * the callback was preallocated by us: + */ + if (cc->cc_exec_entity[direct].cc_drain_fn != NULL || + coa == NULL || (c->c_flags & CALLOUT_LOCAL_ALLOC) != 0) { + CTR4(KTR_CALLOUT, "%s %p func %p arg %p", + cancelled ? "cancelled and draining" : "draining", + c, c->c_func, c->c_arg); + + /* clear old flags, if any */ + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING | + CALLOUT_DEFRESTART | CALLOUT_PROCESSED); + + /* clear restart flag, if any */ + cc->cc_exec_entity[direct].cc_restart = false; + + /* set drain function, if any */ + if (drain_fn != NULL) { + cc->cc_exec_entity[direct].cc_drain_fn = drain_fn; + cc->cc_exec_entity[direct].cc_drain_arg = drain_arg; + cancelled |= 2; /* XXX define the value */ + } + } else { + CTR4(KTR_CALLOUT, "%s %p func %p arg %p", + cancelled ? "cancelled and restarting" : "restarting", + c, c->c_func, c->c_arg); + + /* get us back into the game */ + c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING | + CALLOUT_DEFRESTART); + c->c_flags &= ~CALLOUT_PROCESSED; + + /* enable deferred restart */ + cc->cc_exec_entity[direct].cc_restart = true; + + /* store arguments for the deferred restart, if any */ + cc->cc_exec_entity[direct].cc_restart_args = *coa; + } + } else { + /* stop callout */ + if (c->c_flags & CALLOUT_PENDING) { + /* + * The callback has not yet been executed, and + * we simply just need to unlink it: + */ + if ((c->c_flags & CALLOUT_PROCESSED) == 0) { + if (cc->cc_exec_next_dir == c) + cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); + LIST_REMOVE(c, c_links.le); + } else { + TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); + } + cancelled = 1; + } else { + cancelled = 0; + } + + CTR4(KTR_CALLOUT, "%s %p func %p arg %p", + cancelled ? "rescheduled" : "scheduled", + c, c->c_func, c->c_arg); + + /* [re-]schedule callout, if any */ + if (coa != NULL) { + cc = callout_cc_add_locked(c, cc, coa, true); + } else { + /* clear old flags, if any */ + c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING | + CALLOUT_DEFRESTART | CALLOUT_PROCESSED); + + /* return callback to pre-allocated list, if any */ + if ((c->c_flags & CALLOUT_LOCAL_ALLOC) && cancelled != 0) { + callout_cc_del(c, cc); + } + } + } + CC_UNLOCK(cc); + return (cancelled); +} + /* * New interface; clients allocate their own callout structures. * @@ -949,25 +1107,32 @@ */ int callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision, - void (*ftn)(void *), void *arg, int cpu, int flags) + callout_func_t *ftn, void *arg, int cpu, int flags) { - sbintime_t to_sbt, pr; - struct callout_cpu *cc; - int cancelled, direct; + struct callout_args coa; - cancelled = 0; - if (flags & C_ABSOLUTE) { - to_sbt = sbt; + /* store arguments for callout add function */ + coa.func = ftn; + coa.arg = arg; + coa.precision = precision; + coa.flags = flags; + coa.cpu = cpu; + + /* compute the rest of the arguments needed */ + if (coa.flags & C_ABSOLUTE) { + coa.time = sbt; } else { - if ((flags & C_HARDCLOCK) && (sbt < tick_sbt)) + sbintime_t pr; + + if ((coa.flags & C_HARDCLOCK) && (sbt < tick_sbt)) sbt = tick_sbt; - if ((flags & C_HARDCLOCK) || + if ((coa.flags & C_HARDCLOCK) || #ifdef NO_EVENTTIMERS sbt >= sbt_timethreshold) { - to_sbt = getsbinuptime(); + coa.time = getsbinuptime(); /* Add safety belt for the case of hz > 1000. */ - to_sbt += tc_tick_sbt - tick_sbt; + coa.time += tc_tick_sbt - tick_sbt; #else sbt >= sbt_tickthreshold) { /* @@ -977,101 +1142,29 @@ * active ones. */ #ifdef __LP64__ - to_sbt = DPCPU_GET(hardclocktime); + coa.time = DPCPU_GET(hardclocktime); #else spinlock_enter(); - to_sbt = DPCPU_GET(hardclocktime); + coa.time = DPCPU_GET(hardclocktime); spinlock_exit(); #endif #endif - if ((flags & C_HARDCLOCK) == 0) - to_sbt += tick_sbt; + if ((coa.flags & C_HARDCLOCK) == 0) + coa.time += tick_sbt; } else - to_sbt = sbinuptime(); - if (SBT_MAX - to_sbt < sbt) - to_sbt = SBT_MAX; + coa.time = sbinuptime(); + if (SBT_MAX - coa.time < sbt) + coa.time = SBT_MAX; else - to_sbt += sbt; - pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp : - sbt >> C_PRELGET(flags)); - if (pr > precision) - precision = pr; + coa.time += sbt; + pr = ((C_PRELGET(coa.flags) < 0) ? sbt >> tc_precexp : + sbt >> C_PRELGET(coa.flags)); + if (pr > coa.precision) + coa.precision = pr; } - /* - * Don't allow migration of pre-allocated callouts lest they - * become unbalanced. - */ - if (c->c_flags & CALLOUT_LOCAL_ALLOC) - cpu = c->c_cpu; - direct = (c->c_flags & CALLOUT_DIRECT) != 0; - KASSERT(!direct || c->c_lock == NULL, - ("%s: direct callout %p has lock", __func__, c)); - cc = callout_lock(c); - if (cc->cc_exec_entity[direct].cc_curr == c) { - /* - * We're being asked to reschedule a callout which is - * currently in progress. If there is a lock then we - * can cancel the callout if it has not really started. - */ - if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel) - cancelled = cc->cc_exec_entity[direct].cc_cancel = true; - if (cc->cc_exec_entity[direct].cc_waiting) { - /* - * Someone has called callout_drain to kill this - * callout. Don't reschedule. - */ - CTR4(KTR_CALLOUT, "%s %p func %p arg %p", - cancelled ? "cancelled" : "failed to cancel", - c, c->c_func, c->c_arg); - CC_UNLOCK(cc); - return (cancelled); - } - } - if (c->c_flags & CALLOUT_PENDING) { - if ((c->c_flags & CALLOUT_PROCESSED) == 0) { - if (cc->cc_exec_next_dir == c) - cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); - LIST_REMOVE(c, c_links.le); - } else - TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); - cancelled = 1; - c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); - } -#ifdef SMP - /* - * If the callout must migrate try to perform it immediately. - * If the callout is currently running, just defer the migration - * to a more appropriate moment. - */ - if (c->c_cpu != cpu) { - if (cc->cc_exec_entity[direct].cc_curr == c) { - cc->cc_exec_entity[direct].ce_migration_cpu = cpu; - cc->cc_exec_entity[direct].ce_migration_time - = to_sbt; - cc->cc_exec_entity[direct].ce_migration_prec - = precision; - cc->cc_exec_entity[direct].ce_migration_func = ftn; - cc->cc_exec_entity[direct].ce_migration_arg = arg; - c->c_flags |= CALLOUT_DFRMIGRATION; - CTR6(KTR_CALLOUT, - "migration of %p func %p arg %p in %d.%08x to %u deferred", - c, c->c_func, c->c_arg, (int)(to_sbt >> 32), - (u_int)(to_sbt & 0xffffffff), cpu); - CC_UNLOCK(cc); - return (cancelled); - } - cc = callout_cpu_switch(c, cc, cpu); - } -#endif - - callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags); - CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x", - cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32), - (u_int)(to_sbt & 0xffffffff)); - CC_UNLOCK(cc); - - return (cancelled); + /* get callback started, if any */ + return (callout_restart_async(c, &coa, NULL, NULL)); } /* @@ -1090,193 +1183,79 @@ } int -_callout_stop_safe(struct callout *c, int safe) +callout_stop(struct callout *c) { - struct callout_cpu *cc, *old_cc; - struct lock_class *class; - int direct, sq_locked, use_lock; + /* get callback stopped, if any */ + return (callout_restart_async(c, NULL, NULL, NULL)); +} - if (safe) - WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, c->c_lock, - "calling %s", __func__); +static void +callout_drain_function(void *arg) +{ + wakeup(arg); +} - /* - * Some old subsystems don't hold Giant while running a callout_stop(), - * so just discard this check for the moment. - */ - if (!safe && c->c_lock != NULL) { - if (c->c_lock == &Giant.lock_object) - use_lock = mtx_owned(&Giant); - else { - use_lock = 1; - class = LOCK_CLASS(c->c_lock); - class->lc_assert(c->c_lock, LA_XLOCKED); - } - } else - use_lock = 0; - direct = (c->c_flags & CALLOUT_DIRECT) != 0; - sq_locked = 0; - old_cc = NULL; -again: - cc = callout_lock(c); +int +callout_drain_async(struct callout *c, callout_func_t *fn, void *arg) +{ + /* get callback stopped, if any */ + return (callout_restart_async(c, NULL, fn, arg) & 2); +} - /* - * If the callout was migrating while the callout cpu lock was - * dropped, just drop the sleepqueue lock and check the states - * again. - */ - if (sq_locked != 0 && cc != old_cc) { -#ifdef SMP - CC_UNLOCK(cc); - sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting); - sq_locked = 0; - old_cc = NULL; - goto again; -#else - panic("migration should not happen"); -#endif - } +int +callout_drain(struct callout *c) +{ + int cancelled; - /* - * If the callout isn't pending, it's not on the queue, so - * don't attempt to remove it from the queue. We can try to - * stop it by other means however. - */ - if (!(c->c_flags & CALLOUT_PENDING)) { - c->c_flags &= ~CALLOUT_ACTIVE; + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "Draining callout"); + callout_lock_client(c->c_flags, c->c_lock); + + /* at this point the "c->c_cpu" field is not changing */ + + cancelled = callout_drain_async(c, &callout_drain_function, c); + + if (cancelled != 0) { + struct callout_cpu *cc; + int direct; + + CTR3(KTR_CALLOUT, "need to drain %p func %p arg %p", + c, c->c_func, c->c_arg); + + cc = callout_lock(c); + direct = ((c->c_flags & CALLOUT_DIRECT) != 0); + /* - * If it wasn't on the queue and it isn't the current - * callout, then we can't stop it, so just bail. + * We've gotten our callout CPU lock, it is safe to + * drop the initial lock: */ - if (cc->cc_exec_entity[direct].cc_curr != c) { - CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", - c, c->c_func, c->c_arg); - CC_UNLOCK(cc); - if (sq_locked) - sleepq_release( - &cc->cc_exec_entity[direct].cc_waiting); - return (0); - } + callout_unlock_client(c->c_flags, c->c_lock); - if (safe) { - /* - * The current callout is running (or just - * about to run) and blocking is allowed, so - * just wait for the current invocation to - * finish. - */ - while (cc->cc_exec_entity[direct].cc_curr == c) { - /* - * Use direct calls to sleepqueue interface - * instead of cv/msleep in order to avoid - * a LOR between cc_lock and sleepqueue - * chain spinlocks. This piece of code - * emulates a msleep_spin() call actually. - * - * If we already have the sleepqueue chain - * locked, then we can safely block. If we - * don't already have it locked, however, - * we have to drop the cc_lock to lock - * it. This opens several races, so we - * restart at the beginning once we have - * both locks. If nothing has changed, then - * we will end up back here with sq_locked - * set. - */ - if (!sq_locked) { - CC_UNLOCK(cc); - sleepq_lock( - &cc->cc_exec_entity[direct].cc_waiting); - sq_locked = 1; - old_cc = cc; - goto again; - } + /* Wait for drain to complete */ - /* - * Migration could be cancelled here, but - * as long as it is still not sure when it - * will be packed up, just let softclock() - * take care of it. - */ - cc->cc_exec_entity[direct].cc_waiting = true; - DROP_GIANT(); - CC_UNLOCK(cc); - sleepq_add( - &cc->cc_exec_entity[direct].cc_waiting, - &cc->cc_lock.lock_object, "codrain", - SLEEPQ_SLEEP, 0); - sleepq_wait( - &cc->cc_exec_entity[direct].cc_waiting, - 0); - sq_locked = 0; - old_cc = NULL; + while (cc->cc_exec_entity[direct].cc_curr == c) + msleep_spin(c, (struct mtx *)&cc->cc_lock, "codrain", 0); - /* Reacquire locks previously released. */ - PICKUP_GIANT(); - CC_LOCK(cc); - } - } else if (use_lock && - !cc->cc_exec_entity[direct].cc_cancel) { - /* - * The current callout is waiting for its - * lock which we hold. Cancel the callout - * and return. After our caller drops the - * lock, the callout will be skipped in - * softclock(). - */ - cc->cc_exec_entity[direct].cc_cancel = true; - CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", - c, c->c_func, c->c_arg); - KASSERT(!cc_cce_migrating(cc, direct), - ("callout wrongly scheduled for migration")); - CC_UNLOCK(cc); - KASSERT(!sq_locked, ("sleepqueue chain locked")); - return (1); - } else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) { - c->c_flags &= ~CALLOUT_DFRMIGRATION; - CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p", - c, c->c_func, c->c_arg); - CC_UNLOCK(cc); - return (1); - } - CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p", - c, c->c_func, c->c_arg); CC_UNLOCK(cc); - KASSERT(!sq_locked, ("sleepqueue chain still locked")); - return (0); + } else { + callout_unlock_client(c->c_flags, c->c_lock); } - if (sq_locked) - sleepq_release(&cc->cc_exec_entity[direct].cc_waiting); - c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING); - CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p", c, c->c_func, c->c_arg); - if ((c->c_flags & CALLOUT_PROCESSED) == 0) { - if (cc->cc_exec_next_dir == c) - cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le); - LIST_REMOVE(c, c_links.le); - } else - TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe); - callout_cc_del(c, cc); - CC_UNLOCK(cc); - return (1); + return (cancelled & 1); } void callout_init(struct callout *c, int mpsafe) { - bzero(c, sizeof *c); if (mpsafe) { - c->c_lock = NULL; - c->c_flags = CALLOUT_RETURNUNLOCKED; + _callout_init_lock(c, NULL, CALLOUT_RETURNUNLOCKED); } else { - c->c_lock = &Giant.lock_object; - c->c_flags = 0; + _callout_init_lock(c, &Giant.lock_object, 0); } - c->c_cpu = timeout_cpu; } void @@ -1283,15 +1262,26 @@ _callout_init_lock(struct callout *c, struct lock_object *lock, int flags) { bzero(c, sizeof *c); + KASSERT((flags & ~CALLOUT_RETURNUNLOCKED) == 0, + ("callout_init_lock: bad flags 0x%08x", flags)); + flags &= CALLOUT_RETURNUNLOCKED; + if (lock != NULL) { + struct lock_class *class = LOCK_CLASS(lock); + if (class == &lock_class_mtx_sleep) + flags |= CALLOUT_SET_LC(CALLOUT_LC_MUTEX); + else if (class == &lock_class_mtx_spin) + flags |= CALLOUT_SET_LC(CALLOUT_LC_SPIN); + else if (class == &lock_class_rm) + flags |= CALLOUT_SET_LC(CALLOUT_LC_RM); + else if (class == &lock_class_rw) + flags |= CALLOUT_SET_LC(CALLOUT_LC_RW); + else + panic("callout_init_lock: Unsupported lock class '%s'\n", class->lc_name); + } else { + flags |= CALLOUT_SET_LC(CALLOUT_LC_UNUSED_0); + } c->c_lock = lock; - KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0, - ("callout_init_lock: bad flags %d", flags)); - KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0, - ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock")); - KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags & - (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class", - __func__)); - c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK); + c->c_flags = flags; c->c_cpu = timeout_cpu; } Index: kern/subr_sleepqueue.c =================================================================== --- kern/subr_sleepqueue.c +++ kern/subr_sleepqueue.c @@ -152,7 +152,8 @@ */ static int sleepq_catch_signals(void *wchan, int pri); static int sleepq_check_signals(void); -static int sleepq_check_timeout(void); +static int sleepq_check_timeout(struct thread *); +static void sleepq_stop_timeout(struct thread *); #ifdef INVARIANTS static void sleepq_dtor(void *mem, int size, void *arg); #endif @@ -373,17 +374,14 @@ sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr, int flags) { - struct sleepqueue_chain *sc; struct thread *td; td = curthread; - sc = SC_LOOKUP(wchan); - mtx_assert(&sc->sc_lock, MA_OWNED); - MPASS(TD_ON_SLEEPQ(td)); - MPASS(td->td_sleepqueue == NULL); - MPASS(wchan != NULL); + + mtx_lock_spin(&td->td_slpmutex); callout_reset_sbt_on(&td->td_slpcallout, sbt, pr, sleepq_timeout, td, PCPU_GET(cpuid), flags | C_DIRECT_EXEC); + mtx_unlock_spin(&td->td_slpmutex); } /* @@ -559,11 +557,8 @@ * Check to see if we timed out. */ static int -sleepq_check_timeout(void) +sleepq_check_timeout(struct thread *td) { - struct thread *td; - - td = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); /* @@ -573,28 +568,21 @@ td->td_flags &= ~TDF_TIMEOUT; return (EWOULDBLOCK); } - - /* - * If TDF_TIMOFAIL is set, the timeout ran after we had - * already been woken up. - */ - if (td->td_flags & TDF_TIMOFAIL) - td->td_flags &= ~TDF_TIMOFAIL; - - /* - * If callout_stop() fails, then the timeout is running on - * another CPU, so synchronize with it to avoid having it - * accidentally wake up a subsequent sleep. - */ - else if (callout_stop(&td->td_slpcallout) == 0) { - td->td_flags |= TDF_TIMEOUT; - TD_SET_SLEEPING(td); - mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL); - } return (0); } /* + * Atomically stop the timeout by using a mutex. + */ +static void +sleepq_stop_timeout(struct thread *td) +{ + mtx_lock_spin(&td->td_slpmutex); + callout_stop(&td->td_slpcallout); + mtx_unlock_spin(&td->td_slpmutex); +} + +/* * Check to see if we were awoken by a signal. */ static int @@ -664,9 +652,11 @@ MPASS(!(td->td_flags & TDF_SINTR)); thread_lock(td); sleepq_switch(wchan, pri); - rval = sleepq_check_timeout(); + rval = sleepq_check_timeout(td); thread_unlock(td); + sleepq_stop_timeout(td); + return (rval); } @@ -677,12 +667,18 @@ int sleepq_timedwait_sig(void *wchan, int pri) { + struct thread *td; int rcatch, rvalt, rvals; + td = curthread; + rcatch = sleepq_catch_signals(wchan, pri); - rvalt = sleepq_check_timeout(); + rvalt = sleepq_check_timeout(td); rvals = sleepq_check_signals(); - thread_unlock(curthread); + thread_unlock(td); + + sleepq_stop_timeout(td); + if (rcatch) return (rcatch); if (rvals) @@ -889,64 +885,49 @@ static void sleepq_timeout(void *arg) { - struct sleepqueue_chain *sc; - struct sleepqueue *sq; - struct thread *td; - void *wchan; - int wakeup_swapper; + struct thread *td = arg; + int wakeup_swapper = 0; - td = arg; - wakeup_swapper = 0; CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)", (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name); - /* - * First, see if the thread is asleep and get the wait channel if - * it is. - */ + /* Handle the three cases which can happen */ + thread_lock(td); - if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) { - wchan = td->td_wchan; - sc = SC_LOOKUP(wchan); - THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); - sq = sleepq_lookup(wchan); - MPASS(sq != NULL); - td->td_flags |= TDF_TIMEOUT; - wakeup_swapper = sleepq_resume_thread(sq, td, 0); - thread_unlock(td); - if (wakeup_swapper) - kick_proc0(); - return; - } + if (TD_ON_SLEEPQ(td)) { + if (TD_IS_SLEEPING(td)) { + struct sleepqueue_chain *sc; + struct sleepqueue *sq; + void *wchan; - /* - * If the thread is on the SLEEPQ but isn't sleeping yet, it - * can either be on another CPU in between sleepq_add() and - * one of the sleepq_*wait*() routines or it can be in - * sleepq_catch_signals(). - */ - if (TD_ON_SLEEPQ(td)) { - td->td_flags |= TDF_TIMEOUT; - thread_unlock(td); - return; + /* + * Case I - thread is asleep and needs to be + * awoken: + */ + wchan = td->td_wchan; + sc = SC_LOOKUP(wchan); + THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock); + sq = sleepq_lookup(wchan); + MPASS(sq != NULL); + td->td_flags |= TDF_TIMEOUT; + wakeup_swapper = sleepq_resume_thread(sq, td, 0); + } else { + /* + * Case II - cancel going to sleep by setting + * the timeout flag because the target thread + * is not asleep yet. It can be on another CPU + * in between sleepq_add() and one of the + * sleepq_*wait*() routines or it can be in + * sleepq_catch_signals(). + */ + td->td_flags |= TDF_TIMEOUT; + } + } else { + /* + * Case III - thread is already woken up by a wakeup + * call and should not timeout. Nothing to do! + */ } - - /* - * Now check for the edge cases. First, if TDF_TIMEOUT is set, - * then the other thread has already yielded to us, so clear - * the flag and resume it. If TDF_TIMEOUT is not set, then the - * we know that the other thread is not on a sleep queue, but it - * hasn't resumed execution yet. In that case, set TDF_TIMOFAIL - * to let it know that the timeout has already run and doesn't - * need to be canceled. - */ - if (td->td_flags & TDF_TIMEOUT) { - MPASS(TD_IS_SLEEPING(td)); - td->td_flags &= ~TDF_TIMEOUT; - TD_CLR_SLEEPING(td); - wakeup_swapper = setrunnable(td); - } else - td->td_flags |= TDF_TIMOFAIL; thread_unlock(td); if (wakeup_swapper) kick_proc0(); Index: ofed/include/linux/completion.h =================================================================== --- ofed/include/linux/completion.h +++ ofed/include/linux/completion.h @@ -105,7 +105,9 @@ if (c->done) break; sleepq_add(c, NULL, "completion", flags, 0); + sleepq_release(c); sleepq_set_timeout(c, end - ticks); + sleepq_lock(c); if (flags & SLEEPQ_INTERRUPTIBLE) { if (sleepq_timedwait_sig(c, 0) != 0) return (-ERESTARTSYS); Index: sys/_callout.h =================================================================== --- sys/_callout.h +++ sys/_callout.h @@ -46,6 +46,17 @@ SLIST_HEAD(callout_slist, callout); TAILQ_HEAD(callout_tailq, callout); +typedef void callout_func_t(void *); + +struct callout_args { + sbintime_t time; /* absolute time for the event */ + sbintime_t precision; /* delta allowed wrt opt */ + void *arg; /* function argument */ + callout_func_t *func; /* function to call */ + int flags; /* flags passed to callout_reset() */ + int cpu; /* CPU we're scheduled on */ +}; + struct callout { union { LIST_ENTRY(callout) le; @@ -52,13 +63,13 @@ SLIST_ENTRY(callout) sle; TAILQ_ENTRY(callout) tqe; } c_links; - sbintime_t c_time; /* ticks to the event */ + sbintime_t c_time; /* absolute time for the event */ sbintime_t c_precision; /* delta allowed wrt opt */ void *c_arg; /* function argument */ - void (*c_func)(void *); /* function to call */ - struct lock_object *c_lock; /* lock to handle */ + callout_func_t *c_func; /* function to call */ + struct lock_object *c_lock; /* callback lock */ int c_flags; /* state of this entry */ - volatile int c_cpu; /* CPU we're scheduled on */ + int c_cpu; /* CPU we're scheduled on */ }; #endif Index: sys/callout.h =================================================================== --- sys/callout.h +++ sys/callout.h @@ -45,10 +45,12 @@ #define CALLOUT_PENDING 0x0004 /* callout is waiting for timeout */ #define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ #define CALLOUT_RETURNUNLOCKED 0x0010 /* handler returns with mtx unlocked */ -#define CALLOUT_SHAREDLOCK 0x0020 /* callout lock held in shared mode */ -#define CALLOUT_DFRMIGRATION 0x0040 /* callout in deferred migration mode */ +#define CALLOUT_UNUSED_5 0x0020 /* --available-- */ +#define CALLOUT_DEFRESTART 0x0040 /* callout restart is deferred */ #define CALLOUT_PROCESSED 0x0080 /* callout in wheel or processing list? */ #define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */ +#define CALLOUT_SET_LC(x) (((x) & 7) << 16) /* set lock class */ +#define CALLOUT_GET_LC(x) (((x) >> 16) & 7) /* get lock class */ #define C_DIRECT_EXEC 0x0001 /* direct execution of callout */ #define C_PRELBITS 7 @@ -65,7 +67,8 @@ #ifdef _KERNEL #define callout_active(c) ((c)->c_flags & CALLOUT_ACTIVE) #define callout_deactivate(c) ((c)->c_flags &= ~CALLOUT_ACTIVE) -#define callout_drain(c) _callout_stop_safe(c, 1) +int callout_drain(struct callout *); +int callout_drain_async(struct callout *, callout_func_t *, void *); void callout_init(struct callout *, int); void _callout_init_lock(struct callout *, struct lock_object *, int); #define callout_init_mtx(c, mtx, flags) \ @@ -79,7 +82,7 @@ NULL, (flags)) #define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) int callout_reset_sbt_on(struct callout *, sbintime_t, sbintime_t, - void (*)(void *), void *, int, int); + callout_func_t *, void *, int, int); #define callout_reset_sbt(c, sbt, pr, fn, arg, flags) \ callout_reset_sbt_on((c), (sbt), (pr), (fn), (arg), (c)->c_cpu, (flags)) #define callout_reset_sbt_curcpu(c, sbt, pr, fn, arg, flags) \ @@ -103,8 +106,7 @@ int callout_schedule_on(struct callout *, int, int); #define callout_schedule_curcpu(c, on_tick) \ callout_schedule_on((c), (on_tick), PCPU_GET(cpuid)) -#define callout_stop(c) _callout_stop_safe(c, 0) -int _callout_stop_safe(struct callout *, int); +int callout_stop(struct callout *); void callout_process(sbintime_t now); #endif Index: sys/proc.h =================================================================== --- sys/proc.h +++ sys/proc.h @@ -308,6 +308,7 @@ } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval struct callout td_slpcallout; /* (h) Callout for sleep. */ + struct mtx td_slpmutex; /* (h) Mutex for sleep callout */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ @@ -364,7 +365,7 @@ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ -#define TDF_TIMOFAIL 0x00001000 /* Timeout from sleep after we were awake. */ +#define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ @@ -706,7 +707,7 @@ #define SWT_OWEPREEMPT 2 /* Switching due to opepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ -#define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ +#define SWT_UNUSED5 5 /* --available-- */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */