Index: sys/kern/subr_epoch.c =================================================================== --- sys/kern/subr_epoch.c +++ sys/kern/subr_epoch.c @@ -66,16 +66,18 @@ #define EPOCH_ALIGN CACHE_LINE_SIZE #endif -TAILQ_HEAD (epoch_tdlist, epoch_tracker); +TAILQ_HEAD(epoch_tdlist, epoch_tracker); typedef struct epoch_record { ck_epoch_record_t er_record; - struct epoch_context er_drain_ctx; struct epoch *er_parent; - volatile struct epoch_tdlist er_tdlist; - volatile uint32_t er_gen; + struct epoch_tdlist er_tdlist; + struct thread *er_firsttd; + struct thread *er_blockedtd; + struct mtx er_lock; + struct lock_object er_lo; uint32_t er_cpuid; int er_drain_state; -} __aligned(EPOCH_ALIGN) *epoch_record_t; +} __aligned(EPOCH_ALIGN) *epoch_record_t; #define EPOCH_DRAIN_START 2 #define EPOCH_DRAIN_RUNNING 1 @@ -91,8 +93,6 @@ const char *e_name; }; -/* arbitrary --- needs benchmarking */ -#define MAX_ADAPTIVE_SPIN 100 #define MAX_EPOCHS 64 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context)); @@ -101,33 +101,22 @@ SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "epoch stats"); -/* Stats. */ -static counter_u64_t block_count; - -SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW, - &block_count, "# of times a thread was in an epoch when epoch_wait was called"); -static counter_u64_t migrate_count; - -SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW, - &migrate_count, "# of times thread was migrated to another CPU in epoch_wait"); -static counter_u64_t turnstile_count; - -SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW, - &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait"); -static counter_u64_t switch_count; - -SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW, - &switch_count, "# of times a thread voluntarily context switched in epoch_wait"); -static counter_u64_t epoch_call_count; - -SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW, - &epoch_call_count, "# of times a callback was deferred"); -static counter_u64_t epoch_call_task_count; - -SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW, - &epoch_call_task_count, "# of times a callback task was run"); - -TAILQ_HEAD (threadlist, thread); +static COUNTER_U64_DEFINE_EARLY(block_count); +SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, blocked, CTLFLAG_RW, + &block_count, + "Number of times a thread was in an epoch when epoch_wait was called"); +static COUNTER_U64_DEFINE_EARLY(turnstile_count); +SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, contended, CTLFLAG_RW, + &turnstile_count, + "Number of times a thread was blocked on a lock in an epoch during an epoch_wait"); +static COUNTER_U64_DEFINE_EARLY(call_count); +SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, calls, CTLFLAG_RW, + &call_count, + "Number of times a callback was deferred"); +static COUNTER_U64_DEFINE_EARLY(call_task_count); +SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, call_tasks, CTLFLAG_RW, + &call_task_count, + "Number of times a callback task was run"); CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, ck_epoch_entry_container) @@ -186,7 +175,6 @@ va_list ap; struct stackentry se, *new; - stack_zero(&se.se_stack); /* XXX: is it really needed? */ stack_save(&se.se_stack); /* Tree is never reduced - go lockless. */ @@ -265,13 +253,6 @@ { int cpu; - block_count = counter_u64_alloc(M_WAITOK); - migrate_count = counter_u64_alloc(M_WAITOK); - turnstile_count = counter_u64_alloc(M_WAITOK); - switch_count = counter_u64_alloc(M_WAITOK); - epoch_call_count = counter_u64_alloc(M_WAITOK); - epoch_call_task_count = counter_u64_alloc(M_WAITOK); - pcpu_zone_record = uma_zcreate("epoch_record pcpu", sizeof(struct epoch_record), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); @@ -306,24 +287,39 @@ epoch_record_t er; int cpu; - epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK); + epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK | + M_ZERO); CPU_FOREACH(cpu) { er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); - bzero(er, sizeof(*er)); ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); - TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); + mtx_init(&er->er_lock, "epoch wait", NULL, MTX_DEF); + er->er_lo.lo_name = epoch->e_name; + TAILQ_INIT(&er->er_tdlist); er->er_cpuid = cpu; er->er_parent = epoch; } } +/* + * Slow path for epoch_exit_preempt(): wake up blocked threads that have + * propagated their scheduling priority to us. + */ static void -epoch_adjust_prio(struct thread *td, u_char prio) +epoch_unblock(epoch_record_t er) { + struct turnstile *ts; - thread_lock(td); - sched_prio(td, prio); - thread_unlock(td); + KASSERT(er->er_blockedtd == curthread, + ("%s: unblocking from wrong thread", __func__)); + + mtx_lock(&er->er_lock); + turnstile_chain_lock(&er->er_lo); + ts = turnstile_lookup(&er->er_lo); + turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE); + turnstile_unpend(ts); + turnstile_chain_unlock(&er->er_lo); + er->er_blockedtd = NULL; + mtx_unlock(&er->er_lock); } epoch_t @@ -391,9 +387,9 @@ THREAD_NO_SLEEPING(); critical_enter(); sched_pin(); - td->td_pre_epoch_prio = td->td_priority; er = epoch_currecord(epoch); TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link); + er->er_firsttd = TAILQ_FIRST(&er->er_tdlist)->et_td; ck_epoch_begin(&er->er_record, &et->et_section); critical_exit(); } @@ -414,26 +410,27 @@ _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) { struct epoch_record *er; + struct epoch_tracker *fet; struct thread *td; INIT_CHECK(epoch); td = curthread; + THREAD_SLEEPING_OK(); + critical_enter(); sched_unpin(); - THREAD_SLEEPING_OK(); er = epoch_currecord(epoch); + MPASS(epoch->e_flags & EPOCH_PREEMPT); - MPASS(et != NULL); MPASS(et->et_td == td); -#ifdef INVARIANTS - et->et_td = (void*)0xDEADBEEF; -#endif + ck_epoch_end(&er->er_record, &et->et_section); TAILQ_REMOVE(&er->er_tdlist, et, et_link); - er->er_gen++; - if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) - epoch_adjust_prio(td, td->td_pre_epoch_prio); + fet = TAILQ_FIRST(&er->er_tdlist); + er->er_firsttd = fet != NULL ? fet->et_td : NULL; critical_exit(); + if (__predict_false(er->er_blockedtd == td)) + epoch_unblock(er); #ifdef EPOCH_TRACE epoch_trace_exit(td, epoch, et, file, line); #endif @@ -458,148 +455,61 @@ epoch_block_handler_preempt(struct ck_epoch *global __unused, ck_epoch_record_t *cr, void *arg __unused) { - epoch_record_t record; - struct thread *td, *owner, *curwaittd; - struct epoch_tracker *tdwait; + struct epoch_record *er; + struct thread *td; struct turnstile *ts; - struct lock_object *lock; - int spincount, gen; - int locksheld __unused; - record = __containerof(cr, struct epoch_record, er_record); - td = curthread; - locksheld = td->td_locks; - spincount = 0; counter_u64_add(block_count, 1); - /* - * We lost a race and there's no longer any threads - * on the CPU in an epoch section. - */ - if (TAILQ_EMPTY(&record->er_tdlist)) - return; - if (record->er_cpuid != curcpu) { + er = __containerof(cr, struct epoch_record, er_record); + + td = er->er_firsttd; + if (td == NULL) + return; + if (TD_IS_RUNNING(td)) { /* - * If the head of the list is running, we can wait for it - * to remove itself from the list and thus save us the - * overhead of a migration + * There is nothing useful we can do until this thread exits the + * epoch. */ - gen = record->er_gen; - thread_unlock(td); + cpu_spinwait(); + return; + } + + mtx_lock(&er->er_lock); + if (er->er_blockedtd == NULL) { /* - * We can't actually check if the waiting thread is running - * so we simply poll for it to exit before giving up and - * migrating. + * A thread in the target epoch is off-CPU. Prepare to make it + * the owner of this CPU's turnstile so that we can lend + * priority. Ensure that it will wake us up upon exiting the + * section, using the thread lock to ensure that it doesn't get + * scheduled and exit the section before we're ready. */ - do { - cpu_spinwait(); - } while (!TAILQ_EMPTY(&record->er_tdlist) && - gen == record->er_gen && - spincount++ < MAX_ADAPTIVE_SPIN); thread_lock(td); - /* - * If the generation has changed we can poll again - * otherwise we need to migrate. - */ - if (gen != record->er_gen) + if (TD_IS_RUNNING(td) || td != er->er_firsttd) { + thread_unlock(td); + mtx_unlock(&er->er_lock); return; + } + er->er_blockedtd = td; + thread_unlock(td); + } else { /* - * Being on the same CPU as that of the record on which - * we need to wait allows us access to the thread - * list associated with that CPU. We can then examine the - * oldest thread in the queue and wait on its turnstile - * until it resumes and so on until a grace period - * elapses. - * - */ - counter_u64_add(migrate_count, 1); - sched_bind(td, record->er_cpuid); - /* - * At this point we need to return to the ck code - * to scan to see if a grace period has elapsed. - * We can't move on to check the thread list, because - * in the meantime new threads may have arrived that - * in fact belong to a different epoch. + * At least one other thread is blocked waiting for a thread to + * exit the target epoch. Join it. */ - return; + td = er->er_blockedtd; } - /* - * Try to find a thread in an epoch section on this CPU - * waiting on a turnstile. Otherwise find the lowest - * priority thread (highest prio value) and drop our priority - * to match to allow it to run. - */ - TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { - /* - * Propagate our priority to any other waiters to prevent us - * from starving them. They will have their original priority - * restore on exit from epoch_wait(). - */ - curwaittd = tdwait->et_td; - if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) { - critical_enter(); - thread_unlock(td); - thread_lock(curwaittd); - sched_prio(curwaittd, td->td_priority); - thread_unlock(curwaittd); - thread_lock(td); - critical_exit(); - } - if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) && - ((ts = curwaittd->td_blocked) != NULL)) { - /* - * We unlock td to allow turnstile_wait to reacquire - * the thread lock. Before unlocking it we enter a - * critical section to prevent preemption after we - * reenable interrupts by dropping the thread lock in - * order to prevent curwaittd from getting to run. - */ - critical_enter(); - thread_unlock(td); + ts = turnstile_trywait(&er->er_lo); + mtx_unlock(&er->er_lock); - if (turnstile_lock(ts, &lock, &owner)) { - if (ts == curwaittd->td_blocked) { - MPASS(TD_IS_INHIBITED(curwaittd) && - TD_ON_LOCK(curwaittd)); - critical_exit(); - turnstile_wait(ts, owner, - curwaittd->td_tsqueue); - counter_u64_add(turnstile_count, 1); - thread_lock(td); - return; - } - turnstile_unlock(ts, lock); - } - thread_lock(td); - critical_exit(); - KASSERT(td->td_locks == locksheld, - ("%d extra locks held", td->td_locks - locksheld)); - } - } - /* - * We didn't find any threads actually blocked on a lock - * so we have nothing to do except context switch away. - */ - counter_u64_add(switch_count, 1); - mi_switch(SW_VOL | SWT_RELINQUISH); - /* - * It is important the thread lock is dropped while yielding - * to allow other threads to acquire the lock pointed to by - * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the - * thread lock before returning. Else a deadlock like - * situation might happen. - */ - thread_lock(td); + counter_u64_add(turnstile_count, 1); + turnstile_wait(ts, td, TS_EXCLUSIVE_QUEUE); } void epoch_wait_preempt(epoch_t epoch) { struct thread *td; - int was_bound; - int old_cpu; - int old_pinned; - u_char old_prio; int locks __unused; MPASS(cold || epoch != NULL); @@ -615,34 +525,10 @@ "of an epoch section of the same epoch")); #endif DROP_GIANT(); - thread_lock(td); - - old_cpu = PCPU_GET(cpuid); - old_pinned = td->td_pinned; - old_prio = td->td_priority; - was_bound = sched_is_bound(td); - sched_unbind(td); - td->td_pinned = 0; - sched_bind(td, old_cpu); ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, NULL); - /* restore CPU binding, if any */ - if (was_bound != 0) { - sched_bind(td, old_cpu); - } else { - /* get thread back to initial CPU, if any */ - if (old_pinned != 0) - sched_bind(td, old_cpu); - sched_unbind(td); - } - /* restore pinned after bind */ - td->td_pinned = old_pinned; - - /* restore thread priority */ - sched_prio(td, old_prio); - thread_unlock(td); PICKUP_GIANT(); KASSERT(td->td_locks == locks, ("%d residual locks held", td->td_locks - locks)); @@ -731,8 +617,8 @@ *DPCPU_PTR(epoch_cb_count) -= total; critical_exit(); - counter_u64_add(epoch_call_count, total); - counter_u64_add(epoch_call_task_count, 1); + counter_u64_add(call_count, total); + counter_u64_add(call_task_count, 1); head = ck_stack_batch_pop_npsc(&cb_stack); for (cursor = head; cursor != NULL; cursor = next) { Index: sys/kern/subr_turnstile.c =================================================================== --- sys/kern/subr_turnstile.c +++ sys/kern/subr_turnstile.c @@ -590,41 +590,6 @@ return (ts); } -bool -turnstile_lock(struct turnstile *ts, struct lock_object **lockp, - struct thread **tdp) -{ - struct turnstile_chain *tc; - struct lock_object *lock; - - if ((lock = ts->ts_lockobj) == NULL) - return (false); - tc = TC_LOOKUP(lock); - mtx_lock_spin(&tc->tc_lock); - mtx_lock_spin(&ts->ts_lock); - if (__predict_false(lock != ts->ts_lockobj)) { - mtx_unlock_spin(&tc->tc_lock); - mtx_unlock_spin(&ts->ts_lock); - return (false); - } - *lockp = lock; - *tdp = ts->ts_owner; - return (true); -} - -void -turnstile_unlock(struct turnstile *ts, struct lock_object *lock) -{ - struct turnstile_chain *tc; - - mtx_assert(&ts->ts_lock, MA_OWNED); - mtx_unlock_spin(&ts->ts_lock); - if (ts == curthread->td_turnstile) - ts->ts_lockobj = NULL; - tc = TC_LOOKUP(lock); - mtx_unlock_spin(&tc->tc_lock); -} - void turnstile_assert(struct turnstile *ts) { Index: sys/sys/epoch.h =================================================================== --- sys/sys/epoch.h +++ sys/sys/epoch.h @@ -61,7 +61,7 @@ const char *et_file; int et_line; #endif -} __aligned(sizeof(void *)); +}; typedef struct epoch_tracker *epoch_tracker_t; epoch_t epoch_alloc(const char *name, int flags); Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -316,7 +316,6 @@ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ - u_char td_pre_epoch_prio; /* (k) User pri on entry to epoch */ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ Index: sys/sys/turnstile.h =================================================================== --- sys/sys/turnstile.h +++ sys/sys/turnstile.h @@ -99,9 +99,7 @@ struct turnstile *turnstile_trywait(struct lock_object *); void turnstile_unpend(struct turnstile *); void turnstile_wait(struct turnstile *, struct thread *, int); -bool turnstile_lock(struct turnstile *, struct lock_object **, - struct thread **); -void turnstile_unlock(struct turnstile *, struct lock_object *); void turnstile_assert(struct turnstile *); + #endif /* _KERNEL */ #endif /* _SYS_TURNSTILE_H_ */