diff --git a/share/man/man9/Makefile b/share/man/man9/Makefile --- a/share/man/man9/Makefile +++ b/share/man/man9/Makefile @@ -1027,6 +1027,12 @@ epoch.9 epoch_enter.9 \ epoch.9 epoch_exit.9 \ epoch.9 epoch_wait.9 \ + epoch.9 epoch_enter_preempt.9 \ + epoch.9 epoch_exit_preempt.9 \ + epoch.9 epoch_wait_preempt.9 \ + epoch.9 epoch_enter_sleepable.9 \ + epoch.9 epoch_exit_sleepable.9 \ + epoch.9 epoch_wait_sleepable.9 \ epoch.9 epoch_call.9 \ epoch.9 epoch_drain_callbacks.9 \ epoch.9 in_epoch.9 diff --git a/share/man/man9/epoch.9 b/share/man/man9/epoch.9 --- a/share/man/man9/epoch.9 +++ b/share/man/man9/epoch.9 @@ -26,7 +26,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 30, 2020 +.Dd May 21, 2021 .Dt EPOCH 9 .Os .Sh NAME @@ -40,6 +40,9 @@ .Nm epoch_enter_preempt , .Nm epoch_exit_preempt , .Nm epoch_wait_preempt , +.Nm epoch_enter_sleepable , +.Nm epoch_exit_sleepable , +.Nm epoch_wait_sleepable , .Nm epoch_call , .Nm epoch_drain_callbacks , .Nm in_epoch , @@ -83,6 +86,12 @@ .Ft void .Fn epoch_wait_preempt "epoch_t epoch" .Ft void +.Fn epoch_enter_sleepable "epoch_t epoch" "epoch_tracker_t et" +.Ft void +.Fn epoch_exit_sleepable "epoch_t epoch" "epoch_tracker_t et" +.Ft void +.Fn epoch_wait_sleepable "epoch_t epoch" +.Ft void .Fn epoch_call "epoch_t epoch" "epoch_callback_t callback" "epoch_context_t ctx" .Ft void .Fn epoch_drain_callbacks "epoch_t epoch" @@ -105,7 +114,9 @@ kernel option is configured. By default, epochs do not allow preemption during sections. By default mutexes cannot be held across -.Fn epoch_wait_preempt . +.Fn epoch_wait_preempt +and +.Fn epoch_wait_sleepable . The .Fa flags specified are formed by @@ -114,11 +125,17 @@ .Bl -tag -offset indent -width Ds .It Dv EPOCH_LOCKED Permit holding mutexes across -.Fn epoch_wait_preempt -(requires -.Dv EPOCH_PREEMPT ) . +.Fn epoch_wait_preempt . +Requires +.Dv EPOCH_PREEMPT +or +.Dv EPOCH_SLEEPABLE . When doing this one must be cautious of creating a situation where a deadlock is possible. +.It Dv EPOCH_CRITICAL +The default non-preemptable +.Vt epoch +type. .It Dv EPOCH_PREEMPT The .Vt epoch @@ -135,6 +152,21 @@ and .Fn epoch_wait , respectively. +.It Dv EPOCH_SLEEPABLE +The +.Vt epoch +will allow preemption and sleeping during sections. +The functions +.Fn epoch_enter_sleepable , +.Fn epoch_exit_sleepable , +and +.Fn epoch_wait_sleepable +must be used in place of +.Fn epoch_enter , +.Fn epoch_exit , +and +.Fn epoch_wait , +respectively. .El .Pp .Vt epoch Ns s @@ -142,23 +174,26 @@ .Fn epoch_free . .Pp Threads indicate the start of an epoch critical section by calling -.Fn epoch_enter -(or +.Fn epoch_enter , .Fn epoch_enter_preempt -for preemptible epochs). +for preemptible epochs or +.Fn epoch_enter_sleepable +for sleepable epochs. Threads call -.Fn epoch_exit -(or +.Fn epoch_exit , .Fn epoch_exit_preempt -for preemptible epochs) -to indicate the end of a critical section. +for preemptible epochs or +.Fn epoch_exit_sleepable +for sleepable epochs, to indicate the end of a critical section. .Vt struct epoch_tracker Ns s are stack objects whose pointers are passed to -.Fn epoch_enter_preempt +.Fn epoch_enter_preempt , +.Fn epoch_exit_preempt , +.Fn epoch_enter_sleepable and -.Fn epoch_exit_preempt -(much like -.Vt struct rm_priotracker ) . +.Fn epoch_exit_sleepable , +much like the +.Vt struct rm_priotracker . .Pp Threads can defer work until a grace period has expired since any thread has entered the epoch either synchronously or asynchronously. @@ -166,19 +201,25 @@ defers work asynchronously by invoking the provided .Fa callback at a later time. -.Fn epoch_wait -(or -.Fn epoch_wait_preempt ) +.Fn epoch_wait , +.Fn epoch_wait_preempt +or +.Fn epoch_wait_sleepable blocks the current thread until the grace period has expired and the work can be done safely. .Pp -Default, non-preemptible epoch wait -.Fn ( epoch_wait ) +Default, non-preemptible epoch wait, +.Fn epoch_wait , +is guaranteed to have much shorter completion times relative to +preemptible epoch wait, +.Fn epoch_wait_preempt . +In turn the preemptible epoch wait, +.Fn epoch_wait_preempt , is guaranteed to have much shorter completion times relative to -preemptible epoch wait -.Fn ( epoch_wait_preempt ) . -(In the default type, none of the threads in an epoch section will be preempted -before completing its section.) +sleepable epoch wait, +.Fn epoch_wait_sleepable . +In the default type, none of the threads in an epoch section will be preempted +before completing its section. .Pp INVARIANTS can assert that a thread is in an epoch by using .Fn in_epoch . @@ -191,9 +232,11 @@ .Fn in_epoch_verbose "epoch" "1" provides additional verbose debugging information. .Pp -The epoch API currently does not support sleeping in epoch_preempt sections. A caller should never call -.Fn epoch_wait +.Fn epoch_wait , +.Fn epoch_wait_preempt +or +.Fn epoch_wait_sleepable in the middle of an epoch section for the same epoch as this will lead to a deadlock. .Pp The @@ -282,7 +325,9 @@ .Fx 11.0 . .Sh CAVEATS One must be cautious when using -.Fn epoch_wait_preempt . +.Fn epoch_wait_preempt +and +.Fn epoch_wait_sleepable . Threads are pinned during epoch sections, so if a thread in a section is then preempted by a higher priority compute bound thread on that CPU, it can be prevented from leaving the section indefinitely. diff --git a/sys/kern/subr_epoch.c b/sys/kern/subr_epoch.c --- a/sys/kern/subr_epoch.c +++ b/sys/kern/subr_epoch.c @@ -2,6 +2,7 @@ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018, Matthew Macy + * Copyright (c) 2017-2021, Hans Petter Selasky * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -138,6 +139,7 @@ static __read_mostly int inited; __read_mostly epoch_t global_epoch; __read_mostly epoch_t global_epoch_preempt; +__read_mostly epoch_t global_epoch_sleepable; static void epoch_call_task(void *context __unused); static uma_zone_t pcpu_zone_record; @@ -291,8 +293,9 @@ #endif sx_init(&epoch_sx, "epoch-sx"); inited = 1; - global_epoch = epoch_alloc("Global", 0); + global_epoch = epoch_alloc("Global critical", EPOCH_CRITICAL); global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT); + global_epoch_sleepable = epoch_alloc("Global sleepable", EPOCH_SLEEPABLE); } SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL); @@ -338,6 +341,7 @@ int i; MPASS(name != NULL); + MPASS((flags & EPOCH_TYPE_MASK) != EPOCH_RESERVED); if (__predict_false(!inited)) panic("%s called too early in boot", __func__); @@ -446,9 +450,8 @@ MPASS((vm_offset_t)et >= td->td_kstack && (vm_offset_t)et + sizeof(struct epoch_tracker) <= td->td_kstack + td->td_kstack_pages * PAGE_SIZE); - INIT_CHECK(epoch); - MPASS(epoch->e_flags & EPOCH_PREEMPT); + MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_PREEMPT); #ifdef EPOCH_TRACE epoch_trace_enter(td, epoch, et, file, line); @@ -466,6 +469,37 @@ critical_exit(); } +void +_epoch_enter_sleepable(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) +{ + struct epoch_record *er; + struct thread *td; + + MPASS(cold || epoch != NULL); + td = curthread; + MPASS((vm_offset_t)et >= td->td_kstack && + (vm_offset_t)et + sizeof(struct epoch_tracker) <= + td->td_kstack + td->td_kstack_pages * PAGE_SIZE); + + INIT_CHECK(epoch); + MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_SLEEPABLE); + +#ifdef EPOCH_TRACE + epoch_trace_enter(td, epoch, et, file, line); +#endif + et->et_td = td; + et->et_old_priority = 0; /* not used */ + + critical_enter(); + sched_pin(); + er = epoch_currecord(epoch); + /* Record-level tracking is reserved for non-preemptible epochs. */ + MPASS(er->er_td == NULL); + TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link); + ck_epoch_begin(&er->er_record, &et->et_section); + critical_exit(); +} + void epoch_enter(epoch_t epoch) { @@ -499,7 +533,7 @@ sched_unpin(); THREAD_SLEEPING_OK(); er = epoch_currecord(epoch); - MPASS(epoch->e_flags & EPOCH_PREEMPT); + MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_PREEMPT); MPASS(et != NULL); MPASS(et->et_td == td); #ifdef INVARIANTS @@ -518,6 +552,35 @@ #endif } +void +_epoch_exit_sleepable(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) +{ + struct epoch_record *er; + struct thread *td; + + INIT_CHECK(epoch); + td = curthread; + critical_enter(); + sched_unpin(); + er = epoch_currecord(epoch); + MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_SLEEPABLE); + MPASS(et != NULL); + MPASS(et->et_td == td); +#ifdef INVARIANTS + et->et_td = (void*)0xDEADBEEF; + MPASS(et->et_old_priority == 0); + /* Record-level tracking is reserved for non-preemptible epochs. */ + MPASS(er->er_td == NULL); +#endif + ck_epoch_end(&er->er_record, &et->et_section); + TAILQ_REMOVE(&er->er_tdlist, et, et_link); + er->er_gen++; + critical_exit(); +#ifdef EPOCH_TRACE + epoch_trace_exit(td, epoch, et, file, line); +#endif +} + void epoch_exit(epoch_t epoch) { @@ -691,7 +754,7 @@ td = curthread; #ifdef INVARIANTS locks = curthread->td_locks; - MPASS(epoch->e_flags & EPOCH_PREEMPT); + MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_PREEMPT); if ((epoch->e_flags & EPOCH_LOCKED) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "epoch_wait() can be long running"); @@ -732,6 +795,133 @@ ("%d residual locks held", td->td_locks - locks)); } +/* + * epoch_block_handler_sleepable() is a callback from the CK code when another + * thread is currently in an epoch section. + */ +static void +epoch_block_handler_sleepable(struct ck_epoch *global __unused, + ck_epoch_record_t *cr, void *arg __unused) +{ + epoch_record_t record; + struct thread *td; + struct epoch_tracker *tdwait; + + record = __containerof(cr, struct epoch_record, er_record); + td = curthread; + counter_u64_add(block_count, 1); + + /* + * We lost a race and there's no longer any threads + * on the CPU in an epoch section. + */ + if (TAILQ_EMPTY(&record->er_tdlist)) + return; + + if (record->er_cpuid == curcpu) { + bool is_sleeping = 0; + uint8_t prio = 0; + + /* + * Find the lowest priority or sleeping thread which + * is blocking synchronization on this CPU core. All + * the threads in the queue are CPU-pinned and cannot + * go anywhere while the current thread is locked. + */ + TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { + if (tdwait->et_td->td_priority > prio) + prio = tdwait->et_td->td_priority; + is_sleeping |= (tdwait->et_td->td_inhibitors != 0); + } + + if (is_sleeping) { + /* + * Wait one tick. Performance is not critical + * for sleepable EPOCHs. + */ + thread_unlock(td); + pause("W", 1); + thread_lock(td); + } else { + /* set new thread priority */ + sched_prio(td, prio); + /* task switch */ + mi_switch(SW_VOL | SWT_RELINQUISH); + /* + * It is important the thread lock is dropped + * while yielding to allow other threads to + * acquire the lock pointed to by + * TDQ_LOCKPTR(td). Currently mi_switch() will + * unlock the thread lock before + * returning. Else a deadlock like situation + * might happen. + */ + thread_lock(td); + } + } else { + /* + * To avoid spinning move execution to the other CPU + * which is blocking synchronization. Set highest + * thread priority so that code gets run. The thread + * priority will be restored later. + */ + sched_prio(td, 0); + sched_bind(td, record->er_cpuid); + } +} + +void +epoch_wait_sleepable(epoch_t epoch) +{ + struct thread *td; + int was_bound; + int old_cpu; + int old_pinned; + u_char old_prio; + + MPASS(cold || epoch != NULL); + INIT_CHECK(epoch); + td = curthread; +#ifdef INVARIANTS + MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_SLEEPABLE); + if ((epoch->e_flags & EPOCH_LOCKED) == 0) + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "epoch_wait() can be long running"); + KASSERT(!in_epoch(epoch), ("epoch_wait_sleepable() called in the middle " + "of an epoch section of the same epoch")); +#endif + DROP_GIANT(); + thread_lock(td); + + old_cpu = PCPU_GET(cpuid); + old_pinned = td->td_pinned; + old_prio = td->td_priority; + was_bound = sched_is_bound(td); + sched_unbind(td); + td->td_pinned = 0; + sched_bind(td, old_cpu); + + ck_epoch_synchronize_wait(&epoch->e_epoch, + epoch_block_handler_sleepable, NULL); + + /* restore CPU binding, if any */ + if (was_bound != 0) { + sched_bind(td, old_cpu); + } else { + /* get thread back to initial CPU, if any */ + if (old_pinned != 0) + sched_bind(td, old_cpu); + sched_unbind(td); + } + /* restore pinned after bind */ + td->td_pinned = old_pinned; + + /* restore thread priority */ + sched_prio(td, old_prio); + thread_unlock(td); + PICKUP_GIANT(); +} + static void epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused, void *arg __unused) @@ -828,7 +1018,7 @@ struct thread *td; MPASS(epoch != NULL); - MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0); + MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_PREEMPT); td = curthread; if (THREAD_CAN_SLEEP()) return (0); @@ -852,6 +1042,37 @@ return (0); } +static int +in_epoch_verbose_sleepable(epoch_t epoch, int dump_onfail) +{ + epoch_record_t er; + struct epoch_tracker *tdwait; + struct thread *td; + + MPASS(epoch != NULL); + MPASS((epoch->e_flags & EPOCH_TYPE_MASK) == EPOCH_SLEEPABLE); + td = curthread; + critical_enter(); + er = epoch_currecord(epoch); + TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) { + if (tdwait->et_td != td) + continue; + critical_exit(); + return (1); + } +#ifdef INVARIANTS + if (dump_onfail) { + MPASS(td->td_pinned); + printf("cpu: %d id: %d\n", curcpu, td->td_tid); + TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) + printf("td_tid: %d ", tdwait->et_td->td_tid); + printf("\n"); + } +#endif + critical_exit(); + return (0); +} + #ifdef INVARIANTS static void epoch_assert_nocpu(epoch_t epoch, struct thread *td) @@ -880,10 +1101,19 @@ epoch_record_t er; struct thread *td; - if (__predict_false((epoch) == NULL)) + if (__predict_false(epoch == NULL)) return (0); - if ((epoch->e_flags & EPOCH_PREEMPT) != 0) + + switch (epoch->e_flags & EPOCH_TYPE_MASK) { + case EPOCH_CRITICAL: + break; + case EPOCH_PREEMPT: return (in_epoch_verbose_preempt(epoch, dump_onfail)); + case EPOCH_SLEEPABLE: + return (in_epoch_verbose_sleepable(epoch, dump_onfail)); + default: + panic("in_epoch_verbose: Invalid EPOCH type."); + } /* * The thread being in a critical section is a necessary diff --git a/sys/sys/epoch.h b/sys/sys/epoch.h --- a/sys/sys/epoch.h +++ b/sys/sys/epoch.h @@ -45,11 +45,16 @@ struct epoch; typedef struct epoch *epoch_t; -#define EPOCH_PREEMPT 0x1 -#define EPOCH_LOCKED 0x2 +#define EPOCH_TYPE_MASK (EPOCH_PREEMPT | EPOCH_SLEEPABLE) +#define EPOCH_CRITICAL 0x0 +#define EPOCH_PREEMPT 0x1 +#define EPOCH_LOCKED 0x2 +#define EPOCH_SLEEPABLE 0x4 +#define EPOCH_RESERVED 0x5 extern epoch_t global_epoch; extern epoch_t global_epoch_preempt; +extern epoch_t global_epoch_sleepable; struct epoch_tracker { TAILQ_ENTRY(epoch_tracker) et_link; @@ -69,10 +74,11 @@ void epoch_free(epoch_t epoch); void epoch_wait(epoch_t epoch); void epoch_wait_preempt(epoch_t epoch); +void epoch_wait_sleepable(epoch_t epoch); void epoch_drain_callbacks(epoch_t epoch); void epoch_call(epoch_t epoch, epoch_callback_t cb, epoch_context_t ctx); int in_epoch(epoch_t epoch); -int in_epoch_verbose(epoch_t epoch, int dump_onfail); +int in_epoch_verbose(epoch_t epoch, int dump_onfail); DPCPU_DECLARE(int, epoch_cb_count); DPCPU_DECLARE(struct grouptask, epoch_cb_task); @@ -84,14 +90,25 @@ void _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); void _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); + +void _epoch_enter_sleepable(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); +void _epoch_exit_sleepable(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE); + #ifdef EPOCH_TRACE void epoch_trace_list(struct thread *); #define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et, __FILE__, __LINE__) #define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et, __FILE__, __LINE__) + +#define epoch_enter_sleepable(epoch, et) _epoch_enter_sleepable(epoch, et, __FILE__, __LINE__) +#define epoch_exit_sleepable(epoch, et) _epoch_exit_sleepable(epoch, et, __FILE__, __LINE__) #else #define epoch_enter_preempt(epoch, et) _epoch_enter_preempt(epoch, et) #define epoch_exit_preempt(epoch, et) _epoch_exit_preempt(epoch, et) + +#define epoch_enter_sleepable(epoch, et) _epoch_enter_sleepable(epoch, et) +#define epoch_exit_sleepable(epoch, et) _epoch_exit_sleepable(epoch, et) #endif + void epoch_enter(epoch_t epoch); void epoch_exit(epoch_t epoch);