Index: head/sys/compat/linuxkpi/common/src/linux_rcu.c =================================================================== --- head/sys/compat/linuxkpi/common/src/linux_rcu.c (revision 356681) +++ head/sys/compat/linuxkpi/common/src/linux_rcu.c (revision 356682) @@ -1,400 +1,399 @@ /*- * Copyright (c) 2016 Matthew Macy (mmacy@mattmacy.io) * Copyright (c) 2017 Hans Petter Selasky (hselasky@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * By defining CONFIG_NO_RCU_SKIP LinuxKPI RCU locks and asserts will * not be skipped during panic(). */ #ifdef CONFIG_NO_RCU_SKIP #define RCU_SKIP(void) 0 #else #define RCU_SKIP(void) unlikely(SCHEDULER_STOPPED() || kdb_active) #endif struct callback_head { STAILQ_ENTRY(callback_head) entry; rcu_callback_t func; }; struct linux_epoch_head { STAILQ_HEAD(, callback_head) cb_head; struct mtx lock; struct task task; } __aligned(CACHE_LINE_SIZE); struct linux_epoch_record { ck_epoch_record_t epoch_record; TAILQ_HEAD(, task_struct) ts_head; int cpuid; } __aligned(CACHE_LINE_SIZE); /* * Verify that "struct rcu_head" is big enough to hold "struct * callback_head". This has been done to avoid having to add special * compile flags for including ck_epoch.h to all clients of the * LinuxKPI. */ CTASSERT(sizeof(struct rcu_head) == sizeof(struct callback_head)); /* * Verify that "epoch_record" is at beginning of "struct * linux_epoch_record": */ CTASSERT(offsetof(struct linux_epoch_record, epoch_record) == 0); static ck_epoch_t linux_epoch; static struct linux_epoch_head linux_epoch_head; DPCPU_DEFINE_STATIC(struct linux_epoch_record, linux_epoch_record); static void linux_rcu_cleaner_func(void *, int); static void linux_rcu_runtime_init(void *arg __unused) { struct linux_epoch_head *head; int i; ck_epoch_init(&linux_epoch); head = &linux_epoch_head; mtx_init(&head->lock, "LRCU-HEAD", NULL, MTX_DEF); TASK_INIT(&head->task, 0, linux_rcu_cleaner_func, NULL); STAILQ_INIT(&head->cb_head); CPU_FOREACH(i) { struct linux_epoch_record *record; record = &DPCPU_ID_GET(i, linux_epoch_record); record->cpuid = i; ck_epoch_register(&linux_epoch, &record->epoch_record, NULL); TAILQ_INIT(&record->ts_head); } } SYSINIT(linux_rcu_runtime, SI_SUB_CPU, SI_ORDER_ANY, linux_rcu_runtime_init, NULL); static void linux_rcu_runtime_uninit(void *arg __unused) { struct linux_epoch_head *head; head = &linux_epoch_head; /* destroy head lock */ mtx_destroy(&head->lock); } SYSUNINIT(linux_rcu_runtime, SI_SUB_LOCK, SI_ORDER_SECOND, linux_rcu_runtime_uninit, NULL); static void linux_rcu_cleaner_func(void *context __unused, int pending __unused) { struct linux_epoch_head *head; struct callback_head *rcu; STAILQ_HEAD(, callback_head) tmp_head; linux_set_current(curthread); head = &linux_epoch_head; /* move current callbacks into own queue */ mtx_lock(&head->lock); STAILQ_INIT(&tmp_head); STAILQ_CONCAT(&tmp_head, &head->cb_head); mtx_unlock(&head->lock); /* synchronize */ linux_synchronize_rcu(); /* dispatch all callbacks, if any */ while ((rcu = STAILQ_FIRST(&tmp_head)) != NULL) { uintptr_t offset; STAILQ_REMOVE_HEAD(&tmp_head, entry); offset = (uintptr_t)rcu->func; if (offset < LINUX_KFREE_RCU_OFFSET_MAX) kfree((char *)rcu - offset); else rcu->func((struct rcu_head *)rcu); } } void linux_rcu_read_lock(void) { struct linux_epoch_record *record; struct task_struct *ts; if (RCU_SKIP()) return; /* * Pin thread to current CPU so that the unlock code gets the * same per-CPU epoch record: */ sched_pin(); record = &DPCPU_GET(linux_epoch_record); ts = current; /* * Use a critical section to prevent recursion inside * ck_epoch_begin(). Else this function supports recursion. */ critical_enter(); ck_epoch_begin(&record->epoch_record, NULL); ts->rcu_recurse++; if (ts->rcu_recurse == 1) TAILQ_INSERT_TAIL(&record->ts_head, ts, rcu_entry); critical_exit(); } void linux_rcu_read_unlock(void) { struct linux_epoch_record *record; struct task_struct *ts; if (RCU_SKIP()) return; record = &DPCPU_GET(linux_epoch_record); ts = current; /* * Use a critical section to prevent recursion inside * ck_epoch_end(). Else this function supports recursion. */ critical_enter(); ck_epoch_end(&record->epoch_record, NULL); ts->rcu_recurse--; if (ts->rcu_recurse == 0) TAILQ_REMOVE(&record->ts_head, ts, rcu_entry); critical_exit(); sched_unpin(); } static void linux_synchronize_rcu_cb(ck_epoch_t *epoch __unused, ck_epoch_record_t *epoch_record, void *arg __unused) { struct linux_epoch_record *record = container_of(epoch_record, struct linux_epoch_record, epoch_record); struct thread *td = curthread; struct task_struct *ts; /* check if blocked on the current CPU */ if (record->cpuid == PCPU_GET(cpuid)) { bool is_sleeping = 0; u_char prio = 0; /* * Find the lowest priority or sleeping thread which * is blocking synchronization on this CPU core. All * the threads in the queue are CPU-pinned and cannot * go anywhere while the current thread is locked. */ TAILQ_FOREACH(ts, &record->ts_head, rcu_entry) { if (ts->task_thread->td_priority > prio) prio = ts->task_thread->td_priority; is_sleeping |= (ts->task_thread->td_inhibitors != 0); } if (is_sleeping) { thread_unlock(td); pause("W", 1); thread_lock(td); } else { /* set new thread priority */ sched_prio(td, prio); /* task switch */ mi_switch(SW_VOL | SWT_RELINQUISH); /* * It is important the thread lock is dropped * while yielding to allow other threads to * acquire the lock pointed to by * TDQ_LOCKPTR(td). Currently mi_switch() will * unlock the thread lock before * returning. Else a deadlock like situation * might happen. */ thread_lock(td); } } else { /* * To avoid spinning move execution to the other CPU * which is blocking synchronization. Set highest * thread priority so that code gets run. The thread * priority will be restored later. */ sched_prio(td, 0); sched_bind(td, record->cpuid); } } void linux_synchronize_rcu(void) { struct thread *td; int was_bound; int old_cpu; int old_pinned; u_char old_prio; if (RCU_SKIP()) return; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "linux_synchronize_rcu() can sleep"); td = curthread; + DROP_GIANT(); /* * Synchronizing RCU might change the CPU core this function * is running on. Save current values: */ thread_lock(td); - - DROP_GIANT(); old_cpu = PCPU_GET(cpuid); old_pinned = td->td_pinned; old_prio = td->td_priority; was_bound = sched_is_bound(td); sched_unbind(td); td->td_pinned = 0; sched_bind(td, old_cpu); ck_epoch_synchronize_wait(&linux_epoch, &linux_synchronize_rcu_cb, NULL); /* restore CPU binding, if any */ if (was_bound != 0) { sched_bind(td, old_cpu); } else { /* get thread back to initial CPU, if any */ if (old_pinned != 0) sched_bind(td, old_cpu); sched_unbind(td); } /* restore pinned after bind */ td->td_pinned = old_pinned; /* restore thread priority */ sched_prio(td, old_prio); thread_unlock(td); PICKUP_GIANT(); } void linux_rcu_barrier(void) { struct linux_epoch_head *head; linux_synchronize_rcu(); head = &linux_epoch_head; /* wait for callbacks to complete */ taskqueue_drain(taskqueue_fast, &head->task); } void linux_call_rcu(struct rcu_head *context, rcu_callback_t func) { struct callback_head *rcu = (struct callback_head *)context; struct linux_epoch_head *head = &linux_epoch_head; mtx_lock(&head->lock); rcu->func = func; STAILQ_INSERT_TAIL(&head->cb_head, rcu, entry); taskqueue_enqueue(taskqueue_fast, &head->task); mtx_unlock(&head->lock); } int init_srcu_struct(struct srcu_struct *srcu) { return (0); } void cleanup_srcu_struct(struct srcu_struct *srcu) { } int srcu_read_lock(struct srcu_struct *srcu) { linux_rcu_read_lock(); return (0); } void srcu_read_unlock(struct srcu_struct *srcu, int key __unused) { linux_rcu_read_unlock(); } void synchronize_srcu(struct srcu_struct *srcu) { linux_synchronize_rcu(); } void srcu_barrier(struct srcu_struct *srcu) { linux_rcu_barrier(); } Index: head/sys/kern/subr_epoch.c =================================================================== --- head/sys/kern/subr_epoch.c (revision 356681) +++ head/sys/kern/subr_epoch.c (revision 356682) @@ -1,846 +1,846 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018, Matthew Macy * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef EPOCH_TRACE #include #include #include #endif #include #include #include #include #include static MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation"); #ifdef __amd64__ #define EPOCH_ALIGN CACHE_LINE_SIZE*2 #else #define EPOCH_ALIGN CACHE_LINE_SIZE #endif TAILQ_HEAD (epoch_tdlist, epoch_tracker); typedef struct epoch_record { ck_epoch_record_t er_record; struct epoch_context er_drain_ctx; struct epoch *er_parent; volatile struct epoch_tdlist er_tdlist; volatile uint32_t er_gen; uint32_t er_cpuid; } __aligned(EPOCH_ALIGN) *epoch_record_t; struct epoch { struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); epoch_record_t e_pcpu_record; int e_idx; int e_flags; struct sx e_drain_sx; struct mtx e_drain_mtx; volatile int e_drain_count; const char *e_name; }; /* arbitrary --- needs benchmarking */ #define MAX_ADAPTIVE_SPIN 100 #define MAX_EPOCHS 64 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context)); SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information"); SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats"); /* Stats. */ static counter_u64_t block_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW, &block_count, "# of times a thread was in an epoch when epoch_wait was called"); static counter_u64_t migrate_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW, &migrate_count, "# of times thread was migrated to another CPU in epoch_wait"); static counter_u64_t turnstile_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW, &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait"); static counter_u64_t switch_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW, &switch_count, "# of times a thread voluntarily context switched in epoch_wait"); static counter_u64_t epoch_call_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW, &epoch_call_count, "# of times a callback was deferred"); static counter_u64_t epoch_call_task_count; SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW, &epoch_call_task_count, "# of times a callback task was run"); TAILQ_HEAD (threadlist, thread); CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, ck_epoch_entry_container) epoch_t allepochs[MAX_EPOCHS]; DPCPU_DEFINE(struct grouptask, epoch_cb_task); DPCPU_DEFINE(int, epoch_cb_count); static __read_mostly int inited; static __read_mostly int epoch_count; __read_mostly epoch_t global_epoch; __read_mostly epoch_t global_epoch_preempt; static void epoch_call_task(void *context __unused); static uma_zone_t pcpu_zone_record; #ifdef EPOCH_TRACE struct stackentry { RB_ENTRY(stackentry) se_node; struct stack se_stack; }; static int stackentry_compare(struct stackentry *a, struct stackentry *b) { if (a->se_stack.depth > b->se_stack.depth) return (1); if (a->se_stack.depth < b->se_stack.depth) return (-1); for (int i = 0; i < a->se_stack.depth; i++) { if (a->se_stack.pcs[i] > b->se_stack.pcs[i]) return (1); if (a->se_stack.pcs[i] < b->se_stack.pcs[i]) return (-1); } return (0); } RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks); RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare); static struct mtx epoch_stacks_lock; MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF); static bool epoch_trace_stack_print = true; SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN, &epoch_trace_stack_print, 0, "Print stack traces on epoch reports"); static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2); static inline void epoch_trace_report(const char *fmt, ...) { va_list ap; struct stackentry se, *new; stack_zero(&se.se_stack); /* XXX: is it really needed? */ stack_save(&se.se_stack); /* Tree is never reduced - go lockless. */ if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL) return; new = malloc(sizeof(*new), M_STACK, M_NOWAIT); if (new != NULL) { bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack)); mtx_lock(&epoch_stacks_lock); new = RB_INSERT(stacktree, &epoch_stacks, new); mtx_unlock(&epoch_stacks_lock); if (new != NULL) free(new, M_STACK); } va_start(ap, fmt); (void)vprintf(fmt, ap); va_end(ap); if (epoch_trace_stack_print) stack_print_ddb(&se.se_stack); } static inline void epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et, const char *file, int line) { epoch_tracker_t iet; SLIST_FOREACH(iet, &td->td_epochs, et_tlink) if (iet->et_epoch == epoch) epoch_trace_report("Recursively entering epoch %s " "at %s:%d, previously entered at %s:%d\n", epoch->e_name, file, line, iet->et_file, iet->et_line); et->et_epoch = epoch; et->et_file = file; et->et_line = line; SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink); } static inline void epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et, const char *file, int line) { if (SLIST_FIRST(&td->td_epochs) != et) { epoch_trace_report("Exiting epoch %s in a not nested order " "at %s:%d. Most recently entered %s at %s:%d\n", epoch->e_name, file, line, SLIST_FIRST(&td->td_epochs)->et_epoch->e_name, SLIST_FIRST(&td->td_epochs)->et_file, SLIST_FIRST(&td->td_epochs)->et_line); /* This will panic if et is not anywhere on td_epochs. */ SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink); } else SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink); } /* Used by assertions that check thread state before going to sleep. */ void epoch_trace_list(struct thread *td) { epoch_tracker_t iet; SLIST_FOREACH(iet, &td->td_epochs, et_tlink) printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name, iet->et_file, iet->et_line); } #endif /* EPOCH_TRACE */ static void epoch_init(void *arg __unused) { int cpu; block_count = counter_u64_alloc(M_WAITOK); migrate_count = counter_u64_alloc(M_WAITOK); turnstile_count = counter_u64_alloc(M_WAITOK); switch_count = counter_u64_alloc(M_WAITOK); epoch_call_count = counter_u64_alloc(M_WAITOK); epoch_call_task_count = counter_u64_alloc(M_WAITOK); pcpu_zone_record = uma_zcreate("epoch_record pcpu", sizeof(struct epoch_record), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); CPU_FOREACH(cpu) { GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0, epoch_call_task, NULL); taskqgroup_attach_cpu(qgroup_softirq, DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL, "epoch call task"); } #ifdef EPOCH_TRACE SLIST_INIT(&thread0.td_epochs); #endif inited = 1; global_epoch = epoch_alloc("Global", 0); global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT); } SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL); #if !defined(EARLY_AP_STARTUP) static void epoch_init_smp(void *dummy __unused) { inited = 2; } SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL); #endif static void epoch_ctor(epoch_t epoch) { epoch_record_t er; int cpu; epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK); CPU_FOREACH(cpu) { er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); bzero(er, sizeof(*er)); ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); er->er_cpuid = cpu; er->er_parent = epoch; } } static void epoch_adjust_prio(struct thread *td, u_char prio) { thread_lock(td); sched_prio(td, prio); thread_unlock(td); } epoch_t epoch_alloc(const char *name, int flags) { epoch_t epoch; if (__predict_false(!inited)) panic("%s called too early in boot", __func__); epoch = malloc(sizeof(struct epoch), M_EPOCH, M_ZERO | M_WAITOK); ck_epoch_init(&epoch->e_epoch); epoch_ctor(epoch); MPASS(epoch_count < MAX_EPOCHS - 2); epoch->e_flags = flags; epoch->e_idx = epoch_count; epoch->e_name = name; sx_init(&epoch->e_drain_sx, "epoch-drain-sx"); mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF); allepochs[epoch_count++] = epoch; return (epoch); } void epoch_free(epoch_t epoch) { epoch_drain_callbacks(epoch); allepochs[epoch->e_idx] = NULL; epoch_wait(global_epoch); uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record); mtx_destroy(&epoch->e_drain_mtx); sx_destroy(&epoch->e_drain_sx); free(epoch, M_EPOCH); } static epoch_record_t epoch_currecord(epoch_t epoch) { return (zpcpu_get_cpu(epoch->e_pcpu_record, curcpu)); } #define INIT_CHECK(epoch) \ do { \ if (__predict_false((epoch) == NULL)) \ return; \ } while (0) void _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) { struct epoch_record *er; struct thread *td; MPASS(cold || epoch != NULL); MPASS(epoch->e_flags & EPOCH_PREEMPT); td = curthread; MPASS((vm_offset_t)et >= td->td_kstack && (vm_offset_t)et + sizeof(struct epoch_tracker) <= td->td_kstack + td->td_kstack_pages * PAGE_SIZE); INIT_CHECK(epoch); #ifdef EPOCH_TRACE epoch_trace_enter(td, epoch, et, file, line); #endif et->et_td = td; THREAD_NO_SLEEPING(); critical_enter(); sched_pin(); td->td_pre_epoch_prio = td->td_priority; er = epoch_currecord(epoch); TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link); ck_epoch_begin(&er->er_record, &et->et_section); critical_exit(); } void epoch_enter(epoch_t epoch) { epoch_record_t er; MPASS(cold || epoch != NULL); INIT_CHECK(epoch); critical_enter(); er = epoch_currecord(epoch); ck_epoch_begin(&er->er_record, NULL); } void _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) { struct epoch_record *er; struct thread *td; INIT_CHECK(epoch); td = curthread; critical_enter(); sched_unpin(); THREAD_SLEEPING_OK(); er = epoch_currecord(epoch); MPASS(epoch->e_flags & EPOCH_PREEMPT); MPASS(et != NULL); MPASS(et->et_td == td); #ifdef INVARIANTS et->et_td = (void*)0xDEADBEEF; #endif ck_epoch_end(&er->er_record, &et->et_section); TAILQ_REMOVE(&er->er_tdlist, et, et_link); er->er_gen++; if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) epoch_adjust_prio(td, td->td_pre_epoch_prio); critical_exit(); #ifdef EPOCH_TRACE epoch_trace_exit(td, epoch, et, file, line); #endif } void epoch_exit(epoch_t epoch) { epoch_record_t er; INIT_CHECK(epoch); er = epoch_currecord(epoch); ck_epoch_end(&er->er_record, NULL); critical_exit(); } /* * epoch_block_handler_preempt() is a callback from the CK code when another * thread is currently in an epoch section. */ static void epoch_block_handler_preempt(struct ck_epoch *global __unused, ck_epoch_record_t *cr, void *arg __unused) { epoch_record_t record; struct thread *td, *owner, *curwaittd; struct epoch_tracker *tdwait; struct turnstile *ts; struct lock_object *lock; int spincount, gen; int locksheld __unused; record = __containerof(cr, struct epoch_record, er_record); td = curthread; locksheld = td->td_locks; spincount = 0; counter_u64_add(block_count, 1); /* * We lost a race and there's no longer any threads * on the CPU in an epoch section. */ if (TAILQ_EMPTY(&record->er_tdlist)) return; if (record->er_cpuid != curcpu) { /* * If the head of the list is running, we can wait for it * to remove itself from the list and thus save us the * overhead of a migration */ gen = record->er_gen; thread_unlock(td); /* * We can't actually check if the waiting thread is running * so we simply poll for it to exit before giving up and * migrating. */ do { cpu_spinwait(); } while (!TAILQ_EMPTY(&record->er_tdlist) && gen == record->er_gen && spincount++ < MAX_ADAPTIVE_SPIN); thread_lock(td); /* * If the generation has changed we can poll again * otherwise we need to migrate. */ if (gen != record->er_gen) return; /* * Being on the same CPU as that of the record on which * we need to wait allows us access to the thread * list associated with that CPU. We can then examine the * oldest thread in the queue and wait on its turnstile * until it resumes and so on until a grace period * elapses. * */ counter_u64_add(migrate_count, 1); sched_bind(td, record->er_cpuid); /* * At this point we need to return to the ck code * to scan to see if a grace period has elapsed. * We can't move on to check the thread list, because * in the meantime new threads may have arrived that * in fact belong to a different epoch. */ return; } /* * Try to find a thread in an epoch section on this CPU * waiting on a turnstile. Otherwise find the lowest * priority thread (highest prio value) and drop our priority * to match to allow it to run. */ TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { /* * Propagate our priority to any other waiters to prevent us * from starving them. They will have their original priority * restore on exit from epoch_wait(). */ curwaittd = tdwait->et_td; if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) { critical_enter(); thread_unlock(td); thread_lock(curwaittd); sched_prio(curwaittd, td->td_priority); thread_unlock(curwaittd); thread_lock(td); critical_exit(); } if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) && ((ts = curwaittd->td_blocked) != NULL)) { /* * We unlock td to allow turnstile_wait to reacquire * the thread lock. Before unlocking it we enter a * critical section to prevent preemption after we * reenable interrupts by dropping the thread lock in * order to prevent curwaittd from getting to run. */ critical_enter(); thread_unlock(td); if (turnstile_lock(ts, &lock, &owner)) { if (ts == curwaittd->td_blocked) { MPASS(TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd)); critical_exit(); turnstile_wait(ts, owner, curwaittd->td_tsqueue); counter_u64_add(turnstile_count, 1); thread_lock(td); return; } turnstile_unlock(ts, lock); } thread_lock(td); critical_exit(); KASSERT(td->td_locks == locksheld, ("%d extra locks held", td->td_locks - locksheld)); } } /* * We didn't find any threads actually blocked on a lock * so we have nothing to do except context switch away. */ counter_u64_add(switch_count, 1); mi_switch(SW_VOL | SWT_RELINQUISH); /* * It is important the thread lock is dropped while yielding * to allow other threads to acquire the lock pointed to by * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the * thread lock before returning. Else a deadlock like * situation might happen. */ thread_lock(td); } void epoch_wait_preempt(epoch_t epoch) { struct thread *td; int was_bound; int old_cpu; int old_pinned; u_char old_prio; int locks __unused; MPASS(cold || epoch != NULL); INIT_CHECK(epoch); td = curthread; #ifdef INVARIANTS locks = curthread->td_locks; MPASS(epoch->e_flags & EPOCH_PREEMPT); if ((epoch->e_flags & EPOCH_LOCKED) == 0) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "epoch_wait() can be long running"); KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle " "of an epoch section of the same epoch")); #endif - thread_lock(td); DROP_GIANT(); + thread_lock(td); old_cpu = PCPU_GET(cpuid); old_pinned = td->td_pinned; old_prio = td->td_priority; was_bound = sched_is_bound(td); sched_unbind(td); td->td_pinned = 0; sched_bind(td, old_cpu); ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, NULL); /* restore CPU binding, if any */ if (was_bound != 0) { sched_bind(td, old_cpu); } else { /* get thread back to initial CPU, if any */ if (old_pinned != 0) sched_bind(td, old_cpu); sched_unbind(td); } /* restore pinned after bind */ td->td_pinned = old_pinned; /* restore thread priority */ sched_prio(td, old_prio); thread_unlock(td); PICKUP_GIANT(); KASSERT(td->td_locks == locks, ("%d residual locks held", td->td_locks - locks)); } static void epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused, void *arg __unused) { cpu_spinwait(); } void epoch_wait(epoch_t epoch) { MPASS(cold || epoch != NULL); INIT_CHECK(epoch); MPASS(epoch->e_flags == 0); critical_enter(); ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL); critical_exit(); } void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)) { epoch_record_t er; ck_epoch_entry_t *cb; cb = (void *)ctx; MPASS(callback); /* too early in boot to have epoch set up */ if (__predict_false(epoch == NULL)) goto boottime; #if !defined(EARLY_AP_STARTUP) if (__predict_false(inited < 2)) goto boottime; #endif critical_enter(); *DPCPU_PTR(epoch_cb_count) += 1; er = epoch_currecord(epoch); ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback); critical_exit(); return; boottime: callback(ctx); } static void epoch_call_task(void *arg __unused) { ck_stack_entry_t *cursor, *head, *next; ck_epoch_record_t *record; epoch_record_t er; epoch_t epoch; ck_stack_t cb_stack; int i, npending, total; ck_stack_init(&cb_stack); critical_enter(); epoch_enter(global_epoch); for (total = i = 0; i < epoch_count; i++) { if (__predict_false((epoch = allepochs[i]) == NULL)) continue; er = epoch_currecord(epoch); record = &er->er_record; if ((npending = record->n_pending) == 0) continue; ck_epoch_poll_deferred(record, &cb_stack); total += npending - record->n_pending; } epoch_exit(global_epoch); *DPCPU_PTR(epoch_cb_count) -= total; critical_exit(); counter_u64_add(epoch_call_count, total); counter_u64_add(epoch_call_task_count, 1); head = ck_stack_batch_pop_npsc(&cb_stack); for (cursor = head; cursor != NULL; cursor = next) { struct ck_epoch_entry *entry = ck_epoch_entry_container(cursor); next = CK_STACK_NEXT(cursor); entry->function(entry); } } int in_epoch_verbose(epoch_t epoch, int dump_onfail) { struct epoch_tracker *tdwait; struct thread *td; epoch_record_t er; td = curthread; if (THREAD_CAN_SLEEP()) return (0); if (__predict_false((epoch) == NULL)) return (0); critical_enter(); er = epoch_currecord(epoch); TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) if (tdwait->et_td == td) { critical_exit(); return (1); } #ifdef INVARIANTS if (dump_onfail) { MPASS(td->td_pinned); printf("cpu: %d id: %d\n", curcpu, td->td_tid); TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) printf("td_tid: %d ", tdwait->et_td->td_tid); printf("\n"); } #endif critical_exit(); return (0); } int in_epoch(epoch_t epoch) { return (in_epoch_verbose(epoch, 0)); } static void epoch_drain_cb(struct epoch_context *ctx) { struct epoch *epoch = __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent; if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) { mtx_lock(&epoch->e_drain_mtx); wakeup(epoch); mtx_unlock(&epoch->e_drain_mtx); } } void epoch_drain_callbacks(epoch_t epoch) { epoch_record_t er; struct thread *td; int was_bound; int old_pinned; int old_cpu; int cpu; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "epoch_drain_callbacks() may sleep!"); /* too early in boot to have epoch set up */ if (__predict_false(epoch == NULL)) return; #if !defined(EARLY_AP_STARTUP) if (__predict_false(inited < 2)) return; #endif DROP_GIANT(); sx_xlock(&epoch->e_drain_sx); mtx_lock(&epoch->e_drain_mtx); td = curthread; thread_lock(td); old_cpu = PCPU_GET(cpuid); old_pinned = td->td_pinned; was_bound = sched_is_bound(td); sched_unbind(td); td->td_pinned = 0; CPU_FOREACH(cpu) epoch->e_drain_count++; CPU_FOREACH(cpu) { er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); sched_bind(td, cpu); epoch_call(epoch, &er->er_drain_ctx, &epoch_drain_cb); } /* restore CPU binding, if any */ if (was_bound != 0) { sched_bind(td, old_cpu); } else { /* get thread back to initial CPU, if any */ if (old_pinned != 0) sched_bind(td, old_cpu); sched_unbind(td); } /* restore pinned after bind */ td->td_pinned = old_pinned; thread_unlock(td); while (epoch->e_drain_count != 0) msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0); mtx_unlock(&epoch->e_drain_mtx); sx_xunlock(&epoch->e_drain_sx); PICKUP_GIANT(); }