Page MenuHomeFreeBSD

D15365.id42339.diff
No OneTemporary

D15365.id42339.diff

Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -3891,6 +3891,7 @@
kern/subr_counter.c standard
kern/subr_devstat.c standard
kern/subr_disk.c standard
+kern/subr_epoch.c standard
kern/subr_eventhandler.c standard
kern/subr_fattime.c standard
kern/subr_firmware.c optional firmware
Index: sys/conf/kern.pre.mk
===================================================================
--- sys/conf/kern.pre.mk
+++ sys/conf/kern.pre.mk
@@ -77,7 +77,7 @@
.endif
NOSTDINC= -nostdinc
-INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S
+INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S -I$S/contrib/ck/include
CFLAGS= ${COPTFLAGS} ${DEBUG}
CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h
Index: sys/kern/kern_malloc.c
===================================================================
--- sys/kern/kern_malloc.c
+++ sys/kern/kern_malloc.c
@@ -514,9 +514,12 @@
}
}
#endif
- if (flags & M_WAITOK)
+ if (flags & M_WAITOK) {
KASSERT(curthread->td_intr_nesting_level == 0,
- ("malloc(M_WAITOK) in interrupt context"));
+ ("malloc(M_WAITOK) in interrupt context"));
+ KASSERT(curthread->td_epochnest == 0,
+ ("malloc(M_WAITOK) in epoch context"));
+ }
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
("malloc: called with spinlock or critical section held"));
Index: sys/kern/kern_synch.c
===================================================================
--- sys/kern/kern_synch.c
+++ sys/kern/kern_synch.c
@@ -147,6 +147,7 @@
("sleeping without a lock"));
KASSERT(ident != NULL, ("_sleep: NULL ident"));
KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running"));
+ KASSERT(td->td_epochnest == 0, ("sleeping in an epoch section"));
if (priority & PDROP)
KASSERT(lock != NULL && lock != &Giant.lock_object,
("PDROP requires a non-Giant lock"));
Index: sys/kern/subr_epoch.c
===================================================================
--- /dev/null
+++ sys/kern/subr_epoch.c
@@ -0,0 +1,388 @@
+/*-
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of Matthew Macy nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/counter.h>
+#include <sys/epoch.h>
+#include <sys/gtaskqueue.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/turnstile.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+
+#include <ck_epoch.h>
+
+MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation");
+
+
+SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information");
+SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats");
+
+/* Stats. */
+static counter_u64_t wait_count;
+SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, preemption_waits, CTLFLAG_RW,
+ &wait_count, "# of times waited due to preemption");
+static counter_u64_t yield_count;
+SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, yields, CTLFLAG_RW,
+ &yield_count, "# of times yielded to other cpu");
+
+typedef struct epoch_cb {
+ void (*ec_callback)(epoch_context_t);
+ STAILQ_ENTRY(epoch_cb) ec_link;
+} *epoch_cb_t;
+
+typedef struct epoch_record {
+ ck_epoch_record_t er_record;
+ TAILQ_HEAD(, thread) er_tdlist;
+ uint32_t er_cpuid;
+} *epoch_record_t;
+
+struct epoch_pcpu_state {
+ struct epoch_record eps_record;
+ volatile int eps_critnest;
+ volatile int eps_waiters;
+} __aligned(CACHE_LINE_SIZE);
+
+struct epoch {
+ struct ck_epoch e_epoch;
+ struct mtx e_lock;
+ struct grouptask e_gtask;
+ STAILQ_HEAD(, epoch_cb) e_cblist;
+ struct epoch_pcpu_state *e_pcpu_dom[MAXMEMDOM];
+ struct epoch_pcpu_state *e_pcpu[0];
+};
+
+static __read_mostly int domcount[MAXMEMDOM];
+static __read_mostly int domoffsets[MAXMEMDOM];
+static __read_mostly int inited;
+
+static void epoch_call_task(void *context);
+
+static void
+epoch_init(void *arg __unused)
+{
+ int domain, count;
+
+ count = domain = 0;
+ domoffsets[0] = 0;
+ for (domain = 0; domain < vm_ndomains; domain++) {
+ domcount[domain] = CPU_COUNT(&cpuset_domain[domain]);
+ if (bootverbose)
+ printf("domcount[%d] %d\n", domain, domcount[domain]);
+ }
+ for (domain = 1; domain < vm_ndomains; domain++)
+ domoffsets[domain] = domoffsets[domain-1] + domcount[domain-1];
+
+#ifdef INVARIANTS
+ for (domain = 0; domain < vm_ndomains; domain++) {
+ KASSERT(domcount[domain], ("domcount[%d] is zero", domain));
+ if (vm_ndomains > 1)
+ MPASS(domcount[domain] < mp_ncpus);
+ else
+ MPASS(domcount[domain] <= mp_ncpus);
+ }
+#endif
+ wait_count = counter_u64_alloc(M_WAITOK);
+ yield_count = counter_u64_alloc(M_WAITOK);
+ inited = 1;
+}
+SYSINIT(epoch, SI_SUB_CPU + 1, SI_ORDER_FIRST, epoch_init, NULL);
+
+epoch_t
+epoch_alloc(void)
+{
+ int domain, cpu_offset;
+ epoch_t epoch;
+ struct epoch_pcpu_state *eps;
+ epoch_record_t er;
+
+ if (__predict_false(!inited))
+ panic("%s called too early in boot", __func__);
+ epoch = malloc(sizeof(struct epoch) + mp_ncpus*sizeof(void*),
+ M_EPOCH, M_ZERO|M_WAITOK);
+ ck_epoch_init(&epoch->e_epoch);
+ mtx_init(&epoch->e_lock, "epoch cblist", NULL, MTX_DEF);
+ STAILQ_INIT(&epoch->e_cblist);
+ taskqgroup_config_gtask_init(epoch, &epoch->e_gtask, epoch_call_task, "epoch call task");
+ for (domain = 0; domain < vm_ndomains; domain++) {
+ eps = malloc_domain(sizeof(*eps)*domcount[domain], M_EPOCH,
+ domain, M_ZERO|M_WAITOK);
+ epoch->e_pcpu_dom[domain] = eps;
+ cpu_offset = domoffsets[domain];
+ for (int i = 0; i < domcount[domain]; i++, eps++) {
+ epoch->e_pcpu[cpu_offset + i] = eps;
+ er = &eps->eps_record;
+ ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
+ TAILQ_INIT(&er->er_tdlist);
+ er->er_cpuid = cpu_offset + i;
+ }
+ }
+ return (epoch);
+}
+
+void
+epoch_free(epoch_t epoch)
+{
+ int domain;
+#ifdef INVARIANTS
+ struct epoch_pcpu_state *eps;
+ int cpu;
+
+ CPU_FOREACH(cpu) {
+ eps = epoch->e_pcpu[cpu];
+ MPASS(TAILQ_EMPTY(&eps->eps_record.er_tdlist));
+ }
+#endif
+ mtx_destroy(&epoch->e_lock);
+ taskqgroup_config_gtask_deinit(&epoch->e_gtask);
+ for (domain = 0; domain < vm_ndomains; domain++)
+ free_domain(epoch->e_pcpu_dom[domain], M_EPOCH);
+ free(epoch, M_EPOCH);
+}
+
+#define INIT_CHECK(epoch) \
+ do { \
+ if (__predict_false((epoch) == NULL)) \
+ return; \
+ } while (0)
+
+void
+epoch_enter(epoch_t epoch)
+{
+ struct epoch_pcpu_state *eps;
+ struct thread *td;
+
+ INIT_CHECK(epoch);
+
+ td = curthread;
+ critical_enter();
+ eps = epoch->e_pcpu[curcpu];
+ td->td_epochnest++;
+ MPASS(td->td_epochnest < UCHAR_MAX - 2);
+ if (td->td_epochnest == 1)
+ TAILQ_INSERT_TAIL(&eps->eps_record.er_tdlist, td, td_epochq);
+#ifdef INVARIANTS
+ if (td->td_epochnest > 1) {
+ struct thread *curtd;
+ int found = 0;
+
+ TAILQ_FOREACH(curtd, &eps->eps_record.er_tdlist, td_epochq)
+ if (curtd == td)
+ found = 1;
+ KASSERT(found, ("recursing on a second epoch"));
+ }
+#endif
+ sched_pin();
+ ck_epoch_begin(&eps->eps_record.er_record, NULL);
+ critical_exit();
+}
+
+void
+epoch_enter_nopreempt(epoch_t epoch)
+{
+ struct epoch_pcpu_state *eps;
+
+ INIT_CHECK(epoch);
+ critical_enter();
+ eps = epoch->e_pcpu[curcpu];
+ curthread->td_epochnest++;
+ MPASS(curthread->td_epochnest < UCHAR_MAX - 2);
+ ck_epoch_begin(&eps->eps_record.er_record, NULL);
+}
+
+void
+epoch_exit(epoch_t epoch)
+{
+ struct epoch_pcpu_state *eps;
+ struct thread *td;
+
+ td = curthread;
+ INIT_CHECK(epoch);
+ critical_enter();
+ eps = epoch->e_pcpu[curcpu];
+ sched_unpin();
+ ck_epoch_end(&eps->eps_record.er_record, NULL);
+ td->td_epochnest--;
+ if (td->td_epochnest == 0)
+ TAILQ_REMOVE(&eps->eps_record.er_tdlist, td, td_epochq);
+ critical_exit();
+}
+
+void
+epoch_exit_nopreempt(epoch_t epoch)
+{
+ struct epoch_pcpu_state *eps;
+
+ INIT_CHECK(epoch);
+ MPASS(curthread->td_critnest);
+ eps = epoch->e_pcpu[curcpu];
+ ck_epoch_end(&eps->eps_record.er_record, NULL);
+ curthread->td_epochnest--;
+ critical_exit();
+}
+
+static void
+epoch_block_handler(struct ck_epoch *global __unused, ck_epoch_record_t *cr,
+ void *arg __unused)
+{
+ epoch_record_t record;
+ struct epoch_pcpu_state *eps;
+ struct thread *td, *tdwait;
+
+ eps = arg;
+ record = __containerof(cr, struct epoch_record, er_record);
+ td = curthread;
+
+ counter_u64_add(yield_count, 1);
+ if (record->er_cpuid == curcpu) {
+ tdwait = TAILQ_FIRST(&record->er_tdlist);
+ critical_enter();
+ thread_unlock(td);
+ /*
+ * The turnstile pointer is stable because we are running
+ * on the same CPU as the blocked thread. Up until this
+ * point we had the thread lock held and we're now in the
+ * middle of a critical section.
+ */
+ turnstile_lock(tdwait->td_blocked);
+ critical_exit();
+ turnstile_wait(tdwait->td_blocked, NULL, tdwait->td_tsqueue);
+ thread_lock(td);
+ } else {
+ /*
+ * To avoid spinning move execution to the other CPU
+ * which is blocking synchronization. Set highest
+ * thread priority so that code gets run. The thread
+ * priority will be restored later.
+ */
+ sched_prio(td, 0);
+ sched_bind(td, record->er_cpuid);
+ }
+}
+
+/*
+ * Taken verbatim from linux_synchronize_rcu
+ */
+void
+epoch_wait(epoch_t epoch)
+{
+ struct thread *td;
+ int was_bound;
+ int old_cpu;
+ int old_pinned;
+ u_char old_prio;
+
+ INIT_CHECK(epoch);
+
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "epoch_wait() can sleep");
+
+ td = curthread;
+ thread_lock(td);
+
+ DROP_GIANT();
+
+ old_cpu = PCPU_GET(cpuid);
+ old_pinned = td->td_pinned;
+ old_prio = td->td_priority;
+ was_bound = sched_is_bound(td);
+ sched_unbind(td);
+ td->td_pinned = 0;
+ sched_bind(td, old_cpu);
+
+ ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
+
+ /* restore CPU binding, if any */
+ if (was_bound != 0) {
+ sched_bind(td, old_cpu);
+ } else {
+ /* get thread back to initial CPU, if any */
+ if (old_pinned != 0)
+ sched_bind(td, old_cpu);
+ sched_unbind(td);
+ }
+ /* restore pinned after bind */
+ td->td_pinned = old_pinned;
+
+ /* restore thread priority */
+ sched_prio(td, old_prio);
+ thread_unlock(td);
+
+ PICKUP_GIANT();
+}
+
+void
+epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t))
+{
+ epoch_cb_t cb;
+
+ cb = (void *)ctx;
+ cb->ec_callback = callback;
+ mtx_lock(&epoch->e_lock);
+ STAILQ_INSERT_TAIL(&epoch->e_cblist, cb, ec_link);
+ GROUPTASK_ENQUEUE(&epoch->e_gtask);
+ mtx_unlock(&epoch->e_lock);
+}
+
+static void
+epoch_call_task(void *context)
+{
+ epoch_t epoch;
+ epoch_cb_t cb;
+ STAILQ_HEAD(, epoch_cb) tmp_head;
+
+ epoch = context;
+ STAILQ_INIT(&tmp_head);
+
+ mtx_lock(&epoch->e_lock);
+ STAILQ_CONCAT(&tmp_head, &epoch->e_cblist);
+ mtx_unlock(&epoch->e_lock);
+
+ epoch_wait(epoch);
+
+ while ((cb = STAILQ_FIRST(&tmp_head)) != NULL)
+ cb->ec_callback((void*)cb);
+}
+
+int
+in_epoch(void)
+{
+ return (curthread->td_epochnest != 0);
+}
Index: sys/kern/subr_gtaskqueue.c
===================================================================
--- sys/kern/subr_gtaskqueue.c
+++ sys/kern/subr_gtaskqueue.c
@@ -987,3 +987,9 @@
GROUPTASK_INIT(gtask, 0, fn, ctx);
taskqgroup_attach(qgroup_config, gtask, gtask, -1, name);
}
+
+void
+taskqgroup_config_gtask_deinit(struct grouptask *gtask)
+{
+ taskqgroup_detach(qgroup_config, gtask);
+}
Index: sys/kern/subr_trap.c
===================================================================
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -161,6 +161,8 @@
WITNESS_WARN(WARN_PANIC, NULL, "userret: returning");
KASSERT(td->td_critnest == 0,
("userret: Returning in a critical section"));
+ KASSERT(td->td_epochnest == 0,
+ ("userret: Returning in an epoch section"));
KASSERT(td->td_locks == 0,
("userret: Returning with %d locks held", td->td_locks));
KASSERT(td->td_rw_rlocks == 0,
Index: sys/kern/subr_turnstile.c
===================================================================
--- sys/kern/subr_turnstile.c
+++ sys/kern/subr_turnstile.c
@@ -566,6 +566,22 @@
return (ts);
}
+void
+turnstile_lock(struct turnstile *ts)
+{
+ struct turnstile_chain *tc;
+ struct turnstile *curtdts;
+
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ mtx_lock_spin(&tc->tc_lock);
+ mtx_lock_spin(&ts->ts_lock);
+
+ curtdts = curthread->td_turnstile;
+ mtx_lock_spin(&curtdts->ts_lock);
+ KASSERT(curtdts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
+ curtdts->ts_lockobj = ts->ts_lockobj;
+}
+
void
turnstile_cancel(struct turnstile *ts)
{
Index: sys/sys/epoch.h
===================================================================
--- /dev/null
+++ sys/sys/epoch.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of Matthew Macy nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SYS_EPOCH_H_
+#define _SYS_EPOCH_H_
+
+struct epoch;
+typedef struct epoch *epoch_t;
+
+struct epoch_context {
+ void *data[2];
+} __aligned(sizeof(void *));
+
+typedef struct epoch_context *epoch_context_t;
+
+epoch_t epoch_alloc(void);
+void epoch_free(epoch_t epoch);
+void epoch_enter(epoch_t epoch);
+void epoch_exit(epoch_t epoch);
+void epoch_enter_nopreempt(epoch_t epoch);
+void epoch_exit_nopreempt(epoch_t epoch);
+void epoch_wait(epoch_t epoch);
+void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t));
+int in_epoch(void);
+
+#endif
Index: sys/sys/gtaskqueue.h
===================================================================
--- sys/sys/gtaskqueue.h
+++ sys/sys/gtaskqueue.h
@@ -63,6 +63,7 @@
int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride);
void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
const char *name);
+void taskqgroup_config_gtask_deinit(struct grouptask *gtask);
#define TASK_ENQUEUED 0x1
#define TASK_SKIP_WAKEUP 0x2
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -243,6 +243,7 @@
/* Cleared during fork1() */
#define td_startzero td_flags
+ u_char td_epochnest; /* (k) Private thread epoch nest counter */
int td_flags; /* (t) TDF_* flags. */
int td_inhibitors; /* (t) Why can not run. */
int td_pflags; /* (k) Private thread (TDP_*) flags. */
@@ -355,6 +356,7 @@
int td_lastcpu; /* (t) Last cpu we were on. */
int td_oncpu; /* (t) Which cpu we are on. */
void *td_lkpi_task; /* LinuxKPI task struct pointer */
+ TAILQ_ENTRY(thread) td_epochq; /* (t) Epoch queue. */
};
struct thread0_storage {
Index: sys/sys/turnstile.h
===================================================================
--- sys/sys/turnstile.h
+++ sys/sys/turnstile.h
@@ -104,6 +104,7 @@
struct turnstile *turnstile_trywait(struct lock_object *);
void turnstile_unpend(struct turnstile *, int);
void turnstile_wait(struct turnstile *, struct thread *, int);
+void turnstile_lock(struct turnstile *);
#endif /* _KERNEL */
#endif /* _SYS_TURNSTILE_H_ */

File Metadata

Mime Type
text/plain
Expires
Fri, Feb 6, 8:43 AM (13 h, 34 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28447474
Default Alt Text
D15365.id42339.diff (17 KB)

Event Timeline