Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F144135834
D15365.id42339.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
17 KB
Referenced Files
None
Subscribers
None
D15365.id42339.diff
View Options
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -3891,6 +3891,7 @@
kern/subr_counter.c standard
kern/subr_devstat.c standard
kern/subr_disk.c standard
+kern/subr_epoch.c standard
kern/subr_eventhandler.c standard
kern/subr_fattime.c standard
kern/subr_firmware.c optional firmware
Index: sys/conf/kern.pre.mk
===================================================================
--- sys/conf/kern.pre.mk
+++ sys/conf/kern.pre.mk
@@ -77,7 +77,7 @@
.endif
NOSTDINC= -nostdinc
-INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S
+INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S -I$S/contrib/ck/include
CFLAGS= ${COPTFLAGS} ${DEBUG}
CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h
Index: sys/kern/kern_malloc.c
===================================================================
--- sys/kern/kern_malloc.c
+++ sys/kern/kern_malloc.c
@@ -514,9 +514,12 @@
}
}
#endif
- if (flags & M_WAITOK)
+ if (flags & M_WAITOK) {
KASSERT(curthread->td_intr_nesting_level == 0,
- ("malloc(M_WAITOK) in interrupt context"));
+ ("malloc(M_WAITOK) in interrupt context"));
+ KASSERT(curthread->td_epochnest == 0,
+ ("malloc(M_WAITOK) in epoch context"));
+ }
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
("malloc: called with spinlock or critical section held"));
Index: sys/kern/kern_synch.c
===================================================================
--- sys/kern/kern_synch.c
+++ sys/kern/kern_synch.c
@@ -147,6 +147,7 @@
("sleeping without a lock"));
KASSERT(ident != NULL, ("_sleep: NULL ident"));
KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running"));
+ KASSERT(td->td_epochnest == 0, ("sleeping in an epoch section"));
if (priority & PDROP)
KASSERT(lock != NULL && lock != &Giant.lock_object,
("PDROP requires a non-Giant lock"));
Index: sys/kern/subr_epoch.c
===================================================================
--- /dev/null
+++ sys/kern/subr_epoch.c
@@ -0,0 +1,388 @@
+/*-
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of Matthew Macy nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/counter.h>
+#include <sys/epoch.h>
+#include <sys/gtaskqueue.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/turnstile.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+
+#include <ck_epoch.h>
+
+MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation");
+
+
+SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information");
+SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats");
+
+/* Stats. */
+static counter_u64_t wait_count;
+SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, preemption_waits, CTLFLAG_RW,
+ &wait_count, "# of times waited due to preemption");
+static counter_u64_t yield_count;
+SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, yields, CTLFLAG_RW,
+ &yield_count, "# of times yielded to other cpu");
+
+typedef struct epoch_cb {
+ void (*ec_callback)(epoch_context_t);
+ STAILQ_ENTRY(epoch_cb) ec_link;
+} *epoch_cb_t;
+
+typedef struct epoch_record {
+ ck_epoch_record_t er_record;
+ TAILQ_HEAD(, thread) er_tdlist;
+ uint32_t er_cpuid;
+} *epoch_record_t;
+
+struct epoch_pcpu_state {
+ struct epoch_record eps_record;
+ volatile int eps_critnest;
+ volatile int eps_waiters;
+} __aligned(CACHE_LINE_SIZE);
+
+struct epoch {
+ struct ck_epoch e_epoch;
+ struct mtx e_lock;
+ struct grouptask e_gtask;
+ STAILQ_HEAD(, epoch_cb) e_cblist;
+ struct epoch_pcpu_state *e_pcpu_dom[MAXMEMDOM];
+ struct epoch_pcpu_state *e_pcpu[0];
+};
+
+static __read_mostly int domcount[MAXMEMDOM];
+static __read_mostly int domoffsets[MAXMEMDOM];
+static __read_mostly int inited;
+
+static void epoch_call_task(void *context);
+
+static void
+epoch_init(void *arg __unused)
+{
+ int domain, count;
+
+ count = domain = 0;
+ domoffsets[0] = 0;
+ for (domain = 0; domain < vm_ndomains; domain++) {
+ domcount[domain] = CPU_COUNT(&cpuset_domain[domain]);
+ if (bootverbose)
+ printf("domcount[%d] %d\n", domain, domcount[domain]);
+ }
+ for (domain = 1; domain < vm_ndomains; domain++)
+ domoffsets[domain] = domoffsets[domain-1] + domcount[domain-1];
+
+#ifdef INVARIANTS
+ for (domain = 0; domain < vm_ndomains; domain++) {
+ KASSERT(domcount[domain], ("domcount[%d] is zero", domain));
+ if (vm_ndomains > 1)
+ MPASS(domcount[domain] < mp_ncpus);
+ else
+ MPASS(domcount[domain] <= mp_ncpus);
+ }
+#endif
+ wait_count = counter_u64_alloc(M_WAITOK);
+ yield_count = counter_u64_alloc(M_WAITOK);
+ inited = 1;
+}
+SYSINIT(epoch, SI_SUB_CPU + 1, SI_ORDER_FIRST, epoch_init, NULL);
+
+epoch_t
+epoch_alloc(void)
+{
+ int domain, cpu_offset;
+ epoch_t epoch;
+ struct epoch_pcpu_state *eps;
+ epoch_record_t er;
+
+ if (__predict_false(!inited))
+ panic("%s called too early in boot", __func__);
+ epoch = malloc(sizeof(struct epoch) + mp_ncpus*sizeof(void*),
+ M_EPOCH, M_ZERO|M_WAITOK);
+ ck_epoch_init(&epoch->e_epoch);
+ mtx_init(&epoch->e_lock, "epoch cblist", NULL, MTX_DEF);
+ STAILQ_INIT(&epoch->e_cblist);
+ taskqgroup_config_gtask_init(epoch, &epoch->e_gtask, epoch_call_task, "epoch call task");
+ for (domain = 0; domain < vm_ndomains; domain++) {
+ eps = malloc_domain(sizeof(*eps)*domcount[domain], M_EPOCH,
+ domain, M_ZERO|M_WAITOK);
+ epoch->e_pcpu_dom[domain] = eps;
+ cpu_offset = domoffsets[domain];
+ for (int i = 0; i < domcount[domain]; i++, eps++) {
+ epoch->e_pcpu[cpu_offset + i] = eps;
+ er = &eps->eps_record;
+ ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
+ TAILQ_INIT(&er->er_tdlist);
+ er->er_cpuid = cpu_offset + i;
+ }
+ }
+ return (epoch);
+}
+
+void
+epoch_free(epoch_t epoch)
+{
+ int domain;
+#ifdef INVARIANTS
+ struct epoch_pcpu_state *eps;
+ int cpu;
+
+ CPU_FOREACH(cpu) {
+ eps = epoch->e_pcpu[cpu];
+ MPASS(TAILQ_EMPTY(&eps->eps_record.er_tdlist));
+ }
+#endif
+ mtx_destroy(&epoch->e_lock);
+ taskqgroup_config_gtask_deinit(&epoch->e_gtask);
+ for (domain = 0; domain < vm_ndomains; domain++)
+ free_domain(epoch->e_pcpu_dom[domain], M_EPOCH);
+ free(epoch, M_EPOCH);
+}
+
+#define INIT_CHECK(epoch) \
+ do { \
+ if (__predict_false((epoch) == NULL)) \
+ return; \
+ } while (0)
+
+void
+epoch_enter(epoch_t epoch)
+{
+ struct epoch_pcpu_state *eps;
+ struct thread *td;
+
+ INIT_CHECK(epoch);
+
+ td = curthread;
+ critical_enter();
+ eps = epoch->e_pcpu[curcpu];
+ td->td_epochnest++;
+ MPASS(td->td_epochnest < UCHAR_MAX - 2);
+ if (td->td_epochnest == 1)
+ TAILQ_INSERT_TAIL(&eps->eps_record.er_tdlist, td, td_epochq);
+#ifdef INVARIANTS
+ if (td->td_epochnest > 1) {
+ struct thread *curtd;
+ int found = 0;
+
+ TAILQ_FOREACH(curtd, &eps->eps_record.er_tdlist, td_epochq)
+ if (curtd == td)
+ found = 1;
+ KASSERT(found, ("recursing on a second epoch"));
+ }
+#endif
+ sched_pin();
+ ck_epoch_begin(&eps->eps_record.er_record, NULL);
+ critical_exit();
+}
+
+void
+epoch_enter_nopreempt(epoch_t epoch)
+{
+ struct epoch_pcpu_state *eps;
+
+ INIT_CHECK(epoch);
+ critical_enter();
+ eps = epoch->e_pcpu[curcpu];
+ curthread->td_epochnest++;
+ MPASS(curthread->td_epochnest < UCHAR_MAX - 2);
+ ck_epoch_begin(&eps->eps_record.er_record, NULL);
+}
+
+void
+epoch_exit(epoch_t epoch)
+{
+ struct epoch_pcpu_state *eps;
+ struct thread *td;
+
+ td = curthread;
+ INIT_CHECK(epoch);
+ critical_enter();
+ eps = epoch->e_pcpu[curcpu];
+ sched_unpin();
+ ck_epoch_end(&eps->eps_record.er_record, NULL);
+ td->td_epochnest--;
+ if (td->td_epochnest == 0)
+ TAILQ_REMOVE(&eps->eps_record.er_tdlist, td, td_epochq);
+ critical_exit();
+}
+
+void
+epoch_exit_nopreempt(epoch_t epoch)
+{
+ struct epoch_pcpu_state *eps;
+
+ INIT_CHECK(epoch);
+ MPASS(curthread->td_critnest);
+ eps = epoch->e_pcpu[curcpu];
+ ck_epoch_end(&eps->eps_record.er_record, NULL);
+ curthread->td_epochnest--;
+ critical_exit();
+}
+
+static void
+epoch_block_handler(struct ck_epoch *global __unused, ck_epoch_record_t *cr,
+ void *arg __unused)
+{
+ epoch_record_t record;
+ struct epoch_pcpu_state *eps;
+ struct thread *td, *tdwait;
+
+ eps = arg;
+ record = __containerof(cr, struct epoch_record, er_record);
+ td = curthread;
+
+ counter_u64_add(yield_count, 1);
+ if (record->er_cpuid == curcpu) {
+ tdwait = TAILQ_FIRST(&record->er_tdlist);
+ critical_enter();
+ thread_unlock(td);
+ /*
+ * The turnstile pointer is stable because we are running
+ * on the same CPU as the blocked thread. Up until this
+ * point we had the thread lock held and we're now in the
+ * middle of a critical section.
+ */
+ turnstile_lock(tdwait->td_blocked);
+ critical_exit();
+ turnstile_wait(tdwait->td_blocked, NULL, tdwait->td_tsqueue);
+ thread_lock(td);
+ } else {
+ /*
+ * To avoid spinning move execution to the other CPU
+ * which is blocking synchronization. Set highest
+ * thread priority so that code gets run. The thread
+ * priority will be restored later.
+ */
+ sched_prio(td, 0);
+ sched_bind(td, record->er_cpuid);
+ }
+}
+
+/*
+ * Taken verbatim from linux_synchronize_rcu
+ */
+void
+epoch_wait(epoch_t epoch)
+{
+ struct thread *td;
+ int was_bound;
+ int old_cpu;
+ int old_pinned;
+ u_char old_prio;
+
+ INIT_CHECK(epoch);
+
+ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
+ "epoch_wait() can sleep");
+
+ td = curthread;
+ thread_lock(td);
+
+ DROP_GIANT();
+
+ old_cpu = PCPU_GET(cpuid);
+ old_pinned = td->td_pinned;
+ old_prio = td->td_priority;
+ was_bound = sched_is_bound(td);
+ sched_unbind(td);
+ td->td_pinned = 0;
+ sched_bind(td, old_cpu);
+
+ ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
+
+ /* restore CPU binding, if any */
+ if (was_bound != 0) {
+ sched_bind(td, old_cpu);
+ } else {
+ /* get thread back to initial CPU, if any */
+ if (old_pinned != 0)
+ sched_bind(td, old_cpu);
+ sched_unbind(td);
+ }
+ /* restore pinned after bind */
+ td->td_pinned = old_pinned;
+
+ /* restore thread priority */
+ sched_prio(td, old_prio);
+ thread_unlock(td);
+
+ PICKUP_GIANT();
+}
+
+void
+epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t))
+{
+ epoch_cb_t cb;
+
+ cb = (void *)ctx;
+ cb->ec_callback = callback;
+ mtx_lock(&epoch->e_lock);
+ STAILQ_INSERT_TAIL(&epoch->e_cblist, cb, ec_link);
+ GROUPTASK_ENQUEUE(&epoch->e_gtask);
+ mtx_unlock(&epoch->e_lock);
+}
+
+static void
+epoch_call_task(void *context)
+{
+ epoch_t epoch;
+ epoch_cb_t cb;
+ STAILQ_HEAD(, epoch_cb) tmp_head;
+
+ epoch = context;
+ STAILQ_INIT(&tmp_head);
+
+ mtx_lock(&epoch->e_lock);
+ STAILQ_CONCAT(&tmp_head, &epoch->e_cblist);
+ mtx_unlock(&epoch->e_lock);
+
+ epoch_wait(epoch);
+
+ while ((cb = STAILQ_FIRST(&tmp_head)) != NULL)
+ cb->ec_callback((void*)cb);
+}
+
+int
+in_epoch(void)
+{
+ return (curthread->td_epochnest != 0);
+}
Index: sys/kern/subr_gtaskqueue.c
===================================================================
--- sys/kern/subr_gtaskqueue.c
+++ sys/kern/subr_gtaskqueue.c
@@ -987,3 +987,9 @@
GROUPTASK_INIT(gtask, 0, fn, ctx);
taskqgroup_attach(qgroup_config, gtask, gtask, -1, name);
}
+
+void
+taskqgroup_config_gtask_deinit(struct grouptask *gtask)
+{
+ taskqgroup_detach(qgroup_config, gtask);
+}
Index: sys/kern/subr_trap.c
===================================================================
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -161,6 +161,8 @@
WITNESS_WARN(WARN_PANIC, NULL, "userret: returning");
KASSERT(td->td_critnest == 0,
("userret: Returning in a critical section"));
+ KASSERT(td->td_epochnest == 0,
+ ("userret: Returning in an epoch section"));
KASSERT(td->td_locks == 0,
("userret: Returning with %d locks held", td->td_locks));
KASSERT(td->td_rw_rlocks == 0,
Index: sys/kern/subr_turnstile.c
===================================================================
--- sys/kern/subr_turnstile.c
+++ sys/kern/subr_turnstile.c
@@ -566,6 +566,22 @@
return (ts);
}
+void
+turnstile_lock(struct turnstile *ts)
+{
+ struct turnstile_chain *tc;
+ struct turnstile *curtdts;
+
+ tc = TC_LOOKUP(ts->ts_lockobj);
+ mtx_lock_spin(&tc->tc_lock);
+ mtx_lock_spin(&ts->ts_lock);
+
+ curtdts = curthread->td_turnstile;
+ mtx_lock_spin(&curtdts->ts_lock);
+ KASSERT(curtdts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
+ curtdts->ts_lockobj = ts->ts_lockobj;
+}
+
void
turnstile_cancel(struct turnstile *ts)
{
Index: sys/sys/epoch.h
===================================================================
--- /dev/null
+++ sys/sys/epoch.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of Matthew Macy nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SYS_EPOCH_H_
+#define _SYS_EPOCH_H_
+
+struct epoch;
+typedef struct epoch *epoch_t;
+
+struct epoch_context {
+ void *data[2];
+} __aligned(sizeof(void *));
+
+typedef struct epoch_context *epoch_context_t;
+
+epoch_t epoch_alloc(void);
+void epoch_free(epoch_t epoch);
+void epoch_enter(epoch_t epoch);
+void epoch_exit(epoch_t epoch);
+void epoch_enter_nopreempt(epoch_t epoch);
+void epoch_exit_nopreempt(epoch_t epoch);
+void epoch_wait(epoch_t epoch);
+void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t));
+int in_epoch(void);
+
+#endif
Index: sys/sys/gtaskqueue.h
===================================================================
--- sys/sys/gtaskqueue.h
+++ sys/sys/gtaskqueue.h
@@ -63,6 +63,7 @@
int taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride);
void taskqgroup_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
const char *name);
+void taskqgroup_config_gtask_deinit(struct grouptask *gtask);
#define TASK_ENQUEUED 0x1
#define TASK_SKIP_WAKEUP 0x2
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -243,6 +243,7 @@
/* Cleared during fork1() */
#define td_startzero td_flags
+ u_char td_epochnest; /* (k) Private thread epoch nest counter */
int td_flags; /* (t) TDF_* flags. */
int td_inhibitors; /* (t) Why can not run. */
int td_pflags; /* (k) Private thread (TDP_*) flags. */
@@ -355,6 +356,7 @@
int td_lastcpu; /* (t) Last cpu we were on. */
int td_oncpu; /* (t) Which cpu we are on. */
void *td_lkpi_task; /* LinuxKPI task struct pointer */
+ TAILQ_ENTRY(thread) td_epochq; /* (t) Epoch queue. */
};
struct thread0_storage {
Index: sys/sys/turnstile.h
===================================================================
--- sys/sys/turnstile.h
+++ sys/sys/turnstile.h
@@ -104,6 +104,7 @@
struct turnstile *turnstile_trywait(struct lock_object *);
void turnstile_unpend(struct turnstile *, int);
void turnstile_wait(struct turnstile *, struct thread *, int);
+void turnstile_lock(struct turnstile *);
#endif /* _KERNEL */
#endif /* _SYS_TURNSTILE_H_ */
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Feb 6, 8:43 AM (13 h, 34 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28447474
Default Alt Text
D15365.id42339.diff (17 KB)
Attached To
Mode
D15365: simple preempt safe epoch API
Attached
Detach File
Event Timeline
Log In to Comment