Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3891,6 +3891,7 @@ kern/subr_counter.c standard kern/subr_devstat.c standard kern/subr_disk.c standard +kern/subr_epoch.c standard kern/subr_eventhandler.c standard kern/subr_fattime.c standard kern/subr_firmware.c optional firmware Index: sys/conf/kern.pre.mk =================================================================== --- sys/conf/kern.pre.mk +++ sys/conf/kern.pre.mk @@ -77,7 +77,7 @@ .endif NOSTDINC= -nostdinc -INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S +INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S -I$S/contrib/ck/include CFLAGS= ${COPTFLAGS} ${DEBUG} CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h Index: sys/kern/subr_epoch.c =================================================================== --- /dev/null +++ sys/kern/subr_epoch.c @@ -0,0 +1,393 @@ +/*- + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of Matthew Macy nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation"); + + +SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information"); +SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats"); + + +/* Stats. */ +static counter_u64_t wait_count; +SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, preemption_waits, CTLFLAG_RW, + &wait_count, "# of times waited due to preemption"); +static counter_u64_t yield_count; +SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, yields, CTLFLAG_RW, + &yield_count, "# of times yielded to other cpu"); + +typedef struct epoch_cb { + void (*ec_callback)(epoch_context_t); + STAILQ_ENTRY(epoch_cb) ec_link; +} *epoch_cb_t; + +typedef struct epoch_record { + ck_epoch_record_t er_record; + TAILQ_HEAD(, thread) er_tdlist; + uint32_t er_cpuid; +} *epoch_record_t; + +struct epoch_pcpu_state { + struct epoch_record eps_record; + volatile int eps_critnest; + volatile int eps_waiters; +} __aligned(CACHE_LINE_SIZE); + +struct epoch { + struct ck_epoch e_epoch; + struct mtx e_lock; + struct grouptask e_gtask; + STAILQ_HEAD(, epoch_cb) e_cblist; + struct epoch_pcpu_state *e_pcpu_dom[MAXMEMDOM]; + struct epoch_pcpu_state *e_pcpu[0]; +}; + +static __read_mostly int domcount[MAXMEMDOM]; +static __read_mostly int domoffsets[MAXMEMDOM]; +static __read_mostly int inited; + +static void epoch_call_task(void *context); + +static void +epoch_init(void *arg __unused) +{ + int domain, count; + + count = domain = 0; + domoffsets[0] = 0; + for (domain = 0; domain < vm_ndomains; domain++) { + domcount[domain] = CPU_COUNT(&cpuset_domain[domain]); + if (bootverbose) + printf("domcount[%d] %d\n", domain, domcount[domain]); + } + for (domain = 1; domain < vm_ndomains; domain++) + domoffsets[domain] = domoffsets[domain-1] + domcount[domain-1]; + +#ifdef INVARIANTS + for (domain = 0; domain < vm_ndomains; domain++) { + KASSERT(domcount[domain], ("domcount[%d] is zero", domain)); + if (vm_ndomains > 1) + MPASS(domcount[domain] < mp_ncpus); + else + MPASS(domcount[domain] <= mp_ncpus); + } +#endif + wait_count = counter_u64_alloc(M_WAITOK); + yield_count = counter_u64_alloc(M_WAITOK); + inited = 1; +} +SYSINIT(epoch, SI_SUB_CPU + 1, SI_ORDER_FIRST, epoch_init, NULL); + +epoch_t +epoch_alloc(void) +{ + int domain, cpu_offset; + epoch_t epoch; + struct epoch_pcpu_state *eps; + epoch_record_t er; + + if (__predict_false(!inited)) + panic("%s called too early in boot", __func__); + epoch = malloc(sizeof(struct epoch) + mp_ncpus*sizeof(void*), + M_EPOCH, M_ZERO|M_WAITOK); + ck_epoch_init(&epoch->e_epoch); + mtx_init(&epoch->e_lock, "epoch cblist", NULL, MTX_DEF); + STAILQ_INIT(&epoch->e_cblist); + taskqgroup_config_gtask_init(epoch, &epoch->e_gtask, epoch_call_task, "epoch call task"); + for (domain = 0; domain < vm_ndomains; domain++) { + eps = malloc_domain(sizeof(*eps)*domcount[domain], M_EPOCH, + domain, M_ZERO|M_WAITOK); + epoch->e_pcpu_dom[domain] = eps; + cpu_offset = domoffsets[domain]; + for (int i = 0; i < domcount[domain]; i++, eps++) { + epoch->e_pcpu[cpu_offset + i] = eps; + er = &eps->eps_record; + ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); + TAILQ_INIT(&er->er_tdlist); + er->er_cpuid = cpu_offset + i; + } + } + return (epoch); +} + +void +epoch_free(epoch_t epoch) +{ + int domain; +#ifdef INVARIANTS + struct epoch_pcpu_state *eps; + int cpu; + CPU_FOREACH(cpu) { + if (CPU_ABSENT(cpu)) + continue; + eps = epoch->e_pcpu[cpu]; + MPASS(eps->eps_critnest == 0); + } +#endif + mtx_destroy(&epoch->e_lock); + taskqgroup_config_gtask_deinit(&epoch->e_gtask); + for (domain = 0; domain < vm_ndomains; domain++) + free_domain(epoch->e_pcpu_dom[domain], M_EPOCH); + free(epoch, M_EPOCH); +} + +#define INIT_CHECK(epoch) \ + do { \ + if (__predict_false((epoch) == NULL)) \ + return; \ + } while (0) + +void +epoch_enter(epoch_t epoch) +{ + struct epoch_pcpu_state *eps; + struct thread *td; + + INIT_CHECK(epoch); + + td = curthread; + critical_enter(); + eps = epoch->e_pcpu[curcpu]; + td->td_epochnest++; + MPASS(td->td_epochnest < UCHAR_MAX - 2); + if (td->td_epochnest == 1) + TAILQ_INSERT_TAIL(&eps->eps_record.er_tdlist, td, td_epochq); + sched_pin(); + ck_epoch_begin(&eps->eps_record.er_record, NULL); + critical_exit(); +} + +void +epoch_enter_nopreempt(epoch_t epoch) +{ + struct epoch_pcpu_state *eps; + + INIT_CHECK(epoch); + critical_enter(); + eps = epoch->e_pcpu[curcpu]; + curthread->td_epochnest++; + MPASS(curthread->td_epochnest < UCHAR_MAX - 2); + ck_epoch_begin(&eps->eps_record.er_record, NULL); +} + +void +epoch_exit(epoch_t epoch) +{ + struct epoch_pcpu_state *eps; + struct thread *td; + + td = curthread; + INIT_CHECK(epoch); + critical_enter(); + eps = epoch->e_pcpu[curcpu]; + sched_unpin(); + ck_epoch_end(&eps->eps_record.er_record, NULL); + td->td_epochnest--; + if (td->td_epochnest == 0) + TAILQ_REMOVE(&eps->eps_record.er_tdlist, td, td_epochq); + critical_exit(); +} + +void +epoch_exit_nopreempt(epoch_t epoch) +{ + struct epoch_pcpu_state *eps; + + INIT_CHECK(epoch); + MPASS(curthread->td_critnest); + eps = epoch->e_pcpu[curcpu]; + ck_epoch_end(&eps->eps_record.er_record, NULL); + curthread->td_epochnest--; + critical_exit(); +} + +static void +epoch_block_handler(struct ck_epoch *global __unused, ck_epoch_record_t *cr, + void *arg __unused) +{ + epoch_record_t record; + struct epoch_pcpu_state *eps; + struct thread *td; + bool is_sleeping; + u_char prio; + + eps = arg; + record = __containerof(cr, struct epoch_record, er_record); + td = curthread; + + counter_u64_add(yield_count, 1); + if (record->er_cpuid == curcpu) { + is_sleeping = false; + prio = 0; + TAILQ_FOREACH(td, &record->er_tdlist, td_epochq) { + if (td->td_priority > prio) + prio = td->td_priority; + is_sleeping |= (td->td_inhibitors != 0); + } + if (is_sleeping) { + thread_unlock(td); + pause("W", 1); + thread_lock(td); + } else { + /* set new thread priority */ + sched_prio(td, prio); + /* task switch */ + mi_switch(SW_VOL | SWT_RELINQUISH, NULL); + + /* + * Release the thread lock while yielding to + * allow other threads to acquire the lock + * pointed to by TDQ_LOCKPTR(td). Else a + * deadlock like situation might happen. + */ + thread_unlock(td); + thread_lock(td); + } + } else { + /* + * To avoid spinning move execution to the other CPU + * which is blocking synchronization. Set highest + * thread priority so that code gets run. The thread + * priority will be restored later. + */ + sched_prio(td, 0); + sched_bind(td, record->er_cpuid); + } +} + +/* + * Taken verbatim from linux_synchronize_rcu + */ +void +epoch_wait(epoch_t epoch) +{ + struct thread *td; + int was_bound; + int old_cpu; + int old_pinned; + u_char old_prio; + + INIT_CHECK(epoch); + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "epoch_wait() can sleep"); + + td = curthread; + thread_lock(td); + + DROP_GIANT(); + + old_cpu = PCPU_GET(cpuid); + old_pinned = td->td_pinned; + old_prio = td->td_priority; + was_bound = sched_is_bound(td); + sched_unbind(td); + td->td_pinned = 0; + sched_bind(td, old_cpu); + + ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL); + + /* restore CPU binding, if any */ + if (was_bound != 0) { + sched_bind(td, old_cpu); + } else { + /* get thread back to initial CPU, if any */ + if (old_pinned != 0) + sched_bind(td, old_cpu); + sched_unbind(td); + } + /* restore pinned after bind */ + td->td_pinned = old_pinned; + + /* restore thread priority */ + sched_prio(td, old_prio); + thread_unlock(td); + + PICKUP_GIANT(); +} + +void +epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)) +{ + epoch_cb_t cb; + + cb = (void *)ctx; + cb->ec_callback = callback; + mtx_lock(&epoch->e_lock); + STAILQ_INSERT_TAIL(&epoch->e_cblist, cb, ec_link); + GROUPTASK_ENQUEUE(&epoch->e_gtask); + mtx_unlock(&epoch->e_lock); +} + +static void +epoch_call_task(void *context) +{ + epoch_t epoch; + epoch_cb_t cb; + STAILQ_HEAD(, epoch_cb) tmp_head; + + epoch = context; + STAILQ_INIT(&tmp_head); + + mtx_lock(&epoch->e_lock); + STAILQ_CONCAT(&tmp_head, &epoch->e_cblist); + mtx_unlock(&epoch->e_lock); + + epoch_wait(epoch); + + while ((cb = STAILQ_FIRST(&tmp_head)) != NULL) + cb->ec_callback((void*)cb); +} + +int +in_epoch(void) +{ + return (curthread->td_epochnest != 0); +} Index: sys/sys/epoch.h =================================================================== --- /dev/null +++ sys/sys/epoch.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of Matthew Macy nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_EPOCH_H_ +#define _SYS_EPOCH_H_ + +struct epoch; +typedef struct epoch *epoch_t; + +struct epoch_context { + void *data[2]; +} __aligned(sizeof(void *)); + +typedef struct epoch_context *epoch_context_t; + +epoch_t epoch_alloc(void); +void epoch_free(epoch_t epoch); +void epoch_enter(epoch_t epoch); +void epoch_exit(epoch_t epoch); +void epoch_enter_nopreempt(epoch_t epoch); +void epoch_exit_nopreempt(epoch_t epoch); +void epoch_wait(epoch_t epoch); +void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)); +int in_epoch(void); + +#endif Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -243,6 +243,7 @@ /* Cleared during fork1() */ #define td_startzero td_flags + u_char td_epochnest; /* (k) Private thread epoch nest counter */ int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ @@ -355,6 +356,7 @@ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ + TAILQ_ENTRY(thread) td_epochq; /* (t) Epoch queue. */ }; struct thread0_storage {