Index: sys/conf/kern.post.mk =================================================================== --- sys/conf/kern.post.mk +++ sys/conf/kern.post.mk @@ -185,13 +185,19 @@ ${CC} ${HACK_EXTRA_FLAGS} -nostdlib hack.c -o hack.pico rm -f hack.c +offset.inc: $S/kern/genoffset.sh genoffset.o + NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genoffset.sh genoffset.o > ${.TARGET} + +genoffset.o: $S/kern/genoffset.c + ${CC} -c ${CFLAGS:N-flto:N-fno-common} $S/kern/genoffset.c + assym.inc: $S/kern/genassym.sh genassym.o NM='${NM}' NMFLAGS='${NMFLAGS}' sh $S/kern/genassym.sh genassym.o > ${.TARGET} -genassym.o: $S/$M/$M/genassym.c +genassym.o: $S/$M/$M/genassym.c offset.inc ${CC} -c ${CFLAGS:N-flto:N-fno-common} $S/$M/$M/genassym.c -${SYSTEM_OBJS} genassym.o vers.o: opt_global.h +${SYSTEM_OBJS} genoffset.o genassym.o vers.o: opt_global.h .if !empty(.MAKE.MODE:Unormal:Mmeta) && empty(.MAKE.MODE:Unormal:Mnofilemon) _meta_filemon= 1 @@ -213,10 +219,10 @@ .endif kernel-depend: .depend -SRCS= assym.inc vnode_if.h ${BEFORE_DEPEND} ${CFILES} \ +SRCS= assym.inc offset.inc vnode_if.h ${BEFORE_DEPEND} ${CFILES} \ ${SYSTEM_CFILES} ${GEN_CFILES} ${SFILES} \ ${MFILES:T:S/.m$/.h/} -DEPENDOBJS+= ${SYSTEM_OBJS} genassym.o +DEPENDOBJS+= ${SYSTEM_OBJS} genassym.o genoffset.o DEPENDFILES= ${DEPENDOBJS:O:u:C/^/.depend./} .if ${MAKE_VERSION} < 20160220 DEPEND_MP?= -MP Index: sys/conf/kern.pre.mk =================================================================== --- sys/conf/kern.pre.mk +++ sys/conf/kern.pre.mk @@ -195,7 +195,7 @@ OFED_C_NOIMP= ${CC} -c -o ${.TARGET} ${OFEDCFLAGS} ${WERROR} ${PROF} OFED_C= ${OFED_C_NOIMP} ${.IMPSRC} -GEN_CFILES= $S/$M/$M/genassym.c ${MFILES:T:S/.m$/.c/} +GEN_CFILES= $S/$M/$M/genassym.c $S/kern/genoffset.c ${MFILES:T:S/.m$/.c/} SYSTEM_CFILES= config.c env.c hints.c vnode_if.c SYSTEM_DEP= Makefile ${SYSTEM_OBJS} SYSTEM_OBJS= locore.o ${MDOBJS} ${OBJS} Index: sys/dev/cxgbe/tom/t4_connect.c =================================================================== --- sys/dev/cxgbe/tom/t4_connect.c +++ sys/dev/cxgbe/tom/t4_connect.c @@ -115,18 +115,19 @@ struct toepcb *toep = lookup_atid(sc, atid); struct inpcb *inp = toep->inp; struct toedev *tod = &toep->td->tod; + struct epoch_tracker et; free_atid(sc, atid); toep->tid = -1; CURVNET_SET(toep->vnet); if (status != EAGAIN) - INP_INFO_RLOCK(&V_tcbinfo); + NET_EPOCH_ENTER_ET(et); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) - INP_INFO_RUNLOCK(&V_tcbinfo); + NET_EPOCH_EXIT_ET(et); CURVNET_RESTORE(); } Index: sys/dev/hwpmc/hwpmc_mod.c =================================================================== --- sys/dev/hwpmc/hwpmc_mod.c +++ sys/dev/hwpmc/hwpmc_mod.c @@ -85,6 +85,9 @@ #define free_domain(addr, type) free(addr, type) #endif +#define PMC_EPOCH_ENTER() struct epoch_tracker pmc_et; epoch_enter_preempt(global_epoch_preempt, &pmc_et) +#define PMC_EPOCH_EXIT() epoch_exit_preempt(global_epoch_preempt, &pmc_et) + /* * Types */ @@ -1752,12 +1755,12 @@ const struct pmc_process *pp; freepath = fullpath = NULL; - MPASS(!in_epoch()); + MPASS(!in_epoch(global_epoch_preempt)); pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath); pid = td->td_proc->p_pid; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); /* Inform owners of all system-wide sampling PMCs. */ CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) @@ -1778,7 +1781,7 @@ done: if (freepath) free(freepath, M_TEMP); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); } @@ -1797,12 +1800,12 @@ pid = td->td_proc->p_pid; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, pid, pkm->pm_address, pkm->pm_address + pkm->pm_size); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL) return; @@ -1824,7 +1827,7 @@ struct pmc_owner *po; struct pmckern_map_in *km, *kmbase; - MPASS(in_epoch() || sx_xlocked(&pmc_sx)); + MPASS(in_epoch(global_epoch_preempt) || sx_xlocked(&pmc_sx)); KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)), ("[pmc,%d] non-sampling PMC (%p) desires mapping information", __LINE__, (void *) pm)); @@ -2106,13 +2109,13 @@ pk = (struct pmckern_procexec *) arg; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); /* Inform owners of SS mode PMCs of the exec event. */ CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_procexec(po, PMC_ID_INVALID, p->p_pid, pk->pm_entryaddr, fullpath); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); PROC_LOCK(p); is_using_hwpmcs = p->p_flag & P_HWPMC; @@ -2242,7 +2245,7 @@ break; case PMC_FN_MUNMAP: - MPASS(in_epoch() || sx_xlocked(&pmc_sx)); + MPASS(in_epoch(global_epoch_preempt) || sx_xlocked(&pmc_sx)); pmc_process_munmap(td, (struct pmckern_map_out *) arg); break; @@ -2479,7 +2482,7 @@ if (mode & PMC_FLAG_ALLOCATE) { if ((ptnew = pmc_thread_descriptor_pool_alloc()) == NULL) { wait_flag = M_WAITOK; - if ((mode & PMC_FLAG_NOWAIT) || in_epoch()) + if ((mode & PMC_FLAG_NOWAIT) || in_epoch(global_epoch_preempt)) wait_flag = M_NOWAIT; ptnew = malloc(THREADENTRY_SIZE, M_PMC, @@ -5070,11 +5073,11 @@ /* * Log a sysexit event to all SS PMC owners. */ - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_sysexit(po, p->p_pid); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); if (!is_using_hwpmcs) return; @@ -5255,13 +5258,13 @@ * If there are system-wide sampling PMCs active, we need to * log all fork events to their owner's logs. */ - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) { pmclog_process_procfork(po, p1->p_pid, newproc->p_pid); pmclog_process_proccreate(po, newproc, 1); } - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); if (!is_using_hwpmcs) return; @@ -5327,11 +5330,11 @@ { struct pmc_owner *po; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_threadcreate(po, td, 1); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); } static void @@ -5339,11 +5342,11 @@ { struct pmc_owner *po; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_threadexit(po, td); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); } static void @@ -5351,11 +5354,11 @@ { struct pmc_owner *po; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_proccreate(po, p, 1 /* sync */); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); } static void @@ -5388,12 +5391,12 @@ /* * Notify owners of system sampling PMCs about KLD operations. */ - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_in(po, (pid_t) -1, (uintfptr_t) lf->address, lf->filename); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); /* * TODO: Notify owners of (all) process-sampling PMCs too. @@ -5406,12 +5409,12 @@ { struct pmc_owner *po; - epoch_enter_preempt(global_epoch_preempt); + PMC_EPOCH_ENTER(); CK_LIST_FOREACH(po, &pmc_ss_owners, po_ssnext) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, (pid_t) -1, (uintfptr_t) address, (uintfptr_t) address + size); - epoch_exit_preempt(global_epoch_preempt); + PMC_EPOCH_EXIT(); /* * TODO: Notify owners of process-sampling PMCs. Index: sys/kern/genoffset.c =================================================================== --- /dev/null +++ sys/kern/genoffset.c @@ -0,0 +1,14 @@ + +#include +__FBSDID("$FreeBSD$"); +#include +#include +#include +#include + +ASSYM(TD_PRE_EPOCH_PRIO, offsetof(struct thread, td_pre_epoch_prio)); +ASSYM(TD_PRIORITY, offsetof(struct thread, td_priority)); +ASSYM(TD_EPOCHNEST, offsetof(struct thread, td_epochnest)); +ASSYM(TD_CRITNEST, offsetof(struct thread, td_critnest)); +ASSYM(TD_PINNED, offsetof(struct thread, td_pinned)); +ASSYM(TD_OWEPREEMPT, offsetof(struct thread, td_owepreempt)); Index: sys/kern/genoffset.sh =================================================================== --- /dev/null +++ sys/kern/genoffset.sh @@ -0,0 +1,75 @@ +#!/bin/sh +# $FreeBSD$ + +usage() +{ + echo "usage: genassym [-o outfile] objfile" + exit 1 +} + + +work() +{ + echo "#ifndef _OFFSET_INC_" + echo "#define _OFFSET_INC_" + ${NM:='nm'} ${NMFLAGS} "$1" | ${AWK:='awk'} ' + / C .*sign$/ { + sign = substr($1, length($1) - 3, 4) + sub("^0*", "", sign) + if (sign != "") + sign = "-" + } + / C .*w0$/ { + w0 = substr($1, length($1) - 3, 4) + } + / C .*w1$/ { + w1 = substr($1, length($1) - 3, 4) + } + / C .*w2$/ { + w2 = substr($1, length($1) - 3, 4) + } + / C .*w3$/ { + w3 = substr($1, length($1) - 3, 4) + w = w3 w2 w1 w0 + sub("^0*", "", w) + if (w == "") + w = "0" + hex = "" + if (w != "0") + hex = "0x" + sub("w3$", "", $3) + # This still has minor problems representing INT_MIN, etc. + # E.g., + # with 32-bit 2''s complement ints, this prints -0x80000000, + # which has the wrong type (unsigned int). + printf("#define\t%s\t%s%s%s\n", $3, sign, hex, w) + } ' + echo "#endif" +} + + +# +#MAIN PROGGRAM +# +use_outfile="no" +while getopts "o:" option +do + case "$option" in + o) outfile="$OPTARG" + use_outfile="yes";; + *) usage;; + esac +done +shift $(($OPTIND - 1)) +case $# in +1) ;; +*) usage;; +esac + +if [ "$use_outfile" = "yes" ] +then + work $1 3>"$outfile" >&3 3>&- +else + work $1 +fi + Index: sys/kern/subr_epoch.c =================================================================== --- sys/kern/subr_epoch.c +++ sys/kern/subr_epoch.c @@ -58,13 +58,6 @@ #define MAX_ADAPTIVE_SPIN 1000 #define MAX_EPOCHS 64 -#ifdef __amd64__ -#define EPOCH_ALIGN CACHE_LINE_SIZE*2 -#else -#define EPOCH_ALIGN CACHE_LINE_SIZE -#endif - -CTASSERT(sizeof(epoch_section_t) == sizeof(ck_epoch_section_t)); CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context)); SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information"); SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats"); @@ -100,26 +93,8 @@ CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, ck_epoch_entry_container) -typedef struct epoch_record { - ck_epoch_record_t er_record; - volatile struct threadlist er_tdlist; - volatile uint32_t er_gen; - uint32_t er_cpuid; -} *epoch_record_t; - -struct epoch_pcpu_state { - struct epoch_record eps_record; -} __aligned(EPOCH_ALIGN); - -struct epoch { - struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); - struct epoch_pcpu_state *e_pcpu_dom[MAXMEMDOM] __aligned(EPOCH_ALIGN); - int e_idx; - int e_flags; - struct epoch_pcpu_state *e_pcpu[0]; -}; - -epoch_t allepochs[MAX_EPOCHS]; + + epoch_t allepochs[MAX_EPOCHS]; DPCPU_DEFINE(struct grouptask, epoch_cb_task); DPCPU_DEFINE(int, epoch_cb_count); @@ -192,17 +167,15 @@ epoch_init_numa(epoch_t epoch) { int domain, cpu_offset; - struct epoch_pcpu_state *eps; epoch_record_t er; for (domain = 0; domain < vm_ndomains; domain++) { - eps = malloc_domain(sizeof(*eps) * domcount[domain], M_EPOCH, + er = malloc_domain(sizeof(*er) * domcount[domain], M_EPOCH, domain, M_ZERO | M_WAITOK); - epoch->e_pcpu_dom[domain] = eps; + epoch->e_pcpu_dom[domain] = er; cpu_offset = domoffsets[domain]; - for (int i = 0; i < domcount[domain]; i++, eps++) { - epoch->e_pcpu[cpu_offset + i] = eps; - er = &eps->eps_record; + for (int i = 0; i < domcount[domain]; i++, er++) { + epoch->e_pcpu[cpu_offset + i] = er; ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); er->er_cpuid = cpu_offset + i; @@ -213,14 +186,12 @@ static void epoch_init_legacy(epoch_t epoch) { - struct epoch_pcpu_state *eps; epoch_record_t er; - eps = malloc(sizeof(*eps) * mp_ncpus, M_EPOCH, M_ZERO | M_WAITOK); - epoch->e_pcpu_dom[0] = eps; - for (int i = 0; i < mp_ncpus; i++, eps++) { - epoch->e_pcpu[i] = eps; - er = &eps->eps_record; + er = malloc(sizeof(*er) * mp_ncpus, M_EPOCH, M_ZERO | M_WAITOK); + epoch->e_pcpu_dom[0] = er; + for (int i = 0; i < mp_ncpus; i++, er++) { + epoch->e_pcpu[i] = er; ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); er->er_cpuid = i; @@ -253,12 +224,12 @@ { int domain; #ifdef INVARIANTS - struct epoch_pcpu_state *eps; + struct epoch_record *er; int cpu; CPU_FOREACH(cpu) { - eps = epoch->e_pcpu[cpu]; - MPASS(TAILQ_EMPTY(&eps->eps_record.er_tdlist)); + er = epoch->e_pcpu[cpu]; + MPASS(TAILQ_EMPTY(&er->er_tdlist)); } #endif allepochs[epoch->e_idx] = NULL; @@ -271,95 +242,32 @@ free(epoch, M_EPOCH); } -#define INIT_CHECK(epoch) \ - do { \ - if (__predict_false((epoch) == NULL)) \ - return; \ - } while (0) - void -epoch_enter_preempt_internal(epoch_t epoch, struct thread *td) +epoch_enter_preempt_KBI(epoch_t epoch, epoch_tracker_t et) { - struct epoch_pcpu_state *eps; - MPASS(cold || epoch != NULL); - INIT_CHECK(epoch); - MPASS(epoch->e_flags & EPOCH_PREEMPT); - critical_enter(); - td->td_pre_epoch_prio = td->td_priority; - eps = epoch->e_pcpu[curcpu]; -#ifdef INVARIANTS - MPASS(td->td_epochnest < UCHAR_MAX - 2); - if (td->td_epochnest > 1) { - struct thread *curtd; - int found = 0; - - TAILQ_FOREACH(curtd, &eps->eps_record.er_tdlist, td_epochq) - if (curtd == td) - found = 1; - KASSERT(found, ("recursing on a second epoch")); - critical_exit(); - return; - } -#endif - TAILQ_INSERT_TAIL(&eps->eps_record.er_tdlist, td, td_epochq); - sched_pin(); - ck_epoch_begin(&eps->eps_record.er_record, (ck_epoch_section_t *)&td->td_epoch_section); - critical_exit(); + epoch_enter_preempt(epoch, et); } - void -epoch_enter(epoch_t epoch) +epoch_exit_preempt_KBI(epoch_t epoch, epoch_tracker_t et) { - ck_epoch_record_t *record; - struct thread *td; - - MPASS(cold || epoch != NULL); - INIT_CHECK(epoch); - td = curthread; - critical_enter(); - td->td_epochnest++; - record = &epoch->e_pcpu[curcpu]->eps_record.er_record; - ck_epoch_begin(record, NULL); + epoch_exit_preempt(epoch, et); } void -epoch_exit_preempt_internal(epoch_t epoch, struct thread *td) +epoch_enter_KBI(epoch_t epoch) { - struct epoch_pcpu_state *eps; - - MPASS(td->td_epochnest == 0); - INIT_CHECK(epoch); - critical_enter(); - eps = epoch->e_pcpu[curcpu]; - MPASS(epoch->e_flags & EPOCH_PREEMPT); - ck_epoch_end(&eps->eps_record.er_record, (ck_epoch_section_t *)&td->td_epoch_section); - TAILQ_REMOVE(&eps->eps_record.er_tdlist, td, td_epochq); - eps->eps_record.er_gen++; - sched_unpin(); - if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) { - thread_lock(td); - sched_prio(td, td->td_pre_epoch_prio); - thread_unlock(td); - } - critical_exit(); + epoch_enter(epoch); } void -epoch_exit(epoch_t epoch) +epoch_exit_KBI(epoch_t epoch) { - ck_epoch_record_t *record; - struct thread *td; - INIT_CHECK(epoch); - td = curthread; - td->td_epochnest--; - record = &epoch->e_pcpu[curcpu]->eps_record.er_record; - ck_epoch_end(record, NULL); - critical_exit(); + epoch_exit(epoch); } /* @@ -371,7 +279,8 @@ void *arg __unused) { epoch_record_t record; - struct thread *td, *tdwait, *owner; + struct thread *td, *owner, *curwaittd; + struct epoch_thread *tdwait; struct turnstile *ts; struct lock_object *lock; int spincount, gen; @@ -389,13 +298,13 @@ * overhead of a migration */ if ((tdwait = TAILQ_FIRST(&record->er_tdlist)) != NULL && - TD_IS_RUNNING(tdwait)) { + TD_IS_RUNNING(tdwait->et_td)) { gen = record->er_gen; thread_unlock(td); do { cpu_spinwait(); } while (tdwait == TAILQ_FIRST(&record->er_tdlist) && - gen == record->er_gen && TD_IS_RUNNING(tdwait) && + gen == record->er_gen && TD_IS_RUNNING(tdwait->et_td) && spincount++ < MAX_ADAPTIVE_SPIN); thread_lock(td); return; @@ -426,28 +335,29 @@ * priority thread (highest prio value) and drop our priority * to match to allow it to run. */ - TAILQ_FOREACH(tdwait, &record->er_tdlist, td_epochq) { + TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { /* * Propagate our priority to any other waiters to prevent us * from starving them. They will have their original priority * restore on exit from epoch_wait(). */ - if (!TD_IS_INHIBITED(tdwait) && tdwait->td_priority > td->td_priority) { + curwaittd = tdwait->et_td; + if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) { critical_enter(); thread_unlock(td); - thread_lock(tdwait); - sched_prio(tdwait, td->td_priority); - thread_unlock(tdwait); + thread_lock(curwaittd); + sched_prio(curwaittd, td->td_priority); + thread_unlock(curwaittd); thread_lock(td); critical_exit(); } - if (TD_IS_INHIBITED(tdwait) && TD_ON_LOCK(tdwait) && - ((ts = tdwait->td_blocked) != NULL)) { + if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) && + ((ts = curwaittd->td_blocked) != NULL)) { /* * We unlock td to allow turnstile_wait to reacquire the * the thread lock. Before unlocking it we enter a critical * section to prevent preemption after we reenable interrupts - * by dropping the thread lock in order to prevent tdwait + * by dropping the thread lock in order to prevent curwaittd * from getting to run. */ critical_enter(); @@ -456,15 +366,15 @@ /* * The owner pointer indicates that the lock succeeded. Only * in case we hold the lock and the turnstile we locked is still - * the one that tdwait is blocked on can we continue. Otherwise + * the one that curwaittd is blocked on can we continue. Otherwise * The turnstile pointer has been changed out from underneath - * us, as in the case where the lock holder has signalled tdwait, + * us, as in the case where the lock holder has signalled curwaittd, * and we need to continue. */ - if (owner != NULL && ts == tdwait->td_blocked) { - MPASS(TD_IS_INHIBITED(tdwait) && TD_ON_LOCK(tdwait)); + if (owner != NULL && ts == curwaittd->td_blocked) { + MPASS(TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd)); critical_exit(); - turnstile_wait(ts, owner, tdwait->td_tsqueue); + turnstile_wait(ts, owner, curwaittd->td_tsqueue); counter_u64_add(turnstile_count, 1); thread_lock(td); return; @@ -569,7 +479,7 @@ void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)) { - struct epoch_pcpu_state *eps; + epoch_record_t er; ck_epoch_entry_t *cb; cb = (void *)ctx; @@ -585,8 +495,8 @@ critical_enter(); *DPCPU_PTR(epoch_cb_count) += 1; - eps = epoch->e_pcpu[curcpu]; - ck_epoch_call(&eps->eps_record.er_record, cb, (ck_epoch_cb_t *)callback); + er = epoch->e_pcpu[curcpu]; + ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback); critical_exit(); return; boottime: @@ -608,7 +518,7 @@ for (total = i = 0; i < epoch_count; i++) { if (__predict_false((epoch = allepochs[i]) == NULL)) continue; - record = &epoch->e_pcpu[curcpu]->eps_record.er_record; + record = &epoch->e_pcpu[curcpu]->er_record; if ((npending = record->n_pending) == 0) continue; ck_epoch_poll_deferred(record, &cb_stack); @@ -632,7 +542,24 @@ } int -in_epoch(void) +in_epoch(epoch_t epoch) { - return (curthread->td_epochnest != 0); + struct epoch_thread *tdwait; + struct thread *td; + epoch_record_t er; + + td = curthread; + if (td->td_epochnest == 0) + return (0); + if (__predict_false((epoch) == NULL)) + return (0); + critical_enter(); + er = epoch->e_pcpu[curcpu]; + TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) + if (tdwait->et_td == td) { + critical_exit(); + return (1); + } + critical_exit(); + return (0); } Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -1760,29 +1760,35 @@ void if_addr_rlock(struct ifnet *ifp) { - - IF_ADDR_RLOCK(ifp); + MPASS(*(uint64_t *)&ifp->if_addr_et == 0); + epoch_enter_preempt(net_epoch_preempt, &ifp->if_addr_et); } void if_addr_runlock(struct ifnet *ifp) { - - IF_ADDR_RUNLOCK(ifp); + epoch_exit_preempt(net_epoch_preempt, &ifp->if_addr_et); +#ifdef INVARIANTS + bzero(&ifp->if_addr_et, sizeof(struct epoch_tracker)); +#endif } void if_maddr_rlock(if_t ifp) { - IF_ADDR_RLOCK((struct ifnet *)ifp); + MPASS(*(uint64_t *)&ifp->if_maddr_et == 0); + epoch_enter_preempt(net_epoch_preempt, &ifp->if_maddr_et); } void if_maddr_runlock(if_t ifp) { - IF_ADDR_RUNLOCK((struct ifnet *)ifp); + epoch_exit_preempt(net_epoch_preempt, &ifp->if_maddr_et); +#ifdef INVARIANTS + bzero(&ifp->if_maddr_et, sizeof(struct epoch_tracker)); +#endif } /* @@ -1926,7 +1932,7 @@ struct ifnet *ifp; struct ifaddr *ifa; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) @@ -1969,7 +1975,7 @@ struct ifnet *ifp; struct ifaddr *ifa; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; @@ -1999,7 +2005,7 @@ struct ifnet *ifp; struct ifaddr *ifa; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; @@ -2032,7 +2038,7 @@ u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. @@ -2069,7 +2075,6 @@ */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { - IF_ADDR_RUNLOCK(ifp); goto done; } } else { @@ -2128,7 +2133,8 @@ if (af >= AF_MAX) return (NULL); - MPASS(in_epoch()); + + MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; Index: sys/net/if_gif.h =================================================================== --- sys/net/if_gif.h +++ sys/net/if_gif.h @@ -96,8 +96,8 @@ /* mbuf adjust factor to force 32-bit alignment of IP header */ #define ETHERIP_ALIGN 2 -#define GIF_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define GIF_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define GIF_RLOCK() struct epoch_tracker gif_et; epoch_enter_preempt(net_epoch_preempt, &gif_et) +#define GIF_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &gif_et) #define GIF_WAIT() epoch_wait_preempt(net_epoch_preempt) /* Prototypes */ Index: sys/net/if_gre.h =================================================================== --- sys/net/if_gre.h +++ sys/net/if_gre.h @@ -91,8 +91,8 @@ #endif #define GRE2IFP(sc) ((sc)->gre_ifp) -#define GRE_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define GRE_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define GRE_RLOCK() struct epoch_tracker gre_et; epoch_enter_preempt(net_epoch_preempt, &gre_et) +#define GRE_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &gre_et) #define GRE_WAIT() epoch_wait_preempt(net_epoch_preempt) #define gre_hdr gre_uhdr.hdr Index: sys/net/if_lagg.c =================================================================== --- sys/net/if_lagg.c +++ sys/net/if_lagg.c @@ -73,10 +73,10 @@ #include #include -#define LAGG_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) -#define LAGG_RLOCK_ASSERT() MPASS(in_epoch()) -#define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch()) +#define LAGG_RLOCK() struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et) +#define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &lagg_et) +#define LAGG_RLOCK_ASSERT() MPASS(in_epoch(net_epoch_preempt)) +#define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch(net_epoch_preempt)) #define LAGG_SX_INIT(_sc) sx_init(&(_sc)->sc_sx, "if_lagg sx") #define LAGG_SX_DESTROY(_sc) sx_destroy(&(_sc)->sc_sx) @@ -1791,6 +1791,7 @@ lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; + struct epoch_tracker net_et; /* * Search a port which reports an active link state. @@ -1809,15 +1810,14 @@ } search: - LAGG_RLOCK(); + epoch_enter_preempt(net_epoch_preempt, &net_et); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { - LAGG_RUNLOCK(); - rval = lp_next; - goto found; + epoch_exit_preempt(net_epoch_preempt, &net_et); + return (lp_next); } } - LAGG_RUNLOCK(); + epoch_exit_preempt(net_epoch_preempt, &net_et); found: return (rval); } Index: sys/net/if_me.c =================================================================== --- sys/net/if_me.c +++ sys/net/if_me.c @@ -87,8 +87,8 @@ CK_LIST_HEAD(me_list, me_softc); #define ME2IFP(sc) ((sc)->me_ifp) #define ME_READY(sc) ((sc)->me_src.s_addr != 0) -#define ME_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define ME_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define ME_RLOCK() struct epoch_tracker me_et; epoch_enter_preempt(net_epoch_preempt, &me_et) +#define ME_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &me_et) #define ME_WAIT() epoch_wait_preempt(net_epoch_preempt) #ifndef ME_HASH_SIZE @@ -315,7 +315,7 @@ if (V_me_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); ip = mtod(m, const struct ip *); CK_LIST_FOREACH(sc, &ME_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { Index: sys/net/if_var.h =================================================================== --- sys/net/if_var.h +++ sys/net/if_var.h @@ -381,6 +381,8 @@ */ struct netdump_methods *if_netdump_methods; struct epoch_context if_epoch_ctx; + struct epoch_tracker if_addr_et; + struct epoch_tracker if_maddr_et; /* * Spare fields to be added before branching a stable branch, so @@ -398,15 +400,17 @@ */ #define IF_ADDR_LOCK_INIT(if) mtx_init(&(if)->if_addr_lock, "if_addr_lock", NULL, MTX_DEF) #define IF_ADDR_LOCK_DESTROY(if) mtx_destroy(&(if)->if_addr_lock) -#define IF_ADDR_RLOCK(if) epoch_enter_preempt(net_epoch_preempt); -#define IF_ADDR_RUNLOCK(if) epoch_exit_preempt(net_epoch_preempt); +#define IF_ADDR_RLOCK(if) struct epoch_tracker if_addr_et; epoch_enter_preempt(net_epoch_preempt, &if_addr_et); +#define IF_ADDR_RUNLOCK(if) epoch_exit_preempt(net_epoch_preempt, &if_addr_et); #define IF_ADDR_WLOCK(if) mtx_lock(&(if)->if_addr_lock) #define IF_ADDR_WUNLOCK(if) mtx_unlock(&(if)->if_addr_lock) -#define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch() || mtx_owned(&(if)->if_addr_lock)) +#define IF_ADDR_LOCK_ASSERT(if) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(if)->if_addr_lock)) #define IF_ADDR_WLOCK_ASSERT(if) mtx_assert(&(if)->if_addr_lock, MA_OWNED) -#define NET_EPOCH_ENTER() epoch_enter_preempt(net_epoch_preempt) -#define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt) +#define NET_EPOCH_ENTER() struct epoch_tracker nep_et; epoch_enter_preempt(net_epoch_preempt, &nep_et) +#define NET_EPOCH_ENTER_ET(et) epoch_enter_preempt(net_epoch_preempt, &(et)) +#define NET_EPOCH_EXIT() epoch_exit_preempt(net_epoch_preempt, &nep_et) +#define NET_EPOCH_EXIT_ET(et) epoch_exit_preempt(net_epoch_preempt, &(et)) /* @@ -482,16 +486,16 @@ mtx_init(&(ifp)->if_afdata_lock, "if_afdata", NULL, MTX_DEF) #define IF_AFDATA_WLOCK(ifp) mtx_lock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RLOCK(ifp) epoch_enter_preempt(net_epoch_preempt) +#define IF_AFDATA_RLOCK(ifp) struct epoch_tracker if_afdata_et; epoch_enter_preempt(net_epoch_preempt, &if_afdata_et) #define IF_AFDATA_WUNLOCK(ifp) mtx_unlock(&(ifp)->if_afdata_lock) -#define IF_AFDATA_RUNLOCK(ifp) epoch_exit_preempt(net_epoch_preempt) +#define IF_AFDATA_RUNLOCK(ifp) epoch_exit_preempt(net_epoch_preempt, &if_afdata_et) #define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp) #define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp) #define IF_AFDATA_TRYLOCK(ifp) mtx_trylock(&(ifp)->if_afdata_lock) #define IF_AFDATA_DESTROY(ifp) mtx_destroy(&(ifp)->if_afdata_lock) -#define IF_AFDATA_LOCK_ASSERT(ifp) MPASS(in_epoch() || mtx_owned(&(ifp)->if_afdata_lock)) -#define IF_AFDATA_RLOCK_ASSERT(ifp) MPASS(in_epoch()); +#define IF_AFDATA_LOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ifp)->if_afdata_lock)) +#define IF_AFDATA_RLOCK_ASSERT(ifp) MPASS(in_epoch(net_epoch_preempt)); #define IF_AFDATA_WLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_OWNED) #define IF_AFDATA_UNLOCK_ASSERT(ifp) mtx_assert(&(ifp)->if_afdata_lock, MA_NOTOWNED) @@ -573,16 +577,16 @@ * write, but also whether it was acquired with sleep support or not. */ #define IFNET_RLOCK_ASSERT() sx_assert(&ifnet_sxlock, SA_SLOCKED) -#define IFNET_RLOCK_NOSLEEP_ASSERT() MPASS(in_epoch()) +#define IFNET_RLOCK_NOSLEEP_ASSERT() MPASS(in_epoch(net_epoch_preempt)) #define IFNET_WLOCK_ASSERT() do { \ sx_assert(&ifnet_sxlock, SA_XLOCKED); \ rw_assert(&ifnet_rwlock, RA_WLOCKED); \ } while (0) #define IFNET_RLOCK() sx_slock(&ifnet_sxlock) -#define IFNET_RLOCK_NOSLEEP() epoch_enter_preempt(net_epoch_preempt) +#define IFNET_RLOCK_NOSLEEP() struct epoch_tracker ifnet_rlock_et; epoch_enter_preempt(net_epoch_preempt, &ifnet_rlock_et) #define IFNET_RUNLOCK() sx_sunlock(&ifnet_sxlock) -#define IFNET_RUNLOCK_NOSLEEP() epoch_exit_preempt(net_epoch_preempt) +#define IFNET_RUNLOCK_NOSLEEP() epoch_exit_preempt(net_epoch_preempt, &ifnet_rlock_et) /* * Look up an ifnet given its index; the _ref variant also acquires a Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -733,7 +733,7 @@ struct ifaddr *ifa; int not_found = 0; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); if ((flags & RTF_GATEWAY) == 0) { /* * If we are adding a route to an interface, Index: sys/net/rtsock.c =================================================================== --- sys/net/rtsock.c +++ sys/net/rtsock.c @@ -1736,15 +1736,15 @@ struct rt_addrinfo info; int len, error = 0; struct sockaddr_storage ss; + struct epoch_tracker et; bzero((caddr_t)&info, sizeof(info)); bzero(&ifd, sizeof(ifd)); - IFNET_RLOCK_NOSLEEP(); + NET_EPOCH_ENTER_ET(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (w->w_arg && w->w_arg != ifp->if_index) continue; if_data_copy(ifp, &ifd); - IF_ADDR_RLOCK(ifp); ifa = ifp->if_addr; info.rti_info[RTAX_IFP] = ifa->ifa_addr; error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len); @@ -1785,15 +1785,12 @@ goto done; } } - IF_ADDR_RUNLOCK(ifp); info.rti_info[RTAX_IFA] = NULL; info.rti_info[RTAX_NETMASK] = NULL; info.rti_info[RTAX_BRD] = NULL; } done: - if (ifp != NULL) - IF_ADDR_RUNLOCK(ifp); - IFNET_RUNLOCK_NOSLEEP(); + NET_EPOCH_EXIT_ET(et); return (error); } Index: sys/netinet/in_gif.c =================================================================== --- sys/netinet/in_gif.c +++ sys/netinet/in_gif.c @@ -224,7 +224,7 @@ int len; /* prepend new IP header */ - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); len = sizeof(struct ip); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) @@ -263,7 +263,7 @@ struct ip *ip; uint8_t ecn; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); if (sc == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_nogif); @@ -292,7 +292,7 @@ if (V_ipv4_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); ip = mtod(m, const struct ip *); /* * NOTE: it is safe to iterate without any locking here, because softc Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -274,6 +274,7 @@ uint8_t inp_spare_byte; /* Compiler hole */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ + epoch_tracker_t inp_et; /* pointer to on-stack epoch_tracker */ uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */ uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */ TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */ @@ -632,16 +633,19 @@ #define INP_INFO_LOCK_INIT(ipi, d) \ mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) #define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) -#define INP_INFO_RLOCK(ipi) NET_EPOCH_ENTER() +#define INP_INFO_RLOCK(ipi) struct epoch_tracker inp_info_et; NET_EPOCH_ENTER_ET(inp_info_et) +#define INP_INFO_RLOCK_ET(ipi, et) NET_EPOCH_ENTER_ET((et)) #define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) #define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) -#define INP_INFO_RUNLOCK(ipi) NET_EPOCH_EXIT() +#define INP_INFO_RUNLOCK(ipi) NET_EPOCH_EXIT_ET(inp_info_et) +#define INP_INFO_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et)) +#define INP_INFO_RUNLOCK_TP(ipi, tp) NET_EPOCH_EXIT_ET(*(tp)->t_inpcb->inp_et) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) -#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_lock)) -#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch()) +#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt)) #define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) -#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch() && !mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch(net_epoch_preempt) && !mtx_owned(&(ipi)->ipi_lock)) #define INP_LIST_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) @@ -664,11 +668,13 @@ #define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF) #define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock) -#define INP_HASH_RLOCK(ipi) NET_EPOCH_ENTER() +#define INP_HASH_RLOCK(ipi) struct epoch_tracker inp_hash_et; epoch_enter_preempt(net_epoch_preempt, &inp_hash_et) +#define INP_HASH_RLOCK_ET(ipi, et) epoch_enter_preempt(net_epoch_preempt, &(et)) #define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) -#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT() +#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT_ET(inp_hash_et) +#define INP_HASH_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT_ET((et)) #define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_hash_lock)) +#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock)) #define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); #define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ Index: sys/netinet/ip_divert.c =================================================================== --- sys/netinet/ip_divert.c +++ sys/netinet/ip_divert.c @@ -636,6 +636,7 @@ struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker net_et; /* * The process of preparing the TCB list is too time-consuming and @@ -654,10 +655,10 @@ /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_divcbinfo); + epoch_enter_preempt(net_epoch_preempt, &net_et); gencnt = V_divcbinfo.ipi_gencnt; n = V_divcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_divcbinfo); + epoch_exit_preempt(net_epoch_preempt, &net_et); error = sysctl_wire_old_buffer(req, 2 * sizeof(xig) + n*sizeof(struct xinpcb)); @@ -675,7 +676,7 @@ il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); inp_list = il->il_inp_list; - INP_INFO_RLOCK(&V_divcbinfo); + epoch_enter_preempt(net_epoch_preempt, &net_et); for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); @@ -686,7 +687,7 @@ } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_divcbinfo); + epoch_exit_preempt(net_epoch_preempt, &net_et); n = i; error = 0; Index: sys/netinet/ip_encap.c =================================================================== --- sys/netinet/ip_encap.c +++ sys/netinet/ip_encap.c @@ -112,8 +112,8 @@ MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF); #define ENCAP_WLOCK() mtx_lock(&encapmtx) #define ENCAP_WUNLOCK() mtx_unlock(&encapmtx) -#define ENCAP_RLOCK() epoch_enter_preempt(net_epoch_preempt) -#define ENCAP_RUNLOCK() epoch_exit_preempt(net_epoch_preempt) +#define ENCAP_RLOCK() struct epoch_tracker encap_et; epoch_enter_preempt(net_epoch_preempt, &encap_et) +#define ENCAP_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &encap_et) #define ENCAP_WAIT() epoch_wait_preempt(net_epoch_preempt) static struct encaptab * Index: sys/netinet/ip_gre.c =================================================================== --- sys/netinet/ip_gre.c +++ sys/netinet/ip_gre.c @@ -118,7 +118,7 @@ if (V_ipv4_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); ip = mtod(m, const struct ip *); CK_LIST_FOREACH(sc, &GRE_HASH(ip->ip_dst.s_addr, ip->ip_src.s_addr), chain) { Index: sys/netinet/raw_ip.c =================================================================== --- sys/netinet/raw_ip.c +++ sys/netinet/raw_ip.c @@ -1037,6 +1037,7 @@ struct inpcb *inp, **inp_list; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker net_et; /* * The process of preparing the TCB list is too time-consuming and @@ -1055,10 +1056,10 @@ /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_ripcbinfo); + epoch_enter_preempt(net_epoch_preempt, &net_et); gencnt = V_ripcbinfo.ipi_gencnt; n = V_ripcbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_ripcbinfo); + epoch_exit_preempt(net_epoch_preempt, &net_et); xig.xig_len = sizeof xig; xig.xig_count = n; @@ -1071,7 +1072,7 @@ il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); inp_list = il->il_inp_list; - INP_INFO_RLOCK(&V_ripcbinfo); + epoch_enter_preempt(net_epoch_preempt, &net_et); for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); @@ -1082,7 +1083,7 @@ } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_ripcbinfo); + epoch_exit_preempt(net_epoch_preempt, &net_et); n = i; error = 0; Index: sys/netinet/tcp_hpts.c =================================================================== --- sys/netinet/tcp_hpts.c +++ sys/netinet/tcp_hpts.c @@ -1145,6 +1145,7 @@ int16_t set_cpu; uint32_t did_prefetch = 0; int32_t ti_locked = TI_UNLOCKED; + struct epoch_tracker et; HPTS_MTX_ASSERT(hpts); while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { @@ -1161,7 +1162,7 @@ mtx_unlock(&hpts->p_mtx); CURVNET_SET(inp->inp_vnet); if (drop_reason) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else { ti_locked = TI_UNLOCKED; @@ -1172,7 +1173,7 @@ out: hpts->p_inp = NULL; if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } if (in_pcbrele_wlocked(inp) == 0) { INP_WUNLOCK(inp); @@ -1201,7 +1202,7 @@ n = m->m_nextpkt; } tp = tcp_drop(tp, drop_reason); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (tp == NULL) { INP_WLOCK(inp); } @@ -1234,7 +1235,7 @@ (m->m_pkthdr.pace_lock == TI_RLOCKED || tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); m = tp->t_in_pkt; } if (in_newts_every_tcb) { @@ -1289,7 +1290,7 @@ n = m->m_nextpkt; if (m != NULL && m->m_pkthdr.pace_lock == TI_RLOCKED) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; @@ -1316,14 +1317,14 @@ if (ti_locked == TI_UNLOCKED && (tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); } } /** end while(m) */ } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); ti_locked = TI_UNLOCKED; Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -583,6 +583,7 @@ int rstreason = 0; /* For badport_bandlim accounting purposes */ uint8_t iptos; struct m_tag *fwd_tag = NULL; + struct epoch_tracker et; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; @@ -773,10 +774,15 @@ * connection in TIMEWAIT and SYNs not targeting a listening socket. */ if ((thflags & (TH_FIN | TH_RST)) != 0) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + if (inp) + inp->inp_et = &et; ti_locked = TI_RLOCKED; - } else + } else { ti_locked = TI_UNLOCKED; + if (inp) + inp->inp_et = NULL; + } /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. @@ -962,7 +968,9 @@ */ if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - INP_INFO_RLOCK(); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + if (inp) + inp->inp_et = &et; ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); @@ -974,7 +982,7 @@ */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); } /* @@ -1011,7 +1019,8 @@ (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && !IS_FASTOPEN(tp->t_flags)))) { if (ti_locked == TI_UNLOCKED) { - INP_INFO_RLOCK(); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + inp->inp_et = &et; ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); @@ -1139,6 +1148,9 @@ tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); + inp->inp_et = NULL; + if (ti_locked == TI_RLOCKED) + inp->inp_et = &et; /* * Process the segment and the data it * contains. tcp_do_segment() consumes @@ -1350,8 +1362,9 @@ * Only the listen socket is unlocked by syncache_add(). */ if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; + inp->inp_et = &et; } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); @@ -1379,6 +1392,10 @@ #endif TCP_PROBE5(receive, NULL, tp, m, tp, th); + inp->inp_et = NULL; + if (ti_locked == TI_RLOCKED) + inp->inp_et = &et; + /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks @@ -1392,7 +1409,9 @@ TCP_PROBE5(receive, NULL, tp, m, tp, th); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + if (inp != NULL) + inp->inp_et = NULL; ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1416,8 +1435,9 @@ TCP_PROBE5(receive, NULL, tp, m, tp, th); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); ti_locked = TI_UNLOCKED; + inp->inp_et = NULL; } #ifdef INVARIANTS else { @@ -1514,6 +1534,8 @@ struct in_conninfo *inc; struct mbuf *mfree; struct tcpopt to; + struct inpcb *inp; + struct epoch_tracker *et; int tfo_syn; #ifdef TCPDEBUG @@ -1526,11 +1548,16 @@ short ostate = 0; #endif thflags = th->th_flags; - inc = &tp->t_inpcb->inp_inc; + inp = tp->t_inpcb; + inc = &inp->inp_inc; tp->sackhint.last_sack_ack = 0; sack_changed = 0; nsegs = max(1, m->m_pkthdr.lro_nsegs); - + if (ti_locked == TI_UNLOCKED) + MPASS(inp->inp_et == NULL); + else + MPASS(inp->inp_et != NULL); + et = inp->inp_et; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we @@ -1761,8 +1788,9 @@ * This is a pure ack for outstanding data. */ if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); ti_locked = TI_UNLOCKED; + inp->inp_et = NULL; TCPSTAT_INC(tcps_predack); @@ -1868,8 +1896,9 @@ * buffer space to take it. */ if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); ti_locked = TI_UNLOCKED; + inp->inp_et = NULL; /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) @@ -2899,7 +2928,7 @@ if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); m_freem(m); return; } @@ -3136,13 +3165,14 @@ ti_locked)); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); return; } } if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); ti_locked = TI_UNLOCKED; + inp->inp_et = NULL; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) @@ -3199,8 +3229,9 @@ #endif TCP_PROBE3(debug__input, tp, th, m); if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); ti_locked = TI_UNLOCKED; + inp->inp_et = NULL; tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); @@ -3210,20 +3241,24 @@ dropwithreset: if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); ti_locked = TI_UNLOCKED; + if (inp != NULL) + inp->inp_et = NULL; if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); - INP_WUNLOCK(tp->t_inpcb); + INP_WUNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); ti_locked = TI_UNLOCKED; + if (inp != NULL) + inp->inp_et = NULL; } #ifdef INVARIANTS else Index: sys/netinet/tcp_stacks/fastpath.c =================================================================== --- sys/netinet/tcp_stacks/fastpath.c +++ sys/netinet/tcp_stacks/fastpath.c @@ -159,6 +159,7 @@ int acked; uint16_t nsegs; int winup_only=0; + struct epoch_tracker *et; nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef TCPDEBUG @@ -170,6 +171,11 @@ struct tcphdr tcp_savetcp; short ostate = 0; #endif + et = NULL; + if (__predict_false(ti_locked == TI_RLOCKED)) { + et = tp->t_inpcb->inp_et; + MPASS(et != NULL); + } /* * The following if statement will be true if * we are doing the win_up_in_fp @@ -208,7 +214,8 @@ * This is a pure ack for outstanding data. */ if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + tp->t_inpcb->inp_et = NULL; + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); } ti_locked = TI_UNLOCKED; @@ -360,7 +367,8 @@ * buffer space to take it. */ if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *tp->t_inpcb->inp_et); + tp->t_inpcb->inp_et = NULL; } ti_locked = TI_UNLOCKED; @@ -441,6 +449,7 @@ uint16_t nsegs; char *s; struct in_conninfo *inc; + struct epoch_tracker *et; struct mbuf *mfree = NULL; nsegs = max(1, m->m_pkthdr.lro_nsegs); @@ -464,7 +473,9 @@ if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); - + et = NULL; + if (ti_locked == TI_RLOCKED) + et = tp->t_inpcb->inp_et; switch (tp->t_state) { /* @@ -1333,7 +1344,7 @@ if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); m_freem(m); return; } @@ -1567,12 +1578,14 @@ ti_locked)); tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); return; } } if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + if (tp) + tp->t_inpcb->inp_et = NULL; + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); } ti_locked = TI_UNLOCKED; @@ -1630,7 +1643,9 @@ #endif TCP_PROBE3(debug__drop, tp, th, m); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + if (tp) + tp->t_inpcb->inp_et = NULL; + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); } ti_locked = TI_UNLOCKED; @@ -1642,7 +1657,9 @@ dropwithreset: if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + if (tp) + tp->t_inpcb->inp_et = NULL; + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); } ti_locked = TI_UNLOCKED; @@ -1655,7 +1672,9 @@ drop: if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + if (tp) + tp->t_inpcb->inp_et = NULL; + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1697,10 +1716,16 @@ int can_enter; struct in_conninfo *inc; struct tcpopt to; + struct epoch_tracker *et; thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; nsegs = max(1, m->m_pkthdr.lro_nsegs); + et = NULL; + if (__predict_false(ti_locked == TI_RLOCKED)) { + et = tp->t_inpcb->inp_et; + MPASS(et != NULL); + } /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we @@ -1737,7 +1762,9 @@ free(s, M_TCPLOG); } if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + if (tp) + tp->t_inpcb->inp_et = NULL; + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); } INP_WUNLOCK(tp->t_inpcb); m_freem(m); @@ -1752,7 +1779,9 @@ (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + if (tp) + tp->t_inpcb->inp_et = NULL; + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); } INP_WUNLOCK(tp->t_inpcb); return; @@ -2040,7 +2069,8 @@ * This is a pure ack for outstanding data. */ if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *tp->t_inpcb->inp_et); + tp->t_inpcb->inp_et = NULL; } ti_locked = TI_UNLOCKED; @@ -2214,7 +2244,8 @@ free(s, M_TCPLOG); } if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *tp->t_inpcb->inp_et); + tp->t_inpcb->inp_et = NULL; } INP_WUNLOCK(tp->t_inpcb); m_freem(m); @@ -2229,7 +2260,8 @@ (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_dropwithreset(m, th, tp, tlen, BANDLIM_UNLIMITED); if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *tp->t_inpcb->inp_et); + tp->t_inpcb->inp_et = NULL; } INP_WUNLOCK(tp->t_inpcb); return; Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -347,13 +347,16 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); -static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked); +static void +rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked, + epoch_tracker_t et); static void rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val); static void rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen); + struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen, + epoch_tracker_t et); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -1492,10 +1495,10 @@ } static void -rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked) +rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked, epoch_tracker_t et) { if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); *ti_locked = TI_UNLOCKED; } /* @@ -1508,10 +1511,11 @@ } static void -rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen) +rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, + int32_t rstreason, int32_t tlen, epoch_tracker_t et) { if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); *ti_locked = TI_UNLOCKED; } if (tp != NULL) { @@ -1550,12 +1554,12 @@ (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return; } else *ret_val = 0; if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_TP(&V_tcbinfo, tp); *ti_locked = TI_UNLOCKED; } rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -1580,6 +1584,7 @@ * of closed window, not covered by the RFC. */ int dropped = 0; + epoch_tracker_t et; if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || @@ -1593,6 +1598,7 @@ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); + et = tp->t_inpcb->inp_et; if (V_tcp_insecure_rst || (tp->last_ack_sent == th->th_seq) || (tp->rcv_nxt == th->th_seq) || @@ -1617,7 +1623,7 @@ tp = tcp_close(tp); } dropped = 1; - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp, ti_locked, et); } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ @@ -1640,17 +1646,20 @@ static void rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val) { + epoch_tracker_t et; + KASSERT(*ti_locked == TI_RLOCKED, ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); + et = tp->t_inpcb->inp_et; if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); *ret_val = 1; - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp, ti_locked, et); } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, @@ -1658,7 +1667,7 @@ tp->last_ack_sent = tp->rcv_nxt; m = NULL; *ret_val = 0; - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, et); } } @@ -1693,7 +1702,7 @@ if (tlen) { rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); } return (1); } @@ -4491,6 +4500,7 @@ struct mbuf *mfree; struct tcp_rack *rack; int32_t recovery = 0; + epoch_tracker_t et; rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { @@ -4641,8 +4651,9 @@ * reset him. */ *ret_val = 1; + et = tp->t_inpcb->inp_et; tp = tcp_close(tp); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen, et); return (1); } } @@ -4669,6 +4680,7 @@ int32_t nsegs; int32_t tfo_syn; struct tcp_rack *rack; + epoch_tracker_t et; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); @@ -4886,14 +4898,16 @@ KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata " "TCP_FIN_WAIT_2 ti_locked: %d", __func__, *ti_locked)); + et = tp->t_inpcb->inp_et; tcp_twstart(tp); *ti_locked = TI_UNLOCKED; - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); return (1); } } if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_TP(&V_tcbinfo, tp); + tp->t_inpcb->inp_et = NULL; *ti_locked = TI_UNLOCKED; } /* @@ -4970,7 +4984,7 @@ * reassembly queue and we have enough buffer space to take it. */ if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_TP(&V_tcbinfo, tp); *ti_locked = TI_UNLOCKED; } nsegs = max(1, m->m_pkthdr.lro_nsegs); @@ -5118,7 +5132,8 @@ * This is a pure ack for outstanding data. */ if (*ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_TP(&V_tcbinfo, tp); + tp->t_inpcb->inp_et = NULL; *ti_locked = TI_UNLOCKED; } TCPSTAT_INC(tcps_predack); @@ -5204,6 +5219,7 @@ int32_t ret_val = 0; int32_t todrop; int32_t ourfinisacked = 0; + epoch_tracker_t et; rack_calc_rwin(so, tp); /* @@ -5217,25 +5233,26 @@ * SYN_RCVD state arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ + et = tp->t_inpcb->inp_et; if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, et); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp, ti_locked, et); return (1); } if (thflags & TH_RST) { - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp, ti_locked, et); return (1); } if (!(thflags & TH_SYN)) { - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp, ti_locked, et); return (1); } tp->irs = th->th_seq; @@ -5402,7 +5419,7 @@ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } if (IS_FASTOPEN(tp->t_flags)) { @@ -5414,7 +5431,7 @@ * FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ @@ -5424,11 +5441,11 @@ if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); return (0); } } @@ -5459,7 +5476,7 @@ * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { @@ -5686,7 +5703,7 @@ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); return (0); } } @@ -5699,7 +5716,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } } @@ -5778,7 +5795,7 @@ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); return (0); } } @@ -5791,7 +5808,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } } @@ -5803,7 +5820,8 @@ rack_check_data_after_close(struct mbuf *m, struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so) { - struct tcp_rack *rack; + struct tcp_rack *rack; + epoch_tracker_t et; KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked)); @@ -5811,9 +5829,10 @@ rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->rc_allow_data_af_clo == 0) { close_now: + et = tp->t_inpcb->inp_et; tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen)); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen), et); return (1); } if (sbavail(&so->so_snd) == 0) @@ -5905,7 +5924,7 @@ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); return (0); } } @@ -5937,7 +5956,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } } @@ -6024,7 +6043,7 @@ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); return (0); } } @@ -6035,9 +6054,11 @@ return (ret_val); } if (ourfinisacked) { + struct epoch_tracker *et; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + et = tp->t_inpcb->inp_et; tcp_twstart(tp); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, *et); *ti_locked = TI_UNLOCKED; m_freem(m); return (1); @@ -6045,7 +6066,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } } @@ -6132,7 +6153,7 @@ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); return (0); } } @@ -6143,15 +6164,18 @@ return (ret_val); } if (ourfinisacked) { + epoch_tracker_t et; + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + et = tp->t_inpcb->inp_et; tp = tcp_close(tp); - rack_do_drop(m, tp, ti_locked); + rack_do_drop(m, tp, ti_locked, et); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } } @@ -6241,7 +6265,7 @@ rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); return (ret_val); } else { - rack_do_drop(m, NULL, ti_locked); + rack_do_drop(m, NULL, ti_locked, tp->t_inpcb->inp_et); return (0); } } @@ -6254,7 +6278,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); + rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen, tp->t_inpcb->inp_et); return (1); } } @@ -6750,7 +6774,8 @@ #endif if (ti_locked != TI_UNLOCKED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_TP(&V_tcbinfo, tp); + tp->t_inpcb->inp_et = NULL; ti_locked = TI_UNLOCKED; } if (retval == 0) { Index: sys/netinet/tcp_timer.h =================================================================== --- sys/netinet/tcp_timer.h +++ sys/netinet/tcp_timer.h @@ -214,7 +214,6 @@ VNET_DECLARE(int, tcp_v6pmtud_blackhole_mss); #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) -int tcp_inpinfo_lock_add(struct inpcb *inp); void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp); void tcp_timer_init(void); Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c +++ sys/netinet/tcp_timer.c @@ -274,43 +274,9 @@ CURVNET_RESTORE(); } -/* - * When a timer wants to remove a TCB it must - * hold the INP_INFO_RLOCK(). The timer function - * should only have grabbed the INP_WLOCK() when - * it entered. To safely switch to holding both the - * INP_INFO_RLOCK() and the INP_WLOCK() we must first - * grab a reference on the inp, which will hold the inp - * so that it can't be removed. We then unlock the INP_WLOCK(), - * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() - * we proceed again to get the INP_WLOCK() (this preserves proper - * lock order). After acquiring the INP_WLOCK we must check if someone - * else deleted the pcb i.e. the inp_flags check. - * If so we return 1 otherwise we return 0. - * - * No matter what the tcp_inpinfo_lock_add() function - * returns the caller must afterwards call tcp_inpinfo_lock_del() - * to drop the locks and reference properly. - */ - -int -tcp_inpinfo_lock_add(struct inpcb *inp) -{ - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - return(1); - } - return(0); - -} - void tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) { - INP_INFO_RUNLOCK(&V_tcbinfo); if (inp && (tp == NULL)) { /* * If tcp_close/drop() gets called and tp @@ -377,11 +343,9 @@ tp->t_inpcb && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); - if (tcp_inpinfo_lock_add(inp)) { - tcp_inpinfo_lock_del(inp, tp); - goto out; - } + INP_INFO_RLOCK(&V_tcbinfo); tp = tcp_close(tp); + INP_INFO_RUNLOCK(&V_tcbinfo); tcp_inpinfo_lock_del(inp, tp); goto out; } else { @@ -389,15 +353,13 @@ callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp); } else { - if (tcp_inpinfo_lock_add(inp)) { - tcp_inpinfo_lock_del(inp, tp); - goto out; - } + INP_INFO_RLOCK(&V_tcbinfo); tp = tcp_close(tp); + INP_INFO_RUNLOCK(&V_tcbinfo); tcp_inpinfo_lock_del(inp, tp); goto out; } - } + } #ifdef TCPDEBUG if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) @@ -511,11 +473,7 @@ dropit: TCPSTAT_INC(tcps_keepdrops); - - if (tcp_inpinfo_lock_add(inp)) { - tcp_inpinfo_lock_del(inp, tp); - goto out; - } + INP_INFO_RLOCK(&V_tcbinfo); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG @@ -524,8 +482,8 @@ PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + INP_INFO_RUNLOCK(&V_tcbinfo); tcp_inpinfo_lock_del(inp, tp); -out: CURVNET_RESTORE(); } @@ -573,11 +531,9 @@ (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); - if (tcp_inpinfo_lock_add(inp)) { - tcp_inpinfo_lock_del(inp, tp); - goto out; - } + INP_INFO_RLOCK(&V_tcbinfo); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK(&V_tcbinfo); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -588,11 +544,9 @@ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); - if (tcp_inpinfo_lock_add(inp)) { - tcp_inpinfo_lock_del(inp, tp); - goto out; - } + INP_INFO_RLOCK(&V_tcbinfo); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK(&V_tcbinfo); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -654,11 +608,9 @@ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); - if (tcp_inpinfo_lock_add(inp)) { - tcp_inpinfo_lock_del(inp, tp); - goto out; - } + INP_INFO_RLOCK(&V_tcbinfo); tp = tcp_drop(tp, ETIMEDOUT); + INP_INFO_RUNLOCK(&V_tcbinfo); tcp_inpinfo_lock_del(inp, tp); goto out; } Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -276,11 +276,12 @@ { struct inpcb *inp; int rlock = 0; + struct epoch_tracker et; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); if (!INP_INFO_WLOCKED(&V_tcbinfo)) { - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); rlock = 1; } INP_WLOCK(inp); @@ -288,7 +289,7 @@ ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); if (rlock) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #ifdef INET @@ -887,6 +888,7 @@ int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; + struct epoch_tracker net_et; #ifdef INET6 int isipv6; #endif @@ -897,7 +899,7 @@ * this call. */ if (flags & PRUS_EOF) - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_RLOCK_ET(&V_tcbinfo, net_et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); @@ -1040,7 +1042,7 @@ ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (flags & PRUS_EOF) - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, net_et); return (error); } Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -840,6 +840,7 @@ struct in_pcblist *il; inp_gen_t gencnt; struct xinpgen xig; + struct epoch_tracker net_et; /* * The process of preparing the PCB list is too time-consuming and @@ -858,10 +859,10 @@ /* * OK, now we're committed to doing something. */ - INP_INFO_RLOCK(&V_udbinfo); + epoch_enter_preempt(net_epoch_preempt, &net_et); gencnt = V_udbinfo.ipi_gencnt; n = V_udbinfo.ipi_count; - INP_INFO_RUNLOCK(&V_udbinfo); + epoch_exit(net_epoch_preempt); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xinpcb)); @@ -878,7 +879,7 @@ il = malloc(sizeof(struct in_pcblist) + n * sizeof(struct inpcb *), M_TEMP, M_WAITOK|M_ZERO_INVARIANTS); inp_list = il->il_inp_list; - INP_INFO_RLOCK(&V_udbinfo); + epoch_enter_preempt(net_epoch_preempt, &net_et); for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { INP_WLOCK(inp); @@ -889,7 +890,7 @@ } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_udbinfo); + epoch_exit(net_epoch_preempt); n = i; error = 0; @@ -1101,6 +1102,7 @@ struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; + struct epoch_tracker et; int cscov_partial = 0; int error = 0; int ipflags; @@ -1257,7 +1259,7 @@ (inp->inp_laddr.s_addr == INADDR_ANY) || (inp->inp_lport == 0))) || (src.sin_family == AF_INET)) { - INP_HASH_RLOCK(pcbinfo); + INP_HASH_RLOCK_ET(pcbinfo, et); unlock_udbinfo = UH_RLOCKED; } else unlock_udbinfo = UH_UNLOCKED; @@ -1513,7 +1515,7 @@ if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK(pcbinfo); + INP_HASH_RUNLOCK_ET(pcbinfo, et); UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, @@ -1533,7 +1535,7 @@ } else if (unlock_udbinfo == UH_RLOCKED) { KASSERT(unlock_inp == UH_RLOCKED, ("%s: shared udbinfo lock, excl inp lock", __func__)); - INP_HASH_RUNLOCK(pcbinfo); + INP_HASH_RUNLOCK_ET(pcbinfo, et); INP_RUNLOCK(inp); } else if (unlock_inp == UH_WLOCKED) INP_WUNLOCK(inp); Index: sys/netinet6/in6_gif.c =================================================================== --- sys/netinet6/in6_gif.c +++ sys/netinet6/in6_gif.c @@ -241,7 +241,7 @@ int len; /* prepend new IP header */ - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); len = sizeof(struct ip6_hdr); #ifndef __NO_STRICT_ALIGNMENT if (proto == IPPROTO_ETHERIP) @@ -283,7 +283,7 @@ struct ip6_hdr *ip6; uint8_t ecn; - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); if (sc == NULL) { m_freem(m); IP6STAT_INC(ip6s_nogif); @@ -312,7 +312,7 @@ if (V_ipv6_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); /* * NOTE: it is safe to iterate without any locking here, because softc * can be reclaimed only when we are not within net_epoch_preempt Index: sys/netinet6/ip6_gre.c =================================================================== --- sys/netinet6/ip6_gre.c +++ sys/netinet6/ip6_gre.c @@ -110,7 +110,7 @@ if (V_ipv6_hashtbl == NULL) return (0); - MPASS(in_epoch()); + MPASS(in_epoch(net_epoch_preempt)); ip6 = mtod(m, const struct ip6_hdr *); CK_LIST_FOREACH(sc, &GRE_HASH(&ip6->ip6_dst, &ip6->ip6_src), chain) { /* Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c +++ sys/netinet6/udp6_usrreq.c @@ -214,6 +214,7 @@ int off = *offp; int cscov_partial; int plen, ulen; + struct epoch_tracker et; struct sockaddr_in6 fromsa[2]; struct m_tag *fwd_tag; uint16_t uh_sum; @@ -300,7 +301,7 @@ struct inpcbhead *pcblist; struct ip6_moptions *imo; - INP_INFO_RLOCK(pcbinfo); + INP_INFO_RLOCK_ET(pcbinfo, et); /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address @@ -414,7 +415,7 @@ goto badheadlocked; } INP_RLOCK(last); - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_RUNLOCK_ET(pcbinfo, et); UDP_PROBE(receive, NULL, last, ip6, last, uh); if (udp6_append(last, m, off, fromsa) == 0) INP_RUNLOCK(last); @@ -499,7 +500,7 @@ return (IPPROTO_DONE); badheadlocked: - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_RUNLOCK_ET(pcbinfo, et); badunlocked: if (m) m_freem(m); Index: sys/sys/epoch.h =================================================================== --- sys/sys/epoch.h +++ sys/sys/epoch.h @@ -46,48 +46,59 @@ struct epoch_context { void *data[2]; -} __aligned(sizeof(void *)); +} __aligned(sizeof(void *)); typedef struct epoch_context *epoch_context_t; + +struct epoch_tracker { + void *datap[3]; +#ifdef INVARIANTS + int datai[5]; +#else + int datai[1]; +#endif +} __aligned(sizeof(void *)); + +typedef struct epoch_tracker *epoch_tracker_t; + +/* + * A section object may be passed to every begin-end pair to allow for + * forward progress guarantees with-in prolonged active sections. + * + * We can't include ck_epoch.h so we define our own variant here and + * then CTASSERT that it's the same size in subr_epoch.c + */ + + + epoch_t epoch_alloc(int flags); void epoch_free(epoch_t epoch); -void epoch_enter(epoch_t epoch); -void epoch_enter_preempt_internal(epoch_t epoch, struct thread *td); -void epoch_exit(epoch_t epoch); -void epoch_exit_preempt_internal(epoch_t epoch, struct thread *td); void epoch_wait(epoch_t epoch); void epoch_wait_preempt(epoch_t epoch); void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)); -int in_epoch(void); +int in_epoch(epoch_t epoch); #ifdef _KERNEL DPCPU_DECLARE(int, epoch_cb_count); DPCPU_DECLARE(struct grouptask, epoch_cb_task); +#define EPOCH_MAGIC0 0xFADECAFEF00DD00D +#define EPOCH_MAGIC1 0xBADDBABEDEEDFEED -static __inline void -epoch_enter_preempt(epoch_t epoch) -{ - struct thread *td; - int nesting __unused; +void epoch_enter_preempt_KBI(epoch_t epoch, epoch_tracker_t et); +void epoch_exit_preempt_KBI(epoch_t epoch, epoch_tracker_t et); +void epoch_enter_KBI(epoch_t epoch); +void epoch_exit_KBI(epoch_t epoch); - td = curthread; - nesting = td->td_epochnest++; -#ifndef INVARIANTS - if (nesting == 0) -#endif - epoch_enter_preempt_internal(epoch, td); -} - -static __inline void -epoch_exit_preempt(epoch_t epoch) -{ - struct thread *td; - - td = curthread; - MPASS(td->td_epochnest); - if (td->td_epochnest-- == 1) - epoch_exit_preempt_internal(epoch, td); -} -#endif /* _KERNEL */ + +#ifdef KLD_MODULE +#define epoch_enter_preempt(e, t) epoch_enter_preempt_KBI((e), (t)) +#define epoch_exit_preempt(e, t) epoch_exit_preempt_KBI((e), (t)) +#define epoch_enter(e) epoch_enter_KBI((e)) +#define epoch_exit(e) epoch_exit_KBI((e)) +#else +#include +#endif /* KLD_MODULE */ + +#endif /* _KERNEL */ #endif Index: sys/sys/epoch_private.h =================================================================== --- /dev/null +++ sys/sys/epoch_private.h @@ -0,0 +1,207 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2018, Matthew Macy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_EPOCH_PRIVATE_H_ +#define _SYS_EPOCH_PRIVATE_H_ +#ifndef _KERNEL +#error "no user serviceable parts" +#else +#include +#ifndef TD_CRITNEST +#include "offset.inc" +#endif + +#include +#include + +#ifdef __amd64__ +#define EPOCH_ALIGN CACHE_LINE_SIZE*2 +#else +#define EPOCH_ALIGN CACHE_LINE_SIZE +#endif + +typedef struct epoch_thread { +#ifdef INVARIANTS + uint64_t et_magic_pre; +#endif + TAILQ_ENTRY(epoch_thread) et_link; /* Epoch queue. */ + struct thread *et_td; /* pointer to thread in section */ + ck_epoch_section_t et_section; /* epoch section object */ +#ifdef INVARIANTS + uint64_t et_magic_post; +#endif +} *epoch_thread_t; +TAILQ_HEAD (epoch_tdlist, epoch_thread); + +typedef struct epoch_record { + ck_epoch_record_t er_record; + volatile struct epoch_tdlist er_tdlist; + volatile uint32_t er_gen; + uint32_t er_cpuid; +} __aligned(EPOCH_ALIGN) *epoch_record_t; + +struct epoch { + struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); + struct epoch_record *e_pcpu_dom[MAXMEMDOM] __aligned(EPOCH_ALIGN); + int e_idx; + int e_flags; + struct epoch_record *e_pcpu[0]; +}; + +#define INIT_CHECK(epoch) \ + do { \ + if (__predict_false((epoch) == NULL)) \ + return; \ + } while (0) + +static __inline void +epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et) +{ + struct epoch_record *er; + struct epoch_thread *etd; + caddr_t ptd; + u_char *td_priority, *td_pre_epoch_prio; + u_char *td_epochnest, *td_critnest; + int *td_pinned; + MPASS(cold || epoch != NULL); + INIT_CHECK(epoch); + etd = (void *)et; +#ifdef INVARIANTS + MPASS(epoch->e_flags & EPOCH_PREEMPT); + etd->et_magic_pre = EPOCH_MAGIC0; + etd->et_magic_post = EPOCH_MAGIC1; +#endif + ptd = (caddr_t)curthread; + etd->et_td = (void*)ptd; + td_critnest = ptd + TD_CRITNEST; + td_epochnest = ptd + TD_EPOCHNEST; + td_pinned = (int *)(ptd + TD_PINNED); + (*td_critnest)++; + (*td_epochnest)++; + (*td_pinned)++; + __compiler_membar(); + + td_priority = (u_char *)ptd + TD_PRIORITY; + td_pre_epoch_prio = (u_char *)ptd + TD_PRE_EPOCH_PRIO; + *td_pre_epoch_prio = *td_priority; + er = epoch->e_pcpu[curcpu]; + TAILQ_INSERT_TAIL(&er->er_tdlist, etd, et_link); + ck_epoch_begin(&er->er_record, (ck_epoch_section_t *)&etd->et_section); + critical_exit(); +} + +static __inline void +epoch_enter(epoch_t epoch) +{ + ck_epoch_record_t *record; + caddr_t ptd; + u_char *td_epochnest, *td_critnest; + + MPASS(cold || epoch != NULL); + INIT_CHECK(epoch); + ptd = (caddr_t)curthread; + + td_critnest = ptd + TD_CRITNEST; + td_epochnest = ptd + TD_EPOCHNEST; + (*td_critnest)++; + (*td_epochnest)++; + __compiler_membar(); + record = &epoch->e_pcpu[curcpu]->er_record; + ck_epoch_begin(record, NULL); +} + +static __inline void +epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et) +{ + struct epoch_record *er; + struct epoch_thread *etd; + u_char *td_epochnest, *td_critnest; + u_char *td_priority, *td_pre_epoch_prio; + int *td_pinned; + caddr_t ptd; + + INIT_CHECK(epoch); + ptd = (caddr_t)curthread; + td_critnest = ptd + TD_CRITNEST; + td_epochnest = ptd + TD_EPOCHNEST; + td_pinned = (int *)(ptd + TD_PINNED); + (*td_critnest)++; + __compiler_membar(); + + er = epoch->e_pcpu[curcpu]; + MPASS(epoch->e_flags & EPOCH_PREEMPT); + etd = (void *)et; +#ifdef INVARIANTS + MPASS(etd != NULL); + MPASS(etd->et_td == (struct thread *)ptd); + MPASS(etd->et_magic_pre == EPOCH_MAGIC0); + MPASS(etd->et_magic_post == EPOCH_MAGIC1); + etd->et_magic_pre = 0; + etd->et_magic_post = 0; + etd->et_td = (void*)0xDEADBEEF; +#endif + ck_epoch_end(&er->er_record, + (ck_epoch_section_t *)&etd->et_section); + TAILQ_REMOVE(&er->er_tdlist, etd, et_link); + er->er_gen++; + td_priority = (u_char *)ptd + TD_PRIORITY; + td_pre_epoch_prio = (u_char *)ptd + TD_PRE_EPOCH_PRIO; + if (__predict_false(*td_pre_epoch_prio != *td_priority)) { + struct thread *td; + + td = (struct thread *)ptd; + thread_lock(td); + sched_prio(td, *td_pre_epoch_prio); + thread_unlock(td); + } + MPASS(*td_epochnest); + MPASS(*td_pinned); + (*td_epochnest)--; + (*td_pinned)--; + critical_exit(); +} + +static __inline void +epoch_exit(epoch_t epoch) +{ + ck_epoch_record_t *record; + u_char *td_epochnest; + caddr_t ptd; + + INIT_CHECK(epoch); + ptd = (caddr_t)curthread; + td_epochnest = ptd + TD_EPOCHNEST; + MPASS(*td_epochnest); + (*td_epochnest)--; + record = &epoch->e_pcpu[curcpu]->er_record; + ck_epoch_end(record, NULL); + critical_exit(); +} +#endif /* _KERNEL */ +#endif /* _SYS_EPOCH_PRIVATE_H_ */ Index: sys/sys/pmckern.h =================================================================== --- sys/sys/pmckern.h +++ sys/sys/pmckern.h @@ -201,11 +201,12 @@ /* Hook invocation; for use within the kernel */ #define PMC_CALL_HOOK(t, cmd, arg) \ -do { \ - epoch_enter_preempt(global_epoch_preempt); \ +do { \ + struct epoch_tracker et; \ + epoch_enter_preempt(global_epoch_preempt, &et); \ if (pmc_hook != NULL) \ (pmc_hook)((t), (cmd), (arg)); \ - epoch_exit_preempt(global_epoch_preempt); \ + epoch_exit_preempt(global_epoch_preempt, &et); \ } while (0) /* Hook invocation that needs an exclusive lock */ Index: sys/sys/proc.h =================================================================== --- sys/sys/proc.h +++ sys/sys/proc.h @@ -74,19 +74,6 @@ #include #endif - -/* - * A section object may be passed to every begin-end pair to allow for - * forward progress guarantees with-in prolonged active sections. - * - * We can't include ck_epoch.h so we define our own variant here and - * then CTASSERT that it's the same size in subr_epoch.c - */ -struct epoch_section { - unsigned int bucket; -}; -typedef struct epoch_section epoch_section_t; - /* * One structure allocated per session. * @@ -373,8 +360,6 @@ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ - TAILQ_ENTRY(thread) td_epochq; /* (t) Epoch queue. */ - epoch_section_t td_epoch_section; /* (t) epoch section object */ int td_pmcpend; };