Index: head/sys/kern/kern_intr.c =================================================================== --- head/sys/kern/kern_intr.c (revision 347429) +++ head/sys/kern/kern_intr.c (revision 347430) @@ -1,1609 +1,1628 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, Stefan Esser * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_usage_prof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif /* * Describe an interrupt thread. There is one of these per interrupt event. */ struct intr_thread { struct intr_event *it_event; struct thread *it_thread; /* Kernel thread. */ int it_flags; /* (j) IT_* flags. */ int it_need; /* Needs service. */ }; /* Interrupt thread flags kept in it_flags */ #define IT_DEAD 0x000001 /* Thread is waiting to exit. */ #define IT_WAIT 0x000002 /* Thread is waiting for completion. */ struct intr_entropy { struct thread *td; uintptr_t event; }; struct intr_event *clk_intr_event; struct intr_event *tty_intr_event; void *vm_ih; struct proc *intrproc; static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads"); static int intr_storm_threshold = 1000; SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN, &intr_storm_threshold, 0, "Number of consecutive interrupts before storm protection is enabled"); static TAILQ_HEAD(, intr_event) event_list = TAILQ_HEAD_INITIALIZER(event_list); static struct mtx event_lock; MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF); static void intr_event_update(struct intr_event *ie); static int intr_event_schedule_thread(struct intr_event *ie); static struct intr_thread *ithread_create(const char *name); static void ithread_destroy(struct intr_thread *ithread); static void ithread_execute_handlers(struct proc *p, struct intr_event *ie); static void ithread_loop(void *); static void ithread_update(struct intr_thread *ithd); static void start_softintr(void *); /* Map an interrupt type to an ithread priority. */ u_char intr_priority(enum intr_type flags) { u_char pri; flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET | INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV); switch (flags) { case INTR_TYPE_TTY: pri = PI_TTY; break; case INTR_TYPE_BIO: pri = PI_DISK; break; case INTR_TYPE_NET: pri = PI_NET; break; case INTR_TYPE_CAM: pri = PI_DISK; break; case INTR_TYPE_AV: pri = PI_AV; break; case INTR_TYPE_CLK: pri = PI_REALTIME; break; case INTR_TYPE_MISC: pri = PI_DULL; /* don't care */ break; default: /* We didn't specify an interrupt level. */ panic("intr_priority: no interrupt type in flags"); } return pri; } /* * Update an ithread based on the associated intr_event. */ static void ithread_update(struct intr_thread *ithd) { struct intr_event *ie; struct thread *td; u_char pri; ie = ithd->it_event; td = ithd->it_thread; mtx_assert(&ie->ie_lock, MA_OWNED); /* Determine the overall priority of this event. */ if (CK_SLIST_EMPTY(&ie->ie_handlers)) pri = PRI_MAX_ITHD; else pri = CK_SLIST_FIRST(&ie->ie_handlers)->ih_pri; /* Update name and priority. */ strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name)); #ifdef KTR sched_clear_tdname(td); #endif thread_lock(td); sched_prio(td, pri); thread_unlock(td); } /* * Regenerate the full name of an interrupt event and update its priority. */ static void intr_event_update(struct intr_event *ie) { struct intr_handler *ih; char *last; int missed, space; /* Start off with no entropy and just the name of the event. */ mtx_assert(&ie->ie_lock, MA_OWNED); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); ie->ie_flags &= ~IE_ENTROPY; missed = 0; space = 1; /* Run through all the handlers updating values. */ CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 < sizeof(ie->ie_fullname)) { strcat(ie->ie_fullname, " "); strcat(ie->ie_fullname, ih->ih_name); space = 0; } else missed++; if (ih->ih_flags & IH_ENTROPY) ie->ie_flags |= IE_ENTROPY; } /* * If there is only one handler and its name is too long, just copy in * as much of the end of the name (includes the unit number) as will * fit. Otherwise, we have multiple handlers and not all of the names * will fit. Add +'s to indicate missing names. If we run out of room * and still have +'s to add, change the last character from a + to a *. */ if (missed == 1 && space == 1) { ih = CK_SLIST_FIRST(&ie->ie_handlers); missed = strlen(ie->ie_fullname) + strlen(ih->ih_name) + 2 - sizeof(ie->ie_fullname); strcat(ie->ie_fullname, (missed == 0) ? " " : "-"); strcat(ie->ie_fullname, &ih->ih_name[missed]); missed = 0; } last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2]; while (missed-- > 0) { if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) { if (*last == '+') { *last = '*'; break; } else *last = '+'; } else if (space) { strcat(ie->ie_fullname, " +"); space = 0; } else strcat(ie->ie_fullname, "+"); } /* * If this event has an ithread, update it's priority and * name. */ if (ie->ie_thread != NULL) ithread_update(ie->ie_thread); CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname); } int intr_event_create(struct intr_event **event, void *source, int flags, int irq, void (*pre_ithread)(void *), void (*post_ithread)(void *), void (*post_filter)(void *), int (*assign_cpu)(void *, int), const char *fmt, ...) { struct intr_event *ie; va_list ap; /* The only valid flag during creation is IE_SOFT. */ if ((flags & ~IE_SOFT) != 0) return (EINVAL); ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO); ie->ie_source = source; ie->ie_pre_ithread = pre_ithread; ie->ie_post_ithread = post_ithread; ie->ie_post_filter = post_filter; ie->ie_assign_cpu = assign_cpu; ie->ie_flags = flags; ie->ie_irq = irq; ie->ie_cpu = NOCPU; CK_SLIST_INIT(&ie->ie_handlers); mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF); va_start(ap, fmt); vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap); va_end(ap); strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname)); mtx_lock(&event_lock); TAILQ_INSERT_TAIL(&event_list, ie, ie_list); mtx_unlock(&event_lock); if (event != NULL) *event = ie; CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name); return (0); } /* * Bind an interrupt event to the specified CPU. Note that not all * platforms support binding an interrupt to a CPU. For those * platforms this request will fail. Using a cpu id of NOCPU unbinds * the interrupt event. */ static int _intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread) { lwpid_t id; int error; /* Need a CPU to bind to. */ if (cpu != NOCPU && CPU_ABSENT(cpu)) return (EINVAL); if (ie->ie_assign_cpu == NULL) return (EOPNOTSUPP); error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR); if (error) return (error); /* * If we have any ithreads try to set their mask first to verify * permissions, etc. */ if (bindithread) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_setithread(id, cpu); if (error) return (error); } else mtx_unlock(&ie->ie_lock); } if (bindirq) error = ie->ie_assign_cpu(ie->ie_source, cpu); if (error) { if (bindithread) { mtx_lock(&ie->ie_lock); if (ie->ie_thread != NULL) { cpu = ie->ie_cpu; id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); (void)cpuset_setithread(id, cpu); } else mtx_unlock(&ie->ie_lock); } return (error); } if (bindirq) { mtx_lock(&ie->ie_lock); ie->ie_cpu = cpu; mtx_unlock(&ie->ie_lock); } return (error); } /* * Bind an interrupt event to the specified CPU. For supported platforms, any * associated ithreads as well as the primary interrupt context will be bound * to the specificed CPU. */ int intr_event_bind(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, true, true)); } /* * Bind an interrupt event to the specified CPU, but do not bind associated * ithreads. */ int intr_event_bind_irqonly(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, true, false)); } /* * Bind an interrupt event's ithread to the specified CPU. */ int intr_event_bind_ithread(struct intr_event *ie, int cpu) { return (_intr_event_bind(ie, cpu, false, true)); } +/* + * Bind an interrupt event's ithread to the specified cpuset. + */ +int +intr_event_bind_ithread_cpuset(struct intr_event *ie, cpuset_t *cs) +{ + lwpid_t id; + + mtx_lock(&ie->ie_lock); + if (ie->ie_thread != NULL) { + id = ie->ie_thread->it_thread->td_tid; + mtx_unlock(&ie->ie_lock); + return (cpuset_setthread(id, cs)); + } else { + mtx_unlock(&ie->ie_lock); + } + return (ENODEV); +} + static struct intr_event * intr_lookup(int irq) { struct intr_event *ie; mtx_lock(&event_lock); TAILQ_FOREACH(ie, &event_list, ie_list) if (ie->ie_irq == irq && (ie->ie_flags & IE_SOFT) == 0 && CK_SLIST_FIRST(&ie->ie_handlers) != NULL) break; mtx_unlock(&event_lock); return (ie); } int intr_setaffinity(int irq, int mode, void *m) { struct intr_event *ie; cpuset_t *mask; int cpu, n; mask = m; cpu = NOCPU; /* * If we're setting all cpus we can unbind. Otherwise make sure * only one cpu is in the set. */ if (CPU_CMP(cpuset_root, mask)) { for (n = 0; n < CPU_SETSIZE; n++) { if (!CPU_ISSET(n, mask)) continue; if (cpu != NOCPU) return (EINVAL); cpu = n; } } ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); switch (mode) { case CPU_WHICH_IRQ: return (intr_event_bind(ie, cpu)); case CPU_WHICH_INTRHANDLER: return (intr_event_bind_irqonly(ie, cpu)); case CPU_WHICH_ITHREAD: return (intr_event_bind_ithread(ie, cpu)); default: return (EINVAL); } } int intr_getaffinity(int irq, int mode, void *m) { struct intr_event *ie; struct thread *td; struct proc *p; cpuset_t *mask; lwpid_t id; int error; mask = m; ie = intr_lookup(irq); if (ie == NULL) return (ESRCH); error = 0; CPU_ZERO(mask); switch (mode) { case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: mtx_lock(&ie->ie_lock); if (ie->ie_cpu == NOCPU) CPU_COPY(cpuset_root, mask); else CPU_SET(ie->ie_cpu, mask); mtx_unlock(&ie->ie_lock); break; case CPU_WHICH_ITHREAD: mtx_lock(&ie->ie_lock); if (ie->ie_thread == NULL) { mtx_unlock(&ie->ie_lock); CPU_COPY(cpuset_root, mask); } else { id = ie->ie_thread->it_thread->td_tid; mtx_unlock(&ie->ie_lock); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL); if (error != 0) return (error); CPU_COPY(&td->td_cpuset->cs_mask, mask); PROC_UNLOCK(p); } default: return (EINVAL); } return (0); } int intr_event_destroy(struct intr_event *ie) { mtx_lock(&event_lock); mtx_lock(&ie->ie_lock); if (!CK_SLIST_EMPTY(&ie->ie_handlers)) { mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); return (EBUSY); } TAILQ_REMOVE(&event_list, ie, ie_list); #ifndef notyet if (ie->ie_thread != NULL) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); mtx_unlock(&event_lock); mtx_destroy(&ie->ie_lock); free(ie, M_ITHREAD); return (0); } static struct intr_thread * ithread_create(const char *name) { struct intr_thread *ithd; struct thread *td; int error; ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO); error = kproc_kthread_add(ithread_loop, ithd, &intrproc, &td, RFSTOPPED | RFHIGHPID, 0, "intr", "%s", name); if (error) panic("kproc_create() failed with %d", error); thread_lock(td); sched_class(td, PRI_ITHD); TD_SET_IWAIT(td); thread_unlock(td); td->td_pflags |= TDP_ITHREAD; ithd->it_thread = td; CTR2(KTR_INTR, "%s: created %s", __func__, name); return (ithd); } static void ithread_destroy(struct intr_thread *ithread) { struct thread *td; CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name); td = ithread->it_thread; thread_lock(td); ithread->it_flags |= IT_DEAD; if (TD_AWAITING_INTR(td)) { TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } thread_unlock(td); } int intr_event_add_handler(struct intr_event *ie, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep) { struct intr_handler *ih, *temp_ih; struct intr_handler **prevptr; struct intr_thread *it; if (ie == NULL || name == NULL || (handler == NULL && filter == NULL)) return (EINVAL); /* Allocate and populate an interrupt handler structure. */ ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO); ih->ih_filter = filter; ih->ih_handler = handler; ih->ih_argument = arg; strlcpy(ih->ih_name, name, sizeof(ih->ih_name)); ih->ih_event = ie; ih->ih_pri = pri; if (flags & INTR_EXCL) ih->ih_flags = IH_EXCLUSIVE; if (flags & INTR_MPSAFE) ih->ih_flags |= IH_MPSAFE; if (flags & INTR_ENTROPY) ih->ih_flags |= IH_ENTROPY; /* We can only have one exclusive handler in a event. */ mtx_lock(&ie->ie_lock); if (!CK_SLIST_EMPTY(&ie->ie_handlers)) { if ((flags & INTR_EXCL) || (CK_SLIST_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) { mtx_unlock(&ie->ie_lock); free(ih, M_ITHREAD); return (EINVAL); } } /* Create a thread if we need one. */ while (ie->ie_thread == NULL && handler != NULL) { if (ie->ie_flags & IE_ADDING_THREAD) msleep(ie, &ie->ie_lock, 0, "ithread", 0); else { ie->ie_flags |= IE_ADDING_THREAD; mtx_unlock(&ie->ie_lock); it = ithread_create("intr: newborn"); mtx_lock(&ie->ie_lock); ie->ie_flags &= ~IE_ADDING_THREAD; ie->ie_thread = it; it->it_event = ie; ithread_update(it); wakeup(ie); } } /* Add the new handler to the event in priority order. */ CK_SLIST_FOREACH_PREVPTR(temp_ih, prevptr, &ie->ie_handlers, ih_next) { if (temp_ih->ih_pri > ih->ih_pri) break; } CK_SLIST_INSERT_PREVPTR(prevptr, temp_ih, ih, ih_next); intr_event_update(ie); CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name, ie->ie_name); mtx_unlock(&ie->ie_lock); if (cookiep != NULL) *cookiep = ih; return (0); } /* * Append a description preceded by a ':' to the name of the specified * interrupt handler. */ int intr_event_describe_handler(struct intr_event *ie, void *cookie, const char *descr) { struct intr_handler *ih; size_t space; char *start; mtx_lock(&ie->ie_lock); #ifdef INVARIANTS CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih == cookie) break; } if (ih == NULL) { mtx_unlock(&ie->ie_lock); panic("handler %p not found in interrupt event %p", cookie, ie); } #endif ih = cookie; /* * Look for an existing description by checking for an * existing ":". This assumes device names do not include * colons. If one is found, prepare to insert the new * description at that point. If one is not found, find the * end of the name to use as the insertion point. */ start = strchr(ih->ih_name, ':'); if (start == NULL) start = strchr(ih->ih_name, 0); /* * See if there is enough remaining room in the string for the * description + ":". The "- 1" leaves room for the trailing * '\0'. The "+ 1" accounts for the colon. */ space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1; if (strlen(descr) + 1 > space) { mtx_unlock(&ie->ie_lock); return (ENOSPC); } /* Append a colon followed by the description. */ *start = ':'; strcpy(start + 1, descr); intr_event_update(ie); mtx_unlock(&ie->ie_lock); return (0); } /* * Return the ie_source field from the intr_event an intr_handler is * associated with. */ void * intr_handler_source(void *cookie) { struct intr_handler *ih; struct intr_event *ie; ih = (struct intr_handler *)cookie; if (ih == NULL) return (NULL); ie = ih->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", ih->ih_name)); return (ie->ie_source); } /* * If intr_event_handle() is running in the ISR context at the time of the call, * then wait for it to complete. */ static void intr_event_barrier(struct intr_event *ie) { int phase; mtx_assert(&ie->ie_lock, MA_OWNED); phase = ie->ie_phase; /* * Switch phase to direct future interrupts to the other active counter. * Make sure that any preceding stores are visible before the switch. */ KASSERT(ie->ie_active[!phase] == 0, ("idle phase has activity")); atomic_store_rel_int(&ie->ie_phase, !phase); /* * This code cooperates with wait-free iteration of ie_handlers * in intr_event_handle. * Make sure that the removal and the phase update are not reordered * with the active count check. * Note that no combination of acquire and release fences can provide * that guarantee as Store->Load sequences can always be reordered. */ atomic_thread_fence_seq_cst(); /* * Now wait on the inactive phase. * The acquire fence is needed so that that all post-barrier accesses * are after the check. */ while (ie->ie_active[phase] > 0) cpu_spinwait(); atomic_thread_fence_acq(); } static void intr_handler_barrier(struct intr_handler *handler) { struct intr_event *ie; ie = handler->ih_event; mtx_assert(&ie->ie_lock, MA_OWNED); KASSERT((handler->ih_flags & IH_DEAD) == 0, ("update for a removed handler")); if (ie->ie_thread == NULL) { intr_event_barrier(ie); return; } if ((handler->ih_flags & IH_CHANGED) == 0) { handler->ih_flags |= IH_CHANGED; intr_event_schedule_thread(ie); } while ((handler->ih_flags & IH_CHANGED) != 0) msleep(handler, &ie->ie_lock, 0, "ih_barr", 0); } /* * Sleep until an ithread finishes executing an interrupt handler. * * XXX Doesn't currently handle interrupt filters or fast interrupt * handlers. This is intended for compatibility with linux drivers * only. Do not use in BSD code. */ void _intr_drain(int irq) { struct intr_event *ie; struct intr_thread *ithd; struct thread *td; ie = intr_lookup(irq); if (ie == NULL) return; if (ie->ie_thread == NULL) return; ithd = ie->ie_thread; td = ithd->it_thread; /* * We set the flag and wait for it to be cleared to avoid * long delays with potentially busy interrupt handlers * were we to only sample TD_AWAITING_INTR() every tick. */ thread_lock(td); if (!TD_AWAITING_INTR(td)) { ithd->it_flags |= IT_WAIT; while (ithd->it_flags & IT_WAIT) { thread_unlock(td); pause("idrain", 1); thread_lock(td); } } thread_unlock(td); return; } int intr_event_remove_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; struct intr_handler *ih; struct intr_handler **prevptr; #ifdef notyet int dead; #endif if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name, ie->ie_name); CK_SLIST_FOREACH_PREVPTR(ih, prevptr, &ie->ie_handlers, ih_next) { if (ih == handler) break; } if (ih == NULL) { panic("interrupt handler \"%s\" not found in " "interrupt event \"%s\"", handler->ih_name, ie->ie_name); } /* * If there is no ithread, then directly remove the handler. Note that * intr_event_handle() iterates ie_handlers in a lock-less fashion, so * care needs to be taken to keep ie_handlers consistent and to free * the removed handler only when ie_handlers is quiescent. */ if (ie->ie_thread == NULL) { CK_SLIST_REMOVE_PREVPTR(prevptr, ih, ih_next); intr_event_barrier(ie); intr_event_update(ie); mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } /* * Let the interrupt thread do the job. * The interrupt source is disabled when the interrupt thread is * running, so it does not have to worry about interaction with * intr_event_handle(). */ KASSERT((handler->ih_flags & IH_DEAD) == 0, ("duplicate handle remove")); handler->ih_flags |= IH_DEAD; intr_event_schedule_thread(ie); while (handler->ih_flags & IH_DEAD) msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0); intr_event_update(ie); #ifdef notyet /* * XXX: This could be bad in the case of ppbus(8). Also, I think * this could lead to races of stale data when servicing an * interrupt. */ dead = 1; CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if (ih->ih_handler != NULL) { dead = 0; break; } } if (dead) { ithread_destroy(ie->ie_thread); ie->ie_thread = NULL; } #endif mtx_unlock(&ie->ie_lock); free(handler, M_ITHREAD); return (0); } int intr_event_suspend_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); mtx_lock(&ie->ie_lock); handler->ih_flags |= IH_SUSP; intr_handler_barrier(handler); mtx_unlock(&ie->ie_lock); return (0); } int intr_event_resume_handler(void *cookie) { struct intr_handler *handler = (struct intr_handler *)cookie; struct intr_event *ie; if (handler == NULL) return (EINVAL); ie = handler->ih_event; KASSERT(ie != NULL, ("interrupt handler \"%s\" has a NULL interrupt event", handler->ih_name)); /* * intr_handler_barrier() acts not only as a barrier, * it also allows to check for any pending interrupts. */ mtx_lock(&ie->ie_lock); handler->ih_flags &= ~IH_SUSP; intr_handler_barrier(handler); mtx_unlock(&ie->ie_lock); return (0); } static int intr_event_schedule_thread(struct intr_event *ie) { struct intr_entropy entropy; struct intr_thread *it; struct thread *td; struct thread *ctd; /* * If no ithread or no handlers, then we have a stray interrupt. */ if (ie == NULL || CK_SLIST_EMPTY(&ie->ie_handlers) || ie->ie_thread == NULL) return (EINVAL); ctd = curthread; it = ie->ie_thread; td = it->it_thread; /* * If any of the handlers for this ithread claim to be good * sources of entropy, then gather some. */ if (ie->ie_flags & IE_ENTROPY) { entropy.event = (uintptr_t)ie; entropy.td = ctd; random_harvest_queue(&entropy, sizeof(entropy), RANDOM_INTERRUPT); } KASSERT(td->td_proc != NULL, ("ithread %s has no process", ie->ie_name)); /* * Set it_need to tell the thread to keep running if it is already * running. Then, lock the thread and see if we actually need to * put it on the runqueue. * * Use store_rel to arrange that the store to ih_need in * swi_sched() is before the store to it_need and prepare for * transfer of this order to loads in the ithread. */ atomic_store_rel_int(&it->it_need, 1); thread_lock(td); if (TD_AWAITING_INTR(td)) { CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, td->td_proc->p_pid, td->td_name); TD_CLR_IWAIT(td); sched_add(td, SRQ_INTR); } else { CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d", __func__, td->td_proc->p_pid, td->td_name, it->it_need, td->td_state); } thread_unlock(td); return (0); } /* * Allow interrupt event binding for software interrupt handlers -- a no-op, * since interrupts are generated in software rather than being directed by * a PIC. */ static int swi_assign_cpu(void *arg, int cpu) { return (0); } /* * Add a software interrupt handler to a specified event. If a given event * is not specified, then a new event is created. */ int swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, void **cookiep) { struct intr_event *ie; int error; if (flags & INTR_ENTROPY) return (EINVAL); ie = (eventp != NULL) ? *eventp : NULL; if (ie != NULL) { if (!(ie->ie_flags & IE_SOFT)) return (EINVAL); } else { error = intr_event_create(&ie, NULL, IE_SOFT, 0, NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri); if (error) return (error); if (eventp != NULL) *eventp = ie; } error = intr_event_add_handler(ie, name, NULL, handler, arg, PI_SWI(pri), flags, cookiep); return (error); } /* * Schedule a software interrupt thread. */ void swi_sched(void *cookie, int flags) { struct intr_handler *ih = (struct intr_handler *)cookie; struct intr_event *ie = ih->ih_event; struct intr_entropy entropy; int error __unused; CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name, ih->ih_need); entropy.event = (uintptr_t)ih; entropy.td = curthread; random_harvest_queue(&entropy, sizeof(entropy), RANDOM_SWI); /* * Set ih_need for this handler so that if the ithread is already * running it will execute this handler on the next pass. Otherwise, * it will execute it the next time it runs. */ ih->ih_need = 1; if (!(flags & SWI_DELAY)) { VM_CNT_INC(v_soft); error = intr_event_schedule_thread(ie); KASSERT(error == 0, ("stray software interrupt")); } } /* * Remove a software interrupt handler. Currently this code does not * remove the associated interrupt event if it becomes empty. Calling code * may do so manually via intr_event_destroy(), but that's not really * an optimal interface. */ int swi_remove(void *cookie) { return (intr_event_remove_handler(cookie)); } static void intr_event_execute_handlers(struct proc *p, struct intr_event *ie) { struct intr_handler *ih, *ihn, *ihp; ihp = NULL; CK_SLIST_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) { /* * If this handler is marked for death, remove it from * the list of handlers and wake up the sleeper. */ if (ih->ih_flags & IH_DEAD) { mtx_lock(&ie->ie_lock); if (ihp == NULL) CK_SLIST_REMOVE_HEAD(&ie->ie_handlers, ih_next); else CK_SLIST_REMOVE_AFTER(ihp, ih_next); ih->ih_flags &= ~IH_DEAD; wakeup(ih); mtx_unlock(&ie->ie_lock); continue; } /* * Now that we know that the current element won't be removed * update the previous element. */ ihp = ih; if ((ih->ih_flags & IH_CHANGED) != 0) { mtx_lock(&ie->ie_lock); ih->ih_flags &= ~IH_CHANGED; wakeup(ih); mtx_unlock(&ie->ie_lock); } /* Skip filter only handlers */ if (ih->ih_handler == NULL) continue; /* Skip suspended handlers */ if ((ih->ih_flags & IH_SUSP) != 0) continue; /* * For software interrupt threads, we only execute * handlers that have their need flag set. Hardware * interrupt threads always invoke all of their handlers. * * ih_need can only be 0 or 1. Failed cmpset below * means that there is no request to execute handlers, * so a retry of the cmpset is not needed. */ if ((ie->ie_flags & IE_SOFT) != 0 && atomic_cmpset_int(&ih->ih_need, 1, 0) == 0) continue; /* Execute this handler. */ CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x", __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument, ih->ih_name, ih->ih_flags); if (!(ih->ih_flags & IH_MPSAFE)) mtx_lock(&Giant); ih->ih_handler(ih->ih_argument); if (!(ih->ih_flags & IH_MPSAFE)) mtx_unlock(&Giant); } } static void ithread_execute_handlers(struct proc *p, struct intr_event *ie) { /* Interrupt handlers should not sleep. */ if (!(ie->ie_flags & IE_SOFT)) THREAD_NO_SLEEPING(); intr_event_execute_handlers(p, ie); if (!(ie->ie_flags & IE_SOFT)) THREAD_SLEEPING_OK(); /* * Interrupt storm handling: * * If this interrupt source is currently storming, then throttle * it to only fire the handler once per clock tick. * * If this interrupt source is not currently storming, but the * number of back to back interrupts exceeds the storm threshold, * then enter storming mode. */ if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold && !(ie->ie_flags & IE_SOFT)) { /* Report the message only once every second. */ if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) { printf( "interrupt storm detected on \"%s\"; throttling interrupt source\n", ie->ie_name); } pause("istorm", 1); } else ie->ie_count++; /* * Now that all the handlers have had a chance to run, reenable * the interrupt source. */ if (ie->ie_post_ithread != NULL) ie->ie_post_ithread(ie->ie_source); } /* * This is the main code for interrupt threads. */ static void ithread_loop(void *arg) { struct intr_thread *ithd; struct intr_event *ie; struct thread *td; struct proc *p; int wake; td = curthread; p = td->td_proc; ithd = (struct intr_thread *)arg; KASSERT(ithd->it_thread == td, ("%s: ithread and proc linkage out of sync", __func__)); ie = ithd->it_event; ie->ie_count = 0; wake = 0; /* * As long as we have interrupts outstanding, go through the * list of handlers, giving each one a go at it. */ for (;;) { /* * If we are an orphaned thread, then just die. */ if (ithd->it_flags & IT_DEAD) { CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__, p->p_pid, td->td_name); free(ithd, M_ITHREAD); kthread_exit(); } /* * Service interrupts. If another interrupt arrives while * we are running, it will set it_need to note that we * should make another pass. * * The load_acq part of the following cmpset ensures * that the load of ih_need in ithread_execute_handlers() * is ordered after the load of it_need here. */ while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) ithread_execute_handlers(p, ie); WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread"); mtx_assert(&Giant, MA_NOTOWNED); /* * Processed all our interrupts. Now get the sched * lock. This may take a while and it_need may get * set again, so we have to check it again. */ thread_lock(td); if (atomic_load_acq_int(&ithd->it_need) == 0 && (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) { TD_SET_IWAIT(td); ie->ie_count = 0; mi_switch(SW_VOL | SWT_IWAIT, NULL); } if (ithd->it_flags & IT_WAIT) { wake = 1; ithd->it_flags &= ~IT_WAIT; } thread_unlock(td); if (wake) { wakeup(ithd); wake = 0; } } } /* * Main interrupt handling body. * * Input: * o ie: the event connected to this interrupt. * o frame: some archs (i.e. i386) pass a frame to some. * handlers as their main argument. * Return value: * o 0: everything ok. * o EINVAL: stray interrupt. */ int intr_event_handle(struct intr_event *ie, struct trapframe *frame) { struct intr_handler *ih; struct trapframe *oldframe; struct thread *td; int phase; int ret; bool filter, thread; td = curthread; #ifdef KSTACK_USAGE_PROF intr_prof_stack_use(td, frame); #endif /* An interrupt with no event or handlers is a stray interrupt. */ if (ie == NULL || CK_SLIST_EMPTY(&ie->ie_handlers)) return (EINVAL); /* * Execute fast interrupt handlers directly. * To support clock handlers, if a handler registers * with a NULL argument, then we pass it a pointer to * a trapframe as its argument. */ td->td_intr_nesting_level++; filter = false; thread = false; ret = 0; critical_enter(); oldframe = td->td_intr_frame; td->td_intr_frame = frame; phase = ie->ie_phase; atomic_add_int(&ie->ie_active[phase], 1); /* * This fence is required to ensure that no later loads are * re-ordered before the ie_active store. */ atomic_thread_fence_seq_cst(); CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) { if ((ih->ih_flags & IH_SUSP) != 0) continue; if (ih->ih_filter == NULL) { thread = true; continue; } CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__, ih->ih_filter, ih->ih_argument == NULL ? frame : ih->ih_argument, ih->ih_name); if (ih->ih_argument == NULL) ret = ih->ih_filter(frame); else ret = ih->ih_filter(ih->ih_argument); KASSERT(ret == FILTER_STRAY || ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 && (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0), ("%s: incorrect return value %#x from %s", __func__, ret, ih->ih_name)); filter = filter || ret == FILTER_HANDLED; /* * Wrapper handler special handling: * * in some particular cases (like pccard and pccbb), * the _real_ device handler is wrapped in a couple of * functions - a filter wrapper and an ithread wrapper. * In this case (and just in this case), the filter wrapper * could ask the system to schedule the ithread and mask * the interrupt source if the wrapped handler is composed * of just an ithread handler. * * TODO: write a generic wrapper to avoid people rolling * their own. */ if (!thread) { if (ret == FILTER_SCHEDULE_THREAD) thread = true; } } atomic_add_rel_int(&ie->ie_active[phase], -1); td->td_intr_frame = oldframe; if (thread) { if (ie->ie_pre_ithread != NULL) ie->ie_pre_ithread(ie->ie_source); } else { if (ie->ie_post_filter != NULL) ie->ie_post_filter(ie->ie_source); } /* Schedule the ithread if needed. */ if (thread) { int error __unused; error = intr_event_schedule_thread(ie); KASSERT(error == 0, ("bad stray interrupt")); } critical_exit(); td->td_intr_nesting_level--; #ifdef notyet /* The interrupt is not aknowledged by any filter and has no ithread. */ if (!thread && !filter) return (EINVAL); #endif return (0); } #ifdef DDB /* * Dump details about an interrupt handler */ static void db_dump_intrhand(struct intr_handler *ih) { int comma; db_printf("\t%-10s ", ih->ih_name); switch (ih->ih_pri) { case PI_REALTIME: db_printf("CLK "); break; case PI_AV: db_printf("AV "); break; case PI_TTY: db_printf("TTY "); break; case PI_NET: db_printf("NET "); break; case PI_DISK: db_printf("DISK"); break; case PI_DULL: db_printf("DULL"); break; default: if (ih->ih_pri >= PI_SOFT) db_printf("SWI "); else db_printf("%4u", ih->ih_pri); break; } db_printf(" "); if (ih->ih_filter != NULL) { db_printf("[F]"); db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC); } if (ih->ih_handler != NULL) { if (ih->ih_filter != NULL) db_printf(","); db_printf("[H]"); db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC); } db_printf("(%p)", ih->ih_argument); if (ih->ih_need || (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD | IH_MPSAFE)) != 0) { db_printf(" {"); comma = 0; if (ih->ih_flags & IH_EXCLUSIVE) { if (comma) db_printf(", "); db_printf("EXCL"); comma = 1; } if (ih->ih_flags & IH_ENTROPY) { if (comma) db_printf(", "); db_printf("ENTROPY"); comma = 1; } if (ih->ih_flags & IH_DEAD) { if (comma) db_printf(", "); db_printf("DEAD"); comma = 1; } if (ih->ih_flags & IH_MPSAFE) { if (comma) db_printf(", "); db_printf("MPSAFE"); comma = 1; } if (ih->ih_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); } /* * Dump details about a event. */ void db_dump_intr_event(struct intr_event *ie, int handlers) { struct intr_handler *ih; struct intr_thread *it; int comma; db_printf("%s ", ie->ie_fullname); it = ie->ie_thread; if (it != NULL) db_printf("(pid %d)", it->it_thread->td_proc->p_pid); else db_printf("(no thread)"); if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 || (it != NULL && it->it_need)) { db_printf(" {"); comma = 0; if (ie->ie_flags & IE_SOFT) { db_printf("SOFT"); comma = 1; } if (ie->ie_flags & IE_ENTROPY) { if (comma) db_printf(", "); db_printf("ENTROPY"); comma = 1; } if (ie->ie_flags & IE_ADDING_THREAD) { if (comma) db_printf(", "); db_printf("ADDING_THREAD"); comma = 1; } if (it != NULL && it->it_need) { if (comma) db_printf(", "); db_printf("NEED"); } db_printf("}"); } db_printf("\n"); if (handlers) CK_SLIST_FOREACH(ih, &ie->ie_handlers, ih_next) db_dump_intrhand(ih); } /* * Dump data about interrupt handlers */ DB_SHOW_COMMAND(intr, db_show_intr) { struct intr_event *ie; int all, verbose; verbose = strchr(modif, 'v') != NULL; all = strchr(modif, 'a') != NULL; TAILQ_FOREACH(ie, &event_list, ie_list) { if (!all && CK_SLIST_EMPTY(&ie->ie_handlers)) continue; db_dump_intr_event(ie, verbose); if (db_pager_quit) break; } } #endif /* DDB */ /* * Start standard software interrupt threads */ static void start_softintr(void *dummy) { if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih)) panic("died while creating vm swi ithread"); } SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL); /* * Sysctls used by systat and others: hw.intrnames and hw.intrcnt. * The data for this machine dependent, and the declarations are in machine * dependent code. The layout of intrnames and intrcnt however is machine * independent. * * We do not know the length of intrcnt and intrnames at compile time, so * calculate things at run time. */ static int sysctl_intrnames(SYSCTL_HANDLER_ARGS) { return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_intrnames, "", "Interrupt Names"); static int sysctl_intrcnt(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 uint32_t *intrcnt32; unsigned i; int error; if (req->flags & SCTL_MASK32) { if (!req->oldptr) return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req)); intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT); if (intrcnt32 == NULL) return (ENOMEM); for (i = 0; i < sintrcnt / sizeof (u_long); i++) intrcnt32[i] = intrcnt[i]; error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req); free(intrcnt32, M_TEMP); return (error); } #endif return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req)); } SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_intrcnt, "", "Interrupt Counts"); #ifdef DDB /* * DDB command to dump the interrupt statistics. */ DB_SHOW_COMMAND(intrcnt, db_show_intrcnt) { u_long *i; char *cp; u_int j; cp = intrnames; j = 0; for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit; i++, j++) { if (*cp == '\0') break; if (*i != 0) db_printf("%s\t%lu\n", cp, *i); cp += strlen(cp) + 1; } } #endif Index: head/sys/netinet/tcp_hpts.c =================================================================== --- head/sys/netinet/tcp_hpts.c (revision 347429) +++ head/sys/netinet/tcp_hpts.c (revision 347430) @@ -1,1902 +1,1952 @@ /*- * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" /** * Some notes about usage. * * The tcp_hpts system is designed to provide a high precision timer * system for tcp. Its main purpose is to provide a mechanism for * pacing packets out onto the wire. It can be used in two ways * by a given TCP stack (and those two methods can be used simultaneously). * * First, and probably the main thing its used by Rack and BBR for, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The * slot is the time from now that the stack wants to be called but it * must be converted to tcp_hpts's notion of slot. This is done with * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical * call from the tcp_output() routine might look like: * * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); * * The above would schedule tcp_ouput() to be called in 550 useconds. * Note that if using this mechanism the stack will want to add near * its top a check to prevent unwanted calls (from user land or the * arrival of incoming ack's). So it would add something like: * * if (inp->inp_in_hpts) * return; * * to prevent output processing until the time alotted has gone by. * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. * * Now the tcp_hpts system will call tcp_output in one of two forms, * it will first check to see if the stack as defined a * tfb_tcp_output_wtime() function, if so that is the routine it * will call, if that function is not defined then it will call the * tfb_tcp_output() function. The only difference between these * two calls is that the former passes the time in to the function * so the function does not have to access the time (which tcp_hpts * already has). What these functions do is of course totally up * to the individual tcp stack. * * Now the second function (actually two functions I guess :D) * the tcp_hpts system provides is the ability to either abort * a connection (later) or process input on a connection. * Why would you want to do this? To keep processor locality. * * So in order to use the input redirection function the * stack changes its tcp_do_segment() routine to instead * of process the data call the function: * * tcp_queue_pkt_to_input() * * You will note that the arguments to this function look * a lot like tcp_do_segments's arguments. This function * will assure that the tcp_hpts system will * call the functions tfb_tcp_hpts_do_segment() from the * correct CPU. Note that multiple calls can get pushed * into the tcp_hpts system this will be indicated by * the next to last argument to tfb_tcp_hpts_do_segment() * (nxt_pkt). If nxt_pkt is a 1 then another packet is * coming. If nxt_pkt is a 0 then this is the last call * that the tcp_hpts system has available for the tcp stack. * * The other point of the input system is to be able to safely * drop a tcp connection without worrying about the recursive * locking that may be occuring on the INP_WLOCK. So if * a stack wants to drop a connection it calls: * * tcp_set_inp_to_drop(tp, ETIMEDOUT) * * To schedule the tcp_hpts system to call * * tcp_drop(tp, drop_reason) * * at a future point. This is quite handy to prevent locking * issues when dropping connections. * */ #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef tcpdebug #include #endif /* tcpdebug */ #ifdef tcp_offload #include #endif #include "opt_rss.h" MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS #include #include static int tcp_bind_threads = 1; #else -static int tcp_bind_threads = 0; +static int tcp_bind_threads = 2; #endif TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); static struct tcp_hptsi tcp_pace; static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; static int32_t tcp_hpts_callout_skip_swi = 0; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls"); #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) static int32_t logging_on = 0; static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); static int32_t tcp_hpts_precision = 120; +struct hpts_domain_info { + int count; + int cpu[MAXCPU]; +}; + +struct hpts_domain_info hpts_domains[MAXMEMDOM]; + SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, &tcp_hpts_precision, 120, "Value for PRE() precision of callout"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, &logging_on, 0, "Turn on logging if compiled in"); counter_u64_t hpts_loops; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, &hpts_loops, "Number of times hpts had to loop to catch up"); counter_u64_t back_tosleep; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); static int32_t in_newts_every_tcb = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, &in_newts_every_tcb, 0, "Do we have a new cts every tcb we process for input"); static int32_t in_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, &in_ts_percision, 0, "Do we use percise timestamp for clients on input"); static int32_t out_newts_every_tcb = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, &out_newts_every_tcb, 0, "Do we have a new cts every tcb we process for output"); static int32_t out_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, &out_ts_percision, 0, "Do we use a percise timestamp for every output cts"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, &hpts_sleep_max, 0, "The maximum time the hpts will sleep <1 - 254>"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, &tcp_min_hptsi_time, 0, "The minimum time the hpts must sleep before processing more slots"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, &tcp_hpts_callout_skip_swi, 0, "Do we have the callout call directly to the hpts?"); static void __tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, uint32_t ticknow, int32_t line) { struct hpts_log *pl; HPTS_MTX_ASSERT(hpts); if (hpts->p_log == NULL) return; pl = &hpts->p_log[hpts->p_log_at]; hpts->p_log_at++; if (hpts->p_log_at >= hpts->p_logsize) { hpts->p_log_at = 0; hpts->p_log_wrapped = 1; } pl->inp = inp; if (inp) { pl->t_paceslot = inp->inp_hptsslot; pl->t_hptsreq = inp->inp_hpts_request; pl->p_onhpts = inp->inp_in_hpts; pl->p_oninput = inp->inp_in_input; } else { pl->t_paceslot = 0; pl->t_hptsreq = 0; pl->p_onhpts = 0; pl->p_oninput = 0; } pl->is_notempty = 1; pl->event = event; pl->line = line; pl->cts = tcp_get_usecs(NULL); pl->p_curtick = hpts->p_curtick; pl->p_prevtick = hpts->p_prevtick; pl->p_on_queue_cnt = hpts->p_on_queue_cnt; pl->ticknow = ticknow; pl->slot_req = slot; pl->p_nxt_slot = hpts->p_nxt_slot; pl->p_cur_slot = hpts->p_cur_slot; pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; pl->p_flags = (hpts->p_cpu & 0x7f); pl->p_flags <<= 7; pl->p_flags |= (hpts->p_num & 0x7f); pl->p_flags <<= 2; if (hpts->p_hpts_active) { pl->p_flags |= HPTS_HPTS_ACTIVE; } } #define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) static void hpts_timeout_swi(void *arg) { struct tcp_hpts_entry *hpts; hpts = (struct tcp_hpts_entry *)arg; swi_sched(hpts->ie_cookie, 0); } static void hpts_timeout_dir(void *arg) { tcp_hpts_thread(arg); } static inline void hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_hpts_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_hpts == 0) { /* We are not on the hpts? */ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); } if (TAILQ_EMPTY(head) && (hpts->p_on_queue_cnt != 0)) { /* We should not be empty with a queue count */ panic("%s hpts:%p hpts bucket empty but cnt:%d", __FUNCTION__, hpts, hpts->p_on_queue_cnt); } #endif TAILQ_REMOVE(head, inp, inp_hpts); hpts->p_on_queue_cnt--; if (hpts->p_on_queue_cnt < 0) { /* Count should not go negative .. */ #ifdef INVARIANTS panic("Hpts goes negative inp:%p hpts:%p", inp, hpts); #endif hpts->p_on_queue_cnt = 0; } if (clear) { inp->inp_hpts_request = 0; inp->inp_in_hpts = 0; } } static inline void hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_hpts_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if ((noref == 0) && (inp->inp_in_hpts == 1)) { /* We are already on the hpts? */ panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_INSERT_TAIL(head, inp, inp_hpts); inp->inp_in_hpts = 1; hpts->p_on_queue_cnt++; if (noref == 0) { in_pcbref(inp); } } static inline void hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_input_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_input == 0) { /* We are not on the input hpts? */ panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_REMOVE(&hpts->p_input, inp, inp_input); hpts->p_on_inqueue_cnt--; if (hpts->p_on_inqueue_cnt < 0) { #ifdef INVARIANTS panic("Hpts in goes negative inp:%p hpts:%p", inp, hpts); #endif hpts->p_on_inqueue_cnt = 0; } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { /* We should not be empty with a queue count */ panic("%s hpts:%p in_hpts input empty but cnt:%d", __FUNCTION__, hpts, hpts->p_on_inqueue_cnt); } #endif if (clear) inp->inp_in_input = 0; } static inline void hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) { #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx) == 0) { /* We don't own the mutex? */ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); } if (hpts->p_cpu != inp->inp_input_cpu) { /* It is not the right cpu/mutex? */ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); } if (inp->inp_in_input == 1) { /* We are already on the input hpts? */ panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp); } #endif TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); inp->inp_in_input = 1; hpts->p_on_inqueue_cnt++; in_pcbref(inp); } static int sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) { struct tcp_hpts_entry *hpts; size_t sz; int32_t logging_was, i; int32_t error = 0; /* * HACK: Turn off logging so no locks are required this really needs * a memory barrier :) */ logging_was = logging_on; logging_on = 0; if (!req->oldptr) { /* How much? */ sz = 0; for (i = 0; i < tcp_pace.rp_num_hptss; i++) { hpts = tcp_pace.rp_ent[i]; if (hpts->p_log == NULL) continue; sz += (sizeof(struct hpts_log) * hpts->p_logsize); } error = SYSCTL_OUT(req, 0, sz); } else { for (i = 0; i < tcp_pace.rp_num_hptss; i++) { hpts = tcp_pace.rp_ent[i]; if (hpts->p_log == NULL) continue; if (hpts->p_log_wrapped) sz = (sizeof(struct hpts_log) * hpts->p_logsize); else sz = (sizeof(struct hpts_log) * hpts->p_log_at); error = SYSCTL_OUT(req, hpts->p_log, sz); } } logging_on = logging_was; return error; } SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); swi_sched(hpts->ie_cookie, 0); if (hpts->p_hpts_active == 2) { /* Rare sleeping on a ENOBUF */ wakeup_one(hpts); } } static void tcp_wakeinput(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); swi_sched(hpts->ie_cookie, 0); if (hpts->p_hpts_active == 2) { /* Rare sleeping on a ENOBUF */ wakeup_one(hpts); } } struct tcp_hpts_entry * tcp_cur_hpts(struct inpcb *inp) { int32_t hpts_num; struct tcp_hpts_entry *hpts; hpts_num = inp->inp_hpts_cpu; hpts = tcp_pace.rp_ent[hpts_num]; return (hpts); } struct tcp_hpts_entry * tcp_hpts_lock(struct inpcb *inp) { struct tcp_hpts_entry *hpts; int32_t hpts_num; again: hpts_num = inp->inp_hpts_cpu; hpts = tcp_pace.rp_ent[hpts_num]; #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (hpts_num != inp->inp_hpts_cpu) { mtx_unlock(&hpts->p_mtx); goto again; } return (hpts); } struct tcp_hpts_entry * tcp_input_lock(struct inpcb *inp) { struct tcp_hpts_entry *hpts; int32_t hpts_num; again: hpts_num = inp->inp_input_cpu; hpts = tcp_pace.rp_ent[hpts_num]; #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (hpts_num != inp->inp_input_cpu) { mtx_unlock(&hpts->p_mtx); goto again; } return (hpts); } static void tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) { int32_t add_freed; if (inp->inp_flags2 & INP_FREED) { /* * Need to play a special trick so that in_pcbrele_wlocked * does not return 1 when it really should have returned 0. */ add_freed = 1; inp->inp_flags2 &= ~INP_FREED; } else { add_freed = 0; } #ifndef INP_REF_DEBUG if (in_pcbrele_wlocked(inp)) { /* * This should not happen. We have the inpcb referred to by * the main socket (why we are called) and the hpts. It * should always return 0. */ panic("inpcb:%p release ret 1", inp); } #else if (__in_pcbrele_wlocked(inp, line)) { /* * This should not happen. We have the inpcb referred to by * the main socket (why we are called) and the hpts. It * should always return 0. */ panic("inpcb:%p release ret 1", inp); } #endif if (add_freed) { inp->inp_flags2 |= INP_FREED; } } static void tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) { if (inp->inp_in_hpts) { hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1); tcp_remove_hpts_ref(inp, hpts, line); } } static void tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) { HPTS_MTX_ASSERT(hpts); if (inp->inp_in_input) { hpts_sane_input_remove(hpts, inp, 1); tcp_remove_hpts_ref(inp, hpts, line); } } /* * Called normally with the INP_LOCKED but it * does not matter, the hpts lock is the key * but the lock order allows us to hold the * INP lock and then get the hpts lock. * * Valid values in the flags are * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. * HPTS_REMOVE_INPUT - remove from the input of the hpts. * Note that you can or both values together and get two * actions. */ void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) { struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); if (flags & HPTS_REMOVE_OUTPUT) { hpts = tcp_hpts_lock(inp); tcp_hpts_remove_locked_output(hpts, inp, flags, line); mtx_unlock(&hpts->p_mtx); } if (flags & HPTS_REMOVE_INPUT) { hpts = tcp_input_lock(inp); tcp_hpts_remove_locked_input(hpts, inp, flags, line); mtx_unlock(&hpts->p_mtx); } } static inline int hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) { return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); } static int tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) { int32_t need_wake = 0; uint32_t ticknow = 0; HPTS_MTX_ASSERT(hpts); if (inp->inp_in_hpts == 0) { /* Ok we need to set it on the hpts in the current slot */ if (hpts->p_hpts_active == 0) { /* A sleeping hpts we want in next slot to run */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, hpts_tick(hpts, 1)); } inp->inp_hptsslot = hpts_tick(hpts, 1); inp->inp_hpts_request = 0; if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); } need_wake = 1; } else if ((void *)inp == hpts->p_inp) { /* * We can't allow you to go into the same slot we * are in. We must put you out. */ inp->inp_hptsslot = hpts->p_nxt_slot; } else inp->inp_hptsslot = hpts->p_cur_slot; hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); inp->inp_hpts_request = 0; if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); } if (need_wake) { /* * Activate the hpts if it is sleeping and its * timeout is not 1. */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } } return (need_wake); } int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line) { int32_t ret; struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); mtx_unlock(&hpts->p_mtx); return (ret); } static void tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, struct hpts_diag *diag, int32_t noref) { int32_t need_new_to = 0; int32_t need_wakeup = 0; uint32_t largest_slot; uint32_t ticknow = 0; uint32_t slot_calc; HPTS_MTX_ASSERT(hpts); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; diag->slot_req = slot; } if ((inp->inp_in_hpts == 0) || noref) { inp->inp_hpts_request = slot; if (slot == 0) { /* Immediate */ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); return; } if (hpts->p_hpts_active) { /* * Its slot - 1 since nxt_slot is the next tick that * will go off since the hpts is awake */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); } /* * We want to make sure that we don't place a inp in * the range of p_cur_slot <-> p_nxt_slot. If we * take from p_nxt_slot to the end, plus p_cur_slot * and then take away 2, we will know how many is * the max slots we can use. */ if (hpts->p_nxt_slot > hpts->p_cur_slot) { /* * Non-wrap case nxt_slot <-> cur_slot we * don't want to land in. So the diff gives * us what is taken away from the number of * slots. */ largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { largest_slot = NUM_OF_HPTSI_SLOTS - 2; } else { /* * Wrap case so the diff gives us the number * of slots that we can land in. */ largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; } /* * We take away two so we never have a problem (20 * usec's) out of 1024000 usecs */ largest_slot -= 2; if (inp->inp_hpts_request > largest_slot) { /* * Restrict max jump of slots and remember * leftover */ slot = largest_slot; inp->inp_hpts_request -= largest_slot; } else { /* This one will run when we hit it */ inp->inp_hpts_request = 0; } if (hpts->p_nxt_slot == hpts->p_cur_slot) slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; else slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; if (slot_calc == hpts->p_cur_slot) { #ifdef INVARIANTS /* TSNH */ panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", hpts, slot_calc, slot, largest_slot); #endif if (slot_calc) slot_calc--; else slot_calc = NUM_OF_HPTSI_SLOTS - 1; } inp->inp_hptsslot = slot_calc; if (diag) { diag->inp_hptsslot = inp->inp_hptsslot; } } else { /* * The hpts is sleeping, we need to figure out where * it will wake up at and if we need to reschedule * its time-out. */ uint32_t have_slept, yet_to_sleep; uint32_t slot_now; struct timeval tv; ticknow = tcp_gethptstick(&tv); slot_now = ticknow % NUM_OF_HPTSI_SLOTS; /* * The user wants to be inserted at (slot_now + * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. */ largest_slot = NUM_OF_HPTSI_SLOTS - 2; if (inp->inp_hpts_request > largest_slot) { /* Adjust the residual in inp_hpts_request */ slot = largest_slot; inp->inp_hpts_request -= largest_slot; } else { /* No residual it all fits */ inp->inp_hpts_request = 0; } inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; if (diag) { diag->slot_now = slot_now; diag->inp_hptsslot = inp->inp_hptsslot; diag->p_on_min_sleep = hpts->p_on_min_sleep; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); } /* Now do we need to restart the hpts's timer? */ if (TSTMP_GT(ticknow, hpts->p_curtick)) have_slept = ticknow - hpts->p_curtick; else have_slept = 0; if (have_slept < hpts->p_hpts_sleep_time) { /* This should be what happens */ yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; } else { /* We are over-due */ yet_to_sleep = 0; need_wakeup = 1; } if (diag) { diag->have_slept = have_slept; diag->yet_to_sleep = yet_to_sleep; diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { /* * We need to reschedule the hptss time-out. */ hpts->p_hpts_sleep_time = slot; need_new_to = slot * HPTS_TICKS_PER_USEC; } } hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); } /* * Now how far is the hpts sleeping to? if active is 1, its * up and ticking we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); if (diag) { diag->need_new_to = 0; diag->co_ret = 0xffff0000; } } else if (need_new_to) { int32_t co_ret; struct timeval tv; sbintime_t sb; tv.tv_sec = 0; tv.tv_usec = 0; while (need_new_to > HPTS_USEC_IN_SEC) { tv.tv_sec++; need_new_to -= HPTS_USEC_IN_SEC; } tv.tv_usec = need_new_to; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } if (diag) { diag->need_new_to = need_new_to; diag->co_ret = co_ret; } } } else { #ifdef INVARIANTS panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp); #endif } } uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ struct tcp_hpts_entry *hpts; uint32_t slot_on, cts; struct timeval tv; /* * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); if (in_ts_percision) microuptime(&tv); else getmicrouptime(&tv); cts = tcp_tv_to_usectick(&tv); tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); slot_on = hpts->p_nxt_slot; mtx_unlock(&hpts->p_mtx); return (slot_on); } uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ return (tcp_hpts_insert_diag(inp, slot, line, NULL)); } int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) { int32_t retval = 0; HPTS_MTX_ASSERT(hpts); if (inp->inp_in_input == 0) { /* Ok we need to set it on the hpts in the current slot */ hpts_sane_input_insert(hpts, inp, line); retval = 1; if (hpts->p_hpts_active == 0) { /* * Activate the hpts if it is sleeping. */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); } retval = 2; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } } else if (hpts->p_hpts_active == 0) { retval = 4; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } return (retval); } void tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, int32_t tlen, int32_t drop_hdrlen, uint8_t iptos) { /* Setup packet for input first */ INP_WLOCK_ASSERT(tp->t_inpcb); m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); m->m_pkthdr.pace_tlen = (uint16_t) tlen; m->m_pkthdr.pace_drphdrlen = drop_hdrlen; m->m_pkthdr.pace_tos = iptos; m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0); if (tp->t_in_pkt == NULL) { tp->t_in_pkt = m; tp->t_tail_pkt = m; } else { tp->t_tail_pkt->m_nextpkt = m; tp->t_tail_pkt = m; } } int32_t __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){ struct tcp_hpts_entry *hpts; int32_t ret; tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos); hpts = tcp_input_lock(tp->t_inpcb); ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); mtx_unlock(&hpts->p_mtx); return (ret); } void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line) { struct tcp_hpts_entry *hpts; struct tcpcb *tp; tp = intotcpcb(inp); hpts = tcp_input_lock(tp->t_inpcb); if (inp->inp_in_input == 0) { /* Ok we need to set it on the hpts in the current slot */ hpts_sane_input_insert(hpts, inp, line); if (hpts->p_hpts_active == 0) { /* * Activate the hpts if it is sleeping. */ hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } } else if (hpts->p_hpts_active == 0) { hpts->p_direct_wake = 1; tcp_wakeinput(hpts); } inp->inp_hpts_drop_reas = reason; mtx_unlock(&hpts->p_mtx); } static uint16_t hpts_random_cpu(struct inpcb *inp){ /* * No flow type set distribute the load randomly. */ uint16_t cpuid; uint32_t ran; /* * If one has been set use it i.e. we want both in and out on the * same hpts. */ if (inp->inp_input_cpu_set) { return (inp->inp_input_cpu); } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } /* Nothing set use a random number */ ran = arc4random(); cpuid = (ran & 0xffff) % mp_ncpus; return (cpuid); } static uint16_t hpts_cpuid(struct inpcb *inp){ u_int cpuid; +#ifdef NUMA + struct hpts_domain_info *di; +#endif - /* * If one has been set use it i.e. we want both in and out on the * same hpts. */ if (inp->inp_input_cpu_set) { return (inp->inp_input_cpu); } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } /* If one is set the other must be the same */ #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (hpts_random_cpu(inp)); else return (cpuid); #else /* * We don't have a flowid -> cpuid mapping, so cheat and just map * unknown cpuids to curcpu. Not the best, but apparently better * than defaulting to swi 0. */ - if (inp->inp_flowtype != M_HASHTYPE_NONE) { + + if (inp->inp_flowtype == M_HASHTYPE_NONE) + return (hpts_random_cpu(inp)); + /* + * Hash to a thread based on the flowid. If we are using numa, + * then restrict the hash to the numa domain where the inp lives. + */ +#ifdef NUMA + if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) { + di = &hpts_domains[inp->inp_numa_domain]; + cpuid = di->cpu[inp->inp_flowid % di->count]; + } else +#endif cpuid = inp->inp_flowid % mp_ncpus; - return (cpuid); - } - cpuid = hpts_random_cpu(inp); + return (cpuid); #endif } /* * Do NOT try to optimize the processing of inp's * by first pulling off all the inp's into a temporary * list (e.g. TAILQ_CONCAT). If you do that the subtle * interactions of switching CPU's will kill because of * problems in the linked list manipulation. Basically * you would switch cpu's with the hpts mutex locked * but then while you were processing one of the inp's * some other one that you switch will get a new * packet on the different CPU. It will insert it * on the new hptss input list. Creating a temporary * link in the inp will not fix it either, since * the other hpts will be doing the same thing and * you will both end up using the temporary link. * * You will die in an ASSERT for tailq corruption if you * run INVARIANTS or you will die horribly without * INVARIANTS in some unknown way with a corrupt linked * list. */ static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) { struct mbuf *m, *n; struct tcpcb *tp; struct inpcb *inp; uint16_t drop_reason; int16_t set_cpu; uint32_t did_prefetch = 0; int32_t ti_locked = TI_UNLOCKED; struct epoch_tracker et; HPTS_MTX_ASSERT(hpts); while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { HPTS_MTX_ASSERT(hpts); hpts_sane_input_remove(hpts, inp, 0); if (inp->inp_input_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } hpts->p_inp = inp; drop_reason = inp->inp_hpts_drop_reas; inp->inp_in_input = 0; mtx_unlock(&hpts->p_mtx); CURVNET_SET(inp->inp_vnet); if (drop_reason) { INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else { ti_locked = TI_UNLOCKED; } INP_WLOCK(inp); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { out: hpts->p_inp = NULL; if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } if (in_pcbrele_wlocked(inp) == 0) { INP_WUNLOCK(inp); } ti_locked = TI_UNLOCKED; CURVNET_RESTORE(); mtx_lock(&hpts->p_mtx); continue; } tp = intotcpcb(inp); if ((tp == NULL) || (tp->t_inpcb == NULL)) { goto out; } if (drop_reason) { /* This tcb is being destroyed for drop_reason */ m = tp->t_in_pkt; if (m) n = m->m_nextpkt; else n = NULL; tp->t_in_pkt = NULL; while (m) { m_freem(m); m = n; if (m) n = m->m_nextpkt; } tp = tcp_drop(tp, drop_reason); INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (tp == NULL) { INP_WLOCK(inp); } if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); CURVNET_RESTORE(); mtx_lock(&hpts->p_mtx); continue; } if (set_cpu) { /* * Setup so the next time we will move to the right * CPU. This should be a rare event. It will * sometimes happens when we are the client side * (usually not the server). Somehow tcp_output() * gets called before the tcp_do_segment() sets the * intial state. This means the r_cpu and r_hpts_cpu * is 0. We get on the hpts, and then tcp_input() * gets called setting up the r_cpu to the correct * value. The hpts goes off and sees the mis-match. * We simply correct it here and the CPU will switch * to the new hpts nextime the tcb gets added to the * the hpts (not this time) :-) */ tcp_set_hpts(inp); } m = tp->t_in_pkt; n = NULL; if (m != NULL && (m->m_pkthdr.pace_lock == TI_RLOCKED || tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; INP_INFO_RLOCK_ET(&V_tcbinfo, et); m = tp->t_in_pkt; } if (in_newts_every_tcb) { if (in_ts_percision) microuptime(tv); else getmicrouptime(tv); } if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } /* Any input work to do, if so do it first */ if ((m != NULL) && (m == tp->t_in_pkt)) { struct tcphdr *th; int32_t tlen, drop_hdrlen, nxt_pkt; uint8_t iptos; n = m->m_nextpkt; tp->t_in_pkt = tp->t_tail_pkt = NULL; while (m) { th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); tlen = m->m_pkthdr.pace_tlen; drop_hdrlen = m->m_pkthdr.pace_drphdrlen; iptos = m->m_pkthdr.pace_tos; m->m_nextpkt = NULL; if (n) nxt_pkt = 1; else nxt_pkt = 0; inp->inp_input_calls = 1; if (tp->t_fb->tfb_tcp_hpts_do_segment) { /* Use the hpts specific do_segment */ (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, tp, drop_hdrlen, tlen, iptos, nxt_pkt, tv); } else { /* Use the default do_segment */ (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, tp, drop_hdrlen, tlen, iptos); } if (ti_locked == TI_RLOCKED) INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); /* * Do segment returns unlocked we need the * lock again but we also need some kasserts * here. */ INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); m = n; if (m) n = m->m_nextpkt; if (m != NULL && m->m_pkthdr.pace_lock == TI_RLOCKED) { INP_INFO_RLOCK_ET(&V_tcbinfo, et); ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; INP_WLOCK(inp); /* * Since we have an opening here we must * re-check if the tcb went away while we * were getting the lock(s). */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { while (m) { m_freem(m); m = n; if (m) n = m->m_nextpkt; } goto out; } /* * Now that we hold the INP lock, check if * we need to upgrade our lock. */ if (ti_locked == TI_UNLOCKED && (tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; INP_INFO_RLOCK_ET(&V_tcbinfo, et); } } /** end while(m) */ } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); if (ti_locked == TI_RLOCKED) INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); ti_locked = TI_UNLOCKED; mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; CURVNET_RESTORE(); } } static int tcp_hpts_est_run(struct tcp_hpts_entry *hpts) { int32_t ticks_to_run; if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { ticks_to_run = hpts->p_curtick - hpts->p_prevtick; if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; } } else { if (hpts->p_prevtick == hpts->p_curtick) { /* This happens when we get woken up right away */ return (-1); } ticks_to_run = 1; } /* Set in where we will be when we catch up */ hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; if (hpts->p_nxt_slot == hpts->p_cur_slot) { panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); } return (ticks_to_run); } static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) { struct tcpcb *tp; struct inpcb *inp = NULL, *ninp; struct timeval tv; int32_t ticks_to_run, i, error, tick_now, interum_tick; int32_t paced_cnt = 0; int32_t did_prefetch = 0; int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; uint32_t cts; int16_t set_cpu; HPTS_MTX_ASSERT(hpts); hpts->p_curtick = tcp_tv_to_hptstick(ctick); cts = tcp_tv_to_usectick(ctick); memcpy(&tv, ctick, sizeof(struct timeval)); hpts->p_cur_slot = hpts_tick(hpts, 1); /* Figure out if we had missed ticks */ again: HPTS_MTX_ASSERT(hpts); ticks_to_run = tcp_hpts_est_run(hpts); if (!TAILQ_EMPTY(&hpts->p_input)) { tcp_input_data(hpts, &tv); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", hpts, hpts->p_on_inqueue_cnt); } #endif HPTS_MTX_ASSERT(hpts); /* Reset the ticks to run and time if we need too */ interum_tick = tcp_gethptstick(&tv); if (interum_tick != hpts->p_curtick) { /* Save off the new time we execute to */ *ctick = tv; hpts->p_curtick = interum_tick; cts = tcp_tv_to_usectick(&tv); hpts->p_cur_slot = hpts_tick(hpts, 1); ticks_to_run = tcp_hpts_est_run(hpts); } if (ticks_to_run == -1) { goto no_run; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); } if (hpts->p_on_queue_cnt == 0) { goto no_one; } HPTS_MTX_ASSERT(hpts); for (i = 0; i < ticks_to_run; i++) { /* * Calculate our delay, if there are no extra ticks there * was not any */ hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; HPTS_MTX_ASSERT(hpts); while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { /* For debugging */ if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); } hpts->p_inp = inp; paced_cnt++; if (hpts->p_cur_slot != inp->inp_hptsslot) { panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); } /* Now pull it */ if (inp->inp_hpts_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { /* We prefetch the next inp if possible */ kern_prefetch(ninp, &prefetch_ninp); prefetch_ninp = 1; } if (inp->inp_hpts_request) { /* * This guy is deferred out further in time * then our wheel had on it. Push him back * on the wheel. */ int32_t remaining_slots; remaining_slots = ticks_to_run - (i + 1); if (inp->inp_hpts_request > remaining_slots) { /* * Keep INVARIANTS happy by clearing * the flag */ tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); hpts->p_inp = NULL; continue; } inp->inp_hpts_request = 0; } /* * We clear the hpts flag here after dealing with * remaining slots. This way anyone looking with the * TCB lock will see its on the hpts until just * before we unlock. */ inp->inp_in_hpts = 0; mtx_unlock(&hpts->p_mtx); INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); hpts->p_inp = NULL; continue; } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { out_now: #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif INP_WUNLOCK(inp); mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); hpts->p_inp = NULL; continue; } tp = intotcpcb(inp); if ((tp == NULL) || (tp->t_inpcb == NULL)) { goto out_now; } if (set_cpu) { /* * Setup so the next time we will move to * the right CPU. This should be a rare * event. It will sometimes happens when we * are the client side (usually not the * server). Somehow tcp_output() gets called * before the tcp_do_segment() sets the * intial state. This means the r_cpu and * r_hpts_cpu is 0. We get on the hpts, and * then tcp_input() gets called setting up * the r_cpu to the correct value. The hpts * goes off and sees the mis-match. We * simply correct it here and the CPU will * switch to the new hpts nextime the tcb * gets added to the the hpts (not this one) * :-) */ tcp_set_hpts(inp); } if (out_newts_every_tcb) { struct timeval sv; if (out_ts_percision) microuptime(&sv); else getmicrouptime(&sv); cts = tcp_tv_to_usectick(&sv); } CURVNET_SET(inp->inp_vnet); /* * There is a hole here, we get the refcnt on the * inp so it will still be preserved but to make * sure we can get the INP we need to hold the p_mtx * above while we pull out the tp/inp, as long as * fini gets the lock first we are assured of having * a sane INP we can lock and test. */ #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx before tcp-output:%d", hpts, __LINE__); } #endif if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } inp->inp_hpts_calls = 1; if (tp->t_fb->tfb_tcp_output_wtime != NULL) { error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); } else { error = tp->t_fb->tfb_tcp_output(tp); } if (ninp && ninp->inp_ppcb) { /* * If we have a nxt inp, see if we can * prefetch its ppcb. Note this may seem * "risky" since we have no locks (other * than the previous inp) and there no * assurance that ninp was not pulled while * we were processing inp and freed. If this * occured it could mean that either: * * a) Its NULL (which is fine we won't go * here) b) Its valid (which is cool we * will prefetch it) c) The inp got * freed back to the slab which was * reallocated. Then the piece of memory was * re-used and something else (not an * address) is in inp_ppcb. If that occurs * we don't crash, but take a TLB shootdown * performance hit (same as if it was NULL * and we tried to pre-fetch it). * * Considering that the likelyhood of is * quite rare we will take a risk on doing * this. If performance drops after testing * we can always take this out. NB: the * kern_prefetch on amd64 actually has * protection against a bad address now via * the DMAP_() tests. This will prevent the * TLB hit, and instead if occurs just * cause us to load cache with a useless * address (to us). */ kern_prefetch(ninp->inp_ppcb, &prefetch_tp); prefetch_tp = 1; } INP_WUNLOCK(inp); INP_UNLOCK_ASSERT(inp); CURVNET_RESTORE(); #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); if (logging_on) tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); hpts->p_inp = NULL; } HPTS_MTX_ASSERT(hpts); hpts->p_inp = NULL; hpts->p_cur_slot++; if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { hpts->p_cur_slot = 0; } } no_one: HPTS_MTX_ASSERT(hpts); hpts->p_prevtick = hpts->p_curtick; hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ /* Re-run any input that may be there */ (void)tcp_gethptstick(&tv); if (!TAILQ_EMPTY(&hpts->p_input)) { tcp_input_data(hpts, &tv); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", hpts, hpts->p_on_inqueue_cnt); } #endif tick_now = tcp_gethptstick(&tv); if (SEQ_GT(tick_now, hpts->p_prevtick)) { struct timeval res; /* Did we really spend a full tick or more in here? */ timersub(&tv, ctick, &res); if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { counter_u64_add(hpts_loops, 1); if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); } *ctick = res; hpts->p_curtick = tick_now; goto again; } } no_run: { uint32_t t = 0, i, fnd = 0; if (hpts->p_on_queue_cnt) { /* * Find next slot that is occupied and use that to * be the sleep time. */ for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { fnd = 1; break; } t = (t + 1) % NUM_OF_HPTSI_SLOTS; } if (fnd) { hpts->p_hpts_sleep_time = i; } else { counter_u64_add(back_tosleep, 1); #ifdef INVARIANTS panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); #endif hpts->p_on_queue_cnt = 0; goto non_found; } t++; } else { /* No one on the wheel sleep for all but 2 slots */ non_found: if (hpts_sleep_max == 0) hpts_sleep_max = 1; hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); t = 0; } if (logging_on) { tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); } } } void __tcp_set_hpts(struct inpcb *inp, int32_t line) { struct tcp_hpts_entry *hpts; INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); if ((inp->inp_in_hpts == 0) && (inp->inp_hpts_cpu_set == 0)) { inp->inp_hpts_cpu = hpts_cpuid(inp); inp->inp_hpts_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if ((inp->inp_input_cpu_set == 0) && (inp->inp_in_input == 0)) { inp->inp_input_cpu = hpts_cpuid(inp); inp->inp_input_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); } uint16_t tcp_hpts_delayedby(struct inpcb *inp){ return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by); } static void tcp_hpts_thread(void *ctx) { struct tcp_hpts_entry *hpts; struct timeval tv; sbintime_t sb; hpts = (struct tcp_hpts_entry *)ctx; mtx_lock(&hpts->p_mtx); if (hpts->p_direct_wake) { /* Signaled by input */ if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); callout_stop(&hpts->co); } else { /* Timed out */ if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); mtx_unlock(&hpts->p_mtx); return; } callout_deactivate(&hpts->co); if (logging_on) tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); } hpts->p_hpts_active = 1; (void)tcp_gethptstick(&tv); tcp_hptsi(hpts, &tv); HPTS_MTX_ASSERT(hpts); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { tv.tv_usec = tcp_min_hptsi_time; hpts->p_on_min_sleep = 1; } else { /* Clear the min sleep flag */ hpts->p_on_min_sleep = 0; } hpts->p_hpts_active = 0; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } hpts->p_direct_wake = 0; mtx_unlock(&hpts->p_mtx); } #undef timersub static void tcp_init_hptsi(void *st) { int32_t i, j, error, bound = 0, created = 0; size_t sz, asz; struct timeval tv; sbintime_t sb; struct tcp_hpts_entry *hpts; + struct pcpu *pc; + cpuset_t cs; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; + int count, domain; tcp_pace.rp_proc = NULL; tcp_pace.rp_num_hptss = ncpus; hpts_loops = counter_u64_alloc(M_WAITOK); back_tosleep = counter_u64_alloc(M_WAITOK); sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; for (i = 0; i < tcp_pace.rp_num_hptss; i++) { tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), M_TCPHPTS, M_WAITOK | M_ZERO); tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); hpts = tcp_pace.rp_ent[i]; /* * Init all the hpts structures that are not specifically * zero'd by the allocations. Also lets attach them to the * appropriate sysctl block as well. */ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", MTX_DEF | MTX_DUPOK); TAILQ_INIT(&hpts->p_input); for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { TAILQ_INIT(&hpts->p_hptss[j]); } sysctl_ctx_init(&hpts->hpts_ctx); sprintf(unit, "%d", i); hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), OID_AUTO, unit, CTLFLAG_RW, 0, ""); SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "in_qcnt", CTLFLAG_RD, &hpts->p_on_inqueue_cnt, 0, "Count TCB's awaiting input processing"); SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "out_qcnt", CTLFLAG_RD, &hpts->p_on_queue_cnt, 0, "Count TCB's awaiting output processing"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "active", CTLFLAG_RD, &hpts->p_hpts_active, 0, "Is the hpts active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curslot", CTLFLAG_RD, &hpts->p_cur_slot, 0, "What the current slot is if active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curtick", CTLFLAG_RD, &hpts->p_curtick, 0, "What the current tick on if active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "logsize", CTLFLAG_RD, &hpts->p_logsize, 0, "Hpts logging buffer size"); hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; hpts->p_num = i; hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); hpts->p_prevtick -= 1; hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; hpts->p_cpu = 0xffff; hpts->p_nxt_slot = 1; hpts->p_logsize = tcp_hpts_logging_size; if (hpts->p_logsize) { sz = (sizeof(struct hpts_log) * hpts->p_logsize); hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); } callout_init(&hpts->co, 1); } + + /* Don't try to bind to NUMA domains if we don't have any */ + if (vm_ndomains == 1 && tcp_bind_threads == 2) + tcp_bind_threads = 0; + /* * Now lets start ithreads to handle the hptss. */ CPU_FOREACH(i) { hpts = tcp_pace.rp_ent[i]; hpts->p_cpu = i; error = swi_add(&hpts->ie, "hpts", tcp_hpts_thread, (void *)hpts, SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); if (error) { panic("Can't add hpts:%p i:%d err:%d", hpts, i, error); } created++; - if (tcp_bind_threads) { + if (tcp_bind_threads == 1) { if (intr_event_bind(hpts->ie, i) == 0) bound++; + } else if (tcp_bind_threads == 2) { + pc = pcpu_find(i); + domain = pc->pc_domain; + CPU_COPY(&cpuset_domain[domain], &cs); + if (intr_event_bind_ithread_cpuset(hpts->ie, &cs) + == 0) { + bound++; + count = hpts_domains[domain].count; + hpts_domains[domain].cpu[count] = i; + hpts_domains[domain].count++; + } } tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; sb = tvtosbt(tv); if (tcp_hpts_callout_skip_swi == 0) { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else { callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_dir, hpts, hpts->p_cpu, C_PREL(tcp_hpts_precision)); } } - printf("TCP Hpts created %d swi interrupt thread and bound %d\n", - created, bound); - return; + /* + * If we somehow have an empty domain, fall back to choosing + * among all htps threads. + */ + for (i = 0; i < vm_ndomains; i++) { + if (hpts_domains[i].count == 0) { + tcp_bind_threads = 0; + break; + } + } + + printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", + created, bound, + tcp_bind_threads == 2 ? "NUMA domains" : "cpus"); } SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); MODULE_VERSION(tcphpts, 1); Index: head/sys/sys/interrupt.h =================================================================== --- head/sys/sys/interrupt.h (revision 347429) +++ head/sys/sys/interrupt.h (revision 347430) @@ -1,201 +1,203 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, Stefan Esser * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_INTERRUPT_H_ #define _SYS_INTERRUPT_H_ #include #include #include struct intr_event; struct intr_thread; struct trapframe; /* * Describe a hardware interrupt handler. * * Multiple interrupt handlers for a specific event can be chained * together. */ struct intr_handler { driver_filter_t *ih_filter; /* Filter handler function. */ driver_intr_t *ih_handler; /* Threaded handler function. */ void *ih_argument; /* Argument to pass to handlers. */ int ih_flags; char ih_name[MAXCOMLEN + 1]; /* Name of handler. */ struct intr_event *ih_event; /* Event we are connected to. */ int ih_need; /* Needs service. */ CK_SLIST_ENTRY(intr_handler) ih_next; /* Next handler for this event. */ u_char ih_pri; /* Priority of this handler. */ struct intr_thread *ih_thread; /* Ithread for filtered handler. */ }; /* Interrupt handle flags kept in ih_flags */ #define IH_EXCLUSIVE 0x00000002 /* Exclusive interrupt. */ #define IH_ENTROPY 0x00000004 /* Device is a good entropy source. */ #define IH_DEAD 0x00000008 /* Handler should be removed. */ #define IH_SUSP 0x00000010 /* Device is powered down. */ #define IH_CHANGED 0x40000000 /* Handler state is changed. */ #define IH_MPSAFE 0x80000000 /* Handler does not need Giant. */ /* * Describe an interrupt event. An event holds a list of handlers. * The 'pre_ithread', 'post_ithread', 'post_filter', and 'assign_cpu' * hooks are used to invoke MD code for certain operations. * * The 'pre_ithread' hook is called when an interrupt thread for * handlers without filters is scheduled. It is responsible for * ensuring that 1) the system won't be swamped with an interrupt * storm from the associated source while the ithread runs and 2) the * current CPU is able to receive interrupts from other interrupt * sources. The first is usually accomplished by disabling * level-triggered interrupts until the ithread completes. The second * is accomplished on some platforms by acknowledging the interrupt * via an EOI. * * The 'post_ithread' hook is invoked when an ithread finishes. It is * responsible for ensuring that the associated interrupt source will * trigger an interrupt when it is asserted in the future. Usually * this is implemented by enabling a level-triggered interrupt that * was previously disabled via the 'pre_ithread' hook. * * The 'post_filter' hook is invoked when a filter handles an * interrupt. It is responsible for ensuring that the current CPU is * able to receive interrupts again. On some platforms this is done * by acknowledging the interrupts via an EOI. * * The 'assign_cpu' hook is used to bind an interrupt source to a * specific CPU. If the interrupt cannot be bound, this function may * return an error. * * Note that device drivers may also use interrupt events to manage * multiplexing interrupt interrupt handler into handlers for child * devices. In that case, the above hooks are not used. The device * can create an event for its interrupt resource and register child * event handlers with that event. It can then use * intr_event_execute_handlers() to execute non-filter handlers. * Currently filter handlers are not supported by this, but that can * be added by splitting out the filter loop from intr_event_handle() * if desired. */ struct intr_event { TAILQ_ENTRY(intr_event) ie_list; CK_SLIST_HEAD(, intr_handler) ie_handlers; /* Interrupt handlers. */ char ie_name[MAXCOMLEN + 1]; /* Individual event name. */ char ie_fullname[MAXCOMLEN + 1]; struct mtx ie_lock; void *ie_source; /* Cookie used by MD code. */ struct intr_thread *ie_thread; /* Thread we are connected to. */ void (*ie_pre_ithread)(void *); void (*ie_post_ithread)(void *); void (*ie_post_filter)(void *); int (*ie_assign_cpu)(void *, int); int ie_flags; int ie_count; /* Loop counter. */ int ie_warncnt; /* Rate-check interrupt storm warns. */ struct timeval ie_warntm; int ie_irq; /* Physical irq number if !SOFT. */ int ie_cpu; /* CPU this event is bound to. */ volatile int ie_phase; /* Switched to establish a barrier. */ volatile int ie_active[2]; /* Filters in ISR context. */ }; /* Interrupt event flags kept in ie_flags. */ #define IE_SOFT 0x000001 /* Software interrupt. */ #define IE_ENTROPY 0x000002 /* Interrupt is an entropy source. */ #define IE_ADDING_THREAD 0x000004 /* Currently building an ithread. */ /* Flags to pass to sched_swi. */ #define SWI_DELAY 0x2 /* * Software interrupt numbers in priority order. The priority determines * the priority of the corresponding interrupt thread. */ #define SWI_TTY 0 #define SWI_NET 1 #define SWI_CAMBIO 2 #define SWI_VM 3 #define SWI_CLOCK 4 #define SWI_TQ_FAST 5 #define SWI_TQ 6 #define SWI_TQ_GIANT 6 struct proc; extern struct intr_event *tty_intr_event; extern struct intr_event *clk_intr_event; extern void *vm_ih; /* Counts and names for statistics (defined in MD code). */ #if defined(__amd64__) || defined(__i386__) || defined(__powerpc__) extern u_long *intrcnt; /* counts for for each device and stray */ extern char *intrnames; /* string table containing device names */ #else extern u_long intrcnt[]; /* counts for for each device and stray */ extern char intrnames[]; /* string table containing device names */ #endif extern size_t sintrcnt; /* size of intrcnt table */ extern size_t sintrnames; /* size of intrnames table */ #ifdef DDB void db_dump_intr_event(struct intr_event *ie, int handlers); #endif u_char intr_priority(enum intr_type flags); int intr_event_add_handler(struct intr_event *ie, const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri, enum intr_type flags, void **cookiep); int intr_event_bind(struct intr_event *ie, int cpu); int intr_event_bind_irqonly(struct intr_event *ie, int cpu); int intr_event_bind_ithread(struct intr_event *ie, int cpu); +int intr_event_bind_ithread_cpuset(struct intr_event *ie, + cpuset_t *mask); int intr_event_create(struct intr_event **event, void *source, int flags, int irq, void (*pre_ithread)(void *), void (*post_ithread)(void *), void (*post_filter)(void *), int (*assign_cpu)(void *, int), const char *fmt, ...) __printflike(9, 10); int intr_event_describe_handler(struct intr_event *ie, void *cookie, const char *descr); int intr_event_destroy(struct intr_event *ie); int intr_event_handle(struct intr_event *ie, struct trapframe *frame); int intr_event_remove_handler(void *cookie); int intr_event_suspend_handler(void *cookie); int intr_event_resume_handler(void *cookie); int intr_getaffinity(int irq, int mode, void *mask); void *intr_handler_source(void *cookie); int intr_setaffinity(int irq, int mode, void *mask); void _intr_drain(int irq); /* Linux compat only. */ int swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler, void *arg, int pri, enum intr_type flags, void **cookiep); void swi_sched(void *cookie, int flags); int swi_remove(void *cookie); #endif