diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index e9a16cd0b36e..18388ae5f232 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -1,393 +1,395 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007, 2022 The FreeBSD Foundation * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_hwpmc_hooks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VIMAGE #include #endif #ifdef HWPMC_HOOKS #include #endif #ifdef EPOCH_TRACE #include #endif +volatile uint32_t __read_frequently hpts_that_need_softclock = 0; + void (*tcp_hpts_softclock)(void); /* * Define the code needed before returning to user mode, for trap and * syscall. */ void userret(struct thread *td, struct trapframe *frame) { struct proc *p = td->td_proc; CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid, td->td_name); KASSERT((p->p_flag & P_WEXIT) == 0, ("Exiting process returns to usermode")); #ifdef DIAGNOSTIC /* * Check that we called signotify() enough. For * multi-threaded processes, where signal distribution might * change due to other threads changing sigmask, the check is * racy and cannot be performed reliably. * If current process is vfork child, indicated by P_PPWAIT, then * issignal() ignores stops, so we block the check to avoid * classifying pending signals. */ if (p->p_numthreads == 1) { PROC_LOCK(p); thread_lock(td); if ((p->p_flag & P_PPWAIT) == 0 && (td->td_pflags & TDP_SIGFASTBLOCK) == 0 && SIGPENDING(td) && !td_ast_pending(td, TDA_AST) && !td_ast_pending(td, TDA_SIG)) { thread_unlock(td); panic( "failed to set signal flags for ast p %p " "td %p td_ast %#x fl %#x", p, td, td->td_ast, td->td_flags); } thread_unlock(td); PROC_UNLOCK(p); } #endif /* * Charge system time if profiling. */ if (__predict_false(p->p_flag & P_PROFIL)) addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio); #ifdef HWPMC_HOOKS if (PMC_THREAD_HAS_SAMPLES(td)) PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL); #endif /* * Calling tcp_hpts_softclock() here allows us to avoid frequent, * expensive callouts that trash the cache and lead to a much higher * number of interrupts and context switches. Testing on busy web * servers at Netflix has shown that this improves CPU use by 7% over * relying only on callouts to drive HPTS, and also results in idle * power savings on mostly idle servers. * This was inspired by the paper "Soft Timers: Efficient Microsecond * Software Timer Support for Network Processing" * by Mohit Aron and Peter Druschel. */ tcp_hpts_softclock(); /* * Let the scheduler adjust our priority etc. */ sched_userret(td); /* * Check for misbehavior. * * In case there is a callchain tracing ongoing because of * hwpmc(4), skip the scheduler pinning check. * hwpmc(4) subsystem, infact, will collect callchain informations * at ast() checkpoint, which is past userret(). */ WITNESS_WARN(WARN_PANIC, NULL, "userret: returning"); KASSERT(td->td_critnest == 0, ("userret: Returning in a critical section")); KASSERT(td->td_locks == 0, ("userret: Returning with %d locks held", td->td_locks)); KASSERT(td->td_rw_rlocks == 0, ("userret: Returning with %d rwlocks held in read mode", td->td_rw_rlocks)); KASSERT(td->td_sx_slocks == 0, ("userret: Returning with %d sx locks held in shared mode", td->td_sx_slocks)); KASSERT(td->td_lk_slocks == 0, ("userret: Returning with %d lockmanager locks held in shared mode", td->td_lk_slocks)); KASSERT((td->td_pflags & TDP_NOFAULTING) == 0, ("userret: Returning with pagefaults disabled")); if (__predict_false(!THREAD_CAN_SLEEP())) { #ifdef EPOCH_TRACE epoch_trace_list(curthread); #endif KASSERT(0, ("userret: Returning with sleep disabled")); } KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0, ("userret: Returning with pinned thread")); KASSERT(td->td_vp_reserved == NULL, ("userret: Returning with preallocated vnode")); KASSERT((td->td_flags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("userret: Returning with stop signals deferred")); KASSERT(td->td_vslock_sz == 0, ("userret: Returning with vslock-wired space")); #ifdef VIMAGE /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */ VNET_ASSERT(curvnet == NULL, ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s", __func__, td, p->p_pid, td->td_name, curvnet, (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A")); #endif } static void ast_prep(struct thread *td, int tda __unused) { VM_CNT_INC(v_trap); td->td_pticks = 0; if (td->td_cowgen != atomic_load_int(&td->td_proc->p_cowgen)) thread_cow_update(td); } struct ast_entry { int ae_flags; int ae_tdp; void (*ae_f)(struct thread *td, int ast); }; _Static_assert(TDAI(TDA_MAX) <= UINT_MAX, "Too many ASTs"); static struct ast_entry ast_entries[TDA_MAX] __read_mostly = { [TDA_AST] = { .ae_f = ast_prep, .ae_flags = ASTR_UNCOND}, }; void ast_register(int ast, int flags, int tdp, void (*f)(struct thread *, int asts)) { struct ast_entry *ae; MPASS(ast < TDA_MAX); MPASS((flags & ASTR_TDP) == 0 || ((flags & ASTR_ASTF_REQUIRED) != 0 && __bitcount(tdp) == 1)); ae = &ast_entries[ast]; MPASS(ae->ae_f == NULL); ae->ae_flags = flags; ae->ae_tdp = tdp; atomic_interrupt_fence(); ae->ae_f = f; } /* * XXXKIB Note that the deregistration of an AST handler does not * drain threads possibly executing it, which affects unloadable * modules. The issue is either handled by the subsystem using * handlers, or simply ignored. Fixing the problem is considered not * worth the overhead. */ void ast_deregister(int ast) { struct ast_entry *ae; MPASS(ast < TDA_MAX); ae = &ast_entries[ast]; MPASS(ae->ae_f != NULL); ae->ae_f = NULL; atomic_interrupt_fence(); ae->ae_flags = 0; ae->ae_tdp = 0; } void ast_sched_locked(struct thread *td, int tda) { THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(tda < TDA_MAX); td->td_ast |= TDAI(tda); } void ast_unsched_locked(struct thread *td, int tda) { THREAD_LOCK_ASSERT(td, MA_OWNED); MPASS(tda < TDA_MAX); td->td_ast &= ~TDAI(tda); } void ast_sched(struct thread *td, int tda) { thread_lock(td); ast_sched_locked(td, tda); thread_unlock(td); } void ast_sched_mask(struct thread *td, int ast) { thread_lock(td); td->td_ast |= ast; thread_unlock(td); } static bool ast_handler_calc_tdp_run(struct thread *td, const struct ast_entry *ae) { return ((ae->ae_flags & ASTR_TDP) == 0 || (td->td_pflags & ae->ae_tdp) != 0); } /* * Process an asynchronous software trap. */ static void ast_handler(struct thread *td, struct trapframe *framep, bool dtor) { struct ast_entry *ae; void (*f)(struct thread *td, int asts); int a, td_ast; bool run; if (framep != NULL) { kmsan_mark(framep, sizeof(*framep), KMSAN_STATE_INITED); td->td_frame = framep; } if (__predict_true(!dtor)) { WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode"); mtx_assert(&Giant, MA_NOTOWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); /* * This updates the td_ast for the checks below in one * atomic operation with turning off all scheduled AST's. * If another AST is triggered while we are handling the * AST's saved in td_ast, the td_ast is again non-zero and * ast() will be called again. */ thread_lock(td); td_ast = td->td_ast; td->td_ast = 0; thread_unlock(td); } else { /* * The td thread's td_lock is not guaranteed to exist, * the thread might be not initialized enough when it's * destructor is called. It is safe to read and * update td_ast without locking since the thread is * not runnable or visible to other threads. */ td_ast = td->td_ast; td->td_ast = 0; } CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, td->td_proc->p_pid, td->td_proc->p_comm); KASSERT(framep == NULL || TRAPF_USERMODE(framep), ("ast in kernel mode")); for (a = 0; a < nitems(ast_entries); a++) { ae = &ast_entries[a]; f = ae->ae_f; if (f == NULL) continue; atomic_interrupt_fence(); run = false; if (__predict_false(framep == NULL)) { if ((ae->ae_flags & ASTR_KCLEAR) != 0) run = ast_handler_calc_tdp_run(td, ae); } else { if ((ae->ae_flags & ASTR_UNCOND) != 0) run = true; else if ((ae->ae_flags & ASTR_ASTF_REQUIRED) != 0 && (td_ast & TDAI(a)) != 0) run = ast_handler_calc_tdp_run(td, ae); } if (run) f(td, td_ast); } } void ast(struct trapframe *framep) { struct thread *td; td = curthread; ast_handler(td, framep, false); userret(td, framep); } void ast_kclear(struct thread *td) { ast_handler(td, NULL, td != curthread); } const char * syscallname(struct proc *p, u_int code) { static const char unknown[] = "unknown"; struct sysentvec *sv; sv = p->p_sysent; if (sv->sv_syscallnames == NULL || code >= sv->sv_size) return (unknown); return (sv->sv_syscallnames[code]); } diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index 8c4d2d41a3eb..222f8e0229dd 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -1,2089 +1,2097 @@ /*- * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" /** * Some notes about usage. * * The tcp_hpts system is designed to provide a high precision timer * system for tcp. Its main purpose is to provide a mechanism for * pacing packets out onto the wire. It can be used in two ways * by a given TCP stack (and those two methods can be used simultaneously). * * First, and probably the main thing its used by Rack and BBR, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The * slot is the time from now that the stack wants to be called but it * must be converted to tcp_hpts's notion of slot. This is done with * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical * call from the tcp_output() routine might look like: * * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); * * The above would schedule tcp_output() to be called in 550 useconds. * Note that if using this mechanism the stack will want to add near * its top a check to prevent unwanted calls (from user land or the * arrival of incoming ack's). So it would add something like: * * if (tcp_in_hpts(inp)) * return; * * to prevent output processing until the time alotted has gone by. * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. * * In order to run input queued segments from the HPTS context the * tcp stack must define an input function for * tfb_do_queued_segments(). This function understands * how to dequeue a array of packets that were input and * knows how to call the correct processing routine. * * Locking in this is important as well so most likely the * stack will need to define the tfb_do_segment_nounlock() * splitting tfb_do_segment() into two parts. The main processing * part that does not unlock the INP and returns a value of 1 or 0. * It returns 0 if all is well and the lock was not released. It * returns 1 if we had to destroy the TCB (a reset received etc). * The remains of tfb_do_segment() then become just a simple call * to the tfb_do_segment_nounlock() function and check the return * code and possibly unlock. * * The stack must also set the flag on the INP that it supports this * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes * this flag as well and will queue packets when it is set. * There are other flags as well INP_MBUF_QUEUE_READY and * INP_DONT_SACK_QUEUE. The first flag tells the LRO code * that we are in the pacer for output so there is no * need to wake up the hpts system to get immediate * input. The second tells the LRO code that its okay * if a SACK arrives you can still defer input and let * the current hpts timer run (this is usually set when * a rack timer is up so we know SACK's are happening * on the connection already and don't want to wakeup yet). * * There is a common functions within the rack_bbr_common code * version i.e. ctf_do_queued_segments(). This function * knows how to take the input queue of packets from tp->t_inqueue * and process them digging out all the arguments, calling any bpf tap and * calling into tfb_do_segment_nounlock(). The common * function (ctf_do_queued_segments()) requires that * you have defined the tfb_do_segment_nounlock() as * described above. */ #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #include #endif #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef tcp_offload #include #endif /* * The hpts uses a 102400 wheel. The wheel * defines the time in 10 usec increments (102400 x 10). * This gives a range of 10usec - 1024ms to place * an entry within. If the user requests more than * 1.024 second, a remaineder is attached and the hpts * when seeing the remainder will re-insert the * inpcb forward in time from where it is until * the remainder is zero. */ #define NUM_OF_HPTSI_SLOTS 102400 /* Each hpts has its own p_mtx which is used for locking */ #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) #define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) #define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) struct tcp_hpts_entry { /* Cache line 0x00 */ struct mtx p_mtx; /* Mutex for hpts */ struct timeval p_mysleep; /* Our min sleep time */ uint64_t syscall_cnt; uint64_t sleeping; /* What the actual sleep was (if sleeping) */ uint16_t p_hpts_active; /* Flag that says hpts is awake */ uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ uint32_t p_runningslot; /* Current tick we are at if we are running */ uint32_t p_prev_slot; /* Previous slot we were on */ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ uint32_t p_nxt_slot; /* The next slot outside the current range of * slots that the hpts is running on. */ int32_t p_on_queue_cnt; /* Count on queue in this hpts */ uint32_t p_lasttick; /* Last tick before the current one */ uint8_t p_direct_wake :1, /* boolean */ p_on_min_sleep:1, /* boolean */ p_hpts_wake_scheduled:1, /* boolean */ - p_avail:5; + hit_callout_thresh:1, + p_avail:4; uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ struct hptsh { TAILQ_HEAD(, tcpcb) head; uint32_t count; uint32_t gencnt; } *p_hptss; /* Hptsi wheel */ uint32_t p_hpts_sleep_time; /* Current sleep interval having a max * of 255ms */ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ uint32_t saved_lasttick; /* for logging */ uint32_t saved_curtick; /* for logging */ uint32_t saved_curslot; /* for logging */ uint32_t saved_prev_slot; /* for logging */ uint32_t p_delayed_by; /* How much were we delayed by */ /* Cache line 0x80 */ struct sysctl_ctx_list hpts_ctx; struct sysctl_oid *hpts_root; struct intr_event *ie; void *ie_cookie; uint16_t p_num; /* The hpts number one per cpu */ uint16_t p_cpu; /* The hpts CPU */ /* There is extra space in here */ /* Cache line 0x100 */ struct callout co __aligned(CACHE_LINE_SIZE); } __aligned(CACHE_LINE_SIZE); static struct tcp_hptsi { struct cpu_group **grps; struct tcp_hpts_entry **rp_ent; /* Array of hptss */ uint32_t *cts_last_ran; uint32_t grp_cnt; uint32_t rp_num_hptss; /* Number of hpts threads */ } tcp_pace; static MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS static int tcp_bind_threads = 1; #else static int tcp_bind_threads = 2; #endif static int tcp_use_irq_cpu = 0; static int hpts_does_tp_logging = 0; static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout); static void tcp_hpts_thread(void *ctx); int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD; static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP; SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP Hpts controls"); SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "TCP Hpts statistics"); #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) static int32_t tcp_hpts_precision = 120; static struct hpts_domain_info { int count; int cpu[MAXCPU]; } hpts_domains[MAXMEMDOM]; counter_u64_t hpts_hopelessly_behind; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, &hpts_hopelessly_behind, "Number of times hpts could not catch up and was behind hopelessly"); counter_u64_t hpts_loops; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD, &hpts_loops, "Number of times hpts had to loop to catch up"); counter_u64_t back_tosleep; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); counter_u64_t combined_wheel_wrap; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); counter_u64_t wheel_wrap; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD, &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); counter_u64_t hpts_direct_call; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD, &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry"); counter_u64_t hpts_wake_timeout; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD, &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring"); counter_u64_t hpts_direct_awakening; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD, &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring"); counter_u64_t hpts_back_tosleep; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD, &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work"); counter_u64_t cpu_uses_flowid; counter_u64_t cpu_uses_random; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD, &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field"); SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD, &cpu_uses_random, "Number of times when setting cpuid we used the a random value"); TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD, &tcp_bind_threads, 2, "Thread Binding tunable"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD, &tcp_use_irq_cpu, 0, "Use of irq CPU tunable"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, &tcp_hpts_precision, 120, "Value for PRE() precision of callout"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW, &conn_cnt_thresh, 0, "How many connections (below) make us use the callout based mechanism"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, &hpts_does_tp_logging, 0, "Do we add to any tp that has logging on pacer logs"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW, &dynamic_min_sleep, 250, "What is the dynamic minsleep value?"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW, &dynamic_max_sleep, 5000, "What is the dynamic maxsleep value?"); static int32_t max_pacer_loops = 10; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW, &max_pacer_loops, 10, "What is the maximum number of times the pacer will loop trying to catch up"); #define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2) static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED; static int sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = hpts_sleep_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if ((new < (dynamic_min_sleep/HPTS_TICKS_PER_SLOT)) || (new > HPTS_MAX_SLEEP_ALLOWED)) error = EINVAL; else hpts_sleep_max = new; } return (error); } static int sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = tcp_min_hptsi_time; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < LOWEST_SLEEP_ALLOWED) error = EINVAL; else tcp_min_hptsi_time = new; } return (error); } SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLTYPE_UINT | CTLFLAG_RW, &hpts_sleep_max, 0, &sysctl_net_inet_tcp_hpts_max_sleep, "IU", "Maximum time hpts will sleep in slots"); SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLTYPE_UINT | CTLFLAG_RW, &tcp_min_hptsi_time, 0, &sysctl_net_inet_tcp_hpts_min_sleep, "IU", "The minimum time the hpts must sleep before processing more slots"); static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP; static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP; static int tcp_hpts_no_wake_over_thresh = 1; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW, &ticks_indicate_more_sleep, 0, "If we only process this many or less on a timeout, we need longer sleep on the next callout"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW, &ticks_indicate_less_sleep, 0, "If we process this many or more on a timeout, we need less sleep on the next callout"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW, &tcp_hpts_no_wake_over_thresh, 0, "When we are over the threshold on the pacer do we prohibit wakeups?"); static uint16_t hpts_random_cpu(void) { uint16_t cpuid; uint32_t ran; ran = arc4random(); cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); return (cpuid); } static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, int slots_to_run, int idx, int from_callout) { union tcp_log_stackspecific log; /* * Unused logs are * 64 bit - delRate, rttProp, bw_inuse * 16 bit - cwnd_gain * 8 bit - bbr_state, bbr_substate, inhpts; */ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = hpts->p_nxt_slot; log.u_bbr.flex2 = hpts->p_cur_slot; log.u_bbr.flex3 = hpts->p_prev_slot; log.u_bbr.flex4 = idx; log.u_bbr.flex5 = hpts->p_curtick; log.u_bbr.flex6 = hpts->p_on_queue_cnt; log.u_bbr.flex7 = hpts->p_cpu; log.u_bbr.flex8 = (uint8_t)from_callout; log.u_bbr.inflight = slots_to_run; log.u_bbr.applimited = hpts->overidden_sleep; log.u_bbr.delivered = hpts->saved_curtick; log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); log.u_bbr.epoch = hpts->saved_curslot; log.u_bbr.lt_epoch = hpts->saved_prev_slot; log.u_bbr.pkts_out = hpts->p_delayed_by; log.u_bbr.lost = hpts->p_hpts_sleep_time; log.u_bbr.pacing_gain = hpts->p_cpu; log.u_bbr.pkt_epoch = hpts->p_runningslot; log.u_bbr.use_lt_bw = 1; TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, BBR_LOG_HPTSDIAG, 0, 0, &log, false, tv); } static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) { hpts->p_direct_wake = 0; return; } if (hpts->p_hpts_wake_scheduled == 0) { hpts->p_hpts_wake_scheduled = 1; swi_sched(hpts->ie_cookie, 0); } } static void hpts_timeout_swi(void *arg) { struct tcp_hpts_entry *hpts; hpts = (struct tcp_hpts_entry *)arg; swi_sched(hpts->ie_cookie, 0); } static void tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) { struct inpcb *inp = tptoinpcb(tp); struct hptsh *hptsh; INP_WLOCK_ASSERT(inp); HPTS_MTX_ASSERT(hpts); MPASS(hpts->p_cpu == tp->t_hpts_cpu); MPASS(!(inp->inp_flags & INP_DROPPED)); hptsh = &hpts->p_hptss[tp->t_hpts_slot]; if (tp->t_in_hpts == IHPTS_NONE) { tp->t_in_hpts = IHPTS_ONQUEUE; in_pcbref(inp); } else if (tp->t_in_hpts == IHPTS_MOVING) { tp->t_in_hpts = IHPTS_ONQUEUE; } else MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); tp->t_hpts_gencnt = hptsh->gencnt; TAILQ_INSERT_TAIL(&hptsh->head, tp, t_hpts); hptsh->count++; hpts->p_on_queue_cnt++; } static struct tcp_hpts_entry * tcp_hpts_lock(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; INP_LOCK_ASSERT(tptoinpcb(tp)); hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; HPTS_LOCK(hpts); return (hpts); } static void tcp_hpts_release(struct tcpcb *tp) { bool released __diagused; tp->t_in_hpts = IHPTS_NONE; released = in_pcbrele_wlocked(tptoinpcb(tp)); MPASS(released == false); } /* * Initialize tcpcb to get ready for use with HPTS. We will know which CPU * is preferred on the first incoming packet. Before that avoid crowding * a single CPU with newborn connections and use a random one. * This initialization is normally called on a newborn tcpcb, but potentially * can be called once again if stack is switched. In that case we inherit CPU * that the previous stack has set, be it random or not. In extreme cases, * e.g. syzkaller fuzzing, a tcpcb can already be in HPTS in IHPTS_MOVING state * and has never received a first packet. */ void tcp_hpts_init(struct tcpcb *tp) { if (__predict_true(tp->t_hpts_cpu == HPTS_CPU_NONE)) { tp->t_hpts_cpu = hpts_random_cpu(); MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET)); } } /* * Called normally with the INP_LOCKED but it * does not matter, the hpts lock is the key * but the lock order allows us to hold the * INP lock and then get the hpts lock. */ void tcp_hpts_remove(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; struct hptsh *hptsh; INP_WLOCK_ASSERT(tptoinpcb(tp)); hpts = tcp_hpts_lock(tp); if (tp->t_in_hpts == IHPTS_ONQUEUE) { hptsh = &hpts->p_hptss[tp->t_hpts_slot]; tp->t_hpts_request = 0; if (__predict_true(tp->t_hpts_gencnt == hptsh->gencnt)) { TAILQ_REMOVE(&hptsh->head, tp, t_hpts); MPASS(hptsh->count > 0); hptsh->count--; MPASS(hpts->p_on_queue_cnt > 0); hpts->p_on_queue_cnt--; tcp_hpts_release(tp); } else { /* * tcp_hptsi() now owns the TAILQ head of this inp. * Can't TAILQ_REMOVE, just mark it. */ #ifdef INVARIANTS struct tcpcb *tmp; TAILQ_FOREACH(tmp, &hptsh->head, t_hpts) MPASS(tmp != tp); #endif tp->t_in_hpts = IHPTS_MOVING; tp->t_hpts_slot = -1; } } else if (tp->t_in_hpts == IHPTS_MOVING) { /* * Handle a special race condition: * tcp_hptsi() moves inpcb to detached tailq * tcp_hpts_remove() marks as IHPTS_MOVING, slot = -1 * tcp_hpts_insert() sets slot to a meaningful value * tcp_hpts_remove() again (we are here!), then in_pcbdrop() * tcp_hptsi() finds pcb with meaningful slot and INP_DROPPED */ tp->t_hpts_slot = -1; } HPTS_UNLOCK(hpts); } static inline int hpts_slot(uint32_t wheel_slot, uint32_t plus) { /* * Given a slot on the wheel, what slot * is that plus ticks out? */ KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot)); return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS); } static inline int tick_to_wheel(uint32_t cts_in_wticks) { /* * Given a timestamp in ticks (so by * default to get it to a real time one * would multiply by 10.. i.e the number * of ticks in a slot) map it to our limited * space wheel. */ return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); } static inline int hpts_slots_diff(int prev_slot, int slot_now) { /* * Given two slots that are someplace * on our wheel. How far are they apart? */ if (slot_now > prev_slot) return (slot_now - prev_slot); else if (slot_now == prev_slot) /* * Special case, same means we can go all of our * wheel less one slot. */ return (NUM_OF_HPTSI_SLOTS - 1); else return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now); } /* * Given a slot on the wheel that is the current time * mapped to the wheel (wheel_slot), what is the maximum * distance forward that can be obtained without * wrapping past either prev_slot or running_slot * depending on the htps state? Also if passed * a uint32_t *, fill it with the slot location. * * Note if you do not give this function the current * time (that you think it is) mapped to the wheel slot * then the results will not be what you expect and * could lead to invalid inserts. */ static inline int32_t max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot) { uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel; if ((hpts->p_hpts_active == 1) && (hpts->p_wheel_complete == 0)) { end_slot = hpts->p_runningslot; /* Back up one tick */ if (end_slot == 0) end_slot = NUM_OF_HPTSI_SLOTS - 1; else end_slot--; if (target_slot) *target_slot = end_slot; } else { /* * For the case where we are * not active, or we have * completed the pass over * the wheel, we can use the * prev tick and subtract one from it. This puts us * as far out as possible on the wheel. */ end_slot = hpts->p_prev_slot; if (end_slot == 0) end_slot = NUM_OF_HPTSI_SLOTS - 1; else end_slot--; if (target_slot) *target_slot = end_slot; /* * Now we have close to the full wheel left minus the * time it has been since the pacer went to sleep. Note * that wheel_tick, passed in, should be the current time * from the perspective of the caller, mapped to the wheel. */ if (hpts->p_prev_slot != wheel_slot) dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot); else dis_to_travel = 1; /* * dis_to_travel in this case is the space from when the * pacer stopped (p_prev_slot) and where our wheel_slot * is now. To know how many slots we can put it in we * subtract from the wheel size. We would not want * to place something after p_prev_slot or it will * get ran too soon. */ return (NUM_OF_HPTSI_SLOTS - dis_to_travel); } /* * So how many slots are open between p_runningslot -> p_cur_slot * that is what is currently un-available for insertion. Special * case when we are at the last slot, this gets 1, so that * the answer to how many slots are available is all but 1. */ if (hpts->p_runningslot == hpts->p_cur_slot) dis_to_travel = 1; else dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot); /* * How long has the pacer been running? */ if (hpts->p_cur_slot != wheel_slot) { /* The pacer is a bit late */ pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot); } else { /* The pacer is right on time, now == pacers start time */ pacer_to_now = 0; } /* * To get the number left we can insert into we simply * subtract the distance the pacer has to run from how * many slots there are. */ avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel; /* * Now how many of those we will eat due to the pacer's * time (p_cur_slot) of start being behind the * real time (wheel_slot)? */ if (avail_on_wheel <= pacer_to_now) { /* * Wheel wrap, we can't fit on the wheel, that * is unusual the system must be way overloaded! * Insert into the assured slot, and return special * "0". */ counter_u64_add(combined_wheel_wrap, 1); *target_slot = hpts->p_nxt_slot; return (0); } else { /* * We know how many slots are open * on the wheel (the reverse of what * is left to run. Take away the time * the pacer started to now (wheel_slot) * and that tells you how many slots are * open that can be inserted into that won't * be touched by the pacer until later. */ return (avail_on_wheel - pacer_to_now); } } #ifdef INVARIANTS static void check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, uint32_t hptsslot, int line) { /* * Sanity checks for the pacer with invariants * on insert. */ KASSERT(hptsslot < NUM_OF_HPTSI_SLOTS, ("hpts:%p tp:%p slot:%d > max", hpts, tp, hptsslot)); if ((hpts->p_hpts_active) && (hpts->p_wheel_complete == 0)) { /* * If the pacer is processing a arc * of the wheel, we need to make * sure we are not inserting within * that arc. */ int distance, yet_to_run; distance = hpts_slots_diff(hpts->p_runningslot, hptsslot); if (hpts->p_runningslot != hpts->p_cur_slot) yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot); else yet_to_run = 0; /* processing last slot */ KASSERT(yet_to_run <= distance, ("hpts:%p tp:%p slot:%d " "distance:%d yet_to_run:%d rs:%d cs:%d", hpts, tp, hptsslot, distance, yet_to_run, hpts->p_runningslot, hpts->p_cur_slot)); } } #endif uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) { struct tcp_hpts_entry *hpts; struct timeval tv; uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0; int32_t wheel_slot, maxslots; bool need_wakeup = false; INP_WLOCK_ASSERT(tptoinpcb(tp)); MPASS(!(tptoinpcb(tp)->inp_flags & INP_DROPPED)); MPASS(!(tp->t_in_hpts == IHPTS_ONQUEUE)); /* * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ hpts = tcp_hpts_lock(tp); microuptime(&tv); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; diag->p_prev_slot = hpts->p_prev_slot; diag->p_runningslot = hpts->p_runningslot; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; diag->p_curtick = hpts->p_curtick; diag->p_lasttick = hpts->p_lasttick; diag->slot_req = slot; diag->p_on_min_sleep = hpts->p_on_min_sleep; diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } if (slot == 0) { /* Ok we need to set it on the hpts in the current slot */ tp->t_hpts_request = 0; if ((hpts->p_hpts_active == 0) || (hpts->p_wheel_complete)) { /* * A sleeping hpts we want in next slot to run * note that in this state p_prev_slot == p_cur_slot */ tp->t_hpts_slot = hpts_slot(hpts->p_prev_slot, 1); if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) need_wakeup = true; } else tp->t_hpts_slot = hpts->p_runningslot; if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) tcp_hpts_insert_internal(tp, hpts); if (need_wakeup) { /* * Activate the hpts if it is sleeping and its * timeout is not 1. */ hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } slot_on = hpts->p_nxt_slot; HPTS_UNLOCK(hpts); return (slot_on); } /* Get the current time relative to the wheel */ wheel_cts = tcp_tv_to_hptstick(&tv); /* Map it onto the wheel */ wheel_slot = tick_to_wheel(wheel_cts); /* Now what's the max we can place it at? */ maxslots = max_slots_available(hpts, wheel_slot, &last_slot); if (diag) { diag->wheel_slot = wheel_slot; diag->maxslots = maxslots; diag->wheel_cts = wheel_cts; } if (maxslots == 0) { /* The pacer is in a wheel wrap behind, yikes! */ if (slot > 1) { /* * Reduce by 1 to prevent a forever loop in * case something else is wrong. Note this * probably does not hurt because the pacer * if its true is so far behind we will be * > 1second late calling anyway. */ slot--; } tp->t_hpts_slot = last_slot; tp->t_hpts_request = slot; } else if (maxslots >= slot) { /* It all fits on the wheel */ tp->t_hpts_request = 0; tp->t_hpts_slot = hpts_slot(wheel_slot, slot); } else { /* It does not fit */ tp->t_hpts_request = slot - maxslots; tp->t_hpts_slot = last_slot; } if (diag) { diag->slot_remaining = tp->t_hpts_request; diag->inp_hptsslot = tp->t_hpts_slot; } #ifdef INVARIANTS check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); #endif if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) tcp_hpts_insert_internal(tp, hpts); if ((hpts->p_hpts_active == 0) && (tp->t_hpts_request == 0) && (hpts->p_on_min_sleep == 0)) { /* * The hpts is sleeping and NOT on a minimum * sleep time, we need to figure out where * it will wake up at and if we need to reschedule * its time-out. */ uint32_t have_slept, yet_to_sleep; /* Now do we need to restart the hpts's timer? */ have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot); if (have_slept < hpts->p_hpts_sleep_time) yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; else { /* We are over-due */ yet_to_sleep = 0; need_wakeup = 1; } if (diag) { diag->have_slept = have_slept; diag->yet_to_sleep = yet_to_sleep; } if (yet_to_sleep && (yet_to_sleep > slot)) { /* * We need to reschedule the hpts's time-out. */ hpts->p_hpts_sleep_time = slot; need_new_to = slot * HPTS_TICKS_PER_SLOT; } } /* * Now how far is the hpts sleeping to? if active is 1, its * up and ticking we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { hpts->p_direct_wake = 1; tcp_wakehpts(hpts); if (diag) { diag->need_new_to = 0; diag->co_ret = 0xffff0000; } } else if (need_new_to) { int32_t co_ret; struct timeval tv; sbintime_t sb; tv.tv_sec = 0; tv.tv_usec = 0; while (need_new_to > HPTS_USEC_IN_SEC) { tv.tv_sec++; need_new_to -= HPTS_USEC_IN_SEC; } tv.tv_usec = need_new_to; sb = tvtosbt(tv); co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); if (diag) { diag->need_new_to = need_new_to; diag->co_ret = co_ret; } } slot_on = hpts->p_nxt_slot; HPTS_UNLOCK(hpts); return (slot_on); } static uint16_t hpts_cpuid(struct tcpcb *tp, int *failed) { struct inpcb *inp = tptoinpcb(tp); u_int cpuid; #ifdef NUMA struct hpts_domain_info *di; #endif *failed = 0; if (tp->t_flags2 & TF2_HPTS_CPU_SET) { return (tp->t_hpts_cpu); } /* * If we are using the irq cpu set by LRO or * the driver then it overrides all other domains. */ if (tcp_use_irq_cpu) { if (tp->t_lro_cpu == HPTS_CPU_NONE) { *failed = 1; return (0); } return (tp->t_lro_cpu); } /* If one is set the other must be the same */ #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) return (hpts_random_cpu()); else return (cpuid); #endif /* * We don't have a flowid -> cpuid mapping, so cheat and just map * unknown cpuids to curcpu. Not the best, but apparently better * than defaulting to swi 0. */ if (inp->inp_flowtype == M_HASHTYPE_NONE) { counter_u64_add(cpu_uses_random, 1); return (hpts_random_cpu()); } /* * Hash to a thread based on the flowid. If we are using numa, * then restrict the hash to the numa domain where the inp lives. */ #ifdef NUMA if ((vm_ndomains == 1) || (inp->inp_numa_domain == M_NODOM)) { #endif cpuid = inp->inp_flowid % mp_ncpus; #ifdef NUMA } else { /* Hash into the cpu's that use that domain */ di = &hpts_domains[inp->inp_numa_domain]; cpuid = di->cpu[inp->inp_flowid % di->count]; } #endif counter_u64_add(cpu_uses_flowid, 1); return (cpuid); } static void tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt) { uint32_t t = 0, i; if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) { /* * Find next slot that is occupied and use that to * be the sleep time. */ for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) { if (TAILQ_EMPTY(&hpts->p_hptss[t].head) == 0) { break; } t = (t + 1) % NUM_OF_HPTSI_SLOTS; } KASSERT((i != NUM_OF_HPTSI_SLOTS), ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt)); hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max); } else { /* No one on the wheel sleep for all but 400 slots or sleep max */ hpts->p_hpts_sleep_time = hpts_sleep_max; } } static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) { struct tcpcb *tp; struct timeval tv; int32_t slots_to_run, i, error; int32_t loop_cnt = 0; int32_t did_prefetch = 0; int32_t prefetch_tp = 0; int32_t wrap_loop_cnt = 0; int32_t slot_pos_of_endpoint = 0; int32_t orig_exit_slot; int8_t completed_measure = 0, seen_endpoint = 0; HPTS_MTX_ASSERT(hpts); NET_EPOCH_ASSERT(); /* record previous info for any logging */ hpts->saved_lasttick = hpts->p_lasttick; hpts->saved_curtick = hpts->p_curtick; hpts->saved_curslot = hpts->p_cur_slot; hpts->saved_prev_slot = hpts->p_prev_slot; hpts->p_lasttick = hpts->p_curtick; hpts->p_curtick = tcp_gethptstick(&tv); tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv); orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); if ((hpts->p_on_queue_cnt == 0) || (hpts->p_lasttick == hpts->p_curtick)) { /* * No time has yet passed, * or nothing to do. */ hpts->p_prev_slot = hpts->p_cur_slot; hpts->p_lasttick = hpts->p_curtick; goto no_run; } again: hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); if (((hpts->p_curtick - hpts->p_lasttick) > ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) && (hpts->p_on_queue_cnt != 0)) { /* * Wheel wrap is occuring, basically we * are behind and the distance between * run's has spread so much it has exceeded * the time on the wheel (1.024 seconds). This * is ugly and should NOT be happening. We * need to run the entire wheel. We last processed * p_prev_slot, so that needs to be the last slot * we run. The next slot after that should be our * reserved first slot for new, and then starts * the running position. Now the problem is the * reserved "not to yet" place does not exist * and there may be inp's in there that need * running. We can merge those into the * first slot at the head. */ wrap_loop_cnt++; hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1); hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2); /* * Adjust p_cur_slot to be where we are starting from * hopefully we will catch up (fat chance if something * is broken this bad :( ) */ hpts->p_cur_slot = hpts->p_prev_slot; /* * The next slot has guys to run too, and that would * be where we would normally start, lets move them into * the next slot (p_prev_slot + 2) so that we will * run them, the extra 10usecs of late (by being * put behind) does not really matter in this situation. */ TAILQ_FOREACH(tp, &hpts->p_hptss[hpts->p_nxt_slot].head, t_hpts) { MPASS(tp->t_hpts_slot == hpts->p_nxt_slot); MPASS(tp->t_hpts_gencnt == hpts->p_hptss[hpts->p_nxt_slot].gencnt); MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); /* * Update gencnt and nextslot accordingly to match * the new location. This is safe since it takes both * the INP lock and the pacer mutex to change the * t_hptsslot and t_hpts_gencnt. */ tp->t_hpts_gencnt = hpts->p_hptss[hpts->p_runningslot].gencnt; tp->t_hpts_slot = hpts->p_runningslot; } TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot].head, &hpts->p_hptss[hpts->p_nxt_slot].head, t_hpts); hpts->p_hptss[hpts->p_runningslot].count += hpts->p_hptss[hpts->p_nxt_slot].count; hpts->p_hptss[hpts->p_nxt_slot].count = 0; hpts->p_hptss[hpts->p_nxt_slot].gencnt++; slots_to_run = NUM_OF_HPTSI_SLOTS - 1; counter_u64_add(wheel_wrap, 1); } else { /* * Nxt slot is always one after p_runningslot though * its not used usually unless we are doing wheel wrap. */ hpts->p_nxt_slot = hpts->p_prev_slot; hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1); } if (hpts->p_on_queue_cnt == 0) { goto no_one; } for (i = 0; i < slots_to_run; i++) { struct tcpcb *tp, *ntp; TAILQ_HEAD(, tcpcb) head = TAILQ_HEAD_INITIALIZER(head); struct hptsh *hptsh; uint32_t runningslot; /* * Calculate our delay, if there are no extra ticks there * was not any (i.e. if slots_to_run == 1, no delay). */ hpts->p_delayed_by = (slots_to_run - (i + 1)) * HPTS_TICKS_PER_SLOT; runningslot = hpts->p_runningslot; hptsh = &hpts->p_hptss[runningslot]; TAILQ_SWAP(&head, &hptsh->head, tcpcb, t_hpts); hpts->p_on_queue_cnt -= hptsh->count; hptsh->count = 0; hptsh->gencnt++; HPTS_UNLOCK(hpts); TAILQ_FOREACH_SAFE(tp, &head, t_hpts, ntp) { struct inpcb *inp = tptoinpcb(tp); bool set_cpu; if (ntp != NULL) { /* * If we have a next tcpcb, see if we can * prefetch it. Note this may seem * "risky" since we have no locks (other * than the previous inp) and there no * assurance that ntp was not pulled while * we were processing tp and freed. If this * occurred it could mean that either: * * a) Its NULL (which is fine we won't go * here) b) Its valid (which is cool we * will prefetch it) c) The inp got * freed back to the slab which was * reallocated. Then the piece of memory was * re-used and something else (not an * address) is in inp_ppcb. If that occurs * we don't crash, but take a TLB shootdown * performance hit (same as if it was NULL * and we tried to pre-fetch it). * * Considering that the likelyhood of is * quite rare we will take a risk on doing * this. If performance drops after testing * we can always take this out. NB: the * kern_prefetch on amd64 actually has * protection against a bad address now via * the DMAP_() tests. This will prevent the * TLB hit, and instead if occurs just * cause us to load cache with a useless * address (to us). * * XXXGL: this comment and the prefetch action * could be outdated after tp == inp change. */ kern_prefetch(ntp, &prefetch_tp); prefetch_tp = 1; } /* For debugging */ if (seen_endpoint == 0) { seen_endpoint = 1; orig_exit_slot = slot_pos_of_endpoint = runningslot; } else if (completed_measure == 0) { /* Record the new position */ orig_exit_slot = runningslot; } INP_WLOCK(inp); if ((tp->t_flags2 & TF2_HPTS_CPU_SET) == 0) { set_cpu = true; } else { set_cpu = false; } if (__predict_false(tp->t_in_hpts == IHPTS_MOVING)) { if (tp->t_hpts_slot == -1) { tp->t_in_hpts = IHPTS_NONE; if (in_pcbrele_wlocked(inp) == false) INP_WUNLOCK(inp); } else { HPTS_LOCK(hpts); tcp_hpts_insert_internal(tp, hpts); HPTS_UNLOCK(hpts); INP_WUNLOCK(inp); } continue; } MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); MPASS(!(inp->inp_flags & INP_DROPPED)); KASSERT(runningslot == tp->t_hpts_slot, ("Hpts:%p inp:%p slot mis-aligned %u vs %u", hpts, inp, runningslot, tp->t_hpts_slot)); if (tp->t_hpts_request) { /* * This guy is deferred out further in time * then our wheel had available on it. * Push him back on the wheel or run it * depending. */ uint32_t maxslots, last_slot, remaining_slots; remaining_slots = slots_to_run - (i + 1); if (tp->t_hpts_request > remaining_slots) { HPTS_LOCK(hpts); /* * How far out can we go? */ maxslots = max_slots_available(hpts, hpts->p_cur_slot, &last_slot); if (maxslots >= tp->t_hpts_request) { /* We can place it finally to * be processed. */ tp->t_hpts_slot = hpts_slot( hpts->p_runningslot, tp->t_hpts_request); tp->t_hpts_request = 0; } else { /* Work off some more time */ tp->t_hpts_slot = last_slot; tp->t_hpts_request -= maxslots; } tcp_hpts_insert_internal(tp, hpts); HPTS_UNLOCK(hpts); INP_WUNLOCK(inp); continue; } tp->t_hpts_request = 0; /* Fall through we will so do it now */ } tcp_hpts_release(tp); if (set_cpu) { /* * Setup so the next time we will move to * the right CPU. This should be a rare * event. It will sometimes happens when we * are the client side (usually not the * server). Somehow tcp_output() gets called * before the tcp_do_segment() sets the * intial state. This means the r_cpu and * r_hpts_cpu is 0. We get on the hpts, and * then tcp_input() gets called setting up * the r_cpu to the correct value. The hpts * goes off and sees the mis-match. We * simply correct it here and the CPU will * switch to the new hpts nextime the tcb * gets added to the hpts (not this one) * :-) */ tcp_set_hpts(tp); } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); } if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } /* * We set TF2_HPTS_CALLS before any possible output. * The contract with the transport is that if it cares * about hpts calling it should clear the flag. That * way next time it is called it will know it is hpts. * * We also only call tfb_do_queued_segments() * tcp_output(). It is expected that if segments are * queued and come in that the final input mbuf will * cause a call to output if it is needed so we do * not need a second call to tcp_output(). So we do * one or the other but not both. */ tp->t_flags2 |= TF2_HPTS_CALLS; if ((tp->t_flags2 & TF2_SUPPORTS_MBUFQ) && !STAILQ_EMPTY(&tp->t_inqueue)) { error = (*tp->t_fb->tfb_do_queued_segments)(tp, 0); /* * A non-zero return for input queue processing * is the lock is released and most likely the * inp is gone. */ if (error) goto skip_pacing; } else error = tcp_output(tp); if (error < 0) goto skip_pacing; INP_WUNLOCK(inp); skip_pacing: CURVNET_RESTORE(); } if (seen_endpoint) { /* * We now have a accurate distance between * slot_pos_of_endpoint <-> orig_exit_slot * to tell us how late we were, orig_exit_slot * is where we calculated the end of our cycle to * be when we first entered. */ completed_measure = 1; } HPTS_LOCK(hpts); hpts->p_runningslot++; if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) { hpts->p_runningslot = 0; } } no_one: HPTS_MTX_ASSERT(hpts); hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ hpts->p_prev_slot = hpts->p_cur_slot; hpts->p_lasttick = hpts->p_curtick; if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) { /* * Something is serious slow we have * looped through processing the wheel * and by the time we cleared the * needs to run max_pacer_loops time * we still needed to run. That means * the system is hopelessly behind and * can never catch up :( * * We will just lie to this thread * and let it thing p_curtick is * correct. When it next awakens * it will find itself further behind. */ if (from_callout) counter_u64_add(hpts_hopelessly_behind, 1); goto no_run; } hpts->p_curtick = tcp_gethptstick(&tv); hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); if (seen_endpoint == 0) { /* We saw no endpoint but we may be looping */ orig_exit_slot = hpts->p_cur_slot; } if ((wrap_loop_cnt < 2) && (hpts->p_lasttick != hpts->p_curtick)) { counter_u64_add(hpts_loops, 1); loop_cnt++; goto again; } no_run: tcp_pace.cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv); /* * Set flag to tell that we are done for * any slot input that happens during * input. */ hpts->p_wheel_complete = 1; /* * Now did we spend too long running input and need to run more ticks? * Note that if wrap_loop_cnt < 2 then we should have the conditions * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt * is greater than 2, then the condtion most likely are *not* true. * Also if we are called not from the callout, we don't run the wheel * multiple times so the slots may not align either. */ KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || (wrap_loop_cnt >= 2) || (from_callout == 0)), ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, hpts->p_prev_slot, hpts->p_cur_slot)); KASSERT(((hpts->p_lasttick == hpts->p_curtick) || (wrap_loop_cnt >= 2) || (from_callout == 0)), ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, hpts->p_lasttick, hpts->p_curtick)); if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { hpts->p_curtick = tcp_gethptstick(&tv); counter_u64_add(hpts_loops, 1); hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); goto again; } if (from_callout){ tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt); } if (seen_endpoint) return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot)); else return (0); } void __tcp_set_hpts(struct tcpcb *tp, int32_t line) { struct tcp_hpts_entry *hpts; int failed; INP_WLOCK_ASSERT(tptoinpcb(tp)); hpts = tcp_hpts_lock(tp); if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { tp->t_hpts_cpu = hpts_cpuid(tp, &failed); if (failed == 0) tp->t_flags2 |= TF2_HPTS_CPU_SET; } mtx_unlock(&hpts->p_mtx); } static struct tcp_hpts_entry * tcp_choose_hpts_to_run(void) { int i, oldest_idx, start, end; uint32_t cts, time_since_ran, calc; cts = tcp_get_usecs(NULL); time_since_ran = 0; /* Default is all one group */ start = 0; end = tcp_pace.rp_num_hptss; /* * If we have more than one L3 group figure out which one * this CPU is in. */ if (tcp_pace.grp_cnt > 1) { for (i = 0; i < tcp_pace.grp_cnt; i++) { if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) { start = tcp_pace.grps[i]->cg_first; end = (tcp_pace.grps[i]->cg_last + 1); break; } } } oldest_idx = -1; for (i = start; i < end; i++) { if (TSTMP_GT(cts, tcp_pace.cts_last_ran[i])) calc = cts - tcp_pace.cts_last_ran[i]; else calc = 0; if (calc > time_since_ran) { oldest_idx = i; time_since_ran = calc; } } if (oldest_idx >= 0) return(tcp_pace.rp_ent[oldest_idx]); else return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); } static void __tcp_run_hpts(void) { struct epoch_tracker et; struct tcp_hpts_entry *hpts; int ticks_ran; hpts = tcp_choose_hpts_to_run(); if (hpts->p_hpts_active) { /* Already active */ return; } if (mtx_trylock(&hpts->p_mtx) == 0) { /* Someone else got the lock */ return; } NET_EPOCH_ENTER(et); if (hpts->p_hpts_active) goto out_with_mtx; hpts->syscall_cnt++; counter_u64_add(hpts_direct_call, 1); hpts->p_hpts_active = 1; ticks_ran = tcp_hptsi(hpts, 0); /* We may want to adjust the sleep values here */ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { if (ticks_ran > ticks_indicate_less_sleep) { struct timeval tv; sbintime_t sb; hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) hpts->p_mysleep.tv_usec = dynamic_min_sleep; /* Reschedule with new to value */ tcp_hpts_set_max_sleep(hpts, 0); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; /* Validate its in the right ranges */ if (tv.tv_usec < hpts->p_mysleep.tv_usec) { hpts->overidden_sleep = tv.tv_usec; tv.tv_usec = hpts->p_mysleep.tv_usec; } else if (tv.tv_usec > dynamic_max_sleep) { /* Lets not let sleep get above this value */ hpts->overidden_sleep = tv.tv_usec; tv.tv_usec = dynamic_max_sleep; } /* * In this mode the timer is a backstop to * all the userret/lro_flushes so we use * the dynamic value and set the on_min_sleep * flag so we will not be awoken. */ sb = tvtosbt(tv); /* Store off to make visible the actual sleep time */ hpts->sleeping = tv.tv_usec; callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } else if (ticks_ran < ticks_indicate_more_sleep) { /* For the further sleep, don't reschedule hpts */ hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) hpts->p_mysleep.tv_usec = dynamic_max_sleep; } hpts->p_on_min_sleep = 1; } hpts->p_hpts_active = 0; out_with_mtx: HPTS_MTX_ASSERT(hpts); mtx_unlock(&hpts->p_mtx); NET_EPOCH_EXIT(et); } static void tcp_hpts_thread(void *ctx) { struct tcp_hpts_entry *hpts; struct epoch_tracker et; struct timeval tv; sbintime_t sb; int ticks_ran; hpts = (struct tcp_hpts_entry *)ctx; mtx_lock(&hpts->p_mtx); if (hpts->p_direct_wake) { /* Signaled by input or output with low occupancy count. */ callout_stop(&hpts->co); counter_u64_add(hpts_direct_awakening, 1); } else { /* Timed out, the normal case. */ counter_u64_add(hpts_wake_timeout, 1); if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { mtx_unlock(&hpts->p_mtx); return; } } callout_deactivate(&hpts->co); hpts->p_hpts_wake_scheduled = 0; NET_EPOCH_ENTER(et); if (hpts->p_hpts_active) { /* * We are active already. This means that a syscall * trap or LRO is running in behalf of hpts. In that case * we need to double our timeout since there seems to be * enough activity in the system that we don't need to * run as often (if we were not directly woken). */ if (hpts->p_direct_wake == 0) { counter_u64_add(hpts_back_tosleep, 1); if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) hpts->p_mysleep.tv_usec = dynamic_max_sleep; tv.tv_usec = hpts->p_mysleep.tv_usec; hpts->p_on_min_sleep = 1; } else { /* * Here we have low count on the wheel, but * somehow we still collided with one of the * connections. Lets go back to sleep for a * min sleep time, but clear the flag so we * can be awoken by insert. */ hpts->p_on_min_sleep = 0; tv.tv_usec = tcp_min_hptsi_time; } } else { /* * Directly woken most likely to reset the * callout time. */ tv.tv_sec = 0; tv.tv_usec = hpts->p_mysleep.tv_usec; } goto back_to_sleep; } hpts->sleeping = 0; hpts->p_hpts_active = 1; ticks_ran = tcp_hptsi(hpts, 1); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; + if ((hpts->p_on_queue_cnt > conn_cnt_thresh) && (hpts->hit_callout_thresh == 0)) { + hpts->hit_callout_thresh = 1; + atomic_add_int(&hpts_that_need_softclock, 1); + } else if ((hpts->p_on_queue_cnt <= conn_cnt_thresh) && (hpts->hit_callout_thresh == 1)) { + hpts->hit_callout_thresh = 0; + atomic_subtract_int(&hpts_that_need_softclock, 1); + } if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { if(hpts->p_direct_wake == 0) { /* * Only adjust sleep time if we were * called from the callout i.e. direct_wake == 0. */ if (ticks_ran < ticks_indicate_more_sleep) { hpts->p_mysleep.tv_usec *= 2; if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) hpts->p_mysleep.tv_usec = dynamic_max_sleep; } else if (ticks_ran > ticks_indicate_less_sleep) { hpts->p_mysleep.tv_usec /= 2; if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) hpts->p_mysleep.tv_usec = dynamic_min_sleep; } } if (tv.tv_usec < hpts->p_mysleep.tv_usec) { hpts->overidden_sleep = tv.tv_usec; tv.tv_usec = hpts->p_mysleep.tv_usec; } else if (tv.tv_usec > dynamic_max_sleep) { /* Lets not let sleep get above this value */ hpts->overidden_sleep = tv.tv_usec; tv.tv_usec = dynamic_max_sleep; } /* * In this mode the timer is a backstop to * all the userret/lro_flushes so we use * the dynamic value and set the on_min_sleep * flag so we will not be awoken. */ hpts->p_on_min_sleep = 1; } else if (hpts->p_on_queue_cnt == 0) { /* * No one on the wheel, please wake us up * if you insert on the wheel. */ hpts->p_on_min_sleep = 0; hpts->overidden_sleep = 0; } else { /* * We hit here when we have a low number of * clients on the wheel (our else clause). * We may need to go on min sleep, if we set * the flag we will not be awoken if someone * is inserted ahead of us. Clearing the flag * means we can be awoken. This is "old mode" * where the timer is what runs hpts mainly. */ if (tv.tv_usec < tcp_min_hptsi_time) { /* * Yes on min sleep, which means * we cannot be awoken. */ hpts->overidden_sleep = tv.tv_usec; tv.tv_usec = tcp_min_hptsi_time; hpts->p_on_min_sleep = 1; } else { /* Clear the min sleep flag */ hpts->overidden_sleep = 0; hpts->p_on_min_sleep = 0; } } HPTS_MTX_ASSERT(hpts); hpts->p_hpts_active = 0; back_to_sleep: hpts->p_direct_wake = 0; sb = tvtosbt(tv); /* Store off to make visible the actual sleep time */ hpts->sleeping = tv.tv_usec; callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); NET_EPOCH_EXIT(et); mtx_unlock(&hpts->p_mtx); } #undef timersub static int32_t hpts_count_level(struct cpu_group *cg) { int32_t count_l3, i; count_l3 = 0; if (cg->cg_level == CG_SHARE_L3) count_l3++; /* Walk all the children looking for L3 */ for (i = 0; i < cg->cg_children; i++) { count_l3 += hpts_count_level(&cg->cg_child[i]); } return (count_l3); } static void hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_group *cg) { int32_t idx, i; idx = *at; if (cg->cg_level == CG_SHARE_L3) { grps[idx] = cg; idx++; if (idx == max) { *at = idx; return; } } *at = idx; /* Walk all the children looking for L3 */ for (i = 0; i < cg->cg_children; i++) { hpts_gather_grps(grps, at, max, &cg->cg_child[i]); } } static void tcp_hpts_mod_load(void) { struct cpu_group *cpu_top; int32_t error __diagused; int32_t i, j, bound = 0, created = 0; size_t sz, asz; struct timeval tv; sbintime_t sb; struct tcp_hpts_entry *hpts; struct pcpu *pc; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; int count, domain; #ifdef SMP cpu_top = smp_topo(); #else cpu_top = NULL; #endif tcp_pace.rp_num_hptss = ncpus; hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); hpts_loops = counter_u64_alloc(M_WAITOK); back_tosleep = counter_u64_alloc(M_WAITOK); combined_wheel_wrap = counter_u64_alloc(M_WAITOK); wheel_wrap = counter_u64_alloc(M_WAITOK); hpts_wake_timeout = counter_u64_alloc(M_WAITOK); hpts_direct_awakening = counter_u64_alloc(M_WAITOK); hpts_back_tosleep = counter_u64_alloc(M_WAITOK); hpts_direct_call = counter_u64_alloc(M_WAITOK); cpu_uses_flowid = counter_u64_alloc(M_WAITOK); cpu_uses_random = counter_u64_alloc(M_WAITOK); sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss); tcp_pace.cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); tcp_pace.grp_cnt = 0; if (cpu_top == NULL) { tcp_pace.grp_cnt = 1; } else { /* Find out how many cache level 3 domains we have */ count = 0; tcp_pace.grp_cnt = hpts_count_level(cpu_top); if (tcp_pace.grp_cnt == 0) { tcp_pace.grp_cnt = 1; } sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *)); tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK); /* Now populate the groups */ if (tcp_pace.grp_cnt == 1) { /* * All we need is the top level all cpu's are in * the same cache so when we use grp[0]->cg_mask * with the cg_first <-> cg_last it will include * all cpu's in it. The level here is probably * zero which is ok. */ tcp_pace.grps[0] = cpu_top; } else { /* * Here we must find all the level three cache domains * and setup our pointers to them. */ count = 0; hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top); } } asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; for (i = 0; i < tcp_pace.rp_num_hptss; i++) { tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), M_TCPHPTS, M_WAITOK | M_ZERO); tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK); hpts = tcp_pace.rp_ent[i]; /* * Init all the hpts structures that are not specifically * zero'd by the allocations. Also lets attach them to the * appropriate sysctl block as well. */ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", MTX_DEF | MTX_DUPOK); for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { TAILQ_INIT(&hpts->p_hptss[j].head); hpts->p_hptss[j].count = 0; hpts->p_hptss[j].gencnt = 0; } sysctl_ctx_init(&hpts->hpts_ctx); sprintf(unit, "%d", i); hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), OID_AUTO, unit, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "out_qcnt", CTLFLAG_RD, &hpts->p_on_queue_cnt, 0, "Count TCB's awaiting output processing"); SYSCTL_ADD_U16(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "active", CTLFLAG_RD, &hpts->p_hpts_active, 0, "Is the hpts active"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curslot", CTLFLAG_RD, &hpts->p_cur_slot, 0, "What the current running pacers goal"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "runtick", CTLFLAG_RD, &hpts->p_runningslot, 0, "What the running pacers current slot is"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curtick", CTLFLAG_RD, &hpts->p_curtick, 0, "What the running pacers last tick mapped to the wheel was"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "lastran", CTLFLAG_RD, &tcp_pace.cts_last_ran[i], 0, "The last usec tick that this hpts ran"); SYSCTL_ADD_LONG(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "cur_min_sleep", CTLFLAG_RD, &hpts->p_mysleep.tv_usec, "What the running pacers is using for p_mysleep.tv_usec"); SYSCTL_ADD_U64(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "now_sleeping", CTLFLAG_RD, &hpts->sleeping, 0, "What the running pacers is actually sleeping for"); SYSCTL_ADD_U64(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "syscall_cnt", CTLFLAG_RD, &hpts->syscall_cnt, 0, "How many times we had syscalls on this hpts"); hpts->p_hpts_sleep_time = hpts_sleep_max; hpts->p_num = i; hpts->p_curtick = tcp_gethptstick(&tv); tcp_pace.cts_last_ran[i] = tcp_tv_to_usectick(&tv); hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); hpts->p_cpu = 0xffff; hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1); callout_init(&hpts->co, 1); } /* Don't try to bind to NUMA domains if we don't have any */ if (vm_ndomains == 1 && tcp_bind_threads == 2) tcp_bind_threads = 0; /* * Now lets start ithreads to handle the hptss. */ for (i = 0; i < tcp_pace.rp_num_hptss; i++) { hpts = tcp_pace.rp_ent[i]; hpts->p_cpu = i; error = swi_add(&hpts->ie, "hpts", tcp_hpts_thread, (void *)hpts, SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); KASSERT(error == 0, ("Can't add hpts:%p i:%d err:%d", hpts, i, error)); created++; hpts->p_mysleep.tv_sec = 0; hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; if (tcp_bind_threads == 1) { if (intr_event_bind(hpts->ie, i) == 0) bound++; } else if (tcp_bind_threads == 2) { /* Find the group for this CPU (i) and bind into it */ for (j = 0; j < tcp_pace.grp_cnt; j++) { if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) { if (intr_event_bind_ithread_cpuset(hpts->ie, &tcp_pace.grps[j]->cg_mask) == 0) { bound++; pc = pcpu_find(i); domain = pc->pc_domain; count = hpts_domains[domain].count; hpts_domains[domain].cpu[count] = i; hpts_domains[domain].count++; break; } } } } tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; hpts->sleeping = tv.tv_usec; sb = tvtosbt(tv); callout_reset_sbt_on(&hpts->co, sb, 0, hpts_timeout_swi, hpts, hpts->p_cpu, (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } /* * If we somehow have an empty domain, fall back to choosing * among all htps threads. */ for (i = 0; i < vm_ndomains; i++) { if (hpts_domains[i].count == 0) { tcp_bind_threads = 0; break; } } tcp_hpts_softclock = __tcp_run_hpts; tcp_lro_hpts_init(); printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", created, bound, tcp_bind_threads == 2 ? "NUMA domains" : "cpus"); } static void tcp_hpts_mod_unload(void) { int rv __diagused; tcp_lro_hpts_uninit(); atomic_store_ptr(&tcp_hpts_softclock, NULL); for (int i = 0; i < tcp_pace.rp_num_hptss; i++) { struct tcp_hpts_entry *hpts = tcp_pace.rp_ent[i]; rv = callout_drain(&hpts->co); MPASS(rv != 0); rv = swi_remove(hpts->ie_cookie); MPASS(rv == 0); rv = sysctl_ctx_free(&hpts->hpts_ctx); MPASS(rv == 0); mtx_destroy(&hpts->p_mtx); free(hpts->p_hptss, M_TCPHPTS); free(hpts, M_TCPHPTS); } free(tcp_pace.rp_ent, M_TCPHPTS); free(tcp_pace.cts_last_ran, M_TCPHPTS); #ifdef SMP free(tcp_pace.grps, M_TCPHPTS); #endif counter_u64_free(hpts_hopelessly_behind); counter_u64_free(hpts_loops); counter_u64_free(back_tosleep); counter_u64_free(combined_wheel_wrap); counter_u64_free(wheel_wrap); counter_u64_free(hpts_wake_timeout); counter_u64_free(hpts_direct_awakening); counter_u64_free(hpts_back_tosleep); counter_u64_free(hpts_direct_call); counter_u64_free(cpu_uses_flowid); counter_u64_free(cpu_uses_random); } static int tcp_hpts_modevent(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: tcp_hpts_mod_load(); return (0); case MOD_QUIESCE: /* * Since we are a dependency of TCP stack modules, they should * already be unloaded, and the HPTS ring is empty. However, * function pointer manipulations aren't 100% safe. Although, * tcp_hpts_mod_unload() use atomic(9) the userret() doesn't. * Thus, allow only forced unload of HPTS. */ return (EBUSY); case MOD_UNLOAD: tcp_hpts_mod_unload(); return (0); default: return (EINVAL); }; } static moduledata_t tcp_hpts_module = { .name = "tcphpts", .evhand = tcp_hpts_modevent, }; DECLARE_MODULE(tcphpts, tcp_hpts_module, SI_SUB_SOFTINTR, SI_ORDER_ANY); MODULE_VERSION(tcphpts, 1); diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 435e6b61c904..6c2b466c4cfe 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -1,594 +1,596 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #include #include #include #include #include /* for people using printf mainly */ #include #include __NULLABILITY_PRAGMA_PUSH #ifdef _KERNEL extern int cold; /* nonzero if we are doing a cold boot */ extern int suspend_blocked; /* block suspend due to pending shutdown */ extern int rebooting; /* kern_reboot() has been called. */ extern char version[]; /* system version */ extern char compiler_version[]; /* compiler version */ extern char copyright[]; /* system copyright */ extern int kstack_pages; /* number of kernel stack pages */ extern u_long pagesizes[]; /* supported page sizes */ extern long physmem; /* physical memory */ extern long realmem; /* 'real' memory */ extern char *rootdevnames[2]; /* names of possible root devices */ extern int boothowto; /* reboot flags, from console subsystem */ extern int bootverbose; /* nonzero to print verbose messages */ extern int maxusers; /* system tune hint */ extern int ngroups_max; /* max # of supplemental groups */ extern int vm_guest; /* Running as virtual machine guest? */ extern u_long maxphys; /* max raw I/O transfer size */ /* * Detected virtual machine guest types. The intention is to expand * and/or add to the VM_GUEST_VM type if specific VM functionality is * ever implemented (e.g. vendor-specific paravirtualization features). * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_GUEST_VBOX, VM_GUEST_PARALLELS, VM_GUEST_LAST }; #endif /* KERNEL */ /* * Align variables. */ #define __read_mostly __section(".data.read_mostly") #define __read_frequently __section(".data.read_frequently") #define __exclusive_cache_line __aligned(CACHE_LINE_SIZE) \ __section(".data.exclusive_cache_line") #if defined(_STANDALONE) struct ucred; #endif #ifdef _KERNEL #include /* MAXCPU */ #include /* curthread */ #include extern bool scheduler_stopped; /* * If we have already panic'd and this is the thread that called * panic(), then don't block on any mutexes but silently succeed. * Otherwise, the kernel will deadlock since the scheduler isn't * going to run the thread that holds any lock we need. */ #define SCHEDULER_STOPPED() __predict_false(scheduler_stopped) extern int osreldate; extern const void *zero_region; /* address space maps to a zeroed page */ extern int unmapped_buf_allowed; #ifdef __LP64__ #define IOSIZE_MAX iosize_max() #define DEVFS_IOSIZE_MAX devfs_iosize_max() #else #define IOSIZE_MAX SSIZE_MAX #define DEVFS_IOSIZE_MAX SSIZE_MAX #endif /* * General function declarations. */ struct inpcb; struct lock_object; struct malloc_type; struct mtx; struct proc; struct socket; struct thread; struct tty; struct ucred; struct uio; struct _jmp_buf; struct trapframe; struct eventtimer; int setjmp(struct _jmp_buf *) __returns_twice; void longjmp(struct _jmp_buf *, int) __dead2; int dumpstatus(vm_offset_t addr, off_t count); int nullop(void); int eopnotsupp(void); int ureadc(int, struct uio *); void hashdestroy(void *, struct malloc_type *, u_long); void *hashinit(int count, struct malloc_type *type, u_long *hashmask); void *hashinit_flags(int count, struct malloc_type *type, u_long *hashmask, int flags); #define HASH_NOWAIT 0x00000001 #define HASH_WAITOK 0x00000002 void *phashinit(int count, struct malloc_type *type, u_long *nentries); void *phashinit_flags(int count, struct malloc_type *type, u_long *nentries, int flags); void cpu_flush_dcache(void *, size_t); void cpu_rootconf(void); void critical_enter_KBI(void); void critical_exit_KBI(void); void critical_exit_preempt(void); void init_param1(void); void init_param2(long physpages); void init_static_kenv(char *, size_t); void tablefull(const char *); /* * Allocate per-thread "current" state in the linuxkpi */ extern int (*lkpi_alloc_current)(struct thread *, int); int linux_alloc_current_noop(struct thread *, int); #if (defined(KLD_MODULE) && !defined(KLD_TIED)) || defined(KTR_CRITICAL) || !defined(_KERNEL) || defined(GENOFFSET) #define critical_enter() critical_enter_KBI() #define critical_exit() critical_exit_KBI() #else static __inline void critical_enter(void) { struct thread_lite *td; td = (struct thread_lite *)curthread; td->td_critnest++; atomic_interrupt_fence(); } static __inline void critical_exit(void) { struct thread_lite *td; td = (struct thread_lite *)curthread; KASSERT(td->td_critnest != 0, ("critical_exit: td_critnest == 0")); atomic_interrupt_fence(); td->td_critnest--; atomic_interrupt_fence(); if (__predict_false(td->td_owepreempt)) critical_exit_preempt(); } #endif #ifdef EARLY_PRINTF typedef void early_putc_t(int ch); extern early_putc_t *early_putc; #define CHECK_EARLY_PRINTF(x) \ __CONCAT(early_printf_, EARLY_PRINTF) == __CONCAT(early_printf_, x) #define early_printf_1 1 #define early_printf_mvebu 2 #define early_printf_ns8250 3 #define early_printf_pl011 4 #define early_printf_snps 5 #define early_printf_sbi 6 #else #define CHECK_EARLY_PRINTF(x) 0 #endif int kvprintf(char const *, void (*)(int, void*), void *, int, __va_list) __printflike(1, 0); void log(int, const char *, ...) __printflike(2, 3); void log_console(struct uio *); void vlog(int, const char *, __va_list) __printflike(2, 0); int asprintf(char **ret, struct malloc_type *mtp, const char *format, ...) __printflike(3, 4); int printf(const char *, ...) __printflike(1, 2); int snprintf(char *, size_t, const char *, ...) __printflike(3, 4); int sprintf(char *buf, const char *, ...) __printflike(2, 3); int uprintf(const char *, ...) __printflike(1, 2); int vprintf(const char *, __va_list) __printflike(1, 0); int vasprintf(char **ret, struct malloc_type *mtp, const char *format, __va_list ap) __printflike(3, 0); int vsnprintf(char *, size_t, const char *, __va_list) __printflike(3, 0); int vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0); int vsprintf(char *buf, const char *, __va_list) __printflike(2, 0); int sscanf(const char *, char const * _Nonnull, ...) __scanflike(2, 3); int vsscanf(const char * _Nonnull, char const * _Nonnull, __va_list) __scanflike(2, 0); long strtol(const char *, char **, int); u_long strtoul(const char *, char **, int); quad_t strtoq(const char *, char **, int); u_quad_t strtouq(const char *, char **, int); void tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4); void vtprintf(struct proc *, int, const char *, __va_list) __printflike(3, 0); void hexdump(const void *ptr, int length, const char *hdr, int flags); #define HD_COLUMN_MASK 0xff #define HD_DELIM_MASK 0xff00 #define HD_OMIT_COUNT (1 << 16) #define HD_OMIT_HEX (1 << 17) #define HD_OMIT_CHARS (1 << 18) #define ovbcopy(f, t, l) bcopy((f), (t), (l)) void explicit_bzero(void * _Nonnull, size_t); void *memset(void * _Nonnull buf, int c, size_t len); void *memcpy(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove(void * _Nonnull dest, const void * _Nonnull src, size_t n); int memcmp(const void *b1, const void *b2, size_t len); #ifdef SAN_NEEDS_INTERCEPTORS #define SAN_INTERCEPTOR(func) \ __CONCAT(SAN_INTERCEPTOR_PREFIX, __CONCAT(_, func)) void *SAN_INTERCEPTOR(memset)(void *, int, size_t); void *SAN_INTERCEPTOR(memcpy)(void *, const void *, size_t); void *SAN_INTERCEPTOR(memmove)(void *, const void *, size_t); int SAN_INTERCEPTOR(memcmp)(const void *, const void *, size_t); #ifndef SAN_RUNTIME #define bcopy(from, to, len) SAN_INTERCEPTOR(memmove)((to), (from), (len)) #define bzero(buf, len) SAN_INTERCEPTOR(memset)((buf), 0, (len)) #define bcmp(b1, b2, len) SAN_INTERCEPTOR(memcmp)((b1), (b2), (len)) #define memset(buf, c, len) SAN_INTERCEPTOR(memset)((buf), (c), (len)) #define memcpy(to, from, len) SAN_INTERCEPTOR(memcpy)((to), (from), (len)) #define memmove(dest, src, n) SAN_INTERCEPTOR(memmove)((dest), (src), (n)) #define memcmp(b1, b2, len) SAN_INTERCEPTOR(memcmp)((b1), (b2), (len)) #endif /* !SAN_RUNTIME */ #else /* !SAN_NEEDS_INTERCEPTORS */ #define bcopy(from, to, len) __builtin_memmove((to), (from), (len)) #define bzero(buf, len) __builtin_memset((buf), 0, (len)) #define bcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len)) #define memset(buf, c, len) __builtin_memset((buf), (c), (len)) #define memcpy(to, from, len) __builtin_memcpy((to), (from), (len)) #define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) #define memcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len)) #endif /* SAN_NEEDS_INTERCEPTORS */ void *memset_early(void * _Nonnull buf, int c, size_t len); #define bzero_early(buf, len) memset_early((buf), 0, (len)) void *memcpy_early(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove_early(void * _Nonnull dest, const void * _Nonnull src, size_t n); #define bcopy_early(from, to, len) memmove_early((to), (from), (len)) #define copystr(src, dst, len, outlen) ({ \ size_t __r, __len, *__outlen; \ \ __len = (len); \ __outlen = (outlen); \ __r = strlcpy((dst), (src), __len); \ if (__outlen != NULL) \ *__outlen = ((__r >= __len) ? __len : __r + 1); \ ((__r >= __len) ? ENAMETOOLONG : 0); \ }) int __result_use_check copyinstr(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len, size_t * __restrict lencopied); int __result_use_check copyin(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int __result_use_check copyin_nofault(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int __result_use_or_ignore_check copyout(const void * _Nonnull __restrict kaddr, void * __restrict udaddr, size_t len); int __result_use_or_ignore_check copyout_nofault( const void * _Nonnull __restrict kaddr, void * __restrict udaddr, size_t len); #ifdef SAN_NEEDS_INTERCEPTORS int SAN_INTERCEPTOR(copyin)(const void *, void *, size_t); int SAN_INTERCEPTOR(copyinstr)(const void *, void *, size_t, size_t *); int SAN_INTERCEPTOR(copyout)(const void *, void *, size_t); #ifndef SAN_RUNTIME #define copyin(u, k, l) SAN_INTERCEPTOR(copyin)((u), (k), (l)) #define copyinstr(u, k, l, lc) SAN_INTERCEPTOR(copyinstr)((u), (k), (l), (lc)) #define copyout(k, u, l) SAN_INTERCEPTOR(copyout)((k), (u), (l)) #endif /* !SAN_RUNTIME */ #endif /* SAN_NEEDS_INTERCEPTORS */ int fubyte(volatile const void *base); long fuword(volatile const void *base); int fuword16(volatile const void *base); int32_t fuword32(volatile const void *base); int64_t fuword64(volatile const void *base); int __result_use_check fueword(volatile const void *base, long *val); int __result_use_check fueword32(volatile const void *base, int32_t *val); int __result_use_check fueword64(volatile const void *base, int64_t *val); int __result_use_or_ignore_check subyte(volatile void *base, int byte); int __result_use_or_ignore_check suword(volatile void *base, long word); int __result_use_or_ignore_check suword16(volatile void *base, int word); int __result_use_or_ignore_check suword32(volatile void *base, int32_t word); int __result_use_or_ignore_check suword64(volatile void *base, int64_t word); uint32_t casuword32(volatile uint32_t *base, uint32_t oldval, uint32_t newval); u_long casuword(volatile u_long *p, u_long oldval, u_long newval); int casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int casueword(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); #if defined(SAN_NEEDS_INTERCEPTORS) && !defined(KCSAN) int SAN_INTERCEPTOR(fubyte)(volatile const void *base); int SAN_INTERCEPTOR(fuword16)(volatile const void *base); int SAN_INTERCEPTOR(fueword)(volatile const void *base, long *val); int SAN_INTERCEPTOR(fueword32)(volatile const void *base, int32_t *val); int SAN_INTERCEPTOR(fueword64)(volatile const void *base, int64_t *val); int SAN_INTERCEPTOR(subyte)(volatile void *base, int byte); int SAN_INTERCEPTOR(suword)(volatile void *base, long word); int SAN_INTERCEPTOR(suword16)(volatile void *base, int word); int SAN_INTERCEPTOR(suword32)(volatile void *base, int32_t word); int SAN_INTERCEPTOR(suword64)(volatile void *base, int64_t word); int SAN_INTERCEPTOR(casueword32)(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int SAN_INTERCEPTOR(casueword)(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); #ifndef SAN_RUNTIME #define fubyte(b) SAN_INTERCEPTOR(fubyte)((b)) #define fuword16(b) SAN_INTERCEPTOR(fuword16)((b)) #define fueword(b, v) SAN_INTERCEPTOR(fueword)((b), (v)) #define fueword32(b, v) SAN_INTERCEPTOR(fueword32)((b), (v)) #define fueword64(b, v) SAN_INTERCEPTOR(fueword64)((b), (v)) #define subyte(b, w) SAN_INTERCEPTOR(subyte)((b), (w)) #define suword(b, w) SAN_INTERCEPTOR(suword)((b), (w)) #define suword16(b, w) SAN_INTERCEPTOR(suword16)((b), (w)) #define suword32(b, w) SAN_INTERCEPTOR(suword32)((b), (w)) #define suword64(b, w) SAN_INTERCEPTOR(suword64)((b), (w)) #define casueword32(b, o, p, n) SAN_INTERCEPTOR(casueword32)((b), (o), (p), (n)) #define casueword(b, o, p, n) SAN_INTERCEPTOR(casueword)((b), (o), (p), (n)) #endif /* !SAN_RUNTIME */ #endif /* SAN_NEEDS_INTERCEPTORS && !KCSAN */ int sysbeep(int hertz, sbintime_t duration); void hardclock(int cnt, int usermode); void hardclock_sync(int cpu); void statclock(int cnt, int usermode); void profclock(int cnt, int usermode, uintfptr_t pc); int hardclockintr(void); void startprofclock(struct proc *); void stopprofclock(struct proc *); void cpu_startprofclock(void); void cpu_stopprofclock(void); void suspendclock(void); void resumeclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq); extern int cpu_disable_c2_sleep; extern int cpu_disable_c3_sleep; extern void (*tcp_hpts_softclock)(void); +extern volatile uint32_t __read_frequently hpts_that_need_softclock; + #define tcp_hpts_softclock() do { \ - if (tcp_hpts_softclock != NULL) \ + if (hpts_that_need_softclock > 0) \ tcp_hpts_softclock(); \ } while (0) char *kern_getenv(const char *name); void freeenv(char *env); int getenv_int(const char *name, int *data); int getenv_uint(const char *name, unsigned int *data); int getenv_long(const char *name, long *data); int getenv_ulong(const char *name, unsigned long *data); int getenv_string(const char *name, char *data, int size); int getenv_int64(const char *name, int64_t *data); int getenv_uint64(const char *name, uint64_t *data); int getenv_quad(const char *name, quad_t *data); int getenv_bool(const char *name, bool *data); bool getenv_is_true(const char *name); bool getenv_is_false(const char *name); int kern_setenv(const char *name, const char *value); int kern_unsetenv(const char *name); int testenv(const char *name); int getenv_array(const char *name, void *data, int size, int *psize, int type_size, bool allow_signed); #define GETENV_UNSIGNED false /* negative numbers not allowed */ #define GETENV_SIGNED true /* negative numbers allowed */ typedef uint64_t (cpu_tick_f)(void); void set_cputicker(cpu_tick_f *func, uint64_t freq, bool isvariable); extern cpu_tick_f *cpu_ticks; uint64_t cpu_tickrate(void); uint64_t cputick2usec(uint64_t tick); #include /* Initialize the world */ void consinit(void); void cpu_initclocks(void); void cpu_initclocks_bsp(void); void cpu_initclocks_ap(void); void usrinfoinit(void); /* Finalize the world */ void kern_reboot(int) __dead2; void shutdown_nice(int); /* Stubs for obsolete functions that used to be for interrupt management */ static __inline intrmask_t splhigh(void) { return 0; } static __inline intrmask_t splimp(void) { return 0; } static __inline intrmask_t splnet(void) { return 0; } static __inline intrmask_t spltty(void) { return 0; } static __inline void splx(intrmask_t ipl __unused) { return; } /* * Common `proc' functions are declared here so that proc.h can be included * less often. */ int _sleep(const void * _Nonnull chan, struct lock_object *lock, int pri, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep(chan, mtx, pri, wmesg, timo) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) #define msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr), \ (flags)) int msleep_spin_sbt(const void * _Nonnull chan, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep_spin(chan, mtx, wmesg, timo) \ msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); static __inline int pause(const char *wmesg, int timo) { return (pause_sbt(wmesg, tick_sbt * timo, 0, C_HARDCLOCK)); } #define pause_sig(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK | C_CATCH) #define tsleep(chan, pri, wmesg, timo) \ _sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) #define tsleep_sbt(chan, pri, wmesg, bt, pr, flags) \ _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) void wakeup(const void *chan); void wakeup_one(const void *chan); void wakeup_any(const void *chan); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning */ struct cdev; dev_t dev2udev(struct cdev *x); const char *devtoname(struct cdev *cdev); #ifdef __LP64__ size_t devfs_iosize_max(void); size_t iosize_max(void); #endif int poll_no_poll(int events); /* XXX: Should be void nanodelay(u_int nsec); */ void DELAY(int usec); int kcmp_cmp(uintptr_t a, uintptr_t b); /* Root mount holdback API */ struct root_hold_token { int flags; const char *who; TAILQ_ENTRY(root_hold_token) list; }; struct root_hold_token *root_mount_hold(const char *identifier); void root_mount_hold_token(const char *identifier, struct root_hold_token *h); void root_mount_rel(struct root_hold_token *h); int root_mounted(void); /* * Unit number allocation API. (kern/subr_unit.c) */ struct unrhdr; #define UNR_NO_MTX ((void *)(uintptr_t)-1) struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); void clear_unrhdr(struct unrhdr *uh); void clean_unrhdr(struct unrhdr *uh); void clean_unrhdrl(struct unrhdr *uh); int alloc_unr(struct unrhdr *uh); int alloc_unr_specific(struct unrhdr *uh, u_int item); int alloc_unrl(struct unrhdr *uh); void free_unr(struct unrhdr *uh, u_int item); void *create_iter_unr(struct unrhdr *uh); int next_iter_unr(void *handle); void free_iter_unr(void *handle); struct unrhdr64 { uint64_t counter; }; static __inline void new_unrhdr64(struct unrhdr64 *unr64, uint64_t low) { unr64->counter = low; } static __inline uint64_t alloc_unr64(struct unrhdr64 *unr64) { return (atomic_fetchadd_64(&unr64->counter, 1)); } void intr_prof_stack_use(struct thread *td, struct trapframe *frame); void counted_warning(unsigned *counter, const char *msg); /* * APIs to manage deprecation and obsolescence. */ void _gone_in(int major, const char *msg); void _gone_in_dev(device_t dev, int major, const char *msg); #ifdef NO_OBSOLETE_CODE #define __gone_ok(m, msg) \ _Static_assert(m < P_OSREL_MAJOR(__FreeBSD_version)), \ "Obsolete code: " msg); #else #define __gone_ok(m, msg) #endif #define gone_in(major, msg) __gone_ok(major, msg) _gone_in(major, msg) #define gone_in_dev(dev, major, msg) __gone_ok(major, msg) _gone_in_dev(dev, major, msg) #ifdef INVARIANTS #define __diagused #else #define __diagused __unused #endif #ifdef WITNESS #define __witness_used #else #define __witness_used __unused #endif #endif /* _KERNEL */ __NULLABILITY_PRAGMA_POP #endif /* !_SYS_SYSTM_H_ */