Index: sys/kern/subr_trap.c =================================================================== --- sys/kern/subr_trap.c +++ sys/kern/subr_trap.c @@ -140,6 +140,16 @@ #ifdef HWPMC_HOOKS if (PMC_THREAD_HAS_SAMPLES(td)) PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL); +#endif +#ifdef TCPHPTS + /* + * @gallatin is adament that this needs to go here, I + * am not so sure. Running hpts is a lot like + * a lro_flush() that happens while a user process + * is running. But he may know best so I will go + * with his view of accounting. :-) + */ + tcp_run_hpts(); #endif /* * Let the scheduler adjust our priority etc. Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -258,6 +258,7 @@ volatile uint32_t inp_in_input; /* on input hpts (lock b) */ #endif volatile uint16_t inp_hpts_cpu; /* Lock (i) */ + volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */ u_int inp_refcount; /* (i) refcount */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ @@ -266,7 +267,8 @@ inp_input_cpu_set : 1, /* on input hpts (i) */ inp_hpts_calls :1, /* (i) from output hpts */ inp_input_calls :1, /* (i) from input hpts */ - inp_spare_bits2 : 4; + inp_irq_cpu_set :1, /* (i) from LRO/Driver */ + inp_spare_bits2 : 3; uint8_t inp_numa_domain; /* numa domain */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ Index: sys/netinet/tcp_hpts.h =================================================================== --- sys/netinet/tcp_hpts.h +++ sys/netinet/tcp_hpts.h @@ -44,7 +44,7 @@ TAILQ_HEAD(hptsh, inpcb); /* Number of useconds in a hpts tick */ -#define HPTS_TICKS_PER_USEC 10 +#define HPTS_TICKS_PER_SLOT 10 #define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 @@ -56,7 +56,7 @@ uint32_t p_nxt_slot; /* bbr->flex1 x */ uint32_t p_cur_slot; /* bbr->flex2 x */ uint32_t p_prev_slot; /* bbr->delivered */ - uint32_t p_runningtick; /* bbr->inflight */ + uint32_t p_runningslot; /* bbr->inflight */ uint32_t slot_req; /* bbr->flex3 x */ uint32_t inp_hptsslot; /* bbr->flex4 x */ uint32_t slot_remaining; /* bbr->flex5 x */ @@ -64,8 +64,8 @@ uint32_t hpts_sleep_time; /* bbr->applimited x */ uint32_t yet_to_sleep; /* bbr->lt_epoch x */ uint32_t need_new_to; /* bbr->flex6 x */ - uint32_t wheel_tick; /* bbr->bw_inuse x */ - uint32_t maxticks; /* bbr->delRate x */ + uint32_t wheel_slot; /* bbr->bw_inuse x */ + uint32_t maxslots; /* bbr->delRate x */ uint32_t wheel_cts; /* bbr->rttProp x */ int32_t co_ret; /* bbr->pkts_out x */ uint32_t p_curtick; /* upper bbr->cur_del_rate */ @@ -83,16 +83,20 @@ #define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */ #define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) +#define DEFAULT_CONNECTION_THESHOLD 100 + #ifdef _KERNEL /* Each hpts has its own p_mtx which is used for locking */ struct tcp_hpts_entry { /* Cache line 0x00 */ struct mtx p_mtx; /* Mutex for hpts */ + struct timeval p_mysleep; /* Our min sleep time */ + uint64_t syscall_cnt; + uint64_t sleeping; /* What the actual sleep was (if sleeping) */ uint16_t p_hpts_active; /* Flag that says hpts is awake */ - uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */ uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ - uint32_t p_runningtick; /* Current tick we are at if we are running */ + uint32_t p_runningslot; /* Current tick we are at if we are running */ uint32_t p_prev_slot; /* Previous slot we were on */ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ uint32_t p_nxt_slot; /* The next slot outside the current range of @@ -101,7 +105,8 @@ uint32_t p_lasttick; /* Last tick before the current one */ uint8_t p_direct_wake :1, /* boolean */ p_on_min_sleep:1, /* boolean */ - p_avail:6; + p_hpts_wake_scheduled:1, /* boolean */ + p_avail:5; uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ void *p_inp; @@ -109,8 +114,6 @@ /* Hptsi wheel */ struct hptsh *p_hptss; int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ - uint32_t hit_no_enobuf; - uint32_t p_dyn_adjust; uint32_t p_hpts_sleep_time; /* Current sleep interval having a max * of 255ms */ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ @@ -134,6 +137,7 @@ struct tcp_hptsi { struct proc *rp_proc; /* Process structure for hpts */ struct tcp_hpts_entry **rp_ent; /* Array of hptss */ + uint32_t *cts_last_ran; uint32_t rp_num_hptss; /* Number of hpts threads */ }; @@ -155,10 +159,37 @@ * be sent when a TCB is still around must be * sent from a routine like tcp_respond(). */ +#define LOWEST_SLEEP_ALLOWED 50 #define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep * this determines min granularity of the - * hpts. If 0, granularity is 10useconds at - * the cost of more CPU (context switching). */ + * hpts. If 1, granularity is 10useconds at + * the cost of more CPU (context switching). + * Note do not set this to 0. + */ +#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP +#define DYNAMIC_MAX_SLEEP 100000 /* 100ms */ +/* No of connections when wee start aligning to the cpu from syscalls */ +#define OLDEST_THRESHOLD 1200 +/* Thresholds for raising/lowering sleep */ +#define TICKS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */ +#define TICKS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */ +/** + * + * Dynamic adjustment of sleeping times is done in "new" mode + * where we are depending on syscall returns and lro returns + * to push hpts forward mainly and the timer is only a backstop. + * + * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh + * then we do a dynamic adjustment on the time we sleep. + * Our threshold is if the lateness of the first client served (in ticks) is + * greater than or equal too ticks_indicate_more_sleep (10ms + * or 10000 ticks). If we were that late, the actual sleep time + * is adjusted down by 50%. If the ticks_ran is less than + * ticks_indicate_more_sleep (100 ticks or 1000usecs). + * + */ + + #ifdef _KERNEL #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp); @@ -215,43 +246,61 @@ void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line); #define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__) +void tcp_run_hpts(void); + +uint16_t hpts_random_cpu(struct inpcb *inp); + extern int32_t tcp_min_hptsi_time; -static __inline uint32_t -tcp_tv_to_hptstick(struct timeval *sv) -{ - return ((sv->tv_sec * 100000) + (sv->tv_usec / 10)); -} +#endif /* _KERNEL */ +/* + * The following functions should also be available + * to userspace as well. + */ static __inline uint32_t -tcp_gethptstick(struct timeval *sv) +tcp_tv_to_hptstick(const struct timeval *sv) { - struct timeval tv; - - if (sv == NULL) - sv = &tv; - microuptime(sv); - return (tcp_tv_to_hptstick(sv)); + return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT)); } static __inline uint32_t -tcp_tv_to_usectick(struct timeval *sv) +tcp_tv_to_usectick(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); } static __inline uint32_t -tcp_tv_to_mssectick(struct timeval *sv) +tcp_tv_to_mssectick(const struct timeval *sv) { return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC))); } +static __inline uint64_t +tcp_tv_to_lusectick(const struct timeval *sv) +{ + return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec)); +} + +#ifdef _KERNEL + static __inline void tcp_hpts_unlock(struct tcp_hpts_entry *hpts) { mtx_unlock(&hpts->p_mtx); } +static __inline uint32_t +tcp_gethptstick(struct timeval *sv) +{ + struct timeval tv; + + if (sv == NULL) + sv = &tv; + microuptime(sv); + return (tcp_tv_to_hptstick(sv)); +} + static __inline uint32_t tcp_get_usecs(struct timeval *tv) { Index: sys/netinet/tcp_hpts.c =================================================================== --- sys/netinet/tcp_hpts.c +++ sys/netinet/tcp_hpts.c @@ -193,23 +193,29 @@ #else static int tcp_bind_threads = 2; #endif -TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); - +static int tcp_use_irq_cpu = 0; static struct tcp_hptsi tcp_pace; +static uint32_t *cts_last_ran; static int hpts_does_tp_logging = 0; +static int hpts_use_assigned_cpu = 1; +static int32_t hpts_uses_oldest = OLDEST_THRESHOLD; -static void tcp_wakehpts(struct tcp_hpts_entry *p); -static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); -static void tcp_hptsi(struct tcp_hpts_entry *hpts); +static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; -static int32_t tcp_hpts_callout_skip_swi = 0; +static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD; +static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP; +static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP; + + SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP Hpts controls"); +SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "TCP Hpts statistics"); #define timersub(tvp, uvp, vvp) \ do { \ @@ -230,44 +236,92 @@ struct hpts_domain_info hpts_domains[MAXMEMDOM]; -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, - &tcp_hpts_precision, 120, - "Value for PRE() precision of callout"); - counter_u64_t hpts_hopelessly_behind; -SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD, +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, &hpts_hopelessly_behind, "Number of times hpts could not catch up and was behind hopelessly"); counter_u64_t hpts_loops; -SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD, &hpts_loops, "Number of times hpts had to loop to catch up"); counter_u64_t back_tosleep; -SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); counter_u64_t combined_wheel_wrap; -SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); counter_u64_t wheel_wrap; -SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD, +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD, &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); -static int32_t out_ts_percision = 0; +counter_u64_t hpts_direct_call; +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD, + &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry"); + +counter_u64_t hpts_wake_timeout; + +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD, + &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring"); + +counter_u64_t hpts_direct_awakening; + +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD, + &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, - &out_ts_percision, 0, - "Do we use a percise timestamp for every output cts"); +counter_u64_t hpts_back_tosleep; + +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD, + &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work"); + +counter_u64_t cpu_uses_flowid; +counter_u64_t cpu_uses_random; + +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD, + &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field"); +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD, + &cpu_uses_random, "Number of times when setting cpuid we used the a random value"); + +TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); +TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD, + &tcp_bind_threads, 2, + "Thread Binding tunable"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD, + &tcp_use_irq_cpu, 0, + "Use of irq CPU tunable"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, + &tcp_hpts_precision, 120, + "Value for PRE() precision of callout"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW, + &conn_cnt_thresh, 0, + "How many connections (below) make us use the callout based mechanism"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, &hpts_does_tp_logging, 0, "Do we add to any tp that has logging on pacer logs"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW, + &hpts_use_assigned_cpu, 0, + "Do we start any hpts timer on the assigned cpu?"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW, + &hpts_uses_oldest, OLDEST_THRESHOLD, + "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW, + &dynamic_min_sleep, 250, + "What is the dynamic minsleep value?"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW, + &dynamic_max_sleep, 5000, + "What is the dynamic maxsleep value?"); + + + + static int32_t max_pacer_loops = 10; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW, @@ -287,7 +341,7 @@ new = hpts_sleep_max; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { - if ((new < (NUM_OF_HPTSI_SLOTS / 4)) || + if ((new < dynamic_min_sleep) || (new > HPTS_MAX_SLEEP_ALLOWED)) error = EINVAL; else @@ -296,26 +350,60 @@ return (error); } +static int +sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS) +{ + int error; + uint32_t new; + + new = tcp_min_hptsi_time; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if (new < LOWEST_SLEEP_ALLOWED) + error = EINVAL; + else + tcp_min_hptsi_time = new; + } + return (error); +} + SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &hpts_sleep_max, 0, &sysctl_net_inet_tcp_hpts_max_sleep, "IU", "Maximum time hpts will sleep"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep, + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &tcp_min_hptsi_time, 0, + &sysctl_net_inet_tcp_hpts_min_sleep, "IU", "The minimum time the hpts must sleep before processing more slots"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, - &tcp_hpts_callout_skip_swi, 0, - "Do we have the callout call directly to the hpts?"); +static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP; +static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP; +static int tcp_hpts_no_wake_over_thresh = 1; + +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW, + &ticks_indicate_more_sleep, 0, + "If we only process this many or less on a timeout, we need longer sleep on the next callout"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW, + &ticks_indicate_less_sleep, 0, + "If we process this many or more on a timeout, we need less sleep on the next callout"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW, + &tcp_hpts_no_wake_over_thresh, 0, + "When we are over the threshold on the pacer do we prohibit wakeups?"); static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, - int ticks_to_run, int idx) + int slots_to_run, int idx, int from_callout) { union tcp_log_stackspecific log; - + /* + * Unused logs are + * 64 bit - delRate, rttProp, bw_inuse + * 16 bit - cwnd_gain + * 8 bit - bbr_state, bbr_substate, inhpts, ininput; + */ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = hpts->p_nxt_slot; log.u_bbr.flex2 = hpts->p_cur_slot; @@ -323,8 +411,9 @@ log.u_bbr.flex4 = idx; log.u_bbr.flex5 = hpts->p_curtick; log.u_bbr.flex6 = hpts->p_on_queue_cnt; - log.u_bbr.use_lt_bw = 1; - log.u_bbr.inflight = ticks_to_run; + log.u_bbr.flex7 = hpts->p_cpu; + log.u_bbr.flex8 = (uint8_t)from_callout; + log.u_bbr.inflight = slots_to_run; log.u_bbr.applimited = hpts->overidden_sleep; log.u_bbr.delivered = hpts->saved_curtick; log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); @@ -332,7 +421,9 @@ log.u_bbr.lt_epoch = hpts->saved_prev_slot; log.u_bbr.pkts_out = hpts->p_delayed_by; log.u_bbr.lost = hpts->p_hpts_sleep_time; - log.u_bbr.cur_del_rate = hpts->p_runningtick; + log.u_bbr.pacing_gain = hpts->p_cpu; + log.u_bbr.pkt_epoch = hpts->p_runningslot; + log.u_bbr.use_lt_bw = 1; TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, @@ -341,47 +432,40 @@ } static void -hpts_timeout_swi(void *arg) +tcp_wakehpts(struct tcp_hpts_entry *hpts) { - struct tcp_hpts_entry *hpts; + HPTS_MTX_ASSERT(hpts); - hpts = (struct tcp_hpts_entry *)arg; - swi_sched(hpts->ie_cookie, 0); + if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) { + hpts->p_direct_wake = 0; + return; + } + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); + } } static void -hpts_timeout_dir(void *arg) +hpts_timeout_swi(void *arg) { - tcp_hpts_thread(arg); + struct tcp_hpts_entry *hpts; + + hpts = (struct tcp_hpts_entry *)arg; + swi_sched(hpts->ie_cookie, 0); } static inline void hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear) { -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx) == 0) { - /* We don't own the mutex? */ - panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); - } - if (hpts->p_cpu != inp->inp_hpts_cpu) { - /* It is not the right cpu/mutex? */ - panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); - } - if (inp->inp_in_hpts == 0) { - /* We are not on the hpts? */ - panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); - } -#endif + HPTS_MTX_ASSERT(hpts); + KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp)); + KASSERT(inp->inp_in_hpts != 0, ("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp)); TAILQ_REMOVE(head, inp, inp_hpts); hpts->p_on_queue_cnt--; - if (hpts->p_on_queue_cnt < 0) { - /* Count should not go negative .. */ -#ifdef INVARIANTS - panic("Hpts goes negative inp:%p hpts:%p", - inp, hpts); -#endif - hpts->p_on_queue_cnt = 0; - } + KASSERT(hpts->p_on_queue_cnt >= 0, + ("Hpts goes negative inp:%p hpts:%p", + inp, hpts)); if (clear) { inp->inp_hpts_request = 0; inp->inp_in_hpts = 0; @@ -391,20 +475,13 @@ static inline void hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref) { -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx) == 0) { - /* We don't own the mutex? */ - panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); - } - if (hpts->p_cpu != inp->inp_hpts_cpu) { - /* It is not the right cpu/mutex? */ - panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); - } - if ((noref == 0) && (inp->inp_in_hpts == 1)) { - /* We are already on the hpts? */ - panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp); - } -#endif + HPTS_MTX_ASSERT(hpts); + KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, + ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp)); + KASSERT(((noref == 1) && (inp->inp_in_hpts == 1)) || + ((noref == 0) && (inp->inp_in_hpts == 0)), + ("%s: hpts:%p inp:%p already on the hpts?", + __FUNCTION__, hpts, inp)); TAILQ_INSERT_TAIL(head, inp, inp_hpts); inp->inp_in_hpts = 1; hpts->p_on_queue_cnt++; @@ -416,37 +493,20 @@ static inline void hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) { -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx) == 0) { - /* We don't own the mutex? */ - panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); - } - if (hpts->p_cpu != inp->inp_input_cpu) { - /* It is not the right cpu/mutex? */ - panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); - } - if (inp->inp_in_input == 0) { - /* We are not on the input hpts? */ - panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp); - } -#endif + HPTS_MTX_ASSERT(hpts); + KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, + ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp)); + KASSERT(inp->inp_in_input != 0, + ("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp)); TAILQ_REMOVE(&hpts->p_input, inp, inp_input); hpts->p_on_inqueue_cnt--; - if (hpts->p_on_inqueue_cnt < 0) { -#ifdef INVARIANTS - panic("Hpts in goes negative inp:%p hpts:%p", - inp, hpts); -#endif - hpts->p_on_inqueue_cnt = 0; - } -#ifdef INVARIANTS - if (TAILQ_EMPTY(&hpts->p_input) && - (hpts->p_on_inqueue_cnt != 0)) { - /* We should not be empty with a queue count */ - panic("%s hpts:%p in_hpts input empty but cnt:%d", - __FUNCTION__, hpts, hpts->p_on_inqueue_cnt); - } -#endif + KASSERT(hpts->p_on_inqueue_cnt >= 0, + ("Hpts in goes negative inp:%p hpts:%p", + inp, hpts)); + KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) || + ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))), + ("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch", + __FUNCTION__, hpts, hpts->p_on_inqueue_cnt)); if (clear) inp->inp_in_input = 0; } @@ -454,46 +514,17 @@ static inline void hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) { -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx) == 0) { - /* We don't own the mutex? */ - panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); - } - if (hpts->p_cpu != inp->inp_input_cpu) { - /* It is not the right cpu/mutex? */ - panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); - } - if (inp->inp_in_input == 1) { - /* We are already on the input hpts? */ - panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp); - } -#endif + HPTS_MTX_ASSERT(hpts); + KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, + ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp)); + KASSERT(inp->inp_in_input == 0, + ("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp)); TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); inp->inp_in_input = 1; hpts->p_on_inqueue_cnt++; in_pcbref(inp); } -static void -tcp_wakehpts(struct tcp_hpts_entry *hpts) -{ - HPTS_MTX_ASSERT(hpts); - if (hpts->p_hpts_wake_scheduled == 0) { - hpts->p_hpts_wake_scheduled = 1; - swi_sched(hpts->ie_cookie, 0); - } -} - -static void -tcp_wakeinput(struct tcp_hpts_entry *hpts) -{ - HPTS_MTX_ASSERT(hpts); - if (hpts->p_hpts_wake_scheduled == 0) { - hpts->p_hpts_wake_scheduled = 1; - swi_sched(hpts->ie_cookie, 0); - } -} - struct tcp_hpts_entry * tcp_cur_hpts(struct inpcb *inp) { @@ -514,12 +545,9 @@ again: hpts_num = inp->inp_hpts_cpu; hpts = tcp_pace.rp_ent[hpts_num]; -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx)) { - panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); - } -#endif + KASSERT(mtx_owned(&hpts->p_mtx) == 0, + ("Hpts:%p owns mtx prior-to lock line:%d", + hpts, __LINE__)); mtx_lock(&hpts->p_mtx); if (hpts_num != inp->inp_hpts_cpu) { mtx_unlock(&hpts->p_mtx); @@ -537,12 +565,9 @@ again: hpts_num = inp->inp_input_cpu; hpts = tcp_pace.rp_ent[hpts_num]; -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx)) { - panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); - } -#endif + KASSERT(mtx_owned(&hpts->p_mtx) == 0, + ("Hpts:%p owns mtx prior-to lock line:%d", + hpts, __LINE__)); mtx_lock(&hpts->p_mtx); if (hpts_num != inp->inp_input_cpu) { mtx_unlock(&hpts->p_mtx); @@ -555,6 +580,7 @@ tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) { int32_t add_freed; + int32_t ret; if (inp->inp_flags2 & INP_FREED) { /* @@ -567,26 +593,11 @@ add_freed = 0; } #ifndef INP_REF_DEBUG - if (in_pcbrele_wlocked(inp)) { - /* - * This should not happen. We have the inpcb referred to by - * the main socket (why we are called) and the hpts. It - * should always return 0. - */ - panic("inpcb:%p release ret 1", - inp); - } + ret = in_pcbrele_wlocked(inp); #else - if (__in_pcbrele_wlocked(inp, line)) { - /* - * This should not happen. We have the inpcb referred to by - * the main socket (why we are called) and the hpts. It - * should always return 0. - */ - panic("inpcb:%p release ret 1", - inp); - } + ret = __in_pcbrele_wlocked(inp, line); #endif + KASSERT(ret != 1, ("inpcb:%p release ret 1", inp)); if (add_freed) { inp->inp_flags2 |= INP_FREED; } @@ -642,73 +653,76 @@ } static inline int -hpts_tick(uint32_t wheel_tick, uint32_t plus) +hpts_slot(uint32_t wheel_slot, uint32_t plus) { /* * Given a slot on the wheel, what slot * is that plus ticks out? */ - KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick)); - return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS); + KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot)); + return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS); } static inline int tick_to_wheel(uint32_t cts_in_wticks) { /* - * Given a timestamp in wheel ticks (10usec inc's) - * map it to our limited space wheel. + * Given a timestamp in ticks (so by + * default to get it to a real time one + * would multiply by 10.. i.e the number + * of ticks in a slot) map it to our limited + * space wheel. */ return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); } static inline int -hpts_ticks_diff(int prev_tick, int tick_now) +hpts_slots_diff(int prev_slot, int slot_now) { /* - * Given two ticks that are someplace + * Given two slots that are someplace * on our wheel. How far are they apart? */ - if (tick_now > prev_tick) - return (tick_now - prev_tick); - else if (tick_now == prev_tick) + if (slot_now > prev_slot) + return (slot_now - prev_slot); + else if (slot_now == prev_slot) /* * Special case, same means we can go all of our * wheel less one slot. */ return (NUM_OF_HPTSI_SLOTS - 1); else - return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now); + return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now); } /* - * Given a tick on the wheel that is the current time - * mapped to the wheel (wheel_tick), what is the maximum + * Given a slot on the wheel that is the current time + * mapped to the wheel (wheel_slot), what is the maximum * distance forward that can be obtained without - * wrapping past either prev_tick or running_tick + * wrapping past either prev_slot or running_slot * depending on the htps state? Also if passed - * a uint32_t *, fill it with the tick location. + * a uint32_t *, fill it with the slot location. * * Note if you do not give this function the current - * time (that you think it is) mapped to the wheel + * time (that you think it is) mapped to the wheel slot * then the results will not be what you expect and * could lead to invalid inserts. */ static inline int32_t -max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick) +max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot) { - uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel; + uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel; if ((hpts->p_hpts_active == 1) && (hpts->p_wheel_complete == 0)) { - end_tick = hpts->p_runningtick; + end_slot = hpts->p_runningslot; /* Back up one tick */ - if (end_tick == 0) - end_tick = NUM_OF_HPTSI_SLOTS - 1; + if (end_slot == 0) + end_slot = NUM_OF_HPTSI_SLOTS - 1; else - end_tick--; - if (target_tick) - *target_tick = end_tick; + end_slot--; + if (target_slot) + *target_slot = end_slot; } else { /* * For the case where we are @@ -718,26 +732,26 @@ * prev tick and subtract one from it. This puts us * as far out as possible on the wheel. */ - end_tick = hpts->p_prev_slot; - if (end_tick == 0) - end_tick = NUM_OF_HPTSI_SLOTS - 1; + end_slot = hpts->p_prev_slot; + if (end_slot == 0) + end_slot = NUM_OF_HPTSI_SLOTS - 1; else - end_tick--; - if (target_tick) - *target_tick = end_tick; + end_slot--; + if (target_slot) + *target_slot = end_slot; /* * Now we have close to the full wheel left minus the * time it has been since the pacer went to sleep. Note * that wheel_tick, passed in, should be the current time * from the perspective of the caller, mapped to the wheel. */ - if (hpts->p_prev_slot != wheel_tick) - dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + if (hpts->p_prev_slot != wheel_slot) + dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot); else dis_to_travel = 1; /* * dis_to_travel in this case is the space from when the - * pacer stopped (p_prev_slot) and where our wheel_tick + * pacer stopped (p_prev_slot) and where our wheel_slot * is now. To know how many slots we can put it in we * subtract from the wheel size. We would not want * to place something after p_prev_slot or it will @@ -746,21 +760,21 @@ return (NUM_OF_HPTSI_SLOTS - dis_to_travel); } /* - * So how many slots are open between p_runningtick -> p_cur_slot + * So how many slots are open between p_runningslot -> p_cur_slot * that is what is currently un-available for insertion. Special * case when we are at the last slot, this gets 1, so that * the answer to how many slots are available is all but 1. */ - if (hpts->p_runningtick == hpts->p_cur_slot) + if (hpts->p_runningslot == hpts->p_cur_slot) dis_to_travel = 1; else - dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot); /* * How long has the pacer been running? */ - if (hpts->p_cur_slot != wheel_tick) { + if (hpts->p_cur_slot != wheel_slot) { /* The pacer is a bit late */ - pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick); + pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot); } else { /* The pacer is right on time, now == pacers start time */ pacer_to_now = 0; @@ -774,24 +788,24 @@ /* * Now how many of those we will eat due to the pacer's * time (p_cur_slot) of start being behind the - * real time (wheel_tick)? + * real time (wheel_slot)? */ if (avail_on_wheel <= pacer_to_now) { /* * Wheel wrap, we can't fit on the wheel, that * is unusual the system must be way overloaded! - * Insert into the assured tick, and return special + * Insert into the assured slot, and return special * "0". */ counter_u64_add(combined_wheel_wrap, 1); - *target_tick = hpts->p_nxt_slot; + *target_slot = hpts->p_nxt_slot; return (0); } else { /* * We know how many slots are open * on the wheel (the reverse of what * is left to run. Take away the time - * the pacer started to now (wheel_tick) + * the pacer started to now (wheel_slot) * and that tells you how many slots are * open that can be inserted into that won't * be touched by the pacer until later. @@ -815,7 +829,7 @@ * A sleeping hpts we want in next slot to run * note that in this state p_prev_slot == p_cur_slot */ - inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1); + inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1); if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) need_wake = 1; } else if ((void *)inp == hpts->p_inp) { @@ -827,7 +841,7 @@ */ inp->inp_hptsslot = hpts->p_nxt_slot; } else - inp->inp_hptsslot = hpts->p_runningtick; + inp->inp_hptsslot = hpts->p_runningslot; hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); if (need_wake) { /* @@ -862,9 +876,9 @@ * Sanity checks for the pacer with invariants * on insert. */ - if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS) - panic("hpts:%p inp:%p slot:%d > max", - hpts, inp, inp_hptsslot); + KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS, + ("hpts:%p inp:%p slot:%d > max", + hpts, inp, inp_hptsslot)); if ((hpts->p_hpts_active) && (hpts->p_wheel_complete == 0)) { /* @@ -875,17 +889,16 @@ */ int distance, yet_to_run; - distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot); - if (hpts->p_runningtick != hpts->p_cur_slot) - yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot); + if (hpts->p_runningslot != hpts->p_cur_slot) + yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot); else yet_to_run = 0; /* processing last slot */ - if (yet_to_run > distance) { - panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", - hpts, inp, inp_hptsslot, - distance, yet_to_run, - hpts->p_runningtick, hpts->p_cur_slot); - } + KASSERT(yet_to_run <= distance, + ("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", + hpts, inp, inp_hptsslot, + distance, yet_to_run, + hpts->p_runningslot, hpts->p_cur_slot)); } } #endif @@ -895,8 +908,9 @@ struct hpts_diag *diag, struct timeval *tv) { uint32_t need_new_to = 0; - uint32_t wheel_cts, last_tick; - int32_t wheel_tick, maxticks; + uint32_t wheel_cts; + int32_t wheel_slot, maxslots, last_slot; + int cpu; int8_t need_wakeup = 0; HPTS_MTX_ASSERT(hpts); @@ -904,7 +918,7 @@ memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; diag->p_prev_slot = hpts->p_prev_slot; - diag->p_runningtick = hpts->p_runningtick; + diag->p_runningslot = hpts->p_runningslot; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; diag->p_curtick = hpts->p_curtick; @@ -913,131 +927,120 @@ diag->p_on_min_sleep = hpts->p_on_min_sleep; diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if (inp->inp_in_hpts == 0) { - if (slot == 0) { - /* Immediate */ - tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); - return; - } - /* Get the current time relative to the wheel */ - wheel_cts = tcp_tv_to_hptstick(tv); - /* Map it onto the wheel */ - wheel_tick = tick_to_wheel(wheel_cts); - /* Now what's the max we can place it at? */ - maxticks = max_ticks_available(hpts, wheel_tick, &last_tick); - if (diag) { - diag->wheel_tick = wheel_tick; - diag->maxticks = maxticks; - diag->wheel_cts = wheel_cts; + KASSERT(inp->inp_in_hpts == 0, ("Hpts:%p tp:%p already on hpts and add?", hpts, inp)); + if (slot == 0) { + /* Immediate */ + tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); + return; + } + /* Get the current time relative to the wheel */ + wheel_cts = tcp_tv_to_hptstick(tv); + /* Map it onto the wheel */ + wheel_slot = tick_to_wheel(wheel_cts); + /* Now what's the max we can place it at? */ + maxslots = max_slots_available(hpts, wheel_slot, &last_slot); + if (diag) { + diag->wheel_slot = wheel_slot; + diag->maxslots = maxslots; + diag->wheel_cts = wheel_cts; + } + if (maxslots == 0) { + /* The pacer is in a wheel wrap behind, yikes! */ + if (slot > 1) { + /* + * Reduce by 1 to prevent a forever loop in + * case something else is wrong. Note this + * probably does not hurt because the pacer + * if its true is so far behind we will be + * > 1second late calling anyway. + */ + slot--; } - if (maxticks == 0) { - /* The pacer is in a wheel wrap behind, yikes! */ - if (slot > 1) { - /* - * Reduce by 1 to prevent a forever loop in - * case something else is wrong. Note this - * probably does not hurt because the pacer - * if its true is so far behind we will be - * > 1second late calling anyway. - */ - slot--; - } - inp->inp_hptsslot = last_tick; - inp->inp_hpts_request = slot; - } else if (maxticks >= slot) { - /* It all fits on the wheel */ - inp->inp_hpts_request = 0; - inp->inp_hptsslot = hpts_tick(wheel_tick, slot); - } else { - /* It does not fit */ - inp->inp_hpts_request = slot - maxticks; - inp->inp_hptsslot = last_tick; + inp->inp_hptsslot = last_slot; + inp->inp_hpts_request = slot; + } else if (maxslots >= slot) { + /* It all fits on the wheel */ + inp->inp_hpts_request = 0; + inp->inp_hptsslot = hpts_slot(wheel_slot, slot); + } else { + /* It does not fit */ + inp->inp_hpts_request = slot - maxslots; + inp->inp_hptsslot = last_slot; + } + if (diag) { + diag->slot_remaining = inp->inp_hpts_request; + diag->inp_hptsslot = inp->inp_hptsslot; + } +#ifdef INVARIANTS + check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); +#endif + hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0); + if ((hpts->p_hpts_active == 0) && + (inp->inp_hpts_request == 0) && + (hpts->p_on_min_sleep == 0)) { + /* + * The hpts is sleeping and NOT on a minimum + * sleep time, we need to figure out where + * it will wake up at and if we need to reschedule + * its time-out. + */ + uint32_t have_slept, yet_to_sleep; + + /* Now do we need to restart the hpts's timer? */ + have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot); + if (have_slept < hpts->p_hpts_sleep_time) + yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; + else { + /* We are over-due */ + yet_to_sleep = 0; + need_wakeup = 1; } if (diag) { - diag->slot_remaining = inp->inp_hpts_request; - diag->inp_hptsslot = inp->inp_hptsslot; + diag->have_slept = have_slept; + diag->yet_to_sleep = yet_to_sleep; } -#ifdef INVARIANTS - check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); -#endif - hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0); - if ((hpts->p_hpts_active == 0) && - (inp->inp_hpts_request == 0) && - (hpts->p_on_min_sleep == 0)) { + if (yet_to_sleep && + (yet_to_sleep > slot)) { /* - * The hpts is sleeping and not on a minimum - * sleep time, we need to figure out where - * it will wake up at and if we need to reschedule - * its time-out. + * We need to reschedule the hpts's time-out. */ - uint32_t have_slept, yet_to_sleep; - - /* Now do we need to restart the hpts's timer? */ - have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); - if (have_slept < hpts->p_hpts_sleep_time) - yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; - else { - /* We are over-due */ - yet_to_sleep = 0; - need_wakeup = 1; - } - if (diag) { - diag->have_slept = have_slept; - diag->yet_to_sleep = yet_to_sleep; - } - if (yet_to_sleep && - (yet_to_sleep > slot)) { - /* - * We need to reschedule the hpts's time-out. - */ - hpts->p_hpts_sleep_time = slot; - need_new_to = slot * HPTS_TICKS_PER_USEC; - } + hpts->p_hpts_sleep_time = slot; + need_new_to = slot * HPTS_TICKS_PER_SLOT; } - /* - * Now how far is the hpts sleeping to? if active is 1, its - * up and ticking we do nothing, otherwise we may need to - * reschedule its callout if need_new_to is set from above. - */ - if (need_wakeup) { - hpts->p_direct_wake = 1; - tcp_wakehpts(hpts); - if (diag) { - diag->need_new_to = 0; - diag->co_ret = 0xffff0000; - } - } else if (need_new_to) { - int32_t co_ret; - struct timeval tv; - sbintime_t sb; + } + /* + * Now how far is the hpts sleeping to? if active is 1, its + * up and ticking we do nothing, otherwise we may need to + * reschedule its callout if need_new_to is set from above. + */ + if (need_wakeup) { + hpts->p_direct_wake = 1; + tcp_wakehpts(hpts); + if (diag) { + diag->need_new_to = 0; + diag->co_ret = 0xffff0000; + } + } else if (need_new_to) { + int32_t co_ret; + struct timeval tv; + sbintime_t sb; - tv.tv_sec = 0; - tv.tv_usec = 0; - while (need_new_to > HPTS_USEC_IN_SEC) { - tv.tv_sec++; - need_new_to -= HPTS_USEC_IN_SEC; - } - tv.tv_usec = need_new_to; - sb = tvtosbt(tv); - if (tcp_hpts_callout_skip_swi == 0) { - co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } else { - co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_dir, hpts, - hpts->p_cpu, - C_PREL(tcp_hpts_precision)); - } - if (diag) { - diag->need_new_to = need_new_to; - diag->co_ret = co_ret; - } + tv.tv_sec = 0; + tv.tv_usec = 0; + while (need_new_to > HPTS_USEC_IN_SEC) { + tv.tv_sec++; + need_new_to -= HPTS_USEC_IN_SEC; + } + tv.tv_usec = need_new_to; + sb = tvtosbt(tv); + cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu; + co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, + hpts_timeout_swi, hpts, cpu, + (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + if (diag) { + diag->need_new_to = need_new_to; + diag->co_ret = co_ret; } - } else { -#ifdef INVARIANTS - panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp); -#endif } } @@ -1066,6 +1069,7 @@ __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ return (tcp_hpts_insert_diag(inp, slot, line, NULL)); } + int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) { @@ -1076,18 +1080,20 @@ /* Ok we need to set it on the hpts in the current slot */ hpts_sane_input_insert(hpts, inp, line); retval = 1; - if (hpts->p_hpts_active == 0) { + if ((hpts->p_hpts_active == 0) && + (hpts->p_on_min_sleep == 0)){ /* * Activate the hpts if it is sleeping. */ retval = 2; hpts->p_direct_wake = 1; - tcp_wakeinput(hpts); + tcp_wakehpts(hpts); } - } else if (hpts->p_hpts_active == 0) { + } else if ((hpts->p_hpts_active == 0) && + (hpts->p_on_min_sleep == 0)){ retval = 4; hpts->p_direct_wake = 1; - tcp_wakeinput(hpts); + tcp_wakehpts(hpts); } return (retval); } @@ -1115,22 +1121,24 @@ if (inp->inp_in_input == 0) { /* Ok we need to set it on the hpts in the current slot */ hpts_sane_input_insert(hpts, inp, line); - if (hpts->p_hpts_active == 0) { + if ((hpts->p_hpts_active == 0) && + (hpts->p_on_min_sleep == 0)){ /* * Activate the hpts if it is sleeping. */ hpts->p_direct_wake = 1; - tcp_wakeinput(hpts); + tcp_wakehpts(hpts); } - } else if (hpts->p_hpts_active == 0) { + } else if ((hpts->p_hpts_active == 0) && + (hpts->p_on_min_sleep == 0)){ hpts->p_direct_wake = 1; - tcp_wakeinput(hpts); + tcp_wakehpts(hpts); } inp->inp_hpts_drop_reas = reason; mtx_unlock(&hpts->p_mtx); } -static uint16_t +uint16_t hpts_random_cpu(struct inpcb *inp){ /* * No flow type set distribute the load randomly. @@ -1149,18 +1157,19 @@ } /* Nothing set use a random number */ ran = arc4random(); - cpuid = (ran & 0xffff) % mp_ncpus; + cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); return (cpuid); } static uint16_t -hpts_cpuid(struct inpcb *inp) +hpts_cpuid(struct inpcb *inp, int *failed) { u_int cpuid; #if !defined(RSS) && defined(NUMA) struct hpts_domain_info *di; #endif + *failed = 0; /* * If one has been set use it i.e. we want both in and out on the * same hpts. @@ -1170,6 +1179,17 @@ } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } + /* + * If we are using the irq cpu set by LRO or + * the driver then it overrides all other domains. + */ + if (tcp_use_irq_cpu) { + if (inp->inp_irq_cpu_set == 0) { + *failed = 1; + return(0); + } + return(inp->inp_irq_cpu); + } /* If one is set the other must be the same */ #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); @@ -1183,9 +1203,10 @@ * unknown cpuids to curcpu. Not the best, but apparently better * than defaulting to swi 0. */ - - if (inp->inp_flowtype == M_HASHTYPE_NONE) + if (inp->inp_flowtype == M_HASHTYPE_NONE) { + counter_u64_add(cpu_uses_random, 1); return (hpts_random_cpu(inp)); + } /* * Hash to a thread based on the flowid. If we are using numa, * then restrict the hash to the numa domain where the inp lives. @@ -1197,7 +1218,7 @@ } else #endif cpuid = inp->inp_flowid % mp_ncpus; - + counter_u64_add(cpu_uses_flowid, 1); return (cpuid); #endif } @@ -1323,7 +1344,7 @@ kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } - if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { + if ((tp->t_fb->tfb_do_queued_segments != NULL) && tp->t_in_pkt) { if (inp->inp_in_input) tcp_hpts_remove(inp, HPTS_REMOVE_INPUT); dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); @@ -1357,23 +1378,51 @@ } static void -tcp_hptsi(struct tcp_hpts_entry *hpts) +tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt) +{ + uint32_t t = 0, i, fnd = 0; + + if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) { + /* + * Find next slot that is occupied and use that to + * be the sleep time. + */ + for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) { + if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { + fnd = 1; + break; + } + t = (t + 1) % NUM_OF_HPTSI_SLOTS; + } + KASSERT(fnd != 0, ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt)); + hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max); + } else { + /* No one on the wheel sleep for all but 400 slots or sleep max */ + hpts->p_hpts_sleep_time = hpts_sleep_max; + } +} + +static int32_t +tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) { struct tcpcb *tp; struct inpcb *inp = NULL, *ninp; struct timeval tv; - int32_t ticks_to_run, i, error; + uint64_t total_slots_processed = 0; + int32_t slots_to_run, i, error; int32_t paced_cnt = 0; int32_t loop_cnt = 0; int32_t did_prefetch = 0; int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; int32_t wrap_loop_cnt = 0; + int32_t slot_pos_of_endpoint = 0; + int32_t orig_exit_slot; int16_t set_cpu; + int8_t completed_measure = 0, seen_endpoint = 0; HPTS_MTX_ASSERT(hpts); NET_EPOCH_ASSERT(); - /* record previous info for any logging */ hpts->saved_lasttick = hpts->p_lasttick; hpts->saved_curtick = hpts->p_curtick; @@ -1382,7 +1431,8 @@ hpts->p_lasttick = hpts->p_curtick; hpts->p_curtick = tcp_gethptstick(&tv); - hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv); + orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); if ((hpts->p_on_queue_cnt == 0) || (hpts->p_lasttick == hpts->p_curtick)) { /* @@ -1396,8 +1446,9 @@ again: hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); - ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot); - if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) && + slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot); + if (((hpts->p_curtick - hpts->p_lasttick) > + ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) && (hpts->p_on_queue_cnt != 0)) { /* * Wheel wrap is occuring, basically we @@ -1416,8 +1467,8 @@ * first slot at the head. */ wrap_loop_cnt++; - hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1); - hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2); + hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1); + hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2); /* * Adjust p_cur_slot to be where we are starting from * hopefully we will catch up (fat chance if something @@ -1438,58 +1489,61 @@ * INP lock and the pacer mutex to change the inp_hptsslot. */ TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) { - inp->inp_hptsslot = hpts->p_runningtick; + inp->inp_hptsslot = hpts->p_runningslot; } #endif - TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick], + TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot], &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts); - ticks_to_run = NUM_OF_HPTSI_SLOTS - 1; + slots_to_run = NUM_OF_HPTSI_SLOTS - 1; counter_u64_add(wheel_wrap, 1); } else { /* - * Nxt slot is always one after p_runningtick though + * Nxt slot is always one after p_runningslot though * its not used usually unless we are doing wheel wrap. */ hpts->p_nxt_slot = hpts->p_prev_slot; - hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1); - } -#ifdef INVARIANTS - if (TAILQ_EMPTY(&hpts->p_input) && - (hpts->p_on_inqueue_cnt != 0)) { - panic("tp:%p in_hpts input empty but cnt:%d", - hpts, hpts->p_on_inqueue_cnt); + hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1); } -#endif + KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) || + ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))), + ("%s hpts:%p in_hpts cnt:%d and queue state mismatch", + __FUNCTION__, hpts, hpts->p_on_inqueue_cnt)); HPTS_MTX_ASSERT(hpts); if (hpts->p_on_queue_cnt == 0) { goto no_one; } HPTS_MTX_ASSERT(hpts); - for (i = 0; i < ticks_to_run; i++) { + for (i = 0; i < slots_to_run; i++) { /* * Calculate our delay, if there are no extra ticks there - * was not any (i.e. if ticks_to_run == 1, no delay). + * was not any (i.e. if slots_to_run == 1, no delay). */ - hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; + hpts->p_delayed_by = (slots_to_run - (i + 1)) * HPTS_TICKS_PER_SLOT; HPTS_MTX_ASSERT(hpts); - while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { + while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) { + HPTS_MTX_ASSERT(hpts); /* For debugging */ + if (seen_endpoint == 0) { + seen_endpoint = 1; + orig_exit_slot = slot_pos_of_endpoint = hpts->p_runningslot; + } else if (completed_measure == 0) { + /* Record the new position */ + orig_exit_slot = hpts->p_runningslot; + } + total_slots_processed++; hpts->p_inp = inp; paced_cnt++; -#ifdef INVARIANTS - if (hpts->p_runningtick != inp->inp_hptsslot) { - panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", - hpts, inp, hpts->p_runningtick, inp->inp_hptsslot); - } -#endif + KASSERT(hpts->p_runningslot == inp->inp_hptsslot, + ("Hpts:%p inp:%p slot mis-aligned %u vs %u", + hpts, inp, hpts->p_runningslot, inp->inp_hptsslot)); /* Now pull it */ if (inp->inp_hpts_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } - hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0); - if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { + hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningslot], 0); + if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) { /* We prefetch the next inp if possible */ kern_prefetch(ninp, &prefetch_ninp); prefetch_ninp = 1; @@ -1501,22 +1555,22 @@ * Push him back on the wheel or run it * depending. */ - uint32_t maxticks, last_tick, remaining_slots; + uint32_t maxslots, last_slot, remaining_slots; - remaining_slots = ticks_to_run - (i + 1); + remaining_slots = slots_to_run - (i + 1); if (inp->inp_hpts_request > remaining_slots) { /* * How far out can we go? */ - maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick); - if (maxticks >= inp->inp_hpts_request) { + maxslots = max_slots_available(hpts, hpts->p_cur_slot, &last_slot); + if (maxslots >= inp->inp_hpts_request) { /* we can place it finally to be processed */ - inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request); + inp->inp_hptsslot = hpts_slot(hpts->p_runningslot, inp->inp_hpts_request); inp->inp_hpts_request = 0; } else { /* Work off some more time */ - inp->inp_hptsslot = last_tick; - inp->inp_hpts_request-= maxticks; + inp->inp_hptsslot = last_slot; + inp->inp_hpts_request-= maxslots; } hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1); hpts->p_inp = NULL; @@ -1542,12 +1596,9 @@ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { out_now: -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx)) { - panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); - } -#endif + KASSERT(mtx_owned(&hpts->p_mtx) == 0, + ("Hpts:%p owns mtx prior-to lock line:%d", + hpts, __LINE__)); INP_WUNLOCK(inp); mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; @@ -1582,7 +1633,7 @@ #endif /* Lets do any logging that we might want to */ if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { - tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i); + tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); } /* * There is a hole here, we get the refcnt on the @@ -1592,12 +1643,10 @@ * fini gets the lock first we are assured of having * a sane INP we can lock and test. */ -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx)) { - panic("Hpts:%p owns mtx before tcp-output:%d", - hpts, __LINE__); - } -#endif + KASSERT(mtx_owned(&hpts->p_mtx) == 0, + ("Hpts:%p owns mtx prior-to tcp_output call line:%d", + hpts, __LINE__)); + if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; @@ -1653,20 +1702,27 @@ CURVNET_RESTORE(); #endif INP_UNLOCK_ASSERT(inp); -#ifdef INVARIANTS - if (mtx_owned(&hpts->p_mtx)) { - panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); - } -#endif + KASSERT(mtx_owned(&hpts->p_mtx) == 0, + ("Hpts:%p owns mtx prior-to lock line:%d", + hpts, __LINE__)); mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; } + if (seen_endpoint) { + /* + * We now have a accurate distance between + * slot_pos_of_endpoint <-> orig_exit_slot + * to tell us how late we were, orig_exit_slot + * is where we calculated the end of our cycle to + * be when we first entered. + */ + completed_measure = 1; + } HPTS_MTX_ASSERT(hpts); hpts->p_inp = NULL; - hpts->p_runningtick++; - if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) { - hpts->p_runningtick = 0; + hpts->p_runningslot++; + if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) { + hpts->p_runningslot = 0; } } no_one: @@ -1676,16 +1732,13 @@ * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ -#ifdef INVARIANTS - if (TAILQ_EMPTY(&hpts->p_input) && - (hpts->p_on_inqueue_cnt != 0)) { - panic("tp:%p in_hpts input empty but cnt:%d", - hpts, hpts->p_on_inqueue_cnt); - } -#endif + KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) || + ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))), + ("%s hpts:%p in_hpts cnt:%d queue state mismatch", + __FUNCTION__, hpts, hpts->p_on_inqueue_cnt)); hpts->p_prev_slot = hpts->p_cur_slot; hpts->p_lasttick = hpts->p_curtick; - if (loop_cnt > max_pacer_loops) { + if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) { /* * Something is serious slow we have * looped through processing the wheel @@ -1700,11 +1753,16 @@ * correct. When it next awakens * it will find itself further behind. */ - counter_u64_add(hpts_hopelessly_behind, 1); + if (from_callout) + counter_u64_add(hpts_hopelessly_behind, 1); goto no_run; } hpts->p_curtick = tcp_gethptstick(&tv); hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + if (seen_endpoint == 0) { + /* We saw no endpoint but we may be looping */ + orig_exit_slot = hpts->p_cur_slot; + } if ((wrap_loop_cnt < 2) && (hpts->p_lasttick != hpts->p_curtick)) { counter_u64_add(hpts_loops, 1); @@ -1712,6 +1770,7 @@ goto again; } no_run: + cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv); /* * Set flag to tell that we are done for * any slot input that happens during @@ -1725,76 +1784,58 @@ if (!TAILQ_EMPTY(&hpts->p_input)) { tcp_input_data(hpts, &tv); /* - * Now did we spend too long running - * input and need to run more ticks? + * Now did we spend too long running input and need to run more ticks? + * Note that if wrap_loop_cnt < 2 then we should have the conditions + * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt + * is greater than 2, then the condtion most likely are *not* true. Also + * if we are called not from the callout, we don't run the wheel multiple + * times so the slots may not align either. */ - KASSERT(hpts->p_prev_slot == hpts->p_cur_slot, + KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || + (wrap_loop_cnt >= 2) || (from_callout == 0)), ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, hpts->p_prev_slot, hpts->p_cur_slot)); - KASSERT(hpts->p_lasttick == hpts->p_curtick, + KASSERT(((hpts->p_lasttick == hpts->p_curtick) + || (wrap_loop_cnt >= 2) || (from_callout == 0)), ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, hpts->p_lasttick, hpts->p_curtick)); - hpts->p_curtick = tcp_gethptstick(&tv); - if (hpts->p_lasttick != hpts->p_curtick) { + if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { + hpts->p_curtick = tcp_gethptstick(&tv); counter_u64_add(hpts_loops, 1); hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); goto again; } } - { - uint32_t t = 0, i, fnd = 0; - - if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) { - /* - * Find next slot that is occupied and use that to - * be the sleep time. - */ - for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) { - if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { - fnd = 1; - break; - } - t = (t + 1) % NUM_OF_HPTSI_SLOTS; - } - if (fnd) { - hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max); - } else { -#ifdef INVARIANTS - panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt); -#endif - counter_u64_add(back_tosleep, 1); - hpts->p_on_queue_cnt = 0; - goto non_found; - } - } else if (wrap_loop_cnt >= 2) { - /* Special case handling */ - hpts->p_hpts_sleep_time = tcp_min_hptsi_time; - } else { - /* No one on the wheel sleep for all but 400 slots or sleep max */ - non_found: - hpts->p_hpts_sleep_time = hpts_sleep_max; - } + if (from_callout){ + tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt); } + if (seen_endpoint) + return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot)); + else + return (0); } void __tcp_set_hpts(struct inpcb *inp, int32_t line) { struct tcp_hpts_entry *hpts; + int failed; INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); if ((inp->inp_in_hpts == 0) && (inp->inp_hpts_cpu_set == 0)) { - inp->inp_hpts_cpu = hpts_cpuid(inp); - inp->inp_hpts_cpu_set = 1; + inp->inp_hpts_cpu = hpts_cpuid(inp, &failed); + if (failed == 0) + inp->inp_hpts_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if ((inp->inp_input_cpu_set == 0) && (inp->inp_in_input == 0)) { - inp->inp_input_cpu = hpts_cpuid(inp); - inp->inp_input_cpu_set = 1; + inp->inp_input_cpu = hpts_cpuid(inp, &failed); + if (failed == 0) + inp->inp_input_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); } @@ -1804,6 +1845,127 @@ return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by); } +static void +__tcp_run_hpts(struct tcp_hpts_entry *hpts) +{ + int ticks_ran; + + if (hpts->p_hpts_active) { + /* Already active */ + return; + } + if (mtx_trylock(&hpts->p_mtx) == 0) { + /* Someone else got the lock */ + return; + } + if (hpts->p_hpts_active) + goto out_with_mtx; + hpts->syscall_cnt++; + counter_u64_add(hpts_direct_call, 1); + hpts->p_hpts_active = 1; + ticks_ran = tcp_hptsi(hpts, 0); + /* We may want to adjust the sleep values here */ + if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { + if (ticks_ran > ticks_indicate_less_sleep) { + struct timeval tv; + sbintime_t sb; + int cpu; + + hpts->p_mysleep.tv_usec /= 2; + if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) + hpts->p_mysleep.tv_usec = dynamic_min_sleep; + /* Reschedule with new to value */ + tcp_hpts_set_max_sleep(hpts, 0); + tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; + /* Validate its in the right ranges */ + if (tv.tv_usec < hpts->p_mysleep.tv_usec) { + hpts->overidden_sleep = tv.tv_usec; + tv.tv_usec = hpts->p_mysleep.tv_usec; + } else if (tv.tv_usec > dynamic_max_sleep) { + /* Lets not let sleep get above this value */ + hpts->overidden_sleep = tv.tv_usec; + tv.tv_usec = dynamic_max_sleep; + } + /* + * In this mode the timer is a backstop to + * all the userret/lro_flushes so we use + * the dynamic value and set the on_min_sleep + * flag so we will not be awoken. + */ + sb = tvtosbt(tv); + cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu; + /* Store off to make visible the actual sleep time */ + hpts->sleeping = tv.tv_usec; + callout_reset_sbt_on(&hpts->co, sb, 0, + hpts_timeout_swi, hpts, cpu, + (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + } else if (ticks_ran < ticks_indicate_more_sleep) { + /* For the further sleep, don't reschedule hpts */ + hpts->p_mysleep.tv_usec *= 2; + if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) + hpts->p_mysleep.tv_usec = dynamic_max_sleep; + } + hpts->p_on_min_sleep = 1; + } + hpts->p_hpts_active = 0; +out_with_mtx: + HPTS_MTX_ASSERT(hpts); + mtx_unlock(&hpts->p_mtx); +} + +static struct tcp_hpts_entry * +tcp_choose_hpts_to_run() +{ + int i, oldest_idx; + uint32_t cts, time_since_ran, calc; + + if ((hpts_uses_oldest == 0) || + ((hpts_uses_oldest > 1) && + (tcp_pace.rp_ent[(tcp_pace.rp_num_hptss-1)]->p_on_queue_cnt >= hpts_uses_oldest))) { + /* + * We have either disabled the feature (0), or + * we have crossed over the oldest threshold on the + * last hpts. We use the last one for simplification + * since we don't want to use the first one (it may + * have starting connections that have not settled + * on the cpu yet). + */ + return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); + } + /* Lets find the oldest hpts to attempt to run */ + cts = tcp_get_usecs(NULL); + time_since_ran = 0; + oldest_idx = -1; + for (i = 0; i < tcp_pace.rp_num_hptss; i++) { + if (TSTMP_GT(cts, cts_last_ran[i])) + calc = cts - cts_last_ran[i]; + else + calc = 0; + if (calc > time_since_ran) { + oldest_idx = i; + time_since_ran = calc; + } + } + if (oldest_idx >= 0) + return(tcp_pace.rp_ent[oldest_idx]); + else + return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]); +} + + +void +tcp_run_hpts(void) +{ + static struct tcp_hpts_entry *hpts; + struct epoch_tracker et; + + NET_EPOCH_ENTER(et); + hpts = tcp_choose_hpts_to_run(); + __tcp_run_hpts(hpts); + NET_EPOCH_EXIT(et); +} + + static void tcp_hpts_thread(void *ctx) { @@ -1811,51 +1973,142 @@ struct epoch_tracker et; struct timeval tv; sbintime_t sb; + int cpu, ticks_ran; hpts = (struct tcp_hpts_entry *)ctx; mtx_lock(&hpts->p_mtx); if (hpts->p_direct_wake) { - /* Signaled by input */ + /* Signaled by input or output with low occupancy count. */ callout_stop(&hpts->co); + counter_u64_add(hpts_direct_awakening, 1); } else { - /* Timed out */ + /* Timed out, the normal case. */ + counter_u64_add(hpts_wake_timeout, 1); if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { mtx_unlock(&hpts->p_mtx); return; } - callout_deactivate(&hpts->co); } + callout_deactivate(&hpts->co); hpts->p_hpts_wake_scheduled = 0; - hpts->p_hpts_active = 1; NET_EPOCH_ENTER(et); - tcp_hptsi(hpts); - NET_EPOCH_EXIT(et); - HPTS_MTX_ASSERT(hpts); + if (hpts->p_hpts_active) { + /* + * We are active already. This means that a syscall + * trap or LRO is running in behalf of hpts. In that case + * we need to double our timeout since there seems to be + * enough activity in the system that we don't need to + * run as often (if we were not directly woken). + */ + if (hpts->p_direct_wake == 0) { + counter_u64_add(hpts_back_tosleep, 1); + if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { + hpts->p_mysleep.tv_usec *= 2; + if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) + hpts->p_mysleep.tv_usec = dynamic_max_sleep; + tv.tv_usec = hpts->p_mysleep.tv_usec; + hpts->p_on_min_sleep = 1; + } else { + /* + * Here we have low count on the wheel, but + * somehow we still collided with one of the + * connections. Lets go back to sleep for a + * min sleep time, but clear the flag so we + * can be awoken by insert. + */ + hpts->p_on_min_sleep = 0; + tv.tv_usec = tcp_min_hptsi_time; + } + } else { + /* + * Directly woken most likely to reset the + * callout time. + */ + tv.tv_sec = 0; + tv.tv_usec = hpts->p_mysleep.tv_usec; + } + goto back_to_sleep; + } + hpts->sleeping = 0; + hpts->p_hpts_active = 1; + ticks_ran = tcp_hptsi(hpts, 1); tv.tv_sec = 0; - tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; - if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { - hpts->overidden_sleep = tv.tv_usec; - tv.tv_usec = tcp_min_hptsi_time; + tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; + if (hpts->p_on_queue_cnt >= conn_cnt_thresh) { + if(hpts->p_direct_wake == 0) { + /* + * Only adjust sleep time if we were + * called from the callout i.e. direct_wake == 0. + */ + if (ticks_ran < ticks_indicate_more_sleep) { + hpts->p_mysleep.tv_usec *= 2; + if (hpts->p_mysleep.tv_usec > dynamic_max_sleep) + hpts->p_mysleep.tv_usec = dynamic_max_sleep; + } else if (ticks_ran > ticks_indicate_less_sleep) { + hpts->p_mysleep.tv_usec /= 2; + if (hpts->p_mysleep.tv_usec < dynamic_min_sleep) + hpts->p_mysleep.tv_usec = dynamic_min_sleep; + } + } + if (tv.tv_usec < hpts->p_mysleep.tv_usec) { + hpts->overidden_sleep = tv.tv_usec; + tv.tv_usec = hpts->p_mysleep.tv_usec; + } else if (tv.tv_usec > dynamic_max_sleep) { + /* Lets not let sleep get above this value */ + hpts->overidden_sleep = tv.tv_usec; + tv.tv_usec = dynamic_max_sleep; + } + /* + * In this mode the timer is a backstop to + * all the userret/lro_flushes so we use + * the dynamic value and set the on_min_sleep + * flag so we will not be awoken. + */ hpts->p_on_min_sleep = 1; - } else { - /* Clear the min sleep flag */ - hpts->overidden_sleep = 0; + } else if (hpts->p_on_queue_cnt == 0) { + /* + * No one on the wheel, please wake us up + * if you insert on the wheel. + */ hpts->p_on_min_sleep = 0; - } - hpts->p_hpts_active = 0; - sb = tvtosbt(tv); - if (tcp_hpts_callout_skip_swi == 0) { - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + hpts->overidden_sleep = 0; } else { - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_dir, hpts, - hpts->p_cpu, - C_PREL(tcp_hpts_precision)); + /* + * We hit here when we have a low number of + * clients on the wheel (our else clause). + * We may need to go on min sleep, if we set + * the flag we will not be awoken if someone + * is inserted ahead of us. Clearing the flag + * means we can be awoken. This is "old mode" + * where the timer is what runs hpts mainly. + */ + if (tv.tv_usec < tcp_min_hptsi_time) { + /* + * Yes on min sleep, which means + * we cannot be awoken. + */ + hpts->overidden_sleep = tv.tv_usec; + tv.tv_usec = tcp_min_hptsi_time; + hpts->p_on_min_sleep = 1; + } else { + /* Clear the min sleep flag */ + hpts->overidden_sleep = 0; + hpts->p_on_min_sleep = 0; + } } + HPTS_MTX_ASSERT(hpts); + hpts->p_hpts_active = 0; +back_to_sleep: hpts->p_direct_wake = 0; + sb = tvtosbt(tv); + cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu; + /* Store off to make visible the actual sleep time */ + hpts->sleeping = tv.tv_usec; + callout_reset_sbt_on(&hpts->co, sb, 0, + hpts_timeout_swi, hpts, cpu, + (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); + NET_EPOCH_EXIT(et); mtx_unlock(&hpts->p_mtx); } @@ -1873,7 +2126,7 @@ cpuset_t cs; char unit[16]; uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; - int count, domain; + int count, domain, cpu; tcp_pace.rp_proc = NULL; tcp_pace.rp_num_hptss = ncpus; @@ -1882,8 +2135,18 @@ back_tosleep = counter_u64_alloc(M_WAITOK); combined_wheel_wrap = counter_u64_alloc(M_WAITOK); wheel_wrap = counter_u64_alloc(M_WAITOK); + hpts_wake_timeout = counter_u64_alloc(M_WAITOK); + hpts_direct_awakening = counter_u64_alloc(M_WAITOK); + hpts_back_tosleep = counter_u64_alloc(M_WAITOK); + hpts_direct_call = counter_u64_alloc(M_WAITOK); + cpu_uses_flowid = counter_u64_alloc(M_WAITOK); + cpu_uses_random = counter_u64_alloc(M_WAITOK); + + sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); + sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss); + cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK); asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; for (i = 0; i < tcp_pace.rp_num_hptss; i++) { tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), @@ -1933,19 +2196,41 @@ SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "runtick", CTLFLAG_RD, - &hpts->p_runningtick, 0, + &hpts->p_runningslot, 0, "What the running pacers current slot is"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curtick", CTLFLAG_RD, &hpts->p_curtick, 0, "What the running pacers last tick mapped to the wheel was"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "lastran", CTLFLAG_RD, + &cts_last_ran[i], 0, + "The last usec tick that this hpts ran"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "cur_min_sleep", CTLFLAG_RD, + &hpts->p_mysleep.tv_usec, 0, + "What the running pacers is using for p_mysleep.tv_usec"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "now_sleeping", CTLFLAG_RD, + &hpts->sleeping, 0, + "What the running pacers is actually sleeping for"); + SYSCTL_ADD_U64(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "syscall_cnt", CTLFLAG_RD, + &hpts->syscall_cnt, 0, + "How many times we had syscalls on this hpts"); + hpts->p_hpts_sleep_time = hpts_sleep_max; hpts->p_num = i; hpts->p_curtick = tcp_gethptstick(&tv); + cts_last_ran[i] = tcp_tv_to_usectick(&tv); hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); hpts->p_cpu = 0xffff; - hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1); + hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1); callout_init(&hpts->co, 1); } @@ -1956,17 +2241,18 @@ /* * Now lets start ithreads to handle the hptss. */ - CPU_FOREACH(i) { + for (i = 0; i < tcp_pace.rp_num_hptss; i++) { hpts = tcp_pace.rp_ent[i]; hpts->p_cpu = i; error = swi_add(&hpts->ie, "hpts", tcp_hpts_thread, (void *)hpts, SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); - if (error) { - panic("Can't add hpts:%p i:%d err:%d", - hpts, i, error); - } + KASSERT(error == 0, + ("Can't add hpts:%p i:%d err:%d", + hpts, i, error)); created++; + hpts->p_mysleep.tv_sec = 0; + hpts->p_mysleep.tv_usec = tcp_min_hptsi_time; if (tcp_bind_threads == 1) { if (intr_event_bind(hpts->ie, i) == 0) bound++; @@ -1983,18 +2269,13 @@ } } tv.tv_sec = 0; - tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; + tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT; + hpts->sleeping = tv.tv_usec; sb = tvtosbt(tv); - if (tcp_hpts_callout_skip_swi == 0) { - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_swi, hpts, hpts->p_cpu, - (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); - } else { - callout_reset_sbt_on(&hpts->co, sb, 0, - hpts_timeout_dir, hpts, - hpts->p_cpu, - C_PREL(tcp_hpts_precision)); - } + cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu; + callout_reset_sbt_on(&hpts->co, sb, 0, + hpts_timeout_swi, hpts, cpu, + (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); } /* * If we somehow have an empty domain, fall back to choosing @@ -2006,11 +2287,13 @@ break; } } - printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", created, bound, tcp_bind_threads == 2 ? "NUMA domains" : "cpus"); +#ifdef INVARIANTS + printf("HPTS is in INVARIANT mode!!\n"); +#endif } -SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); +SYSINIT(tcphptsi, SI_SUB_SOFTINTR, SI_ORDER_ANY, tcp_init_hptsi, NULL); MODULE_VERSION(tcphpts, 1); Index: sys/netinet/tcp_lro.h =================================================================== --- sys/netinet/tcp_lro.h +++ sys/netinet/tcp_lro.h @@ -56,6 +56,11 @@ #define TSTMP_LRO 0x0100 #define TSTMP_HDWR 0x0200 #define HAS_TSTMP 0x0400 +/* + * Default number of interrupts on the same cpu in a row + * that will cause us to declare a "affinity cpu". + */ +#define TCP_LRO_CPU_DECLARATION_THRESH 50 struct inpcb; @@ -162,12 +167,15 @@ unsigned lro_mbuf_count; unsigned lro_mbuf_max; unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */ + unsigned short lro_cpu; /* Guess at the cpu we have affinity too */ unsigned lro_length_lim; /* max len of aggregated data */ - u_long lro_hashsz; + uint32_t lro_last_cpu; + uint32_t lro_cnt_of_same_cpu; struct lro_head *lro_hash; struct lro_head lro_active; struct lro_head lro_free; + uint8_t lro_cpu_is_set; /* Flag to say its ok to set the CPU on the inp */ }; struct tcp_ackent { Index: sys/netinet/tcp_lro.c =================================================================== --- sys/netinet/tcp_lro.c +++ sys/netinet/tcp_lro.c @@ -107,6 +107,11 @@ CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, "default number of LRO entries"); +static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH; +SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold, + CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0, + "Number of interrups in a row on the same CPU that will make us declare an 'affinity' cpu?"); + SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, @@ -631,12 +636,13 @@ log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; } log.u_bbr.inflight = th_seq; + log.u_bbr.delivered = th_ack; log.u_bbr.timeStamp = cts; log.u_bbr.epoch = le->next_seq; - log.u_bbr.delivered = th_ack; log.u_bbr.lt_epoch = le->ack_seq; log.u_bbr.pacing_gain = th_win; log.u_bbr.cwnd_gain = le->window; + log.u_bbr.lost = curcpu; log.u_bbr.cur_del_rate = (uintptr_t)m; log.u_bbr.bw_inuse = (uintptr_t)le->m_head; bintime2timeval(&lc->lro_last_queue_time, &btv); @@ -1273,7 +1279,10 @@ INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } - + if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) { + inp->inp_irq_cpu = lc->lro_last_cpu; + inp->inp_irq_cpu_set = 1; + } /* Check if the transport doesn't support the needed optimizations. */ if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) { INP_WUNLOCK(inp); @@ -1445,7 +1454,17 @@ /* check if no mbufs to flush */ if (lc->lro_mbuf_count == 0) goto done; - + if (lc->lro_cpu_is_set == 0) { + if (lc->lro_last_cpu == curcpu) { + lc->lro_cnt_of_same_cpu++; + /* Have we reached the threshold to declare a cpu? */ + if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh) + lc->lro_cpu_is_set = 1; + } else { + lc->lro_last_cpu = curcpu; + lc->lro_cnt_of_same_cpu = 0; + } + } CURVNET_SET(lc->ifp->if_vnet); /* get current time */ @@ -1486,6 +1505,9 @@ /* flush active streams */ tcp_lro_rx_done(lc); +#ifdef TCPHPTS + tcp_run_hpts(); +#endif lc->lro_mbuf_count = 0; } Index: sys/netinet/tcp_stacks/bbr.c =================================================================== --- sys/netinet/tcp_stacks/bbr.c +++ sys/netinet/tcp_stacks/bbr.c @@ -2429,10 +2429,10 @@ log.u_bbr.pkts_out = diag->co_ret; log.u_bbr.applimited = diag->hpts_sleep_time; log.u_bbr.delivered = diag->p_prev_slot; - log.u_bbr.inflight = diag->p_runningtick; - log.u_bbr.bw_inuse = diag->wheel_tick; + log.u_bbr.inflight = diag->p_runningslot; + log.u_bbr.bw_inuse = diag->wheel_slot; log.u_bbr.rttProp = diag->wheel_cts; - log.u_bbr.delRate = diag->maxticks; + log.u_bbr.delRate = diag->maxslots; log.u_bbr.cur_del_rate = diag->p_curtick; log.u_bbr.cur_del_rate <<= 32; log.u_bbr.cur_del_rate |= diag->p_lasttick; Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -5609,11 +5609,11 @@ log.u_bbr.pkts_out = diag->co_ret; log.u_bbr.applimited = diag->hpts_sleep_time; log.u_bbr.delivered = diag->p_prev_slot; - log.u_bbr.inflight = diag->p_runningtick; - log.u_bbr.bw_inuse = diag->wheel_tick; + log.u_bbr.inflight = diag->p_runningslot; + log.u_bbr.bw_inuse = diag->wheel_slot; log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.timeStamp = cts; - log.u_bbr.delRate = diag->maxticks; + log.u_bbr.delRate = diag->maxslots; log.u_bbr.cur_del_rate = diag->p_curtick; log.u_bbr.cur_del_rate <<= 32; log.u_bbr.cur_del_rate |= diag->p_lasttick; @@ -5707,22 +5707,22 @@ * on the clock. We always have a min * 10 slots (10 x 10 i.e. 100 usecs). */ - if (slot <= HPTS_TICKS_PER_USEC) { + if (slot <= HPTS_TICKS_PER_SLOT) { /* We gain delay */ - rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); - slot = HPTS_TICKS_PER_USEC; + rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot); + slot = HPTS_TICKS_PER_SLOT; } else { /* We take off some */ - rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); - slot = HPTS_TICKS_PER_USEC; + rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT); + slot = HPTS_TICKS_PER_SLOT; } } else { slot -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; /* Make sure we have 100 useconds at minimum */ - if (slot < HPTS_TICKS_PER_USEC) { - rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; - slot = HPTS_TICKS_PER_USEC; + if (slot < HPTS_TICKS_PER_SLOT) { + rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot; + slot = HPTS_TICKS_PER_SLOT; } if (rack->r_ctl.rc_agg_delayed == 0) rack->r_late = 0;