Page MenuHomeFreeBSD

D31083.id.diff
No OneTemporary

D31083.id.diff

Index: sys/kern/subr_trap.c
===================================================================
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -140,6 +140,16 @@
#ifdef HWPMC_HOOKS
if (PMC_THREAD_HAS_SAMPLES(td))
PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL);
+#endif
+#ifdef TCPHPTS
+ /*
+ * @gallatin is adament that this needs to go here, I
+ * am not so sure. Running hpts is a lot like
+ * a lro_flush() that happens while a user process
+ * is running. But he may know best so I will go
+ * with his view of accounting. :-)
+ */
+ tcp_run_hpts();
#endif
/*
* Let the scheduler adjust our priority etc.
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -258,6 +258,7 @@
volatile uint32_t inp_in_input; /* on input hpts (lock b) */
#endif
volatile uint16_t inp_hpts_cpu; /* Lock (i) */
+ volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
@@ -266,7 +267,8 @@
inp_input_cpu_set : 1, /* on input hpts (i) */
inp_hpts_calls :1, /* (i) from output hpts */
inp_input_calls :1, /* (i) from input hpts */
- inp_spare_bits2 : 4;
+ inp_irq_cpu_set :1, /* (i) from LRO/Driver */
+ inp_spare_bits2 : 3;
uint8_t inp_numa_domain; /* numa domain */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct socket *inp_socket; /* (i) back pointer to socket */
Index: sys/netinet/tcp_hpts.h
===================================================================
--- sys/netinet/tcp_hpts.h
+++ sys/netinet/tcp_hpts.h
@@ -44,7 +44,7 @@
TAILQ_HEAD(hptsh, inpcb);
/* Number of useconds in a hpts tick */
-#define HPTS_TICKS_PER_USEC 10
+#define HPTS_TICKS_PER_SLOT 10
#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
@@ -56,7 +56,7 @@
uint32_t p_nxt_slot; /* bbr->flex1 x */
uint32_t p_cur_slot; /* bbr->flex2 x */
uint32_t p_prev_slot; /* bbr->delivered */
- uint32_t p_runningtick; /* bbr->inflight */
+ uint32_t p_runningslot; /* bbr->inflight */
uint32_t slot_req; /* bbr->flex3 x */
uint32_t inp_hptsslot; /* bbr->flex4 x */
uint32_t slot_remaining; /* bbr->flex5 x */
@@ -64,8 +64,8 @@
uint32_t hpts_sleep_time; /* bbr->applimited x */
uint32_t yet_to_sleep; /* bbr->lt_epoch x */
uint32_t need_new_to; /* bbr->flex6 x */
- uint32_t wheel_tick; /* bbr->bw_inuse x */
- uint32_t maxticks; /* bbr->delRate x */
+ uint32_t wheel_slot; /* bbr->bw_inuse x */
+ uint32_t maxslots; /* bbr->delRate x */
uint32_t wheel_cts; /* bbr->rttProp x */
int32_t co_ret; /* bbr->pkts_out x */
uint32_t p_curtick; /* upper bbr->cur_del_rate */
@@ -83,16 +83,20 @@
#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
+#define DEFAULT_CONNECTION_THESHOLD 100
+
#ifdef _KERNEL
/* Each hpts has its own p_mtx which is used for locking */
struct tcp_hpts_entry {
/* Cache line 0x00 */
struct mtx p_mtx; /* Mutex for hpts */
+ struct timeval p_mysleep; /* Our min sleep time */
+ uint64_t syscall_cnt;
+ uint64_t sleeping; /* What the actual sleep was (if sleeping) */
uint16_t p_hpts_active; /* Flag that says hpts is awake */
- uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
- uint32_t p_runningtick; /* Current tick we are at if we are running */
+ uint32_t p_runningslot; /* Current tick we are at if we are running */
uint32_t p_prev_slot; /* Previous slot we were on */
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
uint32_t p_nxt_slot; /* The next slot outside the current range of
@@ -101,7 +105,8 @@
uint32_t p_lasttick; /* Last tick before the current one */
uint8_t p_direct_wake :1, /* boolean */
p_on_min_sleep:1, /* boolean */
- p_avail:6;
+ p_hpts_wake_scheduled:1, /* boolean */
+ p_avail:5;
uint8_t p_fill[3]; /* Fill to 32 bits */
/* Cache line 0x40 */
void *p_inp;
@@ -109,8 +114,6 @@
/* Hptsi wheel */
struct hptsh *p_hptss;
int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
- uint32_t hit_no_enobuf;
- uint32_t p_dyn_adjust;
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
* of 255ms */
uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
@@ -134,6 +137,7 @@
struct tcp_hptsi {
struct proc *rp_proc; /* Process structure for hpts */
struct tcp_hpts_entry **rp_ent; /* Array of hptss */
+ uint32_t *cts_last_ran;
uint32_t rp_num_hptss; /* Number of hpts threads */
};
@@ -155,10 +159,37 @@
* be sent when a TCB is still around must be
* sent from a routine like tcp_respond().
*/
+#define LOWEST_SLEEP_ALLOWED 50
#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
* this determines min granularity of the
- * hpts. If 0, granularity is 10useconds at
- * the cost of more CPU (context switching). */
+ * hpts. If 1, granularity is 10useconds at
+ * the cost of more CPU (context switching).
+ * Note do not set this to 0.
+ */
+#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
+#define DYNAMIC_MAX_SLEEP 100000 /* 100ms */
+/* No of connections when wee start aligning to the cpu from syscalls */
+#define OLDEST_THRESHOLD 1200
+/* Thresholds for raising/lowering sleep */
+#define TICKS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */
+#define TICKS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */
+/**
+ *
+ * Dynamic adjustment of sleeping times is done in "new" mode
+ * where we are depending on syscall returns and lro returns
+ * to push hpts forward mainly and the timer is only a backstop.
+ *
+ * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
+ * then we do a dynamic adjustment on the time we sleep.
+ * Our threshold is if the lateness of the first client served (in ticks) is
+ * greater than or equal too ticks_indicate_more_sleep (10ms
+ * or 10000 ticks). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the ticks_ran is less than
+ * ticks_indicate_more_sleep (100 ticks or 1000usecs).
+ *
+ */
+
+
#ifdef _KERNEL
#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
@@ -215,43 +246,61 @@
void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
+void tcp_run_hpts(void);
+
+uint16_t hpts_random_cpu(struct inpcb *inp);
+
extern int32_t tcp_min_hptsi_time;
-static __inline uint32_t
-tcp_tv_to_hptstick(struct timeval *sv)
-{
- return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
-}
+#endif /* _KERNEL */
+/*
+ * The following functions should also be available
+ * to userspace as well.
+ */
static __inline uint32_t
-tcp_gethptstick(struct timeval *sv)
+tcp_tv_to_hptstick(const struct timeval *sv)
{
- struct timeval tv;
-
- if (sv == NULL)
- sv = &tv;
- microuptime(sv);
- return (tcp_tv_to_hptstick(sv));
+ return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
}
static __inline uint32_t
-tcp_tv_to_usectick(struct timeval *sv)
+tcp_tv_to_usectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
}
static __inline uint32_t
-tcp_tv_to_mssectick(struct timeval *sv)
+tcp_tv_to_mssectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
}
+static __inline uint64_t
+tcp_tv_to_lusectick(const struct timeval *sv)
+{
+ return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+#ifdef _KERNEL
+
static __inline void
tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
{
mtx_unlock(&hpts->p_mtx);
}
+static __inline uint32_t
+tcp_gethptstick(struct timeval *sv)
+{
+ struct timeval tv;
+
+ if (sv == NULL)
+ sv = &tv;
+ microuptime(sv);
+ return (tcp_tv_to_hptstick(sv));
+}
+
static __inline uint32_t
tcp_get_usecs(struct timeval *tv)
{
Index: sys/netinet/tcp_hpts.c
===================================================================
--- sys/netinet/tcp_hpts.c
+++ sys/netinet/tcp_hpts.c
@@ -193,23 +193,29 @@
#else
static int tcp_bind_threads = 2;
#endif
-TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
-
+static int tcp_use_irq_cpu = 0;
static struct tcp_hptsi tcp_pace;
+static uint32_t *cts_last_ran;
static int hpts_does_tp_logging = 0;
+static int hpts_use_assigned_cpu = 1;
+static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
-static void tcp_wakehpts(struct tcp_hpts_entry *p);
-static void tcp_wakeinput(struct tcp_hpts_entry *p);
static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
-static void tcp_hptsi(struct tcp_hpts_entry *hpts);
+static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
static void tcp_hpts_thread(void *ctx);
static void tcp_init_hptsi(void *st);
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
-static int32_t tcp_hpts_callout_skip_swi = 0;
+static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
+static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
+static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
+
+
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"TCP Hpts controls");
+SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "TCP Hpts statistics");
#define timersub(tvp, uvp, vvp) \
do { \
@@ -230,44 +236,92 @@
struct hpts_domain_info hpts_domains[MAXMEMDOM];
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
- &tcp_hpts_precision, 120,
- "Value for PRE() precision of callout");
-
counter_u64_t hpts_hopelessly_behind;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
&hpts_hopelessly_behind,
"Number of times hpts could not catch up and was behind hopelessly");
counter_u64_t hpts_loops;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
&hpts_loops, "Number of times hpts had to loop to catch up");
counter_u64_t back_tosleep;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
&back_tosleep, "Number of times hpts found no tcbs");
counter_u64_t combined_wheel_wrap;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
&combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
counter_u64_t wheel_wrap;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
&wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
-static int32_t out_ts_percision = 0;
+counter_u64_t hpts_direct_call;
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
+ &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");
+
+counter_u64_t hpts_wake_timeout;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
+ &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");
+
+counter_u64_t hpts_direct_awakening;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
+ &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
- &out_ts_percision, 0,
- "Do we use a percise timestamp for every output cts");
+counter_u64_t hpts_back_tosleep;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
+ &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");
+
+counter_u64_t cpu_uses_flowid;
+counter_u64_t cpu_uses_random;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
+ &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
+ &cpu_uses_random, "Number of times when setting cpuid we used the a random value");
+
+TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
+TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
+ &tcp_bind_threads, 2,
+ "Thread Binding tunable");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
+ &tcp_use_irq_cpu, 0,
+ "Use of irq CPU tunable");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
+ &tcp_hpts_precision, 120,
+ "Value for PRE() precision of callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
+ &conn_cnt_thresh, 0,
+ "How many connections (below) make us use the callout based mechanism");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
&hpts_does_tp_logging, 0,
"Do we add to any tp that has logging on pacer logs");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
+ &hpts_use_assigned_cpu, 0,
+ "Do we start any hpts timer on the assigned cpu?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
+ &hpts_uses_oldest, OLDEST_THRESHOLD,
+ "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
+ &dynamic_min_sleep, 250,
+ "What is the dynamic minsleep value?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
+ &dynamic_max_sleep, 5000,
+ "What is the dynamic maxsleep value?");
+
+
+
+
static int32_t max_pacer_loops = 10;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
@@ -287,7 +341,7 @@
new = hpts_sleep_max;
error = sysctl_handle_int(oidp, &new, 0, req);
if (error == 0 && req->newptr) {
- if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
+ if ((new < dynamic_min_sleep) ||
(new > HPTS_MAX_SLEEP_ALLOWED))
error = EINVAL;
else
@@ -296,26 +350,60 @@
return (error);
}
+static int
+sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ uint32_t new;
+
+ new = tcp_min_hptsi_time;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if (new < LOWEST_SLEEP_ALLOWED)
+ error = EINVAL;
+ else
+ tcp_min_hptsi_time = new;
+ }
+ return (error);
+}
+
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&hpts_sleep_max, 0,
&sysctl_net_inet_tcp_hpts_max_sleep, "IU",
"Maximum time hpts will sleep");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&tcp_min_hptsi_time, 0,
+ &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
"The minimum time the hpts must sleep before processing more slots");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
- &tcp_hpts_callout_skip_swi, 0,
- "Do we have the callout call directly to the hpts?");
+static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
+static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
+static int tcp_hpts_no_wake_over_thresh = 1;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
+ &ticks_indicate_more_sleep, 0,
+ "If we only process this many or less on a timeout, we need longer sleep on the next callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
+ &ticks_indicate_less_sleep, 0,
+ "If we process this many or more on a timeout, we need less sleep on the next callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
+ &tcp_hpts_no_wake_over_thresh, 0,
+ "When we are over the threshold on the pacer do we prohibit wakeups?");
static void
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
- int ticks_to_run, int idx)
+ int slots_to_run, int idx, int from_callout)
{
union tcp_log_stackspecific log;
-
+ /*
+ * Unused logs are
+ * 64 bit - delRate, rttProp, bw_inuse
+ * 16 bit - cwnd_gain
+ * 8 bit - bbr_state, bbr_substate, inhpts, ininput;
+ */
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.flex1 = hpts->p_nxt_slot;
log.u_bbr.flex2 = hpts->p_cur_slot;
@@ -323,8 +411,9 @@
log.u_bbr.flex4 = idx;
log.u_bbr.flex5 = hpts->p_curtick;
log.u_bbr.flex6 = hpts->p_on_queue_cnt;
- log.u_bbr.use_lt_bw = 1;
- log.u_bbr.inflight = ticks_to_run;
+ log.u_bbr.flex7 = hpts->p_cpu;
+ log.u_bbr.flex8 = (uint8_t)from_callout;
+ log.u_bbr.inflight = slots_to_run;
log.u_bbr.applimited = hpts->overidden_sleep;
log.u_bbr.delivered = hpts->saved_curtick;
log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
@@ -332,7 +421,9 @@
log.u_bbr.lt_epoch = hpts->saved_prev_slot;
log.u_bbr.pkts_out = hpts->p_delayed_by;
log.u_bbr.lost = hpts->p_hpts_sleep_time;
- log.u_bbr.cur_del_rate = hpts->p_runningtick;
+ log.u_bbr.pacing_gain = hpts->p_cpu;
+ log.u_bbr.pkt_epoch = hpts->p_runningslot;
+ log.u_bbr.use_lt_bw = 1;
TCP_LOG_EVENTP(tp, NULL,
&tp->t_inpcb->inp_socket->so_rcv,
&tp->t_inpcb->inp_socket->so_snd,
@@ -341,47 +432,40 @@
}
static void
-hpts_timeout_swi(void *arg)
+tcp_wakehpts(struct tcp_hpts_entry *hpts)
{
- struct tcp_hpts_entry *hpts;
+ HPTS_MTX_ASSERT(hpts);
- hpts = (struct tcp_hpts_entry *)arg;
- swi_sched(hpts->ie_cookie, 0);
+ if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
+ hpts->p_direct_wake = 0;
+ return;
+ }
+ if (hpts->p_hpts_wake_scheduled == 0) {
+ hpts->p_hpts_wake_scheduled = 1;
+ swi_sched(hpts->ie_cookie, 0);
+ }
}
static void
-hpts_timeout_dir(void *arg)
+hpts_timeout_swi(void *arg)
{
- tcp_hpts_thread(arg);
+ struct tcp_hpts_entry *hpts;
+
+ hpts = (struct tcp_hpts_entry *)arg;
+ swi_sched(hpts->ie_cookie, 0);
}
static inline void
hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
{
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx) == 0) {
- /* We don't own the mutex? */
- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
- }
- if (hpts->p_cpu != inp->inp_hpts_cpu) {
- /* It is not the right cpu/mutex? */
- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
- }
- if (inp->inp_in_hpts == 0) {
- /* We are not on the hpts? */
- panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
- }
-#endif
+ HPTS_MTX_ASSERT(hpts);
+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+ KASSERT(inp->inp_in_hpts != 0, ("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp));
TAILQ_REMOVE(head, inp, inp_hpts);
hpts->p_on_queue_cnt--;
- if (hpts->p_on_queue_cnt < 0) {
- /* Count should not go negative .. */
-#ifdef INVARIANTS
- panic("Hpts goes negative inp:%p hpts:%p",
- inp, hpts);
-#endif
- hpts->p_on_queue_cnt = 0;
- }
+ KASSERT(hpts->p_on_queue_cnt >= 0,
+ ("Hpts goes negative inp:%p hpts:%p",
+ inp, hpts));
if (clear) {
inp->inp_hpts_request = 0;
inp->inp_in_hpts = 0;
@@ -391,20 +475,13 @@
static inline void
hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
{
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx) == 0) {
- /* We don't own the mutex? */
- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
- }
- if (hpts->p_cpu != inp->inp_hpts_cpu) {
- /* It is not the right cpu/mutex? */
- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
- }
- if ((noref == 0) && (inp->inp_in_hpts == 1)) {
- /* We are already on the hpts? */
- panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
- }
-#endif
+ HPTS_MTX_ASSERT(hpts);
+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+ KASSERT(((noref == 1) && (inp->inp_in_hpts == 1)) ||
+ ((noref == 0) && (inp->inp_in_hpts == 0)),
+ ("%s: hpts:%p inp:%p already on the hpts?",
+ __FUNCTION__, hpts, inp));
TAILQ_INSERT_TAIL(head, inp, inp_hpts);
inp->inp_in_hpts = 1;
hpts->p_on_queue_cnt++;
@@ -416,37 +493,20 @@
static inline void
hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
{
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx) == 0) {
- /* We don't own the mutex? */
- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
- }
- if (hpts->p_cpu != inp->inp_input_cpu) {
- /* It is not the right cpu/mutex? */
- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
- }
- if (inp->inp_in_input == 0) {
- /* We are not on the input hpts? */
- panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
- }
-#endif
+ HPTS_MTX_ASSERT(hpts);
+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+ KASSERT(inp->inp_in_input != 0,
+ ("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp));
TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
hpts->p_on_inqueue_cnt--;
- if (hpts->p_on_inqueue_cnt < 0) {
-#ifdef INVARIANTS
- panic("Hpts in goes negative inp:%p hpts:%p",
- inp, hpts);
-#endif
- hpts->p_on_inqueue_cnt = 0;
- }
-#ifdef INVARIANTS
- if (TAILQ_EMPTY(&hpts->p_input) &&
- (hpts->p_on_inqueue_cnt != 0)) {
- /* We should not be empty with a queue count */
- panic("%s hpts:%p in_hpts input empty but cnt:%d",
- __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
- }
-#endif
+ KASSERT(hpts->p_on_inqueue_cnt >= 0,
+ ("Hpts in goes negative inp:%p hpts:%p",
+ inp, hpts));
+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+ ("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch",
+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
if (clear)
inp->inp_in_input = 0;
}
@@ -454,46 +514,17 @@
static inline void
hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
{
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx) == 0) {
- /* We don't own the mutex? */
- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
- }
- if (hpts->p_cpu != inp->inp_input_cpu) {
- /* It is not the right cpu/mutex? */
- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
- }
- if (inp->inp_in_input == 1) {
- /* We are already on the input hpts? */
- panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
- }
-#endif
+ HPTS_MTX_ASSERT(hpts);
+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+ KASSERT(inp->inp_in_input == 0,
+ ("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp));
TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
inp->inp_in_input = 1;
hpts->p_on_inqueue_cnt++;
in_pcbref(inp);
}
-static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
-{
- HPTS_MTX_ASSERT(hpts);
- if (hpts->p_hpts_wake_scheduled == 0) {
- hpts->p_hpts_wake_scheduled = 1;
- swi_sched(hpts->ie_cookie, 0);
- }
-}
-
-static void
-tcp_wakeinput(struct tcp_hpts_entry *hpts)
-{
- HPTS_MTX_ASSERT(hpts);
- if (hpts->p_hpts_wake_scheduled == 0) {
- hpts->p_hpts_wake_scheduled = 1;
- swi_sched(hpts->ie_cookie, 0);
- }
-}
-
struct tcp_hpts_entry *
tcp_cur_hpts(struct inpcb *inp)
{
@@ -514,12 +545,9 @@
again:
hpts_num = inp->inp_hpts_cpu;
hpts = tcp_pace.rp_ent[hpts_num];
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__));
mtx_lock(&hpts->p_mtx);
if (hpts_num != inp->inp_hpts_cpu) {
mtx_unlock(&hpts->p_mtx);
@@ -537,12 +565,9 @@
again:
hpts_num = inp->inp_input_cpu;
hpts = tcp_pace.rp_ent[hpts_num];
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__));
mtx_lock(&hpts->p_mtx);
if (hpts_num != inp->inp_input_cpu) {
mtx_unlock(&hpts->p_mtx);
@@ -555,6 +580,7 @@
tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
{
int32_t add_freed;
+ int32_t ret;
if (inp->inp_flags2 & INP_FREED) {
/*
@@ -567,26 +593,11 @@
add_freed = 0;
}
#ifndef INP_REF_DEBUG
- if (in_pcbrele_wlocked(inp)) {
- /*
- * This should not happen. We have the inpcb referred to by
- * the main socket (why we are called) and the hpts. It
- * should always return 0.
- */
- panic("inpcb:%p release ret 1",
- inp);
- }
+ ret = in_pcbrele_wlocked(inp);
#else
- if (__in_pcbrele_wlocked(inp, line)) {
- /*
- * This should not happen. We have the inpcb referred to by
- * the main socket (why we are called) and the hpts. It
- * should always return 0.
- */
- panic("inpcb:%p release ret 1",
- inp);
- }
+ ret = __in_pcbrele_wlocked(inp, line);
#endif
+ KASSERT(ret != 1, ("inpcb:%p release ret 1", inp));
if (add_freed) {
inp->inp_flags2 |= INP_FREED;
}
@@ -642,73 +653,76 @@
}
static inline int
-hpts_tick(uint32_t wheel_tick, uint32_t plus)
+hpts_slot(uint32_t wheel_slot, uint32_t plus)
{
/*
* Given a slot on the wheel, what slot
* is that plus ticks out?
*/
- KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
- return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
+ KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
+ return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
}
static inline int
tick_to_wheel(uint32_t cts_in_wticks)
{
/*
- * Given a timestamp in wheel ticks (10usec inc's)
- * map it to our limited space wheel.
+ * Given a timestamp in ticks (so by
+ * default to get it to a real time one
+ * would multiply by 10.. i.e the number
+ * of ticks in a slot) map it to our limited
+ * space wheel.
*/
return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
}
static inline int
-hpts_ticks_diff(int prev_tick, int tick_now)
+hpts_slots_diff(int prev_slot, int slot_now)
{
/*
- * Given two ticks that are someplace
+ * Given two slots that are someplace
* on our wheel. How far are they apart?
*/
- if (tick_now > prev_tick)
- return (tick_now - prev_tick);
- else if (tick_now == prev_tick)
+ if (slot_now > prev_slot)
+ return (slot_now - prev_slot);
+ else if (slot_now == prev_slot)
/*
* Special case, same means we can go all of our
* wheel less one slot.
*/
return (NUM_OF_HPTSI_SLOTS - 1);
else
- return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
+ return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
}
/*
- * Given a tick on the wheel that is the current time
- * mapped to the wheel (wheel_tick), what is the maximum
+ * Given a slot on the wheel that is the current time
+ * mapped to the wheel (wheel_slot), what is the maximum
* distance forward that can be obtained without
- * wrapping past either prev_tick or running_tick
+ * wrapping past either prev_slot or running_slot
* depending on the htps state? Also if passed
- * a uint32_t *, fill it with the tick location.
+ * a uint32_t *, fill it with the slot location.
*
* Note if you do not give this function the current
- * time (that you think it is) mapped to the wheel
+ * time (that you think it is) mapped to the wheel slot
* then the results will not be what you expect and
* could lead to invalid inserts.
*/
static inline int32_t
-max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
+max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
{
- uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
+ uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
if ((hpts->p_hpts_active == 1) &&
(hpts->p_wheel_complete == 0)) {
- end_tick = hpts->p_runningtick;
+ end_slot = hpts->p_runningslot;
/* Back up one tick */
- if (end_tick == 0)
- end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ if (end_slot == 0)
+ end_slot = NUM_OF_HPTSI_SLOTS - 1;
else
- end_tick--;
- if (target_tick)
- *target_tick = end_tick;
+ end_slot--;
+ if (target_slot)
+ *target_slot = end_slot;
} else {
/*
* For the case where we are
@@ -718,26 +732,26 @@
* prev tick and subtract one from it. This puts us
* as far out as possible on the wheel.
*/
- end_tick = hpts->p_prev_slot;
- if (end_tick == 0)
- end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ end_slot = hpts->p_prev_slot;
+ if (end_slot == 0)
+ end_slot = NUM_OF_HPTSI_SLOTS - 1;
else
- end_tick--;
- if (target_tick)
- *target_tick = end_tick;
+ end_slot--;
+ if (target_slot)
+ *target_slot = end_slot;
/*
* Now we have close to the full wheel left minus the
* time it has been since the pacer went to sleep. Note
* that wheel_tick, passed in, should be the current time
* from the perspective of the caller, mapped to the wheel.
*/
- if (hpts->p_prev_slot != wheel_tick)
- dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+ if (hpts->p_prev_slot != wheel_slot)
+ dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
else
dis_to_travel = 1;
/*
* dis_to_travel in this case is the space from when the
- * pacer stopped (p_prev_slot) and where our wheel_tick
+ * pacer stopped (p_prev_slot) and where our wheel_slot
* is now. To know how many slots we can put it in we
* subtract from the wheel size. We would not want
* to place something after p_prev_slot or it will
@@ -746,21 +760,21 @@
return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
}
/*
- * So how many slots are open between p_runningtick -> p_cur_slot
+ * So how many slots are open between p_runningslot -> p_cur_slot
* that is what is currently un-available for insertion. Special
* case when we are at the last slot, this gets 1, so that
* the answer to how many slots are available is all but 1.
*/
- if (hpts->p_runningtick == hpts->p_cur_slot)
+ if (hpts->p_runningslot == hpts->p_cur_slot)
dis_to_travel = 1;
else
- dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
/*
* How long has the pacer been running?
*/
- if (hpts->p_cur_slot != wheel_tick) {
+ if (hpts->p_cur_slot != wheel_slot) {
/* The pacer is a bit late */
- pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
+ pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
} else {
/* The pacer is right on time, now == pacers start time */
pacer_to_now = 0;
@@ -774,24 +788,24 @@
/*
* Now how many of those we will eat due to the pacer's
* time (p_cur_slot) of start being behind the
- * real time (wheel_tick)?
+ * real time (wheel_slot)?
*/
if (avail_on_wheel <= pacer_to_now) {
/*
* Wheel wrap, we can't fit on the wheel, that
* is unusual the system must be way overloaded!
- * Insert into the assured tick, and return special
+ * Insert into the assured slot, and return special
* "0".
*/
counter_u64_add(combined_wheel_wrap, 1);
- *target_tick = hpts->p_nxt_slot;
+ *target_slot = hpts->p_nxt_slot;
return (0);
} else {
/*
* We know how many slots are open
* on the wheel (the reverse of what
* is left to run. Take away the time
- * the pacer started to now (wheel_tick)
+ * the pacer started to now (wheel_slot)
* and that tells you how many slots are
* open that can be inserted into that won't
* be touched by the pacer until later.
@@ -815,7 +829,7 @@
* A sleeping hpts we want in next slot to run
* note that in this state p_prev_slot == p_cur_slot
*/
- inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
+ inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
need_wake = 1;
} else if ((void *)inp == hpts->p_inp) {
@@ -827,7 +841,7 @@
*/
inp->inp_hptsslot = hpts->p_nxt_slot;
} else
- inp->inp_hptsslot = hpts->p_runningtick;
+ inp->inp_hptsslot = hpts->p_runningslot;
hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
if (need_wake) {
/*
@@ -862,9 +876,9 @@
* Sanity checks for the pacer with invariants
* on insert.
*/
- if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
- panic("hpts:%p inp:%p slot:%d > max",
- hpts, inp, inp_hptsslot);
+ KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
+ ("hpts:%p inp:%p slot:%d > max",
+ hpts, inp, inp_hptsslot));
if ((hpts->p_hpts_active) &&
(hpts->p_wheel_complete == 0)) {
/*
@@ -875,17 +889,16 @@
*/
int distance, yet_to_run;
- distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
- if (hpts->p_runningtick != hpts->p_cur_slot)
- yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
+ if (hpts->p_runningslot != hpts->p_cur_slot)
+ yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
else
yet_to_run = 0; /* processing last slot */
- if (yet_to_run > distance) {
- panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
- hpts, inp, inp_hptsslot,
- distance, yet_to_run,
- hpts->p_runningtick, hpts->p_cur_slot);
- }
+ KASSERT(yet_to_run <= distance,
+ ("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
+ hpts, inp, inp_hptsslot,
+ distance, yet_to_run,
+ hpts->p_runningslot, hpts->p_cur_slot));
}
}
#endif
@@ -895,8 +908,9 @@
struct hpts_diag *diag, struct timeval *tv)
{
uint32_t need_new_to = 0;
- uint32_t wheel_cts, last_tick;
- int32_t wheel_tick, maxticks;
+ uint32_t wheel_cts;
+ int32_t wheel_slot, maxslots, last_slot;
+ int cpu;
int8_t need_wakeup = 0;
HPTS_MTX_ASSERT(hpts);
@@ -904,7 +918,7 @@
memset(diag, 0, sizeof(struct hpts_diag));
diag->p_hpts_active = hpts->p_hpts_active;
diag->p_prev_slot = hpts->p_prev_slot;
- diag->p_runningtick = hpts->p_runningtick;
+ diag->p_runningslot = hpts->p_runningslot;
diag->p_nxt_slot = hpts->p_nxt_slot;
diag->p_cur_slot = hpts->p_cur_slot;
diag->p_curtick = hpts->p_curtick;
@@ -913,131 +927,120 @@
diag->p_on_min_sleep = hpts->p_on_min_sleep;
diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
}
- if (inp->inp_in_hpts == 0) {
- if (slot == 0) {
- /* Immediate */
- tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
- return;
- }
- /* Get the current time relative to the wheel */
- wheel_cts = tcp_tv_to_hptstick(tv);
- /* Map it onto the wheel */
- wheel_tick = tick_to_wheel(wheel_cts);
- /* Now what's the max we can place it at? */
- maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
- if (diag) {
- diag->wheel_tick = wheel_tick;
- diag->maxticks = maxticks;
- diag->wheel_cts = wheel_cts;
+ KASSERT(inp->inp_in_hpts == 0, ("Hpts:%p tp:%p already on hpts and add?", hpts, inp));
+ if (slot == 0) {
+ /* Immediate */
+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
+ return;
+ }
+ /* Get the current time relative to the wheel */
+ wheel_cts = tcp_tv_to_hptstick(tv);
+ /* Map it onto the wheel */
+ wheel_slot = tick_to_wheel(wheel_cts);
+ /* Now what's the max we can place it at? */
+ maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
+ if (diag) {
+ diag->wheel_slot = wheel_slot;
+ diag->maxslots = maxslots;
+ diag->wheel_cts = wheel_cts;
+ }
+ if (maxslots == 0) {
+ /* The pacer is in a wheel wrap behind, yikes! */
+ if (slot > 1) {
+ /*
+ * Reduce by 1 to prevent a forever loop in
+ * case something else is wrong. Note this
+ * probably does not hurt because the pacer
+ * if its true is so far behind we will be
+ * > 1second late calling anyway.
+ */
+ slot--;
}
- if (maxticks == 0) {
- /* The pacer is in a wheel wrap behind, yikes! */
- if (slot > 1) {
- /*
- * Reduce by 1 to prevent a forever loop in
- * case something else is wrong. Note this
- * probably does not hurt because the pacer
- * if its true is so far behind we will be
- * > 1second late calling anyway.
- */
- slot--;
- }
- inp->inp_hptsslot = last_tick;
- inp->inp_hpts_request = slot;
- } else if (maxticks >= slot) {
- /* It all fits on the wheel */
- inp->inp_hpts_request = 0;
- inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
- } else {
- /* It does not fit */
- inp->inp_hpts_request = slot - maxticks;
- inp->inp_hptsslot = last_tick;
+ inp->inp_hptsslot = last_slot;
+ inp->inp_hpts_request = slot;
+ } else if (maxslots >= slot) {
+ /* It all fits on the wheel */
+ inp->inp_hpts_request = 0;
+ inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
+ } else {
+ /* It does not fit */
+ inp->inp_hpts_request = slot - maxslots;
+ inp->inp_hptsslot = last_slot;
+ }
+ if (diag) {
+ diag->slot_remaining = inp->inp_hpts_request;
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ }
+#ifdef INVARIANTS
+ check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
+#endif
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
+ if ((hpts->p_hpts_active == 0) &&
+ (inp->inp_hpts_request == 0) &&
+ (hpts->p_on_min_sleep == 0)) {
+ /*
+ * The hpts is sleeping and NOT on a minimum
+ * sleep time, we need to figure out where
+ * it will wake up at and if we need to reschedule
+ * its time-out.
+ */
+ uint32_t have_slept, yet_to_sleep;
+
+ /* Now do we need to restart the hpts's timer? */
+ have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
+ if (have_slept < hpts->p_hpts_sleep_time)
+ yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
+ else {
+ /* We are over-due */
+ yet_to_sleep = 0;
+ need_wakeup = 1;
}
if (diag) {
- diag->slot_remaining = inp->inp_hpts_request;
- diag->inp_hptsslot = inp->inp_hptsslot;
+ diag->have_slept = have_slept;
+ diag->yet_to_sleep = yet_to_sleep;
}
-#ifdef INVARIANTS
- check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
-#endif
- hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
- if ((hpts->p_hpts_active == 0) &&
- (inp->inp_hpts_request == 0) &&
- (hpts->p_on_min_sleep == 0)) {
+ if (yet_to_sleep &&
+ (yet_to_sleep > slot)) {
/*
- * The hpts is sleeping and not on a minimum
- * sleep time, we need to figure out where
- * it will wake up at and if we need to reschedule
- * its time-out.
+ * We need to reschedule the hpts's time-out.
*/
- uint32_t have_slept, yet_to_sleep;
-
- /* Now do we need to restart the hpts's timer? */
- have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
- if (have_slept < hpts->p_hpts_sleep_time)
- yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
- else {
- /* We are over-due */
- yet_to_sleep = 0;
- need_wakeup = 1;
- }
- if (diag) {
- diag->have_slept = have_slept;
- diag->yet_to_sleep = yet_to_sleep;
- }
- if (yet_to_sleep &&
- (yet_to_sleep > slot)) {
- /*
- * We need to reschedule the hpts's time-out.
- */
- hpts->p_hpts_sleep_time = slot;
- need_new_to = slot * HPTS_TICKS_PER_USEC;
- }
+ hpts->p_hpts_sleep_time = slot;
+ need_new_to = slot * HPTS_TICKS_PER_SLOT;
}
- /*
- * Now how far is the hpts sleeping to? if active is 1, its
- * up and ticking we do nothing, otherwise we may need to
- * reschedule its callout if need_new_to is set from above.
- */
- if (need_wakeup) {
- hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
- if (diag) {
- diag->need_new_to = 0;
- diag->co_ret = 0xffff0000;
- }
- } else if (need_new_to) {
- int32_t co_ret;
- struct timeval tv;
- sbintime_t sb;
+ }
+ /*
+ * Now how far is the hpts sleeping to? if active is 1, its
+ * up and ticking we do nothing, otherwise we may need to
+ * reschedule its callout if need_new_to is set from above.
+ */
+ if (need_wakeup) {
+ hpts->p_direct_wake = 1;
+ tcp_wakehpts(hpts);
+ if (diag) {
+ diag->need_new_to = 0;
+ diag->co_ret = 0xffff0000;
+ }
+ } else if (need_new_to) {
+ int32_t co_ret;
+ struct timeval tv;
+ sbintime_t sb;
- tv.tv_sec = 0;
- tv.tv_usec = 0;
- while (need_new_to > HPTS_USEC_IN_SEC) {
- tv.tv_sec++;
- need_new_to -= HPTS_USEC_IN_SEC;
- }
- tv.tv_usec = need_new_to;
- sb = tvtosbt(tv);
- if (tcp_hpts_callout_skip_swi == 0) {
- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else {
- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_dir, hpts,
- hpts->p_cpu,
- C_PREL(tcp_hpts_precision));
- }
- if (diag) {
- diag->need_new_to = need_new_to;
- diag->co_ret = co_ret;
- }
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ while (need_new_to > HPTS_USEC_IN_SEC) {
+ tv.tv_sec++;
+ need_new_to -= HPTS_USEC_IN_SEC;
+ }
+ tv.tv_usec = need_new_to;
+ sb = tvtosbt(tv);
+ cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
+ co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ if (diag) {
+ diag->need_new_to = need_new_to;
+ diag->co_ret = co_ret;
}
- } else {
-#ifdef INVARIANTS
- panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
-#endif
}
}
@@ -1066,6 +1069,7 @@
__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
return (tcp_hpts_insert_diag(inp, slot, line, NULL));
}
+
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
{
@@ -1076,18 +1080,20 @@
/* Ok we need to set it on the hpts in the current slot */
hpts_sane_input_insert(hpts, inp, line);
retval = 1;
- if (hpts->p_hpts_active == 0) {
+ if ((hpts->p_hpts_active == 0) &&
+ (hpts->p_on_min_sleep == 0)){
/*
* Activate the hpts if it is sleeping.
*/
retval = 2;
hpts->p_direct_wake = 1;
- tcp_wakeinput(hpts);
+ tcp_wakehpts(hpts);
}
- } else if (hpts->p_hpts_active == 0) {
+ } else if ((hpts->p_hpts_active == 0) &&
+ (hpts->p_on_min_sleep == 0)){
retval = 4;
hpts->p_direct_wake = 1;
- tcp_wakeinput(hpts);
+ tcp_wakehpts(hpts);
}
return (retval);
}
@@ -1115,22 +1121,24 @@
if (inp->inp_in_input == 0) {
/* Ok we need to set it on the hpts in the current slot */
hpts_sane_input_insert(hpts, inp, line);
- if (hpts->p_hpts_active == 0) {
+ if ((hpts->p_hpts_active == 0) &&
+ (hpts->p_on_min_sleep == 0)){
/*
* Activate the hpts if it is sleeping.
*/
hpts->p_direct_wake = 1;
- tcp_wakeinput(hpts);
+ tcp_wakehpts(hpts);
}
- } else if (hpts->p_hpts_active == 0) {
+ } else if ((hpts->p_hpts_active == 0) &&
+ (hpts->p_on_min_sleep == 0)){
hpts->p_direct_wake = 1;
- tcp_wakeinput(hpts);
+ tcp_wakehpts(hpts);
}
inp->inp_hpts_drop_reas = reason;
mtx_unlock(&hpts->p_mtx);
}
-static uint16_t
+uint16_t
hpts_random_cpu(struct inpcb *inp){
/*
* No flow type set distribute the load randomly.
@@ -1149,18 +1157,19 @@
}
/* Nothing set use a random number */
ran = arc4random();
- cpuid = (ran & 0xffff) % mp_ncpus;
+ cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
return (cpuid);
}
static uint16_t
-hpts_cpuid(struct inpcb *inp)
+hpts_cpuid(struct inpcb *inp, int *failed)
{
u_int cpuid;
#if !defined(RSS) && defined(NUMA)
struct hpts_domain_info *di;
#endif
+ *failed = 0;
/*
* If one has been set use it i.e. we want both in and out on the
* same hpts.
@@ -1170,6 +1179,17 @@
} else if (inp->inp_hpts_cpu_set) {
return (inp->inp_hpts_cpu);
}
+ /*
+ * If we are using the irq cpu set by LRO or
+ * the driver then it overrides all other domains.
+ */
+ if (tcp_use_irq_cpu) {
+ if (inp->inp_irq_cpu_set == 0) {
+ *failed = 1;
+ return(0);
+ }
+ return(inp->inp_irq_cpu);
+ }
/* If one is set the other must be the same */
#ifdef RSS
cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
@@ -1183,9 +1203,10 @@
* unknown cpuids to curcpu. Not the best, but apparently better
* than defaulting to swi 0.
*/
-
- if (inp->inp_flowtype == M_HASHTYPE_NONE)
+ if (inp->inp_flowtype == M_HASHTYPE_NONE) {
+ counter_u64_add(cpu_uses_random, 1);
return (hpts_random_cpu(inp));
+ }
/*
* Hash to a thread based on the flowid. If we are using numa,
* then restrict the hash to the numa domain where the inp lives.
@@ -1197,7 +1218,7 @@
} else
#endif
cpuid = inp->inp_flowid % mp_ncpus;
-
+ counter_u64_add(cpu_uses_flowid, 1);
return (cpuid);
#endif
}
@@ -1323,7 +1344,7 @@
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
- if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+ if ((tp->t_fb->tfb_do_queued_segments != NULL) && tp->t_in_pkt) {
if (inp->inp_in_input)
tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
@@ -1357,23 +1378,51 @@
}
static void
-tcp_hptsi(struct tcp_hpts_entry *hpts)
+tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
+{
+ uint32_t t = 0, i, fnd = 0;
+
+ if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
+ /*
+ * Find next slot that is occupied and use that to
+ * be the sleep time.
+ */
+ for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
+ if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
+ fnd = 1;
+ break;
+ }
+ t = (t + 1) % NUM_OF_HPTSI_SLOTS;
+ }
+ KASSERT(fnd != 0, ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt));
+ hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
+ } else {
+ /* No one on the wheel sleep for all but 400 slots or sleep max */
+ hpts->p_hpts_sleep_time = hpts_sleep_max;
+ }
+}
+
+static int32_t
+tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
{
struct tcpcb *tp;
struct inpcb *inp = NULL, *ninp;
struct timeval tv;
- int32_t ticks_to_run, i, error;
+ uint64_t total_slots_processed = 0;
+ int32_t slots_to_run, i, error;
int32_t paced_cnt = 0;
int32_t loop_cnt = 0;
int32_t did_prefetch = 0;
int32_t prefetch_ninp = 0;
int32_t prefetch_tp = 0;
int32_t wrap_loop_cnt = 0;
+ int32_t slot_pos_of_endpoint = 0;
+ int32_t orig_exit_slot;
int16_t set_cpu;
+ int8_t completed_measure = 0, seen_endpoint = 0;
HPTS_MTX_ASSERT(hpts);
NET_EPOCH_ASSERT();
-
/* record previous info for any logging */
hpts->saved_lasttick = hpts->p_lasttick;
hpts->saved_curtick = hpts->p_curtick;
@@ -1382,7 +1431,8 @@
hpts->p_lasttick = hpts->p_curtick;
hpts->p_curtick = tcp_gethptstick(&tv);
- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
+ orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
if ((hpts->p_on_queue_cnt == 0) ||
(hpts->p_lasttick == hpts->p_curtick)) {
/*
@@ -1396,8 +1446,9 @@
again:
hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
- ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
- if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
+ slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
+ if (((hpts->p_curtick - hpts->p_lasttick) >
+ ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) &&
(hpts->p_on_queue_cnt != 0)) {
/*
* Wheel wrap is occuring, basically we
@@ -1416,8 +1467,8 @@
* first slot at the head.
*/
wrap_loop_cnt++;
- hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
- hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
+ hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1);
+ hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2);
/*
* Adjust p_cur_slot to be where we are starting from
* hopefully we will catch up (fat chance if something
@@ -1438,58 +1489,61 @@
* INP lock and the pacer mutex to change the inp_hptsslot.
*/
TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
- inp->inp_hptsslot = hpts->p_runningtick;
+ inp->inp_hptsslot = hpts->p_runningslot;
}
#endif
- TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
+ TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot],
&hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
- ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
+ slots_to_run = NUM_OF_HPTSI_SLOTS - 1;
counter_u64_add(wheel_wrap, 1);
} else {
/*
- * Nxt slot is always one after p_runningtick though
+ * Nxt slot is always one after p_runningslot though
* its not used usually unless we are doing wheel wrap.
*/
hpts->p_nxt_slot = hpts->p_prev_slot;
- hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
- }
-#ifdef INVARIANTS
- if (TAILQ_EMPTY(&hpts->p_input) &&
- (hpts->p_on_inqueue_cnt != 0)) {
- panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
+ hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1);
}
-#endif
+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+ ("%s hpts:%p in_hpts cnt:%d and queue state mismatch",
+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
HPTS_MTX_ASSERT(hpts);
if (hpts->p_on_queue_cnt == 0) {
goto no_one;
}
HPTS_MTX_ASSERT(hpts);
- for (i = 0; i < ticks_to_run; i++) {
+ for (i = 0; i < slots_to_run; i++) {
/*
* Calculate our delay, if there are no extra ticks there
- * was not any (i.e. if ticks_to_run == 1, no delay).
+ * was not any (i.e. if slots_to_run == 1, no delay).
*/
- hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
+ hpts->p_delayed_by = (slots_to_run - (i + 1)) * HPTS_TICKS_PER_SLOT;
HPTS_MTX_ASSERT(hpts);
- while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) {
+ HPTS_MTX_ASSERT(hpts);
/* For debugging */
+ if (seen_endpoint == 0) {
+ seen_endpoint = 1;
+ orig_exit_slot = slot_pos_of_endpoint = hpts->p_runningslot;
+ } else if (completed_measure == 0) {
+ /* Record the new position */
+ orig_exit_slot = hpts->p_runningslot;
+ }
+ total_slots_processed++;
hpts->p_inp = inp;
paced_cnt++;
-#ifdef INVARIANTS
- if (hpts->p_runningtick != inp->inp_hptsslot) {
- panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
- hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
- }
-#endif
+ KASSERT(hpts->p_runningslot == inp->inp_hptsslot,
+ ("Hpts:%p inp:%p slot mis-aligned %u vs %u",
+ hpts, inp, hpts->p_runningslot, inp->inp_hptsslot));
/* Now pull it */
if (inp->inp_hpts_cpu_set == 0) {
set_cpu = 1;
} else {
set_cpu = 0;
}
- hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
- if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningslot], 0);
+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) {
/* We prefetch the next inp if possible */
kern_prefetch(ninp, &prefetch_ninp);
prefetch_ninp = 1;
@@ -1501,22 +1555,22 @@
* Push him back on the wheel or run it
* depending.
*/
- uint32_t maxticks, last_tick, remaining_slots;
+ uint32_t maxslots, last_slot, remaining_slots;
- remaining_slots = ticks_to_run - (i + 1);
+ remaining_slots = slots_to_run - (i + 1);
if (inp->inp_hpts_request > remaining_slots) {
/*
* How far out can we go?
*/
- maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
- if (maxticks >= inp->inp_hpts_request) {
+ maxslots = max_slots_available(hpts, hpts->p_cur_slot, &last_slot);
+ if (maxslots >= inp->inp_hpts_request) {
/* we can place it finally to be processed */
- inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
+ inp->inp_hptsslot = hpts_slot(hpts->p_runningslot, inp->inp_hpts_request);
inp->inp_hpts_request = 0;
} else {
/* Work off some more time */
- inp->inp_hptsslot = last_tick;
- inp->inp_hpts_request-= maxticks;
+ inp->inp_hptsslot = last_slot;
+ inp->inp_hpts_request-= maxslots;
}
hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
hpts->p_inp = NULL;
@@ -1542,12 +1596,9 @@
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
out_now:
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__));
INP_WUNLOCK(inp);
mtx_lock(&hpts->p_mtx);
hpts->p_inp = NULL;
@@ -1582,7 +1633,7 @@
#endif
/* Lets do any logging that we might want to */
if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
- tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
+ tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
}
/*
* There is a hole here, we get the refcnt on the
@@ -1592,12 +1643,10 @@
* fini gets the lock first we are assured of having
* a sane INP we can lock and test.
*/
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx before tcp-output:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to tcp_output call line:%d",
+ hpts, __LINE__));
+
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
@@ -1653,20 +1702,27 @@
CURVNET_RESTORE();
#endif
INP_UNLOCK_ASSERT(inp);
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__));
mtx_lock(&hpts->p_mtx);
hpts->p_inp = NULL;
}
+ if (seen_endpoint) {
+ /*
+ * We now have a accurate distance between
+ * slot_pos_of_endpoint <-> orig_exit_slot
+ * to tell us how late we were, orig_exit_slot
+ * is where we calculated the end of our cycle to
+ * be when we first entered.
+ */
+ completed_measure = 1;
+ }
HPTS_MTX_ASSERT(hpts);
hpts->p_inp = NULL;
- hpts->p_runningtick++;
- if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
- hpts->p_runningtick = 0;
+ hpts->p_runningslot++;
+ if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) {
+ hpts->p_runningslot = 0;
}
}
no_one:
@@ -1676,16 +1732,13 @@
* Check to see if we took an excess amount of time and need to run
* more ticks (if we did not hit eno-bufs).
*/
-#ifdef INVARIANTS
- if (TAILQ_EMPTY(&hpts->p_input) &&
- (hpts->p_on_inqueue_cnt != 0)) {
- panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
- }
-#endif
+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+ ("%s hpts:%p in_hpts cnt:%d queue state mismatch",
+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
hpts->p_prev_slot = hpts->p_cur_slot;
hpts->p_lasttick = hpts->p_curtick;
- if (loop_cnt > max_pacer_loops) {
+ if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) {
/*
* Something is serious slow we have
* looped through processing the wheel
@@ -1700,11 +1753,16 @@
* correct. When it next awakens
* it will find itself further behind.
*/
- counter_u64_add(hpts_hopelessly_behind, 1);
+ if (from_callout)
+ counter_u64_add(hpts_hopelessly_behind, 1);
goto no_run;
}
hpts->p_curtick = tcp_gethptstick(&tv);
hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ if (seen_endpoint == 0) {
+ /* We saw no endpoint but we may be looping */
+ orig_exit_slot = hpts->p_cur_slot;
+ }
if ((wrap_loop_cnt < 2) &&
(hpts->p_lasttick != hpts->p_curtick)) {
counter_u64_add(hpts_loops, 1);
@@ -1712,6 +1770,7 @@
goto again;
}
no_run:
+ cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
/*
* Set flag to tell that we are done for
* any slot input that happens during
@@ -1725,76 +1784,58 @@
if (!TAILQ_EMPTY(&hpts->p_input)) {
tcp_input_data(hpts, &tv);
/*
- * Now did we spend too long running
- * input and need to run more ticks?
+ * Now did we spend too long running input and need to run more ticks?
+ * Note that if wrap_loop_cnt < 2 then we should have the conditions
+ * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
+ * is greater than 2, then the condtion most likely are *not* true. Also
+ * if we are called not from the callout, we don't run the wheel multiple
+ * times so the slots may not align either.
*/
- KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
+ KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
+ (wrap_loop_cnt >= 2) || (from_callout == 0)),
("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
hpts->p_prev_slot, hpts->p_cur_slot));
- KASSERT(hpts->p_lasttick == hpts->p_curtick,
+ KASSERT(((hpts->p_lasttick == hpts->p_curtick)
+ || (wrap_loop_cnt >= 2) || (from_callout == 0)),
("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
hpts->p_lasttick, hpts->p_curtick));
- hpts->p_curtick = tcp_gethptstick(&tv);
- if (hpts->p_lasttick != hpts->p_curtick) {
+ if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
+ hpts->p_curtick = tcp_gethptstick(&tv);
counter_u64_add(hpts_loops, 1);
hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
goto again;
}
}
- {
- uint32_t t = 0, i, fnd = 0;
-
- if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
- /*
- * Find next slot that is occupied and use that to
- * be the sleep time.
- */
- for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
- if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
- fnd = 1;
- break;
- }
- t = (t + 1) % NUM_OF_HPTSI_SLOTS;
- }
- if (fnd) {
- hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
- } else {
-#ifdef INVARIANTS
- panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
-#endif
- counter_u64_add(back_tosleep, 1);
- hpts->p_on_queue_cnt = 0;
- goto non_found;
- }
- } else if (wrap_loop_cnt >= 2) {
- /* Special case handling */
- hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
- } else {
- /* No one on the wheel sleep for all but 400 slots or sleep max */
- non_found:
- hpts->p_hpts_sleep_time = hpts_sleep_max;
- }
+ if (from_callout){
+ tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt);
}
+ if (seen_endpoint)
+ return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot));
+ else
+ return (0);
}
void
__tcp_set_hpts(struct inpcb *inp, int32_t line)
{
struct tcp_hpts_entry *hpts;
+ int failed;
INP_WLOCK_ASSERT(inp);
hpts = tcp_hpts_lock(inp);
if ((inp->inp_in_hpts == 0) &&
(inp->inp_hpts_cpu_set == 0)) {
- inp->inp_hpts_cpu = hpts_cpuid(inp);
- inp->inp_hpts_cpu_set = 1;
+ inp->inp_hpts_cpu = hpts_cpuid(inp, &failed);
+ if (failed == 0)
+ inp->inp_hpts_cpu_set = 1;
}
mtx_unlock(&hpts->p_mtx);
hpts = tcp_input_lock(inp);
if ((inp->inp_input_cpu_set == 0) &&
(inp->inp_in_input == 0)) {
- inp->inp_input_cpu = hpts_cpuid(inp);
- inp->inp_input_cpu_set = 1;
+ inp->inp_input_cpu = hpts_cpuid(inp, &failed);
+ if (failed == 0)
+ inp->inp_input_cpu_set = 1;
}
mtx_unlock(&hpts->p_mtx);
}
@@ -1804,6 +1845,127 @@
return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
}
+static void
+__tcp_run_hpts(struct tcp_hpts_entry *hpts)
+{
+ int ticks_ran;
+
+ if (hpts->p_hpts_active) {
+ /* Already active */
+ return;
+ }
+ if (mtx_trylock(&hpts->p_mtx) == 0) {
+ /* Someone else got the lock */
+ return;
+ }
+ if (hpts->p_hpts_active)
+ goto out_with_mtx;
+ hpts->syscall_cnt++;
+ counter_u64_add(hpts_direct_call, 1);
+ hpts->p_hpts_active = 1;
+ ticks_ran = tcp_hptsi(hpts, 0);
+ /* We may want to adjust the sleep values here */
+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+ if (ticks_ran > ticks_indicate_less_sleep) {
+ struct timeval tv;
+ sbintime_t sb;
+ int cpu;
+
+ hpts->p_mysleep.tv_usec /= 2;
+ if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_min_sleep;
+ /* Reschedule with new to value */
+ tcp_hpts_set_max_sleep(hpts, 0);
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ /* Validate its in the right ranges */
+ if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = hpts->p_mysleep.tv_usec;
+ } else if (tv.tv_usec > dynamic_max_sleep) {
+ /* Lets not let sleep get above this value */
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = dynamic_max_sleep;
+ }
+ /*
+ * In this mode the timer is a backstop to
+ * all the userret/lro_flushes so we use
+ * the dynamic value and set the on_min_sleep
+ * flag so we will not be awoken.
+ */
+ sb = tvtosbt(tv);
+ cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
+ /* Store off to make visible the actual sleep time */
+ hpts->sleeping = tv.tv_usec;
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ } else if (ticks_ran < ticks_indicate_more_sleep) {
+ /* For the further sleep, don't reschedule hpts */
+ hpts->p_mysleep.tv_usec *= 2;
+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+ }
+ hpts->p_on_min_sleep = 1;
+ }
+ hpts->p_hpts_active = 0;
+out_with_mtx:
+ HPTS_MTX_ASSERT(hpts);
+ mtx_unlock(&hpts->p_mtx);
+}
+
+static struct tcp_hpts_entry *
+tcp_choose_hpts_to_run()
+{
+ int i, oldest_idx;
+ uint32_t cts, time_since_ran, calc;
+
+ if ((hpts_uses_oldest == 0) ||
+ ((hpts_uses_oldest > 1) &&
+ (tcp_pace.rp_ent[(tcp_pace.rp_num_hptss-1)]->p_on_queue_cnt >= hpts_uses_oldest))) {
+ /*
+ * We have either disabled the feature (0), or
+ * we have crossed over the oldest threshold on the
+ * last hpts. We use the last one for simplification
+ * since we don't want to use the first one (it may
+ * have starting connections that have not settled
+ * on the cpu yet).
+ */
+ return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+ }
+ /* Lets find the oldest hpts to attempt to run */
+ cts = tcp_get_usecs(NULL);
+ time_since_ran = 0;
+ oldest_idx = -1;
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+ if (TSTMP_GT(cts, cts_last_ran[i]))
+ calc = cts - cts_last_ran[i];
+ else
+ calc = 0;
+ if (calc > time_since_ran) {
+ oldest_idx = i;
+ time_since_ran = calc;
+ }
+ }
+ if (oldest_idx >= 0)
+ return(tcp_pace.rp_ent[oldest_idx]);
+ else
+ return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+}
+
+
+void
+tcp_run_hpts(void)
+{
+ static struct tcp_hpts_entry *hpts;
+ struct epoch_tracker et;
+
+ NET_EPOCH_ENTER(et);
+ hpts = tcp_choose_hpts_to_run();
+ __tcp_run_hpts(hpts);
+ NET_EPOCH_EXIT(et);
+}
+
+
static void
tcp_hpts_thread(void *ctx)
{
@@ -1811,51 +1973,142 @@
struct epoch_tracker et;
struct timeval tv;
sbintime_t sb;
+ int cpu, ticks_ran;
hpts = (struct tcp_hpts_entry *)ctx;
mtx_lock(&hpts->p_mtx);
if (hpts->p_direct_wake) {
- /* Signaled by input */
+ /* Signaled by input or output with low occupancy count. */
callout_stop(&hpts->co);
+ counter_u64_add(hpts_direct_awakening, 1);
} else {
- /* Timed out */
+ /* Timed out, the normal case. */
+ counter_u64_add(hpts_wake_timeout, 1);
if (callout_pending(&hpts->co) ||
!callout_active(&hpts->co)) {
mtx_unlock(&hpts->p_mtx);
return;
}
- callout_deactivate(&hpts->co);
}
+ callout_deactivate(&hpts->co);
hpts->p_hpts_wake_scheduled = 0;
- hpts->p_hpts_active = 1;
NET_EPOCH_ENTER(et);
- tcp_hptsi(hpts);
- NET_EPOCH_EXIT(et);
- HPTS_MTX_ASSERT(hpts);
+ if (hpts->p_hpts_active) {
+ /*
+ * We are active already. This means that a syscall
+ * trap or LRO is running in behalf of hpts. In that case
+ * we need to double our timeout since there seems to be
+ * enough activity in the system that we don't need to
+ * run as often (if we were not directly woken).
+ */
+ if (hpts->p_direct_wake == 0) {
+ counter_u64_add(hpts_back_tosleep, 1);
+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+ hpts->p_mysleep.tv_usec *= 2;
+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+ tv.tv_usec = hpts->p_mysleep.tv_usec;
+ hpts->p_on_min_sleep = 1;
+ } else {
+ /*
+ * Here we have low count on the wheel, but
+ * somehow we still collided with one of the
+ * connections. Lets go back to sleep for a
+ * min sleep time, but clear the flag so we
+ * can be awoken by insert.
+ */
+ hpts->p_on_min_sleep = 0;
+ tv.tv_usec = tcp_min_hptsi_time;
+ }
+ } else {
+ /*
+ * Directly woken most likely to reset the
+ * callout time.
+ */
+ tv.tv_sec = 0;
+ tv.tv_usec = hpts->p_mysleep.tv_usec;
+ }
+ goto back_to_sleep;
+ }
+ hpts->sleeping = 0;
+ hpts->p_hpts_active = 1;
+ ticks_ran = tcp_hptsi(hpts, 1);
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
- if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
- hpts->overidden_sleep = tv.tv_usec;
- tv.tv_usec = tcp_min_hptsi_time;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+ if(hpts->p_direct_wake == 0) {
+ /*
+ * Only adjust sleep time if we were
+ * called from the callout i.e. direct_wake == 0.
+ */
+ if (ticks_ran < ticks_indicate_more_sleep) {
+ hpts->p_mysleep.tv_usec *= 2;
+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+ } else if (ticks_ran > ticks_indicate_less_sleep) {
+ hpts->p_mysleep.tv_usec /= 2;
+ if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_min_sleep;
+ }
+ }
+ if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = hpts->p_mysleep.tv_usec;
+ } else if (tv.tv_usec > dynamic_max_sleep) {
+ /* Lets not let sleep get above this value */
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = dynamic_max_sleep;
+ }
+ /*
+ * In this mode the timer is a backstop to
+ * all the userret/lro_flushes so we use
+ * the dynamic value and set the on_min_sleep
+ * flag so we will not be awoken.
+ */
hpts->p_on_min_sleep = 1;
- } else {
- /* Clear the min sleep flag */
- hpts->overidden_sleep = 0;
+ } else if (hpts->p_on_queue_cnt == 0) {
+ /*
+ * No one on the wheel, please wake us up
+ * if you insert on the wheel.
+ */
hpts->p_on_min_sleep = 0;
- }
- hpts->p_hpts_active = 0;
- sb = tvtosbt(tv);
- if (tcp_hpts_callout_skip_swi == 0) {
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ hpts->overidden_sleep = 0;
} else {
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_dir, hpts,
- hpts->p_cpu,
- C_PREL(tcp_hpts_precision));
+ /*
+ * We hit here when we have a low number of
+ * clients on the wheel (our else clause).
+ * We may need to go on min sleep, if we set
+ * the flag we will not be awoken if someone
+ * is inserted ahead of us. Clearing the flag
+ * means we can be awoken. This is "old mode"
+ * where the timer is what runs hpts mainly.
+ */
+ if (tv.tv_usec < tcp_min_hptsi_time) {
+ /*
+ * Yes on min sleep, which means
+ * we cannot be awoken.
+ */
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = tcp_min_hptsi_time;
+ hpts->p_on_min_sleep = 1;
+ } else {
+ /* Clear the min sleep flag */
+ hpts->overidden_sleep = 0;
+ hpts->p_on_min_sleep = 0;
+ }
}
+ HPTS_MTX_ASSERT(hpts);
+ hpts->p_hpts_active = 0;
+back_to_sleep:
hpts->p_direct_wake = 0;
+ sb = tvtosbt(tv);
+ cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
+ /* Store off to make visible the actual sleep time */
+ hpts->sleeping = tv.tv_usec;
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ NET_EPOCH_EXIT(et);
mtx_unlock(&hpts->p_mtx);
}
@@ -1873,7 +2126,7 @@
cpuset_t cs;
char unit[16];
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
- int count, domain;
+ int count, domain, cpu;
tcp_pace.rp_proc = NULL;
tcp_pace.rp_num_hptss = ncpus;
@@ -1882,8 +2135,18 @@
back_tosleep = counter_u64_alloc(M_WAITOK);
combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
wheel_wrap = counter_u64_alloc(M_WAITOK);
+ hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
+ hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
+ hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
+ hpts_direct_call = counter_u64_alloc(M_WAITOK);
+ cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
+ cpu_uses_random = counter_u64_alloc(M_WAITOK);
+
+
sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+ sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
+ cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
@@ -1933,19 +2196,41 @@
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "runtick", CTLFLAG_RD,
- &hpts->p_runningtick, 0,
+ &hpts->p_runningslot, 0,
"What the running pacers current slot is");
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "curtick", CTLFLAG_RD,
&hpts->p_curtick, 0,
"What the running pacers last tick mapped to the wheel was");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "lastran", CTLFLAG_RD,
+ &cts_last_ran[i], 0,
+ "The last usec tick that this hpts ran");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+ &hpts->p_mysleep.tv_usec, 0,
+ "What the running pacers is using for p_mysleep.tv_usec");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "now_sleeping", CTLFLAG_RD,
+ &hpts->sleeping, 0,
+ "What the running pacers is actually sleeping for");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+ &hpts->syscall_cnt, 0,
+ "How many times we had syscalls on this hpts");
+
hpts->p_hpts_sleep_time = hpts_sleep_max;
hpts->p_num = i;
hpts->p_curtick = tcp_gethptstick(&tv);
+ cts_last_ran[i] = tcp_tv_to_usectick(&tv);
hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
hpts->p_cpu = 0xffff;
- hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
+ hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
}
@@ -1956,17 +2241,18 @@
/*
* Now lets start ithreads to handle the hptss.
*/
- CPU_FOREACH(i) {
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
hpts = tcp_pace.rp_ent[i];
hpts->p_cpu = i;
error = swi_add(&hpts->ie, "hpts",
tcp_hpts_thread, (void *)hpts,
SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
- if (error) {
- panic("Can't add hpts:%p i:%d err:%d",
- hpts, i, error);
- }
+ KASSERT(error == 0,
+ ("Can't add hpts:%p i:%d err:%d",
+ hpts, i, error));
created++;
+ hpts->p_mysleep.tv_sec = 0;
+ hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
if (tcp_bind_threads == 1) {
if (intr_event_bind(hpts->ie, i) == 0)
bound++;
@@ -1983,18 +2269,13 @@
}
}
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ hpts->sleeping = tv.tv_usec;
sb = tvtosbt(tv);
- if (tcp_hpts_callout_skip_swi == 0) {
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else {
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_dir, hpts,
- hpts->p_cpu,
- C_PREL(tcp_hpts_precision));
- }
+ cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
}
/*
* If we somehow have an empty domain, fall back to choosing
@@ -2006,11 +2287,13 @@
break;
}
}
-
printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
created, bound,
tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
+#ifdef INVARIANTS
+ printf("HPTS is in INVARIANT mode!!\n");
+#endif
}
-SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
+SYSINIT(tcphptsi, SI_SUB_SOFTINTR, SI_ORDER_ANY, tcp_init_hptsi, NULL);
MODULE_VERSION(tcphpts, 1);
Index: sys/netinet/tcp_lro.h
===================================================================
--- sys/netinet/tcp_lro.h
+++ sys/netinet/tcp_lro.h
@@ -56,6 +56,11 @@
#define TSTMP_LRO 0x0100
#define TSTMP_HDWR 0x0200
#define HAS_TSTMP 0x0400
+/*
+ * Default number of interrupts on the same cpu in a row
+ * that will cause us to declare a "affinity cpu".
+ */
+#define TCP_LRO_CPU_DECLARATION_THRESH 50
struct inpcb;
@@ -162,12 +167,15 @@
unsigned lro_mbuf_count;
unsigned lro_mbuf_max;
unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */
+ unsigned short lro_cpu; /* Guess at the cpu we have affinity too */
unsigned lro_length_lim; /* max len of aggregated data */
-
u_long lro_hashsz;
+ uint32_t lro_last_cpu;
+ uint32_t lro_cnt_of_same_cpu;
struct lro_head *lro_hash;
struct lro_head lro_active;
struct lro_head lro_free;
+ uint8_t lro_cpu_is_set; /* Flag to say its ok to set the CPU on the inp */
};
struct tcp_ackent {
Index: sys/netinet/tcp_lro.c
===================================================================
--- sys/netinet/tcp_lro.c
+++ sys/netinet/tcp_lro.c
@@ -107,6 +107,11 @@
CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
"default number of LRO entries");
+static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH;
+SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold,
+ CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0,
+ "Number of interrups in a row on the same CPU that will make us declare an 'affinity' cpu?");
+
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
&tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
@@ -631,12 +636,13 @@
log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
}
log.u_bbr.inflight = th_seq;
+ log.u_bbr.delivered = th_ack;
log.u_bbr.timeStamp = cts;
log.u_bbr.epoch = le->next_seq;
- log.u_bbr.delivered = th_ack;
log.u_bbr.lt_epoch = le->ack_seq;
log.u_bbr.pacing_gain = th_win;
log.u_bbr.cwnd_gain = le->window;
+ log.u_bbr.lost = curcpu;
log.u_bbr.cur_del_rate = (uintptr_t)m;
log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
bintime2timeval(&lc->lro_last_queue_time, &btv);
@@ -1273,7 +1279,10 @@
INP_WUNLOCK(inp);
return (TCP_LRO_CANNOT);
}
-
+ if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) {
+ inp->inp_irq_cpu = lc->lro_last_cpu;
+ inp->inp_irq_cpu_set = 1;
+ }
/* Check if the transport doesn't support the needed optimizations. */
if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {
INP_WUNLOCK(inp);
@@ -1445,7 +1454,17 @@
/* check if no mbufs to flush */
if (lc->lro_mbuf_count == 0)
goto done;
-
+ if (lc->lro_cpu_is_set == 0) {
+ if (lc->lro_last_cpu == curcpu) {
+ lc->lro_cnt_of_same_cpu++;
+ /* Have we reached the threshold to declare a cpu? */
+ if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh)
+ lc->lro_cpu_is_set = 1;
+ } else {
+ lc->lro_last_cpu = curcpu;
+ lc->lro_cnt_of_same_cpu = 0;
+ }
+ }
CURVNET_SET(lc->ifp->if_vnet);
/* get current time */
@@ -1486,6 +1505,9 @@
/* flush active streams */
tcp_lro_rx_done(lc);
+#ifdef TCPHPTS
+ tcp_run_hpts();
+#endif
lc->lro_mbuf_count = 0;
}
Index: sys/netinet/tcp_stacks/bbr.c
===================================================================
--- sys/netinet/tcp_stacks/bbr.c
+++ sys/netinet/tcp_stacks/bbr.c
@@ -2429,10 +2429,10 @@
log.u_bbr.pkts_out = diag->co_ret;
log.u_bbr.applimited = diag->hpts_sleep_time;
log.u_bbr.delivered = diag->p_prev_slot;
- log.u_bbr.inflight = diag->p_runningtick;
- log.u_bbr.bw_inuse = diag->wheel_tick;
+ log.u_bbr.inflight = diag->p_runningslot;
+ log.u_bbr.bw_inuse = diag->wheel_slot;
log.u_bbr.rttProp = diag->wheel_cts;
- log.u_bbr.delRate = diag->maxticks;
+ log.u_bbr.delRate = diag->maxslots;
log.u_bbr.cur_del_rate = diag->p_curtick;
log.u_bbr.cur_del_rate <<= 32;
log.u_bbr.cur_del_rate |= diag->p_lasttick;
Index: sys/netinet/tcp_stacks/rack.c
===================================================================
--- sys/netinet/tcp_stacks/rack.c
+++ sys/netinet/tcp_stacks/rack.c
@@ -5609,11 +5609,11 @@
log.u_bbr.pkts_out = diag->co_ret;
log.u_bbr.applimited = diag->hpts_sleep_time;
log.u_bbr.delivered = diag->p_prev_slot;
- log.u_bbr.inflight = diag->p_runningtick;
- log.u_bbr.bw_inuse = diag->wheel_tick;
+ log.u_bbr.inflight = diag->p_runningslot;
+ log.u_bbr.bw_inuse = diag->wheel_slot;
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.timeStamp = cts;
- log.u_bbr.delRate = diag->maxticks;
+ log.u_bbr.delRate = diag->maxslots;
log.u_bbr.cur_del_rate = diag->p_curtick;
log.u_bbr.cur_del_rate <<= 32;
log.u_bbr.cur_del_rate |= diag->p_lasttick;
@@ -5707,22 +5707,22 @@
* on the clock. We always have a min
* 10 slots (10 x 10 i.e. 100 usecs).
*/
- if (slot <= HPTS_TICKS_PER_USEC) {
+ if (slot <= HPTS_TICKS_PER_SLOT) {
/* We gain delay */
- rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot);
- slot = HPTS_TICKS_PER_USEC;
+ rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
+ slot = HPTS_TICKS_PER_SLOT;
} else {
/* We take off some */
- rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC);
- slot = HPTS_TICKS_PER_USEC;
+ rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
+ slot = HPTS_TICKS_PER_SLOT;
}
} else {
slot -= rack->r_ctl.rc_agg_delayed;
rack->r_ctl.rc_agg_delayed = 0;
/* Make sure we have 100 useconds at minimum */
- if (slot < HPTS_TICKS_PER_USEC) {
- rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot;
- slot = HPTS_TICKS_PER_USEC;
+ if (slot < HPTS_TICKS_PER_SLOT) {
+ rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
+ slot = HPTS_TICKS_PER_SLOT;
}
if (rack->r_ctl.rc_agg_delayed == 0)
rack->r_late = 0;

File Metadata

Mime Type
text/plain
Expires
Tue, Oct 14, 8:20 AM (17 h, 19 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
23690852
Default Alt Text
D31083.id.diff (78 KB)

Event Timeline