Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F132063622
D31083.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
78 KB
Referenced Files
None
Subscribers
None
D31083.id.diff
View Options
Index: sys/kern/subr_trap.c
===================================================================
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -140,6 +140,16 @@
#ifdef HWPMC_HOOKS
if (PMC_THREAD_HAS_SAMPLES(td))
PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL);
+#endif
+#ifdef TCPHPTS
+ /*
+ * @gallatin is adament that this needs to go here, I
+ * am not so sure. Running hpts is a lot like
+ * a lro_flush() that happens while a user process
+ * is running. But he may know best so I will go
+ * with his view of accounting. :-)
+ */
+ tcp_run_hpts();
#endif
/*
* Let the scheduler adjust our priority etc.
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -258,6 +258,7 @@
volatile uint32_t inp_in_input; /* on input hpts (lock b) */
#endif
volatile uint16_t inp_hpts_cpu; /* Lock (i) */
+ volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
@@ -266,7 +267,8 @@
inp_input_cpu_set : 1, /* on input hpts (i) */
inp_hpts_calls :1, /* (i) from output hpts */
inp_input_calls :1, /* (i) from input hpts */
- inp_spare_bits2 : 4;
+ inp_irq_cpu_set :1, /* (i) from LRO/Driver */
+ inp_spare_bits2 : 3;
uint8_t inp_numa_domain; /* numa domain */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct socket *inp_socket; /* (i) back pointer to socket */
Index: sys/netinet/tcp_hpts.h
===================================================================
--- sys/netinet/tcp_hpts.h
+++ sys/netinet/tcp_hpts.h
@@ -44,7 +44,7 @@
TAILQ_HEAD(hptsh, inpcb);
/* Number of useconds in a hpts tick */
-#define HPTS_TICKS_PER_USEC 10
+#define HPTS_TICKS_PER_SLOT 10
#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
@@ -56,7 +56,7 @@
uint32_t p_nxt_slot; /* bbr->flex1 x */
uint32_t p_cur_slot; /* bbr->flex2 x */
uint32_t p_prev_slot; /* bbr->delivered */
- uint32_t p_runningtick; /* bbr->inflight */
+ uint32_t p_runningslot; /* bbr->inflight */
uint32_t slot_req; /* bbr->flex3 x */
uint32_t inp_hptsslot; /* bbr->flex4 x */
uint32_t slot_remaining; /* bbr->flex5 x */
@@ -64,8 +64,8 @@
uint32_t hpts_sleep_time; /* bbr->applimited x */
uint32_t yet_to_sleep; /* bbr->lt_epoch x */
uint32_t need_new_to; /* bbr->flex6 x */
- uint32_t wheel_tick; /* bbr->bw_inuse x */
- uint32_t maxticks; /* bbr->delRate x */
+ uint32_t wheel_slot; /* bbr->bw_inuse x */
+ uint32_t maxslots; /* bbr->delRate x */
uint32_t wheel_cts; /* bbr->rttProp x */
int32_t co_ret; /* bbr->pkts_out x */
uint32_t p_curtick; /* upper bbr->cur_del_rate */
@@ -83,16 +83,20 @@
#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
+#define DEFAULT_CONNECTION_THESHOLD 100
+
#ifdef _KERNEL
/* Each hpts has its own p_mtx which is used for locking */
struct tcp_hpts_entry {
/* Cache line 0x00 */
struct mtx p_mtx; /* Mutex for hpts */
+ struct timeval p_mysleep; /* Our min sleep time */
+ uint64_t syscall_cnt;
+ uint64_t sleeping; /* What the actual sleep was (if sleeping) */
uint16_t p_hpts_active; /* Flag that says hpts is awake */
- uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
- uint32_t p_runningtick; /* Current tick we are at if we are running */
+ uint32_t p_runningslot; /* Current tick we are at if we are running */
uint32_t p_prev_slot; /* Previous slot we were on */
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
uint32_t p_nxt_slot; /* The next slot outside the current range of
@@ -101,7 +105,8 @@
uint32_t p_lasttick; /* Last tick before the current one */
uint8_t p_direct_wake :1, /* boolean */
p_on_min_sleep:1, /* boolean */
- p_avail:6;
+ p_hpts_wake_scheduled:1, /* boolean */
+ p_avail:5;
uint8_t p_fill[3]; /* Fill to 32 bits */
/* Cache line 0x40 */
void *p_inp;
@@ -109,8 +114,6 @@
/* Hptsi wheel */
struct hptsh *p_hptss;
int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
- uint32_t hit_no_enobuf;
- uint32_t p_dyn_adjust;
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
* of 255ms */
uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
@@ -134,6 +137,7 @@
struct tcp_hptsi {
struct proc *rp_proc; /* Process structure for hpts */
struct tcp_hpts_entry **rp_ent; /* Array of hptss */
+ uint32_t *cts_last_ran;
uint32_t rp_num_hptss; /* Number of hpts threads */
};
@@ -155,10 +159,37 @@
* be sent when a TCB is still around must be
* sent from a routine like tcp_respond().
*/
+#define LOWEST_SLEEP_ALLOWED 50
#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
* this determines min granularity of the
- * hpts. If 0, granularity is 10useconds at
- * the cost of more CPU (context switching). */
+ * hpts. If 1, granularity is 10useconds at
+ * the cost of more CPU (context switching).
+ * Note do not set this to 0.
+ */
+#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
+#define DYNAMIC_MAX_SLEEP 100000 /* 100ms */
+/* No of connections when wee start aligning to the cpu from syscalls */
+#define OLDEST_THRESHOLD 1200
+/* Thresholds for raising/lowering sleep */
+#define TICKS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */
+#define TICKS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */
+/**
+ *
+ * Dynamic adjustment of sleeping times is done in "new" mode
+ * where we are depending on syscall returns and lro returns
+ * to push hpts forward mainly and the timer is only a backstop.
+ *
+ * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
+ * then we do a dynamic adjustment on the time we sleep.
+ * Our threshold is if the lateness of the first client served (in ticks) is
+ * greater than or equal too ticks_indicate_more_sleep (10ms
+ * or 10000 ticks). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the ticks_ran is less than
+ * ticks_indicate_more_sleep (100 ticks or 1000usecs).
+ *
+ */
+
+
#ifdef _KERNEL
#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
@@ -215,43 +246,61 @@
void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
+void tcp_run_hpts(void);
+
+uint16_t hpts_random_cpu(struct inpcb *inp);
+
extern int32_t tcp_min_hptsi_time;
-static __inline uint32_t
-tcp_tv_to_hptstick(struct timeval *sv)
-{
- return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
-}
+#endif /* _KERNEL */
+/*
+ * The following functions should also be available
+ * to userspace as well.
+ */
static __inline uint32_t
-tcp_gethptstick(struct timeval *sv)
+tcp_tv_to_hptstick(const struct timeval *sv)
{
- struct timeval tv;
-
- if (sv == NULL)
- sv = &tv;
- microuptime(sv);
- return (tcp_tv_to_hptstick(sv));
+ return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
}
static __inline uint32_t
-tcp_tv_to_usectick(struct timeval *sv)
+tcp_tv_to_usectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
}
static __inline uint32_t
-tcp_tv_to_mssectick(struct timeval *sv)
+tcp_tv_to_mssectick(const struct timeval *sv)
{
return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
}
+static __inline uint64_t
+tcp_tv_to_lusectick(const struct timeval *sv)
+{
+ return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+#ifdef _KERNEL
+
static __inline void
tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
{
mtx_unlock(&hpts->p_mtx);
}
+static __inline uint32_t
+tcp_gethptstick(struct timeval *sv)
+{
+ struct timeval tv;
+
+ if (sv == NULL)
+ sv = &tv;
+ microuptime(sv);
+ return (tcp_tv_to_hptstick(sv));
+}
+
static __inline uint32_t
tcp_get_usecs(struct timeval *tv)
{
Index: sys/netinet/tcp_hpts.c
===================================================================
--- sys/netinet/tcp_hpts.c
+++ sys/netinet/tcp_hpts.c
@@ -193,23 +193,29 @@
#else
static int tcp_bind_threads = 2;
#endif
-TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
-
+static int tcp_use_irq_cpu = 0;
static struct tcp_hptsi tcp_pace;
+static uint32_t *cts_last_ran;
static int hpts_does_tp_logging = 0;
+static int hpts_use_assigned_cpu = 1;
+static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
-static void tcp_wakehpts(struct tcp_hpts_entry *p);
-static void tcp_wakeinput(struct tcp_hpts_entry *p);
static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
-static void tcp_hptsi(struct tcp_hpts_entry *hpts);
+static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
static void tcp_hpts_thread(void *ctx);
static void tcp_init_hptsi(void *st);
int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
-static int32_t tcp_hpts_callout_skip_swi = 0;
+static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
+static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
+static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
+
+
SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
"TCP Hpts controls");
+SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+ "TCP Hpts statistics");
#define timersub(tvp, uvp, vvp) \
do { \
@@ -230,44 +236,92 @@
struct hpts_domain_info hpts_domains[MAXMEMDOM];
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
- &tcp_hpts_precision, 120,
- "Value for PRE() precision of callout");
-
counter_u64_t hpts_hopelessly_behind;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
&hpts_hopelessly_behind,
"Number of times hpts could not catch up and was behind hopelessly");
counter_u64_t hpts_loops;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
&hpts_loops, "Number of times hpts had to loop to catch up");
counter_u64_t back_tosleep;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
&back_tosleep, "Number of times hpts found no tcbs");
counter_u64_t combined_wheel_wrap;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
&combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
counter_u64_t wheel_wrap;
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
&wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
-static int32_t out_ts_percision = 0;
+counter_u64_t hpts_direct_call;
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
+ &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");
+
+counter_u64_t hpts_wake_timeout;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
+ &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");
+
+counter_u64_t hpts_direct_awakening;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
+ &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
- &out_ts_percision, 0,
- "Do we use a percise timestamp for every output cts");
+counter_u64_t hpts_back_tosleep;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
+ &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");
+
+counter_u64_t cpu_uses_flowid;
+counter_u64_t cpu_uses_random;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
+ &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
+ &cpu_uses_random, "Number of times when setting cpuid we used the a random value");
+
+TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
+TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
+ &tcp_bind_threads, 2,
+ "Thread Binding tunable");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
+ &tcp_use_irq_cpu, 0,
+ "Use of irq CPU tunable");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
+ &tcp_hpts_precision, 120,
+ "Value for PRE() precision of callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
+ &conn_cnt_thresh, 0,
+ "How many connections (below) make us use the callout based mechanism");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
&hpts_does_tp_logging, 0,
"Do we add to any tp that has logging on pacer logs");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
+ &hpts_use_assigned_cpu, 0,
+ "Do we start any hpts timer on the assigned cpu?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
+ &hpts_uses_oldest, OLDEST_THRESHOLD,
+ "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
+ &dynamic_min_sleep, 250,
+ "What is the dynamic minsleep value?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
+ &dynamic_max_sleep, 5000,
+ "What is the dynamic maxsleep value?");
+
+
+
+
static int32_t max_pacer_loops = 10;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
@@ -287,7 +341,7 @@
new = hpts_sleep_max;
error = sysctl_handle_int(oidp, &new, 0, req);
if (error == 0 && req->newptr) {
- if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
+ if ((new < dynamic_min_sleep) ||
(new > HPTS_MAX_SLEEP_ALLOWED))
error = EINVAL;
else
@@ -296,26 +350,60 @@
return (error);
}
+static int
+sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ uint32_t new;
+
+ new = tcp_min_hptsi_time;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if (new < LOWEST_SLEEP_ALLOWED)
+ error = EINVAL;
+ else
+ tcp_min_hptsi_time = new;
+ }
+ return (error);
+}
+
SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&hpts_sleep_max, 0,
&sysctl_net_inet_tcp_hpts_max_sleep, "IU",
"Maximum time hpts will sleep");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
+ CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
&tcp_min_hptsi_time, 0,
+ &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
"The minimum time the hpts must sleep before processing more slots");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
- &tcp_hpts_callout_skip_swi, 0,
- "Do we have the callout call directly to the hpts?");
+static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
+static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
+static int tcp_hpts_no_wake_over_thresh = 1;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
+ &ticks_indicate_more_sleep, 0,
+ "If we only process this many or less on a timeout, we need longer sleep on the next callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
+ &ticks_indicate_less_sleep, 0,
+ "If we process this many or more on a timeout, we need less sleep on the next callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
+ &tcp_hpts_no_wake_over_thresh, 0,
+ "When we are over the threshold on the pacer do we prohibit wakeups?");
static void
tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
- int ticks_to_run, int idx)
+ int slots_to_run, int idx, int from_callout)
{
union tcp_log_stackspecific log;
-
+ /*
+ * Unused logs are
+ * 64 bit - delRate, rttProp, bw_inuse
+ * 16 bit - cwnd_gain
+ * 8 bit - bbr_state, bbr_substate, inhpts, ininput;
+ */
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.flex1 = hpts->p_nxt_slot;
log.u_bbr.flex2 = hpts->p_cur_slot;
@@ -323,8 +411,9 @@
log.u_bbr.flex4 = idx;
log.u_bbr.flex5 = hpts->p_curtick;
log.u_bbr.flex6 = hpts->p_on_queue_cnt;
- log.u_bbr.use_lt_bw = 1;
- log.u_bbr.inflight = ticks_to_run;
+ log.u_bbr.flex7 = hpts->p_cpu;
+ log.u_bbr.flex8 = (uint8_t)from_callout;
+ log.u_bbr.inflight = slots_to_run;
log.u_bbr.applimited = hpts->overidden_sleep;
log.u_bbr.delivered = hpts->saved_curtick;
log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
@@ -332,7 +421,9 @@
log.u_bbr.lt_epoch = hpts->saved_prev_slot;
log.u_bbr.pkts_out = hpts->p_delayed_by;
log.u_bbr.lost = hpts->p_hpts_sleep_time;
- log.u_bbr.cur_del_rate = hpts->p_runningtick;
+ log.u_bbr.pacing_gain = hpts->p_cpu;
+ log.u_bbr.pkt_epoch = hpts->p_runningslot;
+ log.u_bbr.use_lt_bw = 1;
TCP_LOG_EVENTP(tp, NULL,
&tp->t_inpcb->inp_socket->so_rcv,
&tp->t_inpcb->inp_socket->so_snd,
@@ -341,47 +432,40 @@
}
static void
-hpts_timeout_swi(void *arg)
+tcp_wakehpts(struct tcp_hpts_entry *hpts)
{
- struct tcp_hpts_entry *hpts;
+ HPTS_MTX_ASSERT(hpts);
- hpts = (struct tcp_hpts_entry *)arg;
- swi_sched(hpts->ie_cookie, 0);
+ if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
+ hpts->p_direct_wake = 0;
+ return;
+ }
+ if (hpts->p_hpts_wake_scheduled == 0) {
+ hpts->p_hpts_wake_scheduled = 1;
+ swi_sched(hpts->ie_cookie, 0);
+ }
}
static void
-hpts_timeout_dir(void *arg)
+hpts_timeout_swi(void *arg)
{
- tcp_hpts_thread(arg);
+ struct tcp_hpts_entry *hpts;
+
+ hpts = (struct tcp_hpts_entry *)arg;
+ swi_sched(hpts->ie_cookie, 0);
}
static inline void
hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
{
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx) == 0) {
- /* We don't own the mutex? */
- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
- }
- if (hpts->p_cpu != inp->inp_hpts_cpu) {
- /* It is not the right cpu/mutex? */
- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
- }
- if (inp->inp_in_hpts == 0) {
- /* We are not on the hpts? */
- panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
- }
-#endif
+ HPTS_MTX_ASSERT(hpts);
+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+ KASSERT(inp->inp_in_hpts != 0, ("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp));
TAILQ_REMOVE(head, inp, inp_hpts);
hpts->p_on_queue_cnt--;
- if (hpts->p_on_queue_cnt < 0) {
- /* Count should not go negative .. */
-#ifdef INVARIANTS
- panic("Hpts goes negative inp:%p hpts:%p",
- inp, hpts);
-#endif
- hpts->p_on_queue_cnt = 0;
- }
+ KASSERT(hpts->p_on_queue_cnt >= 0,
+ ("Hpts goes negative inp:%p hpts:%p",
+ inp, hpts));
if (clear) {
inp->inp_hpts_request = 0;
inp->inp_in_hpts = 0;
@@ -391,20 +475,13 @@
static inline void
hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
{
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx) == 0) {
- /* We don't own the mutex? */
- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
- }
- if (hpts->p_cpu != inp->inp_hpts_cpu) {
- /* It is not the right cpu/mutex? */
- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
- }
- if ((noref == 0) && (inp->inp_in_hpts == 1)) {
- /* We are already on the hpts? */
- panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
- }
-#endif
+ HPTS_MTX_ASSERT(hpts);
+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+ KASSERT(((noref == 1) && (inp->inp_in_hpts == 1)) ||
+ ((noref == 0) && (inp->inp_in_hpts == 0)),
+ ("%s: hpts:%p inp:%p already on the hpts?",
+ __FUNCTION__, hpts, inp));
TAILQ_INSERT_TAIL(head, inp, inp_hpts);
inp->inp_in_hpts = 1;
hpts->p_on_queue_cnt++;
@@ -416,37 +493,20 @@
static inline void
hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
{
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx) == 0) {
- /* We don't own the mutex? */
- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
- }
- if (hpts->p_cpu != inp->inp_input_cpu) {
- /* It is not the right cpu/mutex? */
- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
- }
- if (inp->inp_in_input == 0) {
- /* We are not on the input hpts? */
- panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
- }
-#endif
+ HPTS_MTX_ASSERT(hpts);
+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+ KASSERT(inp->inp_in_input != 0,
+ ("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp));
TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
hpts->p_on_inqueue_cnt--;
- if (hpts->p_on_inqueue_cnt < 0) {
-#ifdef INVARIANTS
- panic("Hpts in goes negative inp:%p hpts:%p",
- inp, hpts);
-#endif
- hpts->p_on_inqueue_cnt = 0;
- }
-#ifdef INVARIANTS
- if (TAILQ_EMPTY(&hpts->p_input) &&
- (hpts->p_on_inqueue_cnt != 0)) {
- /* We should not be empty with a queue count */
- panic("%s hpts:%p in_hpts input empty but cnt:%d",
- __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
- }
-#endif
+ KASSERT(hpts->p_on_inqueue_cnt >= 0,
+ ("Hpts in goes negative inp:%p hpts:%p",
+ inp, hpts));
+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+ ("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch",
+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
if (clear)
inp->inp_in_input = 0;
}
@@ -454,46 +514,17 @@
static inline void
hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
{
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx) == 0) {
- /* We don't own the mutex? */
- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
- }
- if (hpts->p_cpu != inp->inp_input_cpu) {
- /* It is not the right cpu/mutex? */
- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
- }
- if (inp->inp_in_input == 1) {
- /* We are already on the input hpts? */
- panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
- }
-#endif
+ HPTS_MTX_ASSERT(hpts);
+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+ KASSERT(inp->inp_in_input == 0,
+ ("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp));
TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
inp->inp_in_input = 1;
hpts->p_on_inqueue_cnt++;
in_pcbref(inp);
}
-static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
-{
- HPTS_MTX_ASSERT(hpts);
- if (hpts->p_hpts_wake_scheduled == 0) {
- hpts->p_hpts_wake_scheduled = 1;
- swi_sched(hpts->ie_cookie, 0);
- }
-}
-
-static void
-tcp_wakeinput(struct tcp_hpts_entry *hpts)
-{
- HPTS_MTX_ASSERT(hpts);
- if (hpts->p_hpts_wake_scheduled == 0) {
- hpts->p_hpts_wake_scheduled = 1;
- swi_sched(hpts->ie_cookie, 0);
- }
-}
-
struct tcp_hpts_entry *
tcp_cur_hpts(struct inpcb *inp)
{
@@ -514,12 +545,9 @@
again:
hpts_num = inp->inp_hpts_cpu;
hpts = tcp_pace.rp_ent[hpts_num];
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__));
mtx_lock(&hpts->p_mtx);
if (hpts_num != inp->inp_hpts_cpu) {
mtx_unlock(&hpts->p_mtx);
@@ -537,12 +565,9 @@
again:
hpts_num = inp->inp_input_cpu;
hpts = tcp_pace.rp_ent[hpts_num];
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__));
mtx_lock(&hpts->p_mtx);
if (hpts_num != inp->inp_input_cpu) {
mtx_unlock(&hpts->p_mtx);
@@ -555,6 +580,7 @@
tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
{
int32_t add_freed;
+ int32_t ret;
if (inp->inp_flags2 & INP_FREED) {
/*
@@ -567,26 +593,11 @@
add_freed = 0;
}
#ifndef INP_REF_DEBUG
- if (in_pcbrele_wlocked(inp)) {
- /*
- * This should not happen. We have the inpcb referred to by
- * the main socket (why we are called) and the hpts. It
- * should always return 0.
- */
- panic("inpcb:%p release ret 1",
- inp);
- }
+ ret = in_pcbrele_wlocked(inp);
#else
- if (__in_pcbrele_wlocked(inp, line)) {
- /*
- * This should not happen. We have the inpcb referred to by
- * the main socket (why we are called) and the hpts. It
- * should always return 0.
- */
- panic("inpcb:%p release ret 1",
- inp);
- }
+ ret = __in_pcbrele_wlocked(inp, line);
#endif
+ KASSERT(ret != 1, ("inpcb:%p release ret 1", inp));
if (add_freed) {
inp->inp_flags2 |= INP_FREED;
}
@@ -642,73 +653,76 @@
}
static inline int
-hpts_tick(uint32_t wheel_tick, uint32_t plus)
+hpts_slot(uint32_t wheel_slot, uint32_t plus)
{
/*
* Given a slot on the wheel, what slot
* is that plus ticks out?
*/
- KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
- return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
+ KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
+ return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
}
static inline int
tick_to_wheel(uint32_t cts_in_wticks)
{
/*
- * Given a timestamp in wheel ticks (10usec inc's)
- * map it to our limited space wheel.
+ * Given a timestamp in ticks (so by
+ * default to get it to a real time one
+ * would multiply by 10.. i.e the number
+ * of ticks in a slot) map it to our limited
+ * space wheel.
*/
return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
}
static inline int
-hpts_ticks_diff(int prev_tick, int tick_now)
+hpts_slots_diff(int prev_slot, int slot_now)
{
/*
- * Given two ticks that are someplace
+ * Given two slots that are someplace
* on our wheel. How far are they apart?
*/
- if (tick_now > prev_tick)
- return (tick_now - prev_tick);
- else if (tick_now == prev_tick)
+ if (slot_now > prev_slot)
+ return (slot_now - prev_slot);
+ else if (slot_now == prev_slot)
/*
* Special case, same means we can go all of our
* wheel less one slot.
*/
return (NUM_OF_HPTSI_SLOTS - 1);
else
- return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
+ return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
}
/*
- * Given a tick on the wheel that is the current time
- * mapped to the wheel (wheel_tick), what is the maximum
+ * Given a slot on the wheel that is the current time
+ * mapped to the wheel (wheel_slot), what is the maximum
* distance forward that can be obtained without
- * wrapping past either prev_tick or running_tick
+ * wrapping past either prev_slot or running_slot
* depending on the htps state? Also if passed
- * a uint32_t *, fill it with the tick location.
+ * a uint32_t *, fill it with the slot location.
*
* Note if you do not give this function the current
- * time (that you think it is) mapped to the wheel
+ * time (that you think it is) mapped to the wheel slot
* then the results will not be what you expect and
* could lead to invalid inserts.
*/
static inline int32_t
-max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
+max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
{
- uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
+ uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
if ((hpts->p_hpts_active == 1) &&
(hpts->p_wheel_complete == 0)) {
- end_tick = hpts->p_runningtick;
+ end_slot = hpts->p_runningslot;
/* Back up one tick */
- if (end_tick == 0)
- end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ if (end_slot == 0)
+ end_slot = NUM_OF_HPTSI_SLOTS - 1;
else
- end_tick--;
- if (target_tick)
- *target_tick = end_tick;
+ end_slot--;
+ if (target_slot)
+ *target_slot = end_slot;
} else {
/*
* For the case where we are
@@ -718,26 +732,26 @@
* prev tick and subtract one from it. This puts us
* as far out as possible on the wheel.
*/
- end_tick = hpts->p_prev_slot;
- if (end_tick == 0)
- end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ end_slot = hpts->p_prev_slot;
+ if (end_slot == 0)
+ end_slot = NUM_OF_HPTSI_SLOTS - 1;
else
- end_tick--;
- if (target_tick)
- *target_tick = end_tick;
+ end_slot--;
+ if (target_slot)
+ *target_slot = end_slot;
/*
* Now we have close to the full wheel left minus the
* time it has been since the pacer went to sleep. Note
* that wheel_tick, passed in, should be the current time
* from the perspective of the caller, mapped to the wheel.
*/
- if (hpts->p_prev_slot != wheel_tick)
- dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+ if (hpts->p_prev_slot != wheel_slot)
+ dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
else
dis_to_travel = 1;
/*
* dis_to_travel in this case is the space from when the
- * pacer stopped (p_prev_slot) and where our wheel_tick
+ * pacer stopped (p_prev_slot) and where our wheel_slot
* is now. To know how many slots we can put it in we
* subtract from the wheel size. We would not want
* to place something after p_prev_slot or it will
@@ -746,21 +760,21 @@
return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
}
/*
- * So how many slots are open between p_runningtick -> p_cur_slot
+ * So how many slots are open between p_runningslot -> p_cur_slot
* that is what is currently un-available for insertion. Special
* case when we are at the last slot, this gets 1, so that
* the answer to how many slots are available is all but 1.
*/
- if (hpts->p_runningtick == hpts->p_cur_slot)
+ if (hpts->p_runningslot == hpts->p_cur_slot)
dis_to_travel = 1;
else
- dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
/*
* How long has the pacer been running?
*/
- if (hpts->p_cur_slot != wheel_tick) {
+ if (hpts->p_cur_slot != wheel_slot) {
/* The pacer is a bit late */
- pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
+ pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
} else {
/* The pacer is right on time, now == pacers start time */
pacer_to_now = 0;
@@ -774,24 +788,24 @@
/*
* Now how many of those we will eat due to the pacer's
* time (p_cur_slot) of start being behind the
- * real time (wheel_tick)?
+ * real time (wheel_slot)?
*/
if (avail_on_wheel <= pacer_to_now) {
/*
* Wheel wrap, we can't fit on the wheel, that
* is unusual the system must be way overloaded!
- * Insert into the assured tick, and return special
+ * Insert into the assured slot, and return special
* "0".
*/
counter_u64_add(combined_wheel_wrap, 1);
- *target_tick = hpts->p_nxt_slot;
+ *target_slot = hpts->p_nxt_slot;
return (0);
} else {
/*
* We know how many slots are open
* on the wheel (the reverse of what
* is left to run. Take away the time
- * the pacer started to now (wheel_tick)
+ * the pacer started to now (wheel_slot)
* and that tells you how many slots are
* open that can be inserted into that won't
* be touched by the pacer until later.
@@ -815,7 +829,7 @@
* A sleeping hpts we want in next slot to run
* note that in this state p_prev_slot == p_cur_slot
*/
- inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
+ inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
need_wake = 1;
} else if ((void *)inp == hpts->p_inp) {
@@ -827,7 +841,7 @@
*/
inp->inp_hptsslot = hpts->p_nxt_slot;
} else
- inp->inp_hptsslot = hpts->p_runningtick;
+ inp->inp_hptsslot = hpts->p_runningslot;
hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
if (need_wake) {
/*
@@ -862,9 +876,9 @@
* Sanity checks for the pacer with invariants
* on insert.
*/
- if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
- panic("hpts:%p inp:%p slot:%d > max",
- hpts, inp, inp_hptsslot);
+ KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
+ ("hpts:%p inp:%p slot:%d > max",
+ hpts, inp, inp_hptsslot));
if ((hpts->p_hpts_active) &&
(hpts->p_wheel_complete == 0)) {
/*
@@ -875,17 +889,16 @@
*/
int distance, yet_to_run;
- distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
- if (hpts->p_runningtick != hpts->p_cur_slot)
- yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
+ if (hpts->p_runningslot != hpts->p_cur_slot)
+ yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
else
yet_to_run = 0; /* processing last slot */
- if (yet_to_run > distance) {
- panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
- hpts, inp, inp_hptsslot,
- distance, yet_to_run,
- hpts->p_runningtick, hpts->p_cur_slot);
- }
+ KASSERT(yet_to_run <= distance,
+ ("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
+ hpts, inp, inp_hptsslot,
+ distance, yet_to_run,
+ hpts->p_runningslot, hpts->p_cur_slot));
}
}
#endif
@@ -895,8 +908,9 @@
struct hpts_diag *diag, struct timeval *tv)
{
uint32_t need_new_to = 0;
- uint32_t wheel_cts, last_tick;
- int32_t wheel_tick, maxticks;
+ uint32_t wheel_cts;
+ int32_t wheel_slot, maxslots, last_slot;
+ int cpu;
int8_t need_wakeup = 0;
HPTS_MTX_ASSERT(hpts);
@@ -904,7 +918,7 @@
memset(diag, 0, sizeof(struct hpts_diag));
diag->p_hpts_active = hpts->p_hpts_active;
diag->p_prev_slot = hpts->p_prev_slot;
- diag->p_runningtick = hpts->p_runningtick;
+ diag->p_runningslot = hpts->p_runningslot;
diag->p_nxt_slot = hpts->p_nxt_slot;
diag->p_cur_slot = hpts->p_cur_slot;
diag->p_curtick = hpts->p_curtick;
@@ -913,131 +927,120 @@
diag->p_on_min_sleep = hpts->p_on_min_sleep;
diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
}
- if (inp->inp_in_hpts == 0) {
- if (slot == 0) {
- /* Immediate */
- tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
- return;
- }
- /* Get the current time relative to the wheel */
- wheel_cts = tcp_tv_to_hptstick(tv);
- /* Map it onto the wheel */
- wheel_tick = tick_to_wheel(wheel_cts);
- /* Now what's the max we can place it at? */
- maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
- if (diag) {
- diag->wheel_tick = wheel_tick;
- diag->maxticks = maxticks;
- diag->wheel_cts = wheel_cts;
+ KASSERT(inp->inp_in_hpts == 0, ("Hpts:%p tp:%p already on hpts and add?", hpts, inp));
+ if (slot == 0) {
+ /* Immediate */
+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
+ return;
+ }
+ /* Get the current time relative to the wheel */
+ wheel_cts = tcp_tv_to_hptstick(tv);
+ /* Map it onto the wheel */
+ wheel_slot = tick_to_wheel(wheel_cts);
+ /* Now what's the max we can place it at? */
+ maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
+ if (diag) {
+ diag->wheel_slot = wheel_slot;
+ diag->maxslots = maxslots;
+ diag->wheel_cts = wheel_cts;
+ }
+ if (maxslots == 0) {
+ /* The pacer is in a wheel wrap behind, yikes! */
+ if (slot > 1) {
+ /*
+ * Reduce by 1 to prevent a forever loop in
+ * case something else is wrong. Note this
+ * probably does not hurt because the pacer
+ * if its true is so far behind we will be
+ * > 1second late calling anyway.
+ */
+ slot--;
}
- if (maxticks == 0) {
- /* The pacer is in a wheel wrap behind, yikes! */
- if (slot > 1) {
- /*
- * Reduce by 1 to prevent a forever loop in
- * case something else is wrong. Note this
- * probably does not hurt because the pacer
- * if its true is so far behind we will be
- * > 1second late calling anyway.
- */
- slot--;
- }
- inp->inp_hptsslot = last_tick;
- inp->inp_hpts_request = slot;
- } else if (maxticks >= slot) {
- /* It all fits on the wheel */
- inp->inp_hpts_request = 0;
- inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
- } else {
- /* It does not fit */
- inp->inp_hpts_request = slot - maxticks;
- inp->inp_hptsslot = last_tick;
+ inp->inp_hptsslot = last_slot;
+ inp->inp_hpts_request = slot;
+ } else if (maxslots >= slot) {
+ /* It all fits on the wheel */
+ inp->inp_hpts_request = 0;
+ inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
+ } else {
+ /* It does not fit */
+ inp->inp_hpts_request = slot - maxslots;
+ inp->inp_hptsslot = last_slot;
+ }
+ if (diag) {
+ diag->slot_remaining = inp->inp_hpts_request;
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ }
+#ifdef INVARIANTS
+ check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
+#endif
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
+ if ((hpts->p_hpts_active == 0) &&
+ (inp->inp_hpts_request == 0) &&
+ (hpts->p_on_min_sleep == 0)) {
+ /*
+ * The hpts is sleeping and NOT on a minimum
+ * sleep time, we need to figure out where
+ * it will wake up at and if we need to reschedule
+ * its time-out.
+ */
+ uint32_t have_slept, yet_to_sleep;
+
+ /* Now do we need to restart the hpts's timer? */
+ have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
+ if (have_slept < hpts->p_hpts_sleep_time)
+ yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
+ else {
+ /* We are over-due */
+ yet_to_sleep = 0;
+ need_wakeup = 1;
}
if (diag) {
- diag->slot_remaining = inp->inp_hpts_request;
- diag->inp_hptsslot = inp->inp_hptsslot;
+ diag->have_slept = have_slept;
+ diag->yet_to_sleep = yet_to_sleep;
}
-#ifdef INVARIANTS
- check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
-#endif
- hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
- if ((hpts->p_hpts_active == 0) &&
- (inp->inp_hpts_request == 0) &&
- (hpts->p_on_min_sleep == 0)) {
+ if (yet_to_sleep &&
+ (yet_to_sleep > slot)) {
/*
- * The hpts is sleeping and not on a minimum
- * sleep time, we need to figure out where
- * it will wake up at and if we need to reschedule
- * its time-out.
+ * We need to reschedule the hpts's time-out.
*/
- uint32_t have_slept, yet_to_sleep;
-
- /* Now do we need to restart the hpts's timer? */
- have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
- if (have_slept < hpts->p_hpts_sleep_time)
- yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
- else {
- /* We are over-due */
- yet_to_sleep = 0;
- need_wakeup = 1;
- }
- if (diag) {
- diag->have_slept = have_slept;
- diag->yet_to_sleep = yet_to_sleep;
- }
- if (yet_to_sleep &&
- (yet_to_sleep > slot)) {
- /*
- * We need to reschedule the hpts's time-out.
- */
- hpts->p_hpts_sleep_time = slot;
- need_new_to = slot * HPTS_TICKS_PER_USEC;
- }
+ hpts->p_hpts_sleep_time = slot;
+ need_new_to = slot * HPTS_TICKS_PER_SLOT;
}
- /*
- * Now how far is the hpts sleeping to? if active is 1, its
- * up and ticking we do nothing, otherwise we may need to
- * reschedule its callout if need_new_to is set from above.
- */
- if (need_wakeup) {
- hpts->p_direct_wake = 1;
- tcp_wakehpts(hpts);
- if (diag) {
- diag->need_new_to = 0;
- diag->co_ret = 0xffff0000;
- }
- } else if (need_new_to) {
- int32_t co_ret;
- struct timeval tv;
- sbintime_t sb;
+ }
+ /*
+ * Now how far is the hpts sleeping to? if active is 1, its
+ * up and ticking we do nothing, otherwise we may need to
+ * reschedule its callout if need_new_to is set from above.
+ */
+ if (need_wakeup) {
+ hpts->p_direct_wake = 1;
+ tcp_wakehpts(hpts);
+ if (diag) {
+ diag->need_new_to = 0;
+ diag->co_ret = 0xffff0000;
+ }
+ } else if (need_new_to) {
+ int32_t co_ret;
+ struct timeval tv;
+ sbintime_t sb;
- tv.tv_sec = 0;
- tv.tv_usec = 0;
- while (need_new_to > HPTS_USEC_IN_SEC) {
- tv.tv_sec++;
- need_new_to -= HPTS_USEC_IN_SEC;
- }
- tv.tv_usec = need_new_to;
- sb = tvtosbt(tv);
- if (tcp_hpts_callout_skip_swi == 0) {
- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else {
- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_dir, hpts,
- hpts->p_cpu,
- C_PREL(tcp_hpts_precision));
- }
- if (diag) {
- diag->need_new_to = need_new_to;
- diag->co_ret = co_ret;
- }
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ while (need_new_to > HPTS_USEC_IN_SEC) {
+ tv.tv_sec++;
+ need_new_to -= HPTS_USEC_IN_SEC;
+ }
+ tv.tv_usec = need_new_to;
+ sb = tvtosbt(tv);
+ cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
+ co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ if (diag) {
+ diag->need_new_to = need_new_to;
+ diag->co_ret = co_ret;
}
- } else {
-#ifdef INVARIANTS
- panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
-#endif
}
}
@@ -1066,6 +1069,7 @@
__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
return (tcp_hpts_insert_diag(inp, slot, line, NULL));
}
+
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
{
@@ -1076,18 +1080,20 @@
/* Ok we need to set it on the hpts in the current slot */
hpts_sane_input_insert(hpts, inp, line);
retval = 1;
- if (hpts->p_hpts_active == 0) {
+ if ((hpts->p_hpts_active == 0) &&
+ (hpts->p_on_min_sleep == 0)){
/*
* Activate the hpts if it is sleeping.
*/
retval = 2;
hpts->p_direct_wake = 1;
- tcp_wakeinput(hpts);
+ tcp_wakehpts(hpts);
}
- } else if (hpts->p_hpts_active == 0) {
+ } else if ((hpts->p_hpts_active == 0) &&
+ (hpts->p_on_min_sleep == 0)){
retval = 4;
hpts->p_direct_wake = 1;
- tcp_wakeinput(hpts);
+ tcp_wakehpts(hpts);
}
return (retval);
}
@@ -1115,22 +1121,24 @@
if (inp->inp_in_input == 0) {
/* Ok we need to set it on the hpts in the current slot */
hpts_sane_input_insert(hpts, inp, line);
- if (hpts->p_hpts_active == 0) {
+ if ((hpts->p_hpts_active == 0) &&
+ (hpts->p_on_min_sleep == 0)){
/*
* Activate the hpts if it is sleeping.
*/
hpts->p_direct_wake = 1;
- tcp_wakeinput(hpts);
+ tcp_wakehpts(hpts);
}
- } else if (hpts->p_hpts_active == 0) {
+ } else if ((hpts->p_hpts_active == 0) &&
+ (hpts->p_on_min_sleep == 0)){
hpts->p_direct_wake = 1;
- tcp_wakeinput(hpts);
+ tcp_wakehpts(hpts);
}
inp->inp_hpts_drop_reas = reason;
mtx_unlock(&hpts->p_mtx);
}
-static uint16_t
+uint16_t
hpts_random_cpu(struct inpcb *inp){
/*
* No flow type set distribute the load randomly.
@@ -1149,18 +1157,19 @@
}
/* Nothing set use a random number */
ran = arc4random();
- cpuid = (ran & 0xffff) % mp_ncpus;
+ cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
return (cpuid);
}
static uint16_t
-hpts_cpuid(struct inpcb *inp)
+hpts_cpuid(struct inpcb *inp, int *failed)
{
u_int cpuid;
#if !defined(RSS) && defined(NUMA)
struct hpts_domain_info *di;
#endif
+ *failed = 0;
/*
* If one has been set use it i.e. we want both in and out on the
* same hpts.
@@ -1170,6 +1179,17 @@
} else if (inp->inp_hpts_cpu_set) {
return (inp->inp_hpts_cpu);
}
+ /*
+ * If we are using the irq cpu set by LRO or
+ * the driver then it overrides all other domains.
+ */
+ if (tcp_use_irq_cpu) {
+ if (inp->inp_irq_cpu_set == 0) {
+ *failed = 1;
+ return(0);
+ }
+ return(inp->inp_irq_cpu);
+ }
/* If one is set the other must be the same */
#ifdef RSS
cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
@@ -1183,9 +1203,10 @@
* unknown cpuids to curcpu. Not the best, but apparently better
* than defaulting to swi 0.
*/
-
- if (inp->inp_flowtype == M_HASHTYPE_NONE)
+ if (inp->inp_flowtype == M_HASHTYPE_NONE) {
+ counter_u64_add(cpu_uses_random, 1);
return (hpts_random_cpu(inp));
+ }
/*
* Hash to a thread based on the flowid. If we are using numa,
* then restrict the hash to the numa domain where the inp lives.
@@ -1197,7 +1218,7 @@
} else
#endif
cpuid = inp->inp_flowid % mp_ncpus;
-
+ counter_u64_add(cpu_uses_flowid, 1);
return (cpuid);
#endif
}
@@ -1323,7 +1344,7 @@
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
- if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+ if ((tp->t_fb->tfb_do_queued_segments != NULL) && tp->t_in_pkt) {
if (inp->inp_in_input)
tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
@@ -1357,23 +1378,51 @@
}
static void
-tcp_hptsi(struct tcp_hpts_entry *hpts)
+tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
+{
+ uint32_t t = 0, i, fnd = 0;
+
+ if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
+ /*
+ * Find next slot that is occupied and use that to
+ * be the sleep time.
+ */
+ for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
+ if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
+ fnd = 1;
+ break;
+ }
+ t = (t + 1) % NUM_OF_HPTSI_SLOTS;
+ }
+ KASSERT(fnd != 0, ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt));
+ hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
+ } else {
+ /* No one on the wheel sleep for all but 400 slots or sleep max */
+ hpts->p_hpts_sleep_time = hpts_sleep_max;
+ }
+}
+
+static int32_t
+tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
{
struct tcpcb *tp;
struct inpcb *inp = NULL, *ninp;
struct timeval tv;
- int32_t ticks_to_run, i, error;
+ uint64_t total_slots_processed = 0;
+ int32_t slots_to_run, i, error;
int32_t paced_cnt = 0;
int32_t loop_cnt = 0;
int32_t did_prefetch = 0;
int32_t prefetch_ninp = 0;
int32_t prefetch_tp = 0;
int32_t wrap_loop_cnt = 0;
+ int32_t slot_pos_of_endpoint = 0;
+ int32_t orig_exit_slot;
int16_t set_cpu;
+ int8_t completed_measure = 0, seen_endpoint = 0;
HPTS_MTX_ASSERT(hpts);
NET_EPOCH_ASSERT();
-
/* record previous info for any logging */
hpts->saved_lasttick = hpts->p_lasttick;
hpts->saved_curtick = hpts->p_curtick;
@@ -1382,7 +1431,8 @@
hpts->p_lasttick = hpts->p_curtick;
hpts->p_curtick = tcp_gethptstick(&tv);
- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
+ orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
if ((hpts->p_on_queue_cnt == 0) ||
(hpts->p_lasttick == hpts->p_curtick)) {
/*
@@ -1396,8 +1446,9 @@
again:
hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
- ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
- if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
+ slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
+ if (((hpts->p_curtick - hpts->p_lasttick) >
+ ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) &&
(hpts->p_on_queue_cnt != 0)) {
/*
* Wheel wrap is occuring, basically we
@@ -1416,8 +1467,8 @@
* first slot at the head.
*/
wrap_loop_cnt++;
- hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
- hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
+ hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1);
+ hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2);
/*
* Adjust p_cur_slot to be where we are starting from
* hopefully we will catch up (fat chance if something
@@ -1438,58 +1489,61 @@
* INP lock and the pacer mutex to change the inp_hptsslot.
*/
TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
- inp->inp_hptsslot = hpts->p_runningtick;
+ inp->inp_hptsslot = hpts->p_runningslot;
}
#endif
- TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
+ TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot],
&hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
- ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
+ slots_to_run = NUM_OF_HPTSI_SLOTS - 1;
counter_u64_add(wheel_wrap, 1);
} else {
/*
- * Nxt slot is always one after p_runningtick though
+ * Nxt slot is always one after p_runningslot though
* its not used usually unless we are doing wheel wrap.
*/
hpts->p_nxt_slot = hpts->p_prev_slot;
- hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
- }
-#ifdef INVARIANTS
- if (TAILQ_EMPTY(&hpts->p_input) &&
- (hpts->p_on_inqueue_cnt != 0)) {
- panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
+ hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1);
}
-#endif
+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+ ("%s hpts:%p in_hpts cnt:%d and queue state mismatch",
+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
HPTS_MTX_ASSERT(hpts);
if (hpts->p_on_queue_cnt == 0) {
goto no_one;
}
HPTS_MTX_ASSERT(hpts);
- for (i = 0; i < ticks_to_run; i++) {
+ for (i = 0; i < slots_to_run; i++) {
/*
* Calculate our delay, if there are no extra ticks there
- * was not any (i.e. if ticks_to_run == 1, no delay).
+ * was not any (i.e. if slots_to_run == 1, no delay).
*/
- hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
+ hpts->p_delayed_by = (slots_to_run - (i + 1)) * HPTS_TICKS_PER_SLOT;
HPTS_MTX_ASSERT(hpts);
- while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) {
+ HPTS_MTX_ASSERT(hpts);
/* For debugging */
+ if (seen_endpoint == 0) {
+ seen_endpoint = 1;
+ orig_exit_slot = slot_pos_of_endpoint = hpts->p_runningslot;
+ } else if (completed_measure == 0) {
+ /* Record the new position */
+ orig_exit_slot = hpts->p_runningslot;
+ }
+ total_slots_processed++;
hpts->p_inp = inp;
paced_cnt++;
-#ifdef INVARIANTS
- if (hpts->p_runningtick != inp->inp_hptsslot) {
- panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
- hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
- }
-#endif
+ KASSERT(hpts->p_runningslot == inp->inp_hptsslot,
+ ("Hpts:%p inp:%p slot mis-aligned %u vs %u",
+ hpts, inp, hpts->p_runningslot, inp->inp_hptsslot));
/* Now pull it */
if (inp->inp_hpts_cpu_set == 0) {
set_cpu = 1;
} else {
set_cpu = 0;
}
- hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
- if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningslot], 0);
+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) {
/* We prefetch the next inp if possible */
kern_prefetch(ninp, &prefetch_ninp);
prefetch_ninp = 1;
@@ -1501,22 +1555,22 @@
* Push him back on the wheel or run it
* depending.
*/
- uint32_t maxticks, last_tick, remaining_slots;
+ uint32_t maxslots, last_slot, remaining_slots;
- remaining_slots = ticks_to_run - (i + 1);
+ remaining_slots = slots_to_run - (i + 1);
if (inp->inp_hpts_request > remaining_slots) {
/*
* How far out can we go?
*/
- maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
- if (maxticks >= inp->inp_hpts_request) {
+ maxslots = max_slots_available(hpts, hpts->p_cur_slot, &last_slot);
+ if (maxslots >= inp->inp_hpts_request) {
/* we can place it finally to be processed */
- inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
+ inp->inp_hptsslot = hpts_slot(hpts->p_runningslot, inp->inp_hpts_request);
inp->inp_hpts_request = 0;
} else {
/* Work off some more time */
- inp->inp_hptsslot = last_tick;
- inp->inp_hpts_request-= maxticks;
+ inp->inp_hptsslot = last_slot;
+ inp->inp_hpts_request-= maxslots;
}
hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
hpts->p_inp = NULL;
@@ -1542,12 +1596,9 @@
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
out_now:
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__));
INP_WUNLOCK(inp);
mtx_lock(&hpts->p_mtx);
hpts->p_inp = NULL;
@@ -1582,7 +1633,7 @@
#endif
/* Lets do any logging that we might want to */
if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
- tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
+ tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
}
/*
* There is a hole here, we get the refcnt on the
@@ -1592,12 +1643,10 @@
* fini gets the lock first we are assured of having
* a sane INP we can lock and test.
*/
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx before tcp-output:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to tcp_output call line:%d",
+ hpts, __LINE__));
+
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
@@ -1653,20 +1702,27 @@
CURVNET_RESTORE();
#endif
INP_UNLOCK_ASSERT(inp);
-#ifdef INVARIANTS
- if (mtx_owned(&hpts->p_mtx)) {
- panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
- }
-#endif
+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+ ("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__));
mtx_lock(&hpts->p_mtx);
hpts->p_inp = NULL;
}
+ if (seen_endpoint) {
+ /*
+ * We now have a accurate distance between
+ * slot_pos_of_endpoint <-> orig_exit_slot
+ * to tell us how late we were, orig_exit_slot
+ * is where we calculated the end of our cycle to
+ * be when we first entered.
+ */
+ completed_measure = 1;
+ }
HPTS_MTX_ASSERT(hpts);
hpts->p_inp = NULL;
- hpts->p_runningtick++;
- if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
- hpts->p_runningtick = 0;
+ hpts->p_runningslot++;
+ if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) {
+ hpts->p_runningslot = 0;
}
}
no_one:
@@ -1676,16 +1732,13 @@
* Check to see if we took an excess amount of time and need to run
* more ticks (if we did not hit eno-bufs).
*/
-#ifdef INVARIANTS
- if (TAILQ_EMPTY(&hpts->p_input) &&
- (hpts->p_on_inqueue_cnt != 0)) {
- panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
- }
-#endif
+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+ ("%s hpts:%p in_hpts cnt:%d queue state mismatch",
+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
hpts->p_prev_slot = hpts->p_cur_slot;
hpts->p_lasttick = hpts->p_curtick;
- if (loop_cnt > max_pacer_loops) {
+ if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) {
/*
* Something is serious slow we have
* looped through processing the wheel
@@ -1700,11 +1753,16 @@
* correct. When it next awakens
* it will find itself further behind.
*/
- counter_u64_add(hpts_hopelessly_behind, 1);
+ if (from_callout)
+ counter_u64_add(hpts_hopelessly_behind, 1);
goto no_run;
}
hpts->p_curtick = tcp_gethptstick(&tv);
hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ if (seen_endpoint == 0) {
+ /* We saw no endpoint but we may be looping */
+ orig_exit_slot = hpts->p_cur_slot;
+ }
if ((wrap_loop_cnt < 2) &&
(hpts->p_lasttick != hpts->p_curtick)) {
counter_u64_add(hpts_loops, 1);
@@ -1712,6 +1770,7 @@
goto again;
}
no_run:
+ cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
/*
* Set flag to tell that we are done for
* any slot input that happens during
@@ -1725,76 +1784,58 @@
if (!TAILQ_EMPTY(&hpts->p_input)) {
tcp_input_data(hpts, &tv);
/*
- * Now did we spend too long running
- * input and need to run more ticks?
+ * Now did we spend too long running input and need to run more ticks?
+ * Note that if wrap_loop_cnt < 2 then we should have the conditions
+ * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
+ * is greater than 2, then the condtion most likely are *not* true. Also
+ * if we are called not from the callout, we don't run the wheel multiple
+ * times so the slots may not align either.
*/
- KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
+ KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
+ (wrap_loop_cnt >= 2) || (from_callout == 0)),
("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
hpts->p_prev_slot, hpts->p_cur_slot));
- KASSERT(hpts->p_lasttick == hpts->p_curtick,
+ KASSERT(((hpts->p_lasttick == hpts->p_curtick)
+ || (wrap_loop_cnt >= 2) || (from_callout == 0)),
("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
hpts->p_lasttick, hpts->p_curtick));
- hpts->p_curtick = tcp_gethptstick(&tv);
- if (hpts->p_lasttick != hpts->p_curtick) {
+ if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
+ hpts->p_curtick = tcp_gethptstick(&tv);
counter_u64_add(hpts_loops, 1);
hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
goto again;
}
}
- {
- uint32_t t = 0, i, fnd = 0;
-
- if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
- /*
- * Find next slot that is occupied and use that to
- * be the sleep time.
- */
- for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
- if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
- fnd = 1;
- break;
- }
- t = (t + 1) % NUM_OF_HPTSI_SLOTS;
- }
- if (fnd) {
- hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
- } else {
-#ifdef INVARIANTS
- panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
-#endif
- counter_u64_add(back_tosleep, 1);
- hpts->p_on_queue_cnt = 0;
- goto non_found;
- }
- } else if (wrap_loop_cnt >= 2) {
- /* Special case handling */
- hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
- } else {
- /* No one on the wheel sleep for all but 400 slots or sleep max */
- non_found:
- hpts->p_hpts_sleep_time = hpts_sleep_max;
- }
+ if (from_callout){
+ tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt);
}
+ if (seen_endpoint)
+ return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot));
+ else
+ return (0);
}
void
__tcp_set_hpts(struct inpcb *inp, int32_t line)
{
struct tcp_hpts_entry *hpts;
+ int failed;
INP_WLOCK_ASSERT(inp);
hpts = tcp_hpts_lock(inp);
if ((inp->inp_in_hpts == 0) &&
(inp->inp_hpts_cpu_set == 0)) {
- inp->inp_hpts_cpu = hpts_cpuid(inp);
- inp->inp_hpts_cpu_set = 1;
+ inp->inp_hpts_cpu = hpts_cpuid(inp, &failed);
+ if (failed == 0)
+ inp->inp_hpts_cpu_set = 1;
}
mtx_unlock(&hpts->p_mtx);
hpts = tcp_input_lock(inp);
if ((inp->inp_input_cpu_set == 0) &&
(inp->inp_in_input == 0)) {
- inp->inp_input_cpu = hpts_cpuid(inp);
- inp->inp_input_cpu_set = 1;
+ inp->inp_input_cpu = hpts_cpuid(inp, &failed);
+ if (failed == 0)
+ inp->inp_input_cpu_set = 1;
}
mtx_unlock(&hpts->p_mtx);
}
@@ -1804,6 +1845,127 @@
return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
}
+static void
+__tcp_run_hpts(struct tcp_hpts_entry *hpts)
+{
+ int ticks_ran;
+
+ if (hpts->p_hpts_active) {
+ /* Already active */
+ return;
+ }
+ if (mtx_trylock(&hpts->p_mtx) == 0) {
+ /* Someone else got the lock */
+ return;
+ }
+ if (hpts->p_hpts_active)
+ goto out_with_mtx;
+ hpts->syscall_cnt++;
+ counter_u64_add(hpts_direct_call, 1);
+ hpts->p_hpts_active = 1;
+ ticks_ran = tcp_hptsi(hpts, 0);
+ /* We may want to adjust the sleep values here */
+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+ if (ticks_ran > ticks_indicate_less_sleep) {
+ struct timeval tv;
+ sbintime_t sb;
+ int cpu;
+
+ hpts->p_mysleep.tv_usec /= 2;
+ if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_min_sleep;
+ /* Reschedule with new to value */
+ tcp_hpts_set_max_sleep(hpts, 0);
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ /* Validate its in the right ranges */
+ if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = hpts->p_mysleep.tv_usec;
+ } else if (tv.tv_usec > dynamic_max_sleep) {
+ /* Lets not let sleep get above this value */
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = dynamic_max_sleep;
+ }
+ /*
+ * In this mode the timer is a backstop to
+ * all the userret/lro_flushes so we use
+ * the dynamic value and set the on_min_sleep
+ * flag so we will not be awoken.
+ */
+ sb = tvtosbt(tv);
+ cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
+ /* Store off to make visible the actual sleep time */
+ hpts->sleeping = tv.tv_usec;
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ } else if (ticks_ran < ticks_indicate_more_sleep) {
+ /* For the further sleep, don't reschedule hpts */
+ hpts->p_mysleep.tv_usec *= 2;
+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+ }
+ hpts->p_on_min_sleep = 1;
+ }
+ hpts->p_hpts_active = 0;
+out_with_mtx:
+ HPTS_MTX_ASSERT(hpts);
+ mtx_unlock(&hpts->p_mtx);
+}
+
+static struct tcp_hpts_entry *
+tcp_choose_hpts_to_run()
+{
+ int i, oldest_idx;
+ uint32_t cts, time_since_ran, calc;
+
+ if ((hpts_uses_oldest == 0) ||
+ ((hpts_uses_oldest > 1) &&
+ (tcp_pace.rp_ent[(tcp_pace.rp_num_hptss-1)]->p_on_queue_cnt >= hpts_uses_oldest))) {
+ /*
+ * We have either disabled the feature (0), or
+ * we have crossed over the oldest threshold on the
+ * last hpts. We use the last one for simplification
+ * since we don't want to use the first one (it may
+ * have starting connections that have not settled
+ * on the cpu yet).
+ */
+ return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+ }
+ /* Lets find the oldest hpts to attempt to run */
+ cts = tcp_get_usecs(NULL);
+ time_since_ran = 0;
+ oldest_idx = -1;
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+ if (TSTMP_GT(cts, cts_last_ran[i]))
+ calc = cts - cts_last_ran[i];
+ else
+ calc = 0;
+ if (calc > time_since_ran) {
+ oldest_idx = i;
+ time_since_ran = calc;
+ }
+ }
+ if (oldest_idx >= 0)
+ return(tcp_pace.rp_ent[oldest_idx]);
+ else
+ return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+}
+
+
+void
+tcp_run_hpts(void)
+{
+ static struct tcp_hpts_entry *hpts;
+ struct epoch_tracker et;
+
+ NET_EPOCH_ENTER(et);
+ hpts = tcp_choose_hpts_to_run();
+ __tcp_run_hpts(hpts);
+ NET_EPOCH_EXIT(et);
+}
+
+
static void
tcp_hpts_thread(void *ctx)
{
@@ -1811,51 +1973,142 @@
struct epoch_tracker et;
struct timeval tv;
sbintime_t sb;
+ int cpu, ticks_ran;
hpts = (struct tcp_hpts_entry *)ctx;
mtx_lock(&hpts->p_mtx);
if (hpts->p_direct_wake) {
- /* Signaled by input */
+ /* Signaled by input or output with low occupancy count. */
callout_stop(&hpts->co);
+ counter_u64_add(hpts_direct_awakening, 1);
} else {
- /* Timed out */
+ /* Timed out, the normal case. */
+ counter_u64_add(hpts_wake_timeout, 1);
if (callout_pending(&hpts->co) ||
!callout_active(&hpts->co)) {
mtx_unlock(&hpts->p_mtx);
return;
}
- callout_deactivate(&hpts->co);
}
+ callout_deactivate(&hpts->co);
hpts->p_hpts_wake_scheduled = 0;
- hpts->p_hpts_active = 1;
NET_EPOCH_ENTER(et);
- tcp_hptsi(hpts);
- NET_EPOCH_EXIT(et);
- HPTS_MTX_ASSERT(hpts);
+ if (hpts->p_hpts_active) {
+ /*
+ * We are active already. This means that a syscall
+ * trap or LRO is running in behalf of hpts. In that case
+ * we need to double our timeout since there seems to be
+ * enough activity in the system that we don't need to
+ * run as often (if we were not directly woken).
+ */
+ if (hpts->p_direct_wake == 0) {
+ counter_u64_add(hpts_back_tosleep, 1);
+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+ hpts->p_mysleep.tv_usec *= 2;
+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+ tv.tv_usec = hpts->p_mysleep.tv_usec;
+ hpts->p_on_min_sleep = 1;
+ } else {
+ /*
+ * Here we have low count on the wheel, but
+ * somehow we still collided with one of the
+ * connections. Lets go back to sleep for a
+ * min sleep time, but clear the flag so we
+ * can be awoken by insert.
+ */
+ hpts->p_on_min_sleep = 0;
+ tv.tv_usec = tcp_min_hptsi_time;
+ }
+ } else {
+ /*
+ * Directly woken most likely to reset the
+ * callout time.
+ */
+ tv.tv_sec = 0;
+ tv.tv_usec = hpts->p_mysleep.tv_usec;
+ }
+ goto back_to_sleep;
+ }
+ hpts->sleeping = 0;
+ hpts->p_hpts_active = 1;
+ ticks_ran = tcp_hptsi(hpts, 1);
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
- if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
- hpts->overidden_sleep = tv.tv_usec;
- tv.tv_usec = tcp_min_hptsi_time;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+ if(hpts->p_direct_wake == 0) {
+ /*
+ * Only adjust sleep time if we were
+ * called from the callout i.e. direct_wake == 0.
+ */
+ if (ticks_ran < ticks_indicate_more_sleep) {
+ hpts->p_mysleep.tv_usec *= 2;
+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+ } else if (ticks_ran > ticks_indicate_less_sleep) {
+ hpts->p_mysleep.tv_usec /= 2;
+ if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
+ hpts->p_mysleep.tv_usec = dynamic_min_sleep;
+ }
+ }
+ if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = hpts->p_mysleep.tv_usec;
+ } else if (tv.tv_usec > dynamic_max_sleep) {
+ /* Lets not let sleep get above this value */
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = dynamic_max_sleep;
+ }
+ /*
+ * In this mode the timer is a backstop to
+ * all the userret/lro_flushes so we use
+ * the dynamic value and set the on_min_sleep
+ * flag so we will not be awoken.
+ */
hpts->p_on_min_sleep = 1;
- } else {
- /* Clear the min sleep flag */
- hpts->overidden_sleep = 0;
+ } else if (hpts->p_on_queue_cnt == 0) {
+ /*
+ * No one on the wheel, please wake us up
+ * if you insert on the wheel.
+ */
hpts->p_on_min_sleep = 0;
- }
- hpts->p_hpts_active = 0;
- sb = tvtosbt(tv);
- if (tcp_hpts_callout_skip_swi == 0) {
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ hpts->overidden_sleep = 0;
} else {
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_dir, hpts,
- hpts->p_cpu,
- C_PREL(tcp_hpts_precision));
+ /*
+ * We hit here when we have a low number of
+ * clients on the wheel (our else clause).
+ * We may need to go on min sleep, if we set
+ * the flag we will not be awoken if someone
+ * is inserted ahead of us. Clearing the flag
+ * means we can be awoken. This is "old mode"
+ * where the timer is what runs hpts mainly.
+ */
+ if (tv.tv_usec < tcp_min_hptsi_time) {
+ /*
+ * Yes on min sleep, which means
+ * we cannot be awoken.
+ */
+ hpts->overidden_sleep = tv.tv_usec;
+ tv.tv_usec = tcp_min_hptsi_time;
+ hpts->p_on_min_sleep = 1;
+ } else {
+ /* Clear the min sleep flag */
+ hpts->overidden_sleep = 0;
+ hpts->p_on_min_sleep = 0;
+ }
}
+ HPTS_MTX_ASSERT(hpts);
+ hpts->p_hpts_active = 0;
+back_to_sleep:
hpts->p_direct_wake = 0;
+ sb = tvtosbt(tv);
+ cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
+ /* Store off to make visible the actual sleep time */
+ hpts->sleeping = tv.tv_usec;
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ NET_EPOCH_EXIT(et);
mtx_unlock(&hpts->p_mtx);
}
@@ -1873,7 +2126,7 @@
cpuset_t cs;
char unit[16];
uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
- int count, domain;
+ int count, domain, cpu;
tcp_pace.rp_proc = NULL;
tcp_pace.rp_num_hptss = ncpus;
@@ -1882,8 +2135,18 @@
back_tosleep = counter_u64_alloc(M_WAITOK);
combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
wheel_wrap = counter_u64_alloc(M_WAITOK);
+ hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
+ hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
+ hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
+ hpts_direct_call = counter_u64_alloc(M_WAITOK);
+ cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
+ cpu_uses_random = counter_u64_alloc(M_WAITOK);
+
+
sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+ sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
+ cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
@@ -1933,19 +2196,41 @@
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "runtick", CTLFLAG_RD,
- &hpts->p_runningtick, 0,
+ &hpts->p_runningslot, 0,
"What the running pacers current slot is");
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "curtick", CTLFLAG_RD,
&hpts->p_curtick, 0,
"What the running pacers last tick mapped to the wheel was");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "lastran", CTLFLAG_RD,
+ &cts_last_ran[i], 0,
+ "The last usec tick that this hpts ran");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+ &hpts->p_mysleep.tv_usec, 0,
+ "What the running pacers is using for p_mysleep.tv_usec");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "now_sleeping", CTLFLAG_RD,
+ &hpts->sleeping, 0,
+ "What the running pacers is actually sleeping for");
+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+ &hpts->syscall_cnt, 0,
+ "How many times we had syscalls on this hpts");
+
hpts->p_hpts_sleep_time = hpts_sleep_max;
hpts->p_num = i;
hpts->p_curtick = tcp_gethptstick(&tv);
+ cts_last_ran[i] = tcp_tv_to_usectick(&tv);
hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
hpts->p_cpu = 0xffff;
- hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
+ hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
}
@@ -1956,17 +2241,18 @@
/*
* Now lets start ithreads to handle the hptss.
*/
- CPU_FOREACH(i) {
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
hpts = tcp_pace.rp_ent[i];
hpts->p_cpu = i;
error = swi_add(&hpts->ie, "hpts",
tcp_hpts_thread, (void *)hpts,
SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
- if (error) {
- panic("Can't add hpts:%p i:%d err:%d",
- hpts, i, error);
- }
+ KASSERT(error == 0,
+ ("Can't add hpts:%p i:%d err:%d",
+ hpts, i, error));
created++;
+ hpts->p_mysleep.tv_sec = 0;
+ hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
if (tcp_bind_threads == 1) {
if (intr_event_bind(hpts->ie, i) == 0)
bound++;
@@ -1983,18 +2269,13 @@
}
}
tv.tv_sec = 0;
- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+ hpts->sleeping = tv.tv_usec;
sb = tvtosbt(tv);
- if (tcp_hpts_callout_skip_swi == 0) {
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_swi, hpts, hpts->p_cpu,
- (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
- } else {
- callout_reset_sbt_on(&hpts->co, sb, 0,
- hpts_timeout_dir, hpts,
- hpts->p_cpu,
- C_PREL(tcp_hpts_precision));
- }
+ cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
}
/*
* If we somehow have an empty domain, fall back to choosing
@@ -2006,11 +2287,13 @@
break;
}
}
-
printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
created, bound,
tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
+#ifdef INVARIANTS
+ printf("HPTS is in INVARIANT mode!!\n");
+#endif
}
-SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
+SYSINIT(tcphptsi, SI_SUB_SOFTINTR, SI_ORDER_ANY, tcp_init_hptsi, NULL);
MODULE_VERSION(tcphpts, 1);
Index: sys/netinet/tcp_lro.h
===================================================================
--- sys/netinet/tcp_lro.h
+++ sys/netinet/tcp_lro.h
@@ -56,6 +56,11 @@
#define TSTMP_LRO 0x0100
#define TSTMP_HDWR 0x0200
#define HAS_TSTMP 0x0400
+/*
+ * Default number of interrupts on the same cpu in a row
+ * that will cause us to declare a "affinity cpu".
+ */
+#define TCP_LRO_CPU_DECLARATION_THRESH 50
struct inpcb;
@@ -162,12 +167,15 @@
unsigned lro_mbuf_count;
unsigned lro_mbuf_max;
unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */
+ unsigned short lro_cpu; /* Guess at the cpu we have affinity too */
unsigned lro_length_lim; /* max len of aggregated data */
-
u_long lro_hashsz;
+ uint32_t lro_last_cpu;
+ uint32_t lro_cnt_of_same_cpu;
struct lro_head *lro_hash;
struct lro_head lro_active;
struct lro_head lro_free;
+ uint8_t lro_cpu_is_set; /* Flag to say its ok to set the CPU on the inp */
};
struct tcp_ackent {
Index: sys/netinet/tcp_lro.c
===================================================================
--- sys/netinet/tcp_lro.c
+++ sys/netinet/tcp_lro.c
@@ -107,6 +107,11 @@
CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
"default number of LRO entries");
+static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH;
+SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold,
+ CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0,
+ "Number of interrups in a row on the same CPU that will make us declare an 'affinity' cpu?");
+
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
&tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
@@ -631,12 +636,13 @@
log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
}
log.u_bbr.inflight = th_seq;
+ log.u_bbr.delivered = th_ack;
log.u_bbr.timeStamp = cts;
log.u_bbr.epoch = le->next_seq;
- log.u_bbr.delivered = th_ack;
log.u_bbr.lt_epoch = le->ack_seq;
log.u_bbr.pacing_gain = th_win;
log.u_bbr.cwnd_gain = le->window;
+ log.u_bbr.lost = curcpu;
log.u_bbr.cur_del_rate = (uintptr_t)m;
log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
bintime2timeval(&lc->lro_last_queue_time, &btv);
@@ -1273,7 +1279,10 @@
INP_WUNLOCK(inp);
return (TCP_LRO_CANNOT);
}
-
+ if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) {
+ inp->inp_irq_cpu = lc->lro_last_cpu;
+ inp->inp_irq_cpu_set = 1;
+ }
/* Check if the transport doesn't support the needed optimizations. */
if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {
INP_WUNLOCK(inp);
@@ -1445,7 +1454,17 @@
/* check if no mbufs to flush */
if (lc->lro_mbuf_count == 0)
goto done;
-
+ if (lc->lro_cpu_is_set == 0) {
+ if (lc->lro_last_cpu == curcpu) {
+ lc->lro_cnt_of_same_cpu++;
+ /* Have we reached the threshold to declare a cpu? */
+ if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh)
+ lc->lro_cpu_is_set = 1;
+ } else {
+ lc->lro_last_cpu = curcpu;
+ lc->lro_cnt_of_same_cpu = 0;
+ }
+ }
CURVNET_SET(lc->ifp->if_vnet);
/* get current time */
@@ -1486,6 +1505,9 @@
/* flush active streams */
tcp_lro_rx_done(lc);
+#ifdef TCPHPTS
+ tcp_run_hpts();
+#endif
lc->lro_mbuf_count = 0;
}
Index: sys/netinet/tcp_stacks/bbr.c
===================================================================
--- sys/netinet/tcp_stacks/bbr.c
+++ sys/netinet/tcp_stacks/bbr.c
@@ -2429,10 +2429,10 @@
log.u_bbr.pkts_out = diag->co_ret;
log.u_bbr.applimited = diag->hpts_sleep_time;
log.u_bbr.delivered = diag->p_prev_slot;
- log.u_bbr.inflight = diag->p_runningtick;
- log.u_bbr.bw_inuse = diag->wheel_tick;
+ log.u_bbr.inflight = diag->p_runningslot;
+ log.u_bbr.bw_inuse = diag->wheel_slot;
log.u_bbr.rttProp = diag->wheel_cts;
- log.u_bbr.delRate = diag->maxticks;
+ log.u_bbr.delRate = diag->maxslots;
log.u_bbr.cur_del_rate = diag->p_curtick;
log.u_bbr.cur_del_rate <<= 32;
log.u_bbr.cur_del_rate |= diag->p_lasttick;
Index: sys/netinet/tcp_stacks/rack.c
===================================================================
--- sys/netinet/tcp_stacks/rack.c
+++ sys/netinet/tcp_stacks/rack.c
@@ -5609,11 +5609,11 @@
log.u_bbr.pkts_out = diag->co_ret;
log.u_bbr.applimited = diag->hpts_sleep_time;
log.u_bbr.delivered = diag->p_prev_slot;
- log.u_bbr.inflight = diag->p_runningtick;
- log.u_bbr.bw_inuse = diag->wheel_tick;
+ log.u_bbr.inflight = diag->p_runningslot;
+ log.u_bbr.bw_inuse = diag->wheel_slot;
log.u_bbr.rttProp = diag->wheel_cts;
log.u_bbr.timeStamp = cts;
- log.u_bbr.delRate = diag->maxticks;
+ log.u_bbr.delRate = diag->maxslots;
log.u_bbr.cur_del_rate = diag->p_curtick;
log.u_bbr.cur_del_rate <<= 32;
log.u_bbr.cur_del_rate |= diag->p_lasttick;
@@ -5707,22 +5707,22 @@
* on the clock. We always have a min
* 10 slots (10 x 10 i.e. 100 usecs).
*/
- if (slot <= HPTS_TICKS_PER_USEC) {
+ if (slot <= HPTS_TICKS_PER_SLOT) {
/* We gain delay */
- rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot);
- slot = HPTS_TICKS_PER_USEC;
+ rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
+ slot = HPTS_TICKS_PER_SLOT;
} else {
/* We take off some */
- rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC);
- slot = HPTS_TICKS_PER_USEC;
+ rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
+ slot = HPTS_TICKS_PER_SLOT;
}
} else {
slot -= rack->r_ctl.rc_agg_delayed;
rack->r_ctl.rc_agg_delayed = 0;
/* Make sure we have 100 useconds at minimum */
- if (slot < HPTS_TICKS_PER_USEC) {
- rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot;
- slot = HPTS_TICKS_PER_USEC;
+ if (slot < HPTS_TICKS_PER_SLOT) {
+ rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
+ slot = HPTS_TICKS_PER_SLOT;
}
if (rack->r_ctl.rc_agg_delayed == 0)
rack->r_late = 0;
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Oct 14, 8:20 AM (17 h, 19 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
23690852
Default Alt Text
D31083.id.diff (78 KB)
Attached To
Mode
D31083: tcp: HPTS performance enhancements.
Attached
Detach File
Event Timeline
Log In to Comment