Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -234,22 +234,21 @@ * fields can *not* be collapsed into a signal bit field. */ #if defined(__amd64__) || defined(__i386__) - volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */ - volatile uint8_t inp_in_input; /* on input hpts (lock b) */ + uint8_t inp_in_hpts; /* on output hpts (lock b) */ + uint8_t inp_in_dropq; /* on input hpts (lock b) */ #else - volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */ - volatile uint32_t inp_in_input; /* on input hpts (lock b) */ + uint32_t inp_in_hpts; /* on output hpts (lock b) */ + uint32_t inp_in_dropq; /* on input hpts (lock b) */ #endif volatile uint16_t inp_hpts_cpu; /* Lock (i) */ volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */ u_int inp_refcount; /* (i) refcount */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ - volatile uint16_t inp_input_cpu; /* Lock (i) */ - volatile uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */ - inp_input_cpu_set : 1, /* on input hpts (i) */ + uint16_t inp_dropq_cpu; /* Lock (i) */ + uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */ + inp_dropq_cpu_set : 1, /* on input hpts (i) */ inp_hpts_calls :1, /* (i) from output hpts */ - inp_input_calls :1, /* (i) from input hpts */ inp_irq_cpu_set :1, /* (i) from LRO/Driver */ inp_spare_bits2 : 3; uint8_t inp_numa_domain; /* numa domain */ @@ -257,7 +256,8 @@ struct socket *inp_socket; /* (i) back pointer to socket */ uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */ uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */ - TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */ + uint32_t inp_dropq_gencnt; + TAILQ_ENTRY(inpcb) inp_dropq; /* hpts drop queue next lock(b) */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -629,7 +629,7 @@ * If using hpts lets drop a random number in so * not all new connections fall on the same CPU. */ - inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp); + inp->inp_hpts_cpu = inp->inp_dropq_cpu = hpts_random_cpu(inp); #endif refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ INP_WLOCK(inp); @@ -1760,7 +1760,7 @@ MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); MPASS(inp->inp_in_hpts == 0); - MPASS(inp->inp_in_input == 0); + MPASS(inp->inp_in_dropq == 0); INP_RUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); @@ -1778,7 +1778,7 @@ MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); MPASS(inp->inp_in_hpts == 0); - MPASS(inp->inp_in_input == 0); + MPASS(inp->inp_in_dropq == 0); INP_WUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); Index: sys/netinet/tcp_hpts.h =================================================================== --- sys/netinet/tcp_hpts.h +++ sys/netinet/tcp_hpts.h @@ -116,9 +116,9 @@ #ifdef _KERNEL #define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__) void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line); -#define HPTS_REMOVE_INPUT 0x01 +#define HPTS_REMOVE_DROPQ 0x01 #define HPTS_REMOVE_OUTPUT 0x02 -#define HPTS_REMOVE_ALL (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT) +#define HPTS_REMOVE_ALL (HPTS_REMOVE_DROPQ | HPTS_REMOVE_OUTPUT) static inline bool tcp_in_hpts(struct inpcb *inp) @@ -160,8 +160,7 @@ void __tcp_set_hpts(struct inpcb *inp, int32_t line); #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) -void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line); -#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__) +void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason); void tcp_run_hpts(void); Index: sys/netinet/tcp_hpts.c =================================================================== --- sys/netinet/tcp_hpts.c +++ sys/netinet/tcp_hpts.c @@ -62,15 +62,7 @@ * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. * - * Now the second function (actually two functions I guess :D) - * the tcp_hpts system provides is the ability to either abort - * a connection (later) or process input on a connection. - * Why would you want to do this? To keep processor locality - * and or not have to worry about untangling any recursive - * locks. The input function now is hooked to the new LRO - * system as well. - * - * In order to use the input redirection function the + * In order to run input queued segments from the HPTS context the * tcp stack must define an input function for * tfb_do_queued_segments(). This function understands * how to dequeue a array of packets that were input and @@ -109,6 +101,10 @@ * you have defined the tfb_do_segment_nounlock() as * described above. * + * Now the second function the tcp_hpts system provides is the ability + * to abort a connection later. Why would you want to do this? + * To not have to worry about untangling any recursive locks. + * * The second feature of the input side of hpts is the * dropping of a connection. This is due to the way that * locking may have occured on the INP_WLOCK. So if @@ -202,6 +198,8 @@ /* Each hpts has its own p_mtx which is used for locking */ #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) +#define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) +#define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) TAILQ_HEAD(hptsh, inpcb); struct tcp_hpts_entry { /* Cache line 0x00 */ @@ -226,10 +224,11 @@ uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ void *p_inp; - struct hptsh p_input; /* For the tcp-input runner */ + TAILQ_HEAD(, inpcb) p_dropq; /* Delayed drop queue */ /* Hptsi wheel */ struct hptsh *p_hptss; - int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ + uint32_t p_dropq_cnt; /* Count on drop queue */ + uint32_t p_dropq_gencnt; uint32_t p_hpts_sleep_time; /* Current sleep interval having a max * of 255ms */ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ @@ -270,7 +269,6 @@ static int hpts_use_assigned_cpu = 1; static int32_t hpts_uses_oldest = OLDEST_THRESHOLD; -static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); @@ -558,41 +556,6 @@ } } -static inline void -hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) -{ - HPTS_MTX_ASSERT(hpts); - KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, - ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp)); - KASSERT(inp->inp_in_input != 0, - ("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp)); - TAILQ_REMOVE(&hpts->p_input, inp, inp_input); - hpts->p_on_inqueue_cnt--; - KASSERT(hpts->p_on_inqueue_cnt >= 0, - ("Hpts in goes negative inp:%p hpts:%p", - inp, hpts)); - KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) || - ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))), - ("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch", - __FUNCTION__, hpts, hpts->p_on_inqueue_cnt)); - if (clear) - inp->inp_in_input = 0; -} - -static inline void -hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) -{ - HPTS_MTX_ASSERT(hpts); - KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, - ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp)); - KASSERT(inp->inp_in_input == 0, - ("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp)); - TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); - inp->inp_in_input = 1; - hpts->p_on_inqueue_cnt++; - in_pcbref(inp); -} - static struct tcp_hpts_entry * tcp_hpts_lock(struct inpcb *inp) { @@ -614,19 +577,19 @@ } static struct tcp_hpts_entry * -tcp_input_lock(struct inpcb *inp) +tcp_dropq_lock(struct inpcb *inp) { struct tcp_hpts_entry *hpts; int32_t hpts_num; again: - hpts_num = inp->inp_input_cpu; + hpts_num = inp->inp_dropq_cpu; hpts = tcp_pace.rp_ent[hpts_num]; KASSERT(mtx_owned(&hpts->p_mtx) == 0, ("Hpts:%p owns mtx prior-to lock line:%d", hpts, __LINE__)); mtx_lock(&hpts->p_mtx); - if (hpts_num != inp->inp_input_cpu) { + if (hpts_num != inp->inp_dropq_cpu) { mtx_unlock(&hpts->p_mtx); goto again; } @@ -652,13 +615,38 @@ } static void -tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) +tcp_dropq_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp) { + bool released __diagused; + HPTS_MTX_ASSERT(hpts); - if (inp->inp_in_input) { - hpts_sane_input_remove(hpts, inp, 1); - tcp_remove_hpts_ref(inp, hpts, line); + INP_WLOCK_ASSERT(inp); + + if (inp->inp_in_dropq != IHPTS_ONQUEUE) + return; + + MPASS(hpts->p_cpu == inp->inp_dropq_cpu); + if (__predict_true(inp->inp_dropq_gencnt == hpts->p_dropq_gencnt)) { + TAILQ_REMOVE(&hpts->p_dropq, inp, inp_dropq); + MPASS(hpts->p_dropq_cnt > 0); + hpts->p_dropq_cnt--; + inp->inp_in_dropq = IHPTS_NONE; + released = in_pcbrele_wlocked(inp); + MPASS(released == false); + } else { + /* + * tcp_delayed_drop() now owns the TAILQ head of this inp. + * Can't TAILQ_REMOVE, just mark it. + */ +#ifdef INVARIANTS + struct inpcb *tmp; + + TAILQ_FOREACH(tmp, &hpts->p_dropq, inp_dropq) + MPASS(tmp != inp); +#endif + inp->inp_in_dropq = IHPTS_MOVING; } + } /* @@ -669,7 +657,7 @@ * * Valid values in the flags are * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. - * HPTS_REMOVE_INPUT - remove from the input of the hpts. + * HPTS_REMOVE_DROPQ - remove from the drop queue of the hpts. * Note that you can use one or both values together * and get two actions. */ @@ -684,9 +672,9 @@ tcp_hpts_remove_locked_output(hpts, inp, flags, line); mtx_unlock(&hpts->p_mtx); } - if (flags & HPTS_REMOVE_INPUT) { - hpts = tcp_input_lock(inp); - tcp_hpts_remove_locked_input(hpts, inp, flags, line); + if (flags & HPTS_REMOVE_DROPQ) { + hpts = tcp_dropq_lock(inp); + tcp_dropq_remove(hpts, inp); mtx_unlock(&hpts->p_mtx); } } @@ -1097,31 +1085,29 @@ } void -__tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line) +tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason) { struct tcp_hpts_entry *hpts; - struct tcpcb *tp; + struct tcpcb *tp = intotcpcb(inp); - tp = intotcpcb(inp); - hpts = tcp_input_lock(tp->t_inpcb); - if (inp->inp_in_input == 0) { - /* Ok we need to set it on the hpts in the current slot */ - hpts_sane_input_insert(hpts, inp, line); - if ((hpts->p_hpts_active == 0) && - (hpts->p_on_min_sleep == 0)){ - /* - * Activate the hpts if it is sleeping. - */ - hpts->p_direct_wake = 1; - tcp_wakehpts(hpts); - } - } else if ((hpts->p_hpts_active == 0) && - (hpts->p_on_min_sleep == 0)){ + INP_WLOCK_ASSERT(inp); + inp->inp_hpts_drop_reas = reason; + if (inp->inp_in_dropq != IHPTS_NONE) + return; + hpts = tcp_dropq_lock(tp->t_inpcb); + MPASS(hpts->p_cpu == inp->inp_dropq_cpu); + + TAILQ_INSERT_TAIL(&hpts->p_dropq, inp, inp_dropq); + inp->inp_in_dropq = IHPTS_ONQUEUE; + inp->inp_dropq_gencnt = hpts->p_dropq_gencnt; + hpts->p_dropq_cnt++; + in_pcbref(inp); + + if ((hpts->p_hpts_active == 0) && (hpts->p_on_min_sleep == 0)){ hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } - inp->inp_hpts_drop_reas = reason; - mtx_unlock(&hpts->p_mtx); + HPTS_UNLOCK(hpts); } static uint16_t @@ -1136,8 +1122,8 @@ * If one has been set use it i.e. we want both in and out on the * same hpts. */ - if (inp->inp_input_cpu_set) { - return (inp->inp_input_cpu); + if (inp->inp_dropq_cpu_set) { + return (inp->inp_dropq_cpu); } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } @@ -1160,8 +1146,8 @@ * If one has been set use it i.e. we want both in and out on the * same hpts. */ - if (inp->inp_input_cpu_set) { - return (inp->inp_input_cpu); + if (inp->inp_dropq_cpu_set) { + return (inp->inp_dropq_cpu); } else if (inp->inp_hpts_cpu_set) { return (inp->inp_hpts_cpu); } @@ -1249,117 +1235,50 @@ * list. */ static void -tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) +tcp_delayed_drop(struct tcp_hpts_entry *hpts) { + TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head); + struct inpcb *inp, *tmp; struct tcpcb *tp; - struct inpcb *inp; - uint16_t drop_reason; - int16_t set_cpu; - uint32_t did_prefetch = 0; - int dropped; HPTS_MTX_ASSERT(hpts); NET_EPOCH_ASSERT(); - while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { - HPTS_MTX_ASSERT(hpts); - hpts_sane_input_remove(hpts, inp, 0); - if (inp->inp_input_cpu_set == 0) { - set_cpu = 1; - } else { - set_cpu = 0; - } - hpts->p_inp = inp; - drop_reason = inp->inp_hpts_drop_reas; - inp->inp_in_input = 0; - mtx_unlock(&hpts->p_mtx); + TAILQ_SWAP(&head, &hpts->p_dropq, inpcb, inp_dropq); + hpts->p_dropq_cnt = 0; + hpts->p_dropq_gencnt++; + HPTS_UNLOCK(hpts); + + TAILQ_FOREACH_SAFE(inp, &head, inp_dropq, tmp) { INP_WLOCK(inp); -#ifdef VIMAGE - CURVNET_SET(inp->inp_vnet); -#endif - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { -out: - hpts->p_inp = NULL; - if (in_pcbrele_wlocked(inp) == 0) { + MPASS(inp->inp_hpts_drop_reas != 0); + if (__predict_false(inp->inp_in_dropq == IHPTS_MOVING)) { + inp->inp_in_dropq = IHPTS_NONE; + if (in_pcbrele_wlocked(inp) == false) INP_WUNLOCK(inp); - } -#ifdef VIMAGE - CURVNET_RESTORE(); -#endif - mtx_lock(&hpts->p_mtx); continue; } - tp = intotcpcb(inp); - if ((tp == NULL) || (tp->t_inpcb == NULL)) { - goto out; - } - if (drop_reason) { - /* This tcb is being destroyed for drop_reason */ - tcp_drop_in_pkts(tp); - tp = tcp_drop(tp, drop_reason); - if (tp == NULL) { - INP_WLOCK(inp); - } - if (in_pcbrele_wlocked(inp) == 0) + MPASS(inp->inp_in_dropq == IHPTS_ONQUEUE); + inp->inp_in_dropq = IHPTS_NONE; + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { + if (in_pcbrele_wlocked(inp) == false) INP_WUNLOCK(inp); -#ifdef VIMAGE - CURVNET_RESTORE(); -#endif - mtx_lock(&hpts->p_mtx); continue; } - if (set_cpu) { - /* - * Setup so the next time we will move to the right - * CPU. This should be a rare event. It will - * sometimes happens when we are the client side - * (usually not the server). Somehow tcp_output() - * gets called before the tcp_do_segment() sets the - * intial state. This means the r_cpu and r_hpts_cpu - * is 0. We get on the hpts, and then tcp_input() - * gets called setting up the r_cpu to the correct - * value. The hpts goes off and sees the mis-match. - * We simply correct it here and the CPU will switch - * to the new hpts nextime the tcb gets added to the - * the hpts (not this time) :-) - */ - tcp_set_hpts(inp); - } - if (tp->t_fb_ptr != NULL) { - kern_prefetch(tp->t_fb_ptr, &did_prefetch); - did_prefetch = 1; - } - if ((tp->t_fb->tfb_do_queued_segments != NULL) && tp->t_in_pkt) { - if (inp->inp_in_input) - tcp_hpts_remove(inp, HPTS_REMOVE_INPUT); - dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); - if (dropped) { - /* Re-acquire the wlock so we can release the reference */ - INP_WLOCK(inp); - } - } else if (tp->t_in_pkt) { - /* - * We reach here only if we had a - * stack that supported INP_SUPPORTS_MBUFQ - * and then somehow switched to a stack that - * does not. The packets are basically stranded - * and would hang with the connection until - * cleanup without this code. Its not the - * best way but I know of no other way to - * handle it since the stack needs functions - * it does not have to handle queued packets. - */ + CURVNET_SET(inp->inp_vnet); + if (__predict_true((tp = intotcpcb(inp)) != NULL)) { + MPASS(tp->t_inpcb == inp); tcp_drop_in_pkts(tp); + tp = tcp_drop(tp, inp->inp_hpts_drop_reas); + if (tp == NULL) + INP_WLOCK(inp); } - if (in_pcbrele_wlocked(inp) == 0) + if (in_pcbrele_wlocked(inp) == false) INP_WUNLOCK(inp); - INP_UNLOCK_ASSERT(inp); -#ifdef VIMAGE CURVNET_RESTORE(); -#endif - mtx_lock(&hpts->p_mtx); - hpts->p_inp = NULL; } + + mtx_lock(&hpts->p_mtx); /* XXXGL */ } static void @@ -1489,10 +1408,10 @@ hpts->p_nxt_slot = hpts->p_prev_slot; hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1); } - KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) || - ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))), + KASSERT((((TAILQ_EMPTY(&hpts->p_dropq) != 0) && (hpts->p_dropq_cnt == 0)) || + ((TAILQ_EMPTY(&hpts->p_dropq) == 0) && (hpts->p_dropq_cnt > 0))), ("%s hpts:%p in_hpts cnt:%d and queue state mismatch", - __FUNCTION__, hpts, hpts->p_on_inqueue_cnt)); + __FUNCTION__, hpts, hpts->p_dropq_cnt)); HPTS_MTX_ASSERT(hpts); if (hpts->p_on_queue_cnt == 0) { goto no_one; @@ -1716,10 +1635,10 @@ * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ - KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) || - ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))), + KASSERT((((TAILQ_EMPTY(&hpts->p_dropq) != 0) && (hpts->p_dropq_cnt == 0)) || + ((TAILQ_EMPTY(&hpts->p_dropq) == 0) && (hpts->p_dropq_cnt > 0))), ("%s hpts:%p in_hpts cnt:%d queue state mismatch", - __FUNCTION__, hpts, hpts->p_on_inqueue_cnt)); + __FUNCTION__, hpts, hpts->p_dropq_cnt)); hpts->p_prev_slot = hpts->p_cur_slot; hpts->p_lasttick = hpts->p_curtick; if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) { @@ -1765,31 +1684,30 @@ * Run any input that may be there not covered * in running data. */ - if (!TAILQ_EMPTY(&hpts->p_input)) { - tcp_input_data(hpts, &tv); - /* - * Now did we spend too long running input and need to run more ticks? - * Note that if wrap_loop_cnt < 2 then we should have the conditions - * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt - * is greater than 2, then the condtion most likely are *not* true. Also - * if we are called not from the callout, we don't run the wheel multiple - * times so the slots may not align either. - */ - KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || - (wrap_loop_cnt >= 2) || (from_callout == 0)), - ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, - hpts->p_prev_slot, hpts->p_cur_slot)); - KASSERT(((hpts->p_lasttick == hpts->p_curtick) - || (wrap_loop_cnt >= 2) || (from_callout == 0)), - ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, - hpts->p_lasttick, hpts->p_curtick)); - if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { - hpts->p_curtick = tcp_gethptstick(&tv); - counter_u64_add(hpts_loops, 1); - hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); - goto again; - } + tcp_delayed_drop(hpts); + /* + * Now did we spend too long running input and need to run more ticks? + * Note that if wrap_loop_cnt < 2 then we should have the conditions + * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt + * is greater than 2, then the condtion most likely are *not* true. + * Also if we are called not from the callout, we don't run the wheel + * multiple times so the slots may not align either. + */ + KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) || + (wrap_loop_cnt >= 2) || (from_callout == 0)), + ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, + hpts->p_prev_slot, hpts->p_cur_slot)); + KASSERT(((hpts->p_lasttick == hpts->p_curtick) + || (wrap_loop_cnt >= 2) || (from_callout == 0)), + ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, + hpts->p_lasttick, hpts->p_curtick)); + if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) { + hpts->p_curtick = tcp_gethptstick(&tv); + counter_u64_add(hpts_loops, 1); + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + goto again; } + if (from_callout){ tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt); } @@ -1814,12 +1732,12 @@ inp->inp_hpts_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); - hpts = tcp_input_lock(inp); - if ((inp->inp_input_cpu_set == 0) && - (inp->inp_in_input == 0)) { - inp->inp_input_cpu = hpts_cpuid(inp, &failed); + hpts = tcp_dropq_lock(inp); + if ((inp->inp_dropq_cpu_set == 0) && + (inp->inp_in_dropq == 0)) { + inp->inp_dropq_cpu = hpts_cpuid(inp, &failed); if (failed == 0) - inp->inp_input_cpu_set = 1; + inp->inp_dropq_cpu_set = 1; } mtx_unlock(&hpts->p_mtx); } @@ -2140,7 +2058,7 @@ */ mtx_init(&hpts->p_mtx, "tcp_hpts_lck", "hpts", MTX_DEF | MTX_DUPOK); - TAILQ_INIT(&hpts->p_input); + TAILQ_INIT(&hpts->p_dropq); for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { TAILQ_INIT(&hpts->p_hptss[j]); } @@ -2155,8 +2073,8 @@ SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "in_qcnt", CTLFLAG_RD, - &hpts->p_on_inqueue_cnt, 0, - "Count TCB's awaiting input processing"); + &hpts->p_dropq_cnt, 0, + "Count TCB's awaiting delayed drop"); SYSCTL_ADD_INT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "out_qcnt", CTLFLAG_RD, Index: sys/netinet/tcp_lro.c =================================================================== --- sys/netinet/tcp_lro.c +++ sys/netinet/tcp_lro.c @@ -1354,7 +1354,7 @@ if (le->m_head != NULL) { counter_u64_add(tcp_inp_lro_direct_queue, 1); tcp_lro_log(tp, lc, le, NULL, 22, 1, - inp->inp_flags2, inp->inp_in_input, 1); + inp->inp_flags2, inp->inp_in_dropq, 1); tcp_queue_pkts(inp, tp, le); } if (should_wake) { Index: sys/netinet/tcp_stacks/bbr.c =================================================================== --- sys/netinet/tcp_stacks/bbr.c +++ sys/netinet/tcp_stacks/bbr.c @@ -1884,7 +1884,7 @@ l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain; l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain; l->inhpts = tcp_in_hpts(bbr->rc_inp); - l->ininput = bbr->rc_inp->inp_in_input; + l->ininput = bbr->rc_inp->inp_in_dropq; l->use_lt_bw = bbr->rc_lt_use_bw; l->pkts_out = bbr->r_ctl.rc_flight_at_input; l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch; Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -2295,7 +2295,7 @@ log.u_bbr.flex6 = rsm->r_end; log.u_bbr.flex8 = mod; log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; @@ -2330,7 +2330,7 @@ else log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; @@ -2355,7 +2355,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; @@ -2394,7 +2394,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex8 = flag; log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.cur_del_rate = (uint64_t)prev; log.u_bbr.delRate = (uint64_t)rsm; log.u_bbr.rttProp = (uint64_t)next; @@ -2439,7 +2439,7 @@ struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.flex1 = t; log.u_bbr.flex2 = len; log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; @@ -2589,7 +2589,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; @@ -2616,7 +2616,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.flex1 = slot; if (rack->rack_no_prr) log.u_bbr.flex2 = 0; @@ -2718,7 +2718,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = reason; @@ -2751,7 +2751,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; log.u_bbr.flex3 = flags_on_entry; @@ -13329,7 +13329,7 @@ #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; else @@ -14321,7 +14321,7 @@ #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; else @@ -15612,7 +15612,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; log.u_bbr.flex1 = error; log.u_bbr.flex2 = flags; log.u_bbr.flex3 = rsm_is_null; @@ -16128,7 +16128,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else @@ -16629,7 +16629,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else @@ -18801,7 +18801,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); - log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.ininput = rack->rc_inp->inp_in_dropq; if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else