diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -145,7 +145,6 @@ * lock is to be obtained and SMR section exited. * * Key: - * (b) - Protected by the hpts lock. * (c) - Constant after initialization * (e) - Protected by the SMR section * (i) - Protected by the inpcb lock @@ -154,51 +153,6 @@ * (s) - Protected by another subsystem's locks * (x) - Undefined locking * - * Notes on the tcp_hpts: - * - * First Hpts lock order is - * 1) INP_WLOCK() - * 2) HPTS_LOCK() i.e. hpts->pmtx - * - * To insert a TCB on the hpts you *must* be holding the INP_WLOCK(). - * You may check the inp->inp_in_hpts flag without the hpts lock. - * The hpts is the only one that will clear this flag holding - * only the hpts lock. This means that in your tcp_output() - * routine when you test for the inp_in_hpts flag to be 1 - * it may be transitioning to 0 (by the hpts). - * That's ok since that will just mean an extra call to tcp_output - * that most likely will find the call you executed - * (when the mis-match occurred) will have put the TCB back - * on the hpts and it will return. If your - * call did not add the inp back to the hpts then you will either - * over-send or the cwnd will block you from sending more. - * - * Note you should also be holding the INP_WLOCK() when you - * call the remove from the hpts as well. Though usually - * you are either doing this from a timer, where you need and have - * the INP_WLOCK() or from destroying your TCB where again - * you should already have the INP_WLOCK(). - * - * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and - * inp_input_cpu_set fields are controlled completely by - * the hpts. Do not ever set these. The inp_hpts_cpu_set - * and inp_input_cpu_set fields indicate if the hpts has - * setup the respective cpu field. It is advised if this - * field is 0, to enqueue the packet with the appropriate - * hpts_immediate() call. If the _set field is 1, then - * you may compare the inp_*_cpu field to the curcpu and - * may want to again insert onto the hpts if these fields - * are not equal (i.e. you are not on the expected CPU). - * - * A note on inp_hpts_calls and inp_input_calls, these - * flags are set when the hpts calls either the output - * or do_segment routines respectively. If the routine - * being called wants to use this, then it needs to - * clear the flag before returning. The hpts will not - * clear the flag. The flags can be used to tell if - * the hpts is the function calling the respective - * routine. - * * A few other notes: * * When a read lock is held, stability of the field is guaranteed; to write @@ -218,41 +172,15 @@ CK_LIST_ENTRY(inpcb) inp_hash; /* (w:h/r:e) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ -#define inp_start_zero inp_hpts +#define inp_start_zero inp_refcount #define inp_zero_size (sizeof(struct inpcb) - \ offsetof(struct inpcb, inp_start_zero)) - TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */ - uint32_t inp_hpts_gencnt; /* XXXGL */ - uint32_t inp_hpts_request; /* Current hpts request, zero if - * fits in the pacing window (i&b). */ - /* - * Note the next fields are protected by a - * different lock (hpts-lock). This means that - * they must correspond in size to the smallest - * protectable bit field (uint8_t on x86, and - * other platfomrs potentially uint32_t?). Also - * since CPU switches can occur at different times the two - * fields can *not* be collapsed into a signal bit field. - */ -#if defined(__amd64__) || defined(__i386__) - uint8_t inp_in_hpts; /* on output hpts (lock b) */ -#else - uint32_t inp_in_hpts; /* on output hpts (lock b) */ -#endif - volatile uint16_t inp_hpts_cpu; /* Lock (i) */ - volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */ u_int inp_refcount; /* (i) refcount */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ - uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */ - inp_hpts_calls :1, /* (i) from output hpts */ - inp_irq_cpu_set :1, /* (i) from LRO/Driver */ - inp_spare_bits2 : 3; uint8_t inp_numa_domain; /* numa domain */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ - int32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */ - uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1690,7 +1690,6 @@ MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); - MPASS(inp->inp_in_hpts == 0); INP_RUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); @@ -1707,7 +1706,6 @@ MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); - MPASS(inp->inp_in_hpts == 0); INP_WUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -113,8 +113,9 @@ #ifdef _KERNEL -void tcp_hpts_remove(struct inpcb *); -bool tcp_in_hpts(struct inpcb *); +void tcp_hpts_init(struct tcpcb *); +void tcp_hpts_remove(struct tcpcb *); +bool tcp_in_hpts(struct tcpcb *); /* * To insert a TCB on the hpts you *must* be holding the @@ -140,20 +141,18 @@ * that INP_WLOCK() or from destroying your TCB where again * you should already have the INP_WLOCK(). */ -uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, +uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag); #define tcp_hpts_insert(inp, slot) \ tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) -void __tcp_set_hpts(struct inpcb *inp, int32_t line); +void __tcp_set_hpts(struct tcpcb *tp, int32_t line); #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason); void tcp_run_hpts(void); -uint16_t hpts_random_cpu(struct inpcb *inp); - extern int32_t tcp_min_hptsi_time; #endif /* _KERNEL */ diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -199,7 +199,7 @@ uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ struct hptsh { - TAILQ_HEAD(, inpcb) head; + TAILQ_HEAD(, tcpcb) head; uint32_t count; uint32_t gencnt; } *p_hptss; /* Hptsi wheel */ @@ -273,12 +273,6 @@ int cpu[MAXCPU]; } hpts_domains[MAXMEMDOM]; -enum { - IHPTS_NONE = 0, - IHPTS_ONQUEUE, - IHPTS_MOVING, -}; - counter_u64_t hpts_hopelessly_behind; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, @@ -426,6 +420,17 @@ &tcp_hpts_no_wake_over_thresh, 0, "When we are over the threshold on the pacer do we prohibit wakeups?"); +static uint16_t +hpts_random_cpu(void) +{ + uint16_t cpuid; + uint32_t ran; + + ran = arc4random(); + cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); + return (cpuid); +} + static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, int slots_to_run, int idx, int from_callout) @@ -489,54 +494,67 @@ } static void -inp_hpts_insert(struct inpcb *inp, struct tcp_hpts_entry *hpts) +tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) { + struct inpcb *inp = tptoinpcb(tp); struct hptsh *hptsh; INP_WLOCK_ASSERT(inp); HPTS_MTX_ASSERT(hpts); - MPASS(hpts->p_cpu == inp->inp_hpts_cpu); + MPASS(hpts->p_cpu == tp->t_hpts_cpu); MPASS(!(inp->inp_flags & INP_DROPPED)); - hptsh = &hpts->p_hptss[inp->inp_hptsslot]; + hptsh = &hpts->p_hptss[tp->t_hpts_slot]; - if (inp->inp_in_hpts == IHPTS_NONE) { - inp->inp_in_hpts = IHPTS_ONQUEUE; + if (tp->t_in_hpts == IHPTS_NONE) { + tp->t_in_hpts = IHPTS_ONQUEUE; in_pcbref(inp); - } else if (inp->inp_in_hpts == IHPTS_MOVING) { - inp->inp_in_hpts = IHPTS_ONQUEUE; + } else if (tp->t_in_hpts == IHPTS_MOVING) { + tp->t_in_hpts = IHPTS_ONQUEUE; } else - MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE); - inp->inp_hpts_gencnt = hptsh->gencnt; + MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); + tp->t_hpts_gencnt = hptsh->gencnt; - TAILQ_INSERT_TAIL(&hptsh->head, inp, inp_hpts); + TAILQ_INSERT_TAIL(&hptsh->head, tp, t_hpts); hptsh->count++; hpts->p_on_queue_cnt++; } static struct tcp_hpts_entry * -tcp_hpts_lock(struct inpcb *inp) +tcp_hpts_lock(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; - INP_LOCK_ASSERT(inp); + INP_LOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_pace.rp_ent[inp->inp_hpts_cpu]; + hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; HPTS_LOCK(hpts); return (hpts); } static void -inp_hpts_release(struct inpcb *inp) +tcp_hpts_release(struct tcpcb *tp) { bool released __diagused; - inp->inp_in_hpts = IHPTS_NONE; - released = in_pcbrele_wlocked(inp); + tp->t_in_hpts = IHPTS_NONE; + released = in_pcbrele_wlocked(tptoinpcb(tp)); MPASS(released == false); } +/* + * Initialize newborn tcpcb to get ready for use with HPTS. + */ +void +tcp_hpts_init(struct tcpcb *tp) +{ + + tp->t_hpts_cpu = hpts_random_cpu(); + tp->t_lro_cpu = HPTS_CPU_NONE; + MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET)); +} + /* * Called normally with the INP_LOCKED but it * does not matter, the hpts lock is the key @@ -544,39 +562,39 @@ * INP lock and then get the hpts lock. */ void -tcp_hpts_remove(struct inpcb *inp) +tcp_hpts_remove(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; struct hptsh *hptsh; - INP_WLOCK_ASSERT(inp); + INP_WLOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_hpts_lock(inp); - if (inp->inp_in_hpts == IHPTS_ONQUEUE) { - hptsh = &hpts->p_hptss[inp->inp_hptsslot]; - inp->inp_hpts_request = 0; - if (__predict_true(inp->inp_hpts_gencnt == hptsh->gencnt)) { - TAILQ_REMOVE(&hptsh->head, inp, inp_hpts); + hpts = tcp_hpts_lock(tp); + if (tp->t_in_hpts == IHPTS_ONQUEUE) { + hptsh = &hpts->p_hptss[tp->t_hpts_slot]; + tp->t_hpts_request = 0; + if (__predict_true(tp->t_hpts_gencnt == hptsh->gencnt)) { + TAILQ_REMOVE(&hptsh->head, tp, t_hpts); MPASS(hptsh->count > 0); hptsh->count--; MPASS(hpts->p_on_queue_cnt > 0); hpts->p_on_queue_cnt--; - inp_hpts_release(inp); + tcp_hpts_release(tp); } else { /* * tcp_hptsi() now owns the TAILQ head of this inp. * Can't TAILQ_REMOVE, just mark it. */ #ifdef INVARIANTS - struct inpcb *tmp; + struct tcpcb *tmp; - TAILQ_FOREACH(tmp, &hptsh->head, inp_hpts) - MPASS(tmp != inp); + TAILQ_FOREACH(tmp, &hptsh->head, t_hpts) + MPASS(tmp != tp); #endif - inp->inp_in_hpts = IHPTS_MOVING; - inp->inp_hptsslot = -1; + tp->t_in_hpts = IHPTS_MOVING; + tp->t_hpts_slot = -1; } - } else if (inp->inp_in_hpts == IHPTS_MOVING) { + } else if (tp->t_in_hpts == IHPTS_MOVING) { /* * Handle a special race condition: * tcp_hptsi() moves inpcb to detached tailq @@ -585,16 +603,16 @@ * tcp_hpts_remove() again (we are here!), then in_pcbdrop() * tcp_hptsi() finds pcb with meaningful slot and INP_DROPPED */ - inp->inp_hptsslot = -1; + tp->t_hpts_slot = -1; } HPTS_UNLOCK(hpts); } bool -tcp_in_hpts(struct inpcb *inp) +tcp_in_hpts(struct tcpcb *tp) { - return (inp->inp_in_hpts == IHPTS_ONQUEUE); + return (tp->t_in_hpts == IHPTS_ONQUEUE); } static inline int @@ -762,15 +780,15 @@ #ifdef INVARIANTS static void -check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line) +check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, + uint32_t hptsslot, int line) { /* * Sanity checks for the pacer with invariants * on insert. */ - KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS, - ("hpts:%p inp:%p slot:%d > max", - hpts, inp, inp_hptsslot)); + KASSERT(hptsslot < NUM_OF_HPTSI_SLOTS, + ("hpts:%p tp:%p slot:%d > max", hpts, tp, hptsslot)); if ((hpts->p_hpts_active) && (hpts->p_wheel_complete == 0)) { /* @@ -781,22 +799,21 @@ */ int distance, yet_to_run; - distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot); + distance = hpts_slots_diff(hpts->p_runningslot, hptsslot); if (hpts->p_runningslot != hpts->p_cur_slot) yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot); else yet_to_run = 0; /* processing last slot */ - KASSERT(yet_to_run <= distance, - ("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", - hpts, inp, inp_hptsslot, - distance, yet_to_run, - hpts->p_runningslot, hpts->p_cur_slot)); + KASSERT(yet_to_run <= distance, ("hpts:%p tp:%p slot:%d " + "distance:%d yet_to_run:%d rs:%d cs:%d", hpts, tp, + hptsslot, distance, yet_to_run, hpts->p_runningslot, + hpts->p_cur_slot)); } } #endif uint32_t -tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag) +tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) { struct tcp_hpts_entry *hpts; struct timeval tv; @@ -804,16 +821,16 @@ int32_t wheel_slot, maxslots; bool need_wakeup = false; - INP_WLOCK_ASSERT(inp); - MPASS(!tcp_in_hpts(inp)); - MPASS(!(inp->inp_flags & INP_DROPPED)); + INP_WLOCK_ASSERT(tptoinpcb(tp)); + MPASS(!(tptoinpcb(tp)->inp_flags & INP_DROPPED)); + MPASS(!tcp_in_hpts(tp)); /* * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ - hpts = tcp_hpts_lock(inp); + hpts = tcp_hpts_lock(tp); microuptime(&tv); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); @@ -830,20 +847,20 @@ } if (slot == 0) { /* Ok we need to set it on the hpts in the current slot */ - inp->inp_hpts_request = 0; + tp->t_hpts_request = 0; if ((hpts->p_hpts_active == 0) || (hpts->p_wheel_complete)) { /* * A sleeping hpts we want in next slot to run * note that in this state p_prev_slot == p_cur_slot */ - inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1); + tp->t_hpts_slot = hpts_slot(hpts->p_prev_slot, 1); if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) need_wakeup = true; } else - inp->inp_hptsslot = hpts->p_runningslot; - if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING)) - inp_hpts_insert(inp, hpts); + tp->t_hpts_slot = hpts->p_runningslot; + if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) + tcp_hpts_insert_internal(tp, hpts); if (need_wakeup) { /* * Activate the hpts if it is sleeping and its @@ -880,28 +897,28 @@ */ slot--; } - inp->inp_hptsslot = last_slot; - inp->inp_hpts_request = slot; + tp->t_hpts_slot = last_slot; + tp->t_hpts_request = slot; } else if (maxslots >= slot) { /* It all fits on the wheel */ - inp->inp_hpts_request = 0; - inp->inp_hptsslot = hpts_slot(wheel_slot, slot); + tp->t_hpts_request = 0; + tp->t_hpts_slot = hpts_slot(wheel_slot, slot); } else { /* It does not fit */ - inp->inp_hpts_request = slot - maxslots; - inp->inp_hptsslot = last_slot; + tp->t_hpts_request = slot - maxslots; + tp->t_hpts_slot = last_slot; } if (diag) { - diag->slot_remaining = inp->inp_hpts_request; - diag->inp_hptsslot = inp->inp_hptsslot; + diag->slot_remaining = tp->t_hpts_request; + diag->inp_hptsslot = tp->t_hpts_slot; } #ifdef INVARIANTS - check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); + check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); #endif - if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING)) - inp_hpts_insert(inp, hpts); + if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) + tcp_hpts_insert_internal(tp, hpts); if ((hpts->p_hpts_active == 0) && - (inp->inp_hpts_request == 0) && + (tp->t_hpts_request == 0) && (hpts->p_on_min_sleep == 0)) { /* * The hpts is sleeping and NOT on a minimum @@ -972,54 +989,35 @@ return (slot_on); } -uint16_t -hpts_random_cpu(struct inpcb *inp){ - /* - * No flow type set distribute the load randomly. - */ - uint16_t cpuid; - uint32_t ran; - - /* - * Shortcut if it is already set. XXXGL: does it happen? - */ - if (inp->inp_hpts_cpu_set) { - return (inp->inp_hpts_cpu); - } - /* Nothing set use a random number */ - ran = arc4random(); - cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); - return (cpuid); -} - static uint16_t -hpts_cpuid(struct inpcb *inp, int *failed) +hpts_cpuid(struct tcpcb *tp, int *failed) { + struct inpcb *inp = tptoinpcb(tp); u_int cpuid; #ifdef NUMA struct hpts_domain_info *di; #endif *failed = 0; - if (inp->inp_hpts_cpu_set) { - return (inp->inp_hpts_cpu); + if (tp->t_flags2 & TF2_HPTS_CPU_SET) { + return (tp->t_hpts_cpu); } /* * If we are using the irq cpu set by LRO or * the driver then it overrides all other domains. */ if (tcp_use_irq_cpu) { - if (inp->inp_irq_cpu_set == 0) { + if (tp->t_lro_cpu == HPTS_CPU_NONE) { *failed = 1; - return(0); + return (0); } - return(inp->inp_irq_cpu); + return (tp->t_lro_cpu); } /* If one is set the other must be the same */ #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) - return (hpts_random_cpu(inp)); + return (hpts_random_cpu()); else return (cpuid); #endif @@ -1030,7 +1028,7 @@ */ if (inp->inp_flowtype == M_HASHTYPE_NONE) { counter_u64_add(cpu_uses_random, 1); - return (hpts_random_cpu(inp)); + return (hpts_random_cpu()); } /* * Hash to a thread based on the flowid. If we are using numa, @@ -1081,12 +1079,10 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) { struct tcpcb *tp; - struct inpcb *inp; struct timeval tv; int32_t slots_to_run, i, error; int32_t loop_cnt = 0; int32_t did_prefetch = 0; - int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; int32_t wrap_loop_cnt = 0; int32_t slot_pos_of_endpoint = 0; @@ -1154,25 +1150,25 @@ * run them, the extra 10usecs of late (by being * put behind) does not really matter in this situation. */ - TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot].head, - inp_hpts) { - MPASS(inp->inp_hptsslot == hpts->p_nxt_slot); - MPASS(inp->inp_hpts_gencnt == + TAILQ_FOREACH(tp, &hpts->p_hptss[hpts->p_nxt_slot].head, + t_hpts) { + MPASS(tp->t_hpts_slot == hpts->p_nxt_slot); + MPASS(tp->t_hpts_gencnt == hpts->p_hptss[hpts->p_nxt_slot].gencnt); - MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE); + MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); /* * Update gencnt and nextslot accordingly to match * the new location. This is safe since it takes both * the INP lock and the pacer mutex to change the - * inp_hptsslot and inp_hpts_gencnt. + * t_hptsslot and t_hpts_gencnt. */ - inp->inp_hpts_gencnt = + tp->t_hpts_gencnt = hpts->p_hptss[hpts->p_runningslot].gencnt; - inp->inp_hptsslot = hpts->p_runningslot; + tp->t_hpts_slot = hpts->p_runningslot; } TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot].head, - &hpts->p_hptss[hpts->p_nxt_slot].head, inp_hpts); + &hpts->p_hptss[hpts->p_nxt_slot].head, t_hpts); hpts->p_hptss[hpts->p_runningslot].count += hpts->p_hptss[hpts->p_nxt_slot].count; hpts->p_hptss[hpts->p_nxt_slot].count = 0; @@ -1191,8 +1187,8 @@ goto no_one; } for (i = 0; i < slots_to_run; i++) { - struct inpcb *inp, *ninp; - TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head); + struct tcpcb *tp, *ntp; + TAILQ_HEAD(, tcpcb) head = TAILQ_HEAD_INITIALIZER(head); struct hptsh *hptsh; uint32_t runningslot; @@ -1205,20 +1201,54 @@ runningslot = hpts->p_runningslot; hptsh = &hpts->p_hptss[runningslot]; - TAILQ_SWAP(&head, &hptsh->head, inpcb, inp_hpts); + TAILQ_SWAP(&head, &hptsh->head, tcpcb, t_hpts); hpts->p_on_queue_cnt -= hptsh->count; hptsh->count = 0; hptsh->gencnt++; HPTS_UNLOCK(hpts); - TAILQ_FOREACH_SAFE(inp, &head, inp_hpts, ninp) { + TAILQ_FOREACH_SAFE(tp, &head, t_hpts, ntp) { + struct inpcb *inp = tptoinpcb(tp); bool set_cpu; - if (ninp != NULL) { - /* We prefetch the next inp if possible */ - kern_prefetch(ninp, &prefetch_ninp); - prefetch_ninp = 1; + if (ntp != NULL) { + /* + * If we have a next tcpcb, see if we can + * prefetch it. Note this may seem + * "risky" since we have no locks (other + * than the previous inp) and there no + * assurance that ntp was not pulled while + * we were processing tp and freed. If this + * occurred it could mean that either: + * + * a) Its NULL (which is fine we won't go + * here) b) Its valid (which is cool we + * will prefetch it) c) The inp got + * freed back to the slab which was + * reallocated. Then the piece of memory was + * re-used and something else (not an + * address) is in inp_ppcb. If that occurs + * we don't crash, but take a TLB shootdown + * performance hit (same as if it was NULL + * and we tried to pre-fetch it). + * + * Considering that the likelyhood of is + * quite rare we will take a risk on doing + * this. If performance drops after testing + * we can always take this out. NB: the + * kern_prefetch on amd64 actually has + * protection against a bad address now via + * the DMAP_() tests. This will prevent the + * TLB hit, and instead if occurs just + * cause us to load cache with a useless + * address (to us). + * + * XXXGL: this comment and the prefetch action + * could be outdated after tp == inp change. + */ + kern_prefetch(ntp, &prefetch_tp); + prefetch_tp = 1; } /* For debugging */ @@ -1232,33 +1262,33 @@ } INP_WLOCK(inp); - if (inp->inp_hpts_cpu_set == 0) { + if ((tp->t_flags2 & TF2_HPTS_CPU_SET) == 0) { set_cpu = true; } else { set_cpu = false; } - if (__predict_false(inp->inp_in_hpts == IHPTS_MOVING)) { - if (inp->inp_hptsslot == -1) { - inp->inp_in_hpts = IHPTS_NONE; + if (__predict_false(tp->t_in_hpts == IHPTS_MOVING)) { + if (tp->t_hpts_slot == -1) { + tp->t_in_hpts = IHPTS_NONE; if (in_pcbrele_wlocked(inp) == false) INP_WUNLOCK(inp); } else { HPTS_LOCK(hpts); - inp_hpts_insert(inp, hpts); + tcp_hpts_insert_internal(tp, hpts); HPTS_UNLOCK(hpts); INP_WUNLOCK(inp); } continue; } - MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE); + MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); MPASS(!(inp->inp_flags & INP_DROPPED)); - KASSERT(runningslot == inp->inp_hptsslot, + KASSERT(runningslot == tp->t_hpts_slot, ("Hpts:%p inp:%p slot mis-aligned %u vs %u", - hpts, inp, runningslot, inp->inp_hptsslot)); + hpts, inp, runningslot, tp->t_hpts_slot)); - if (inp->inp_hpts_request) { + if (tp->t_hpts_request) { /* * This guy is deferred out further in time * then our wheel had available on it. @@ -1268,38 +1298,36 @@ uint32_t maxslots, last_slot, remaining_slots; remaining_slots = slots_to_run - (i + 1); - if (inp->inp_hpts_request > remaining_slots) { + if (tp->t_hpts_request > remaining_slots) { HPTS_LOCK(hpts); /* * How far out can we go? */ maxslots = max_slots_available(hpts, hpts->p_cur_slot, &last_slot); - if (maxslots >= inp->inp_hpts_request) { + if (maxslots >= tp->t_hpts_request) { /* We can place it finally to * be processed. */ - inp->inp_hptsslot = hpts_slot( + tp->t_hpts_slot = hpts_slot( hpts->p_runningslot, - inp->inp_hpts_request); - inp->inp_hpts_request = 0; + tp->t_hpts_request); + tp->t_hpts_request = 0; } else { /* Work off some more time */ - inp->inp_hptsslot = last_slot; - inp->inp_hpts_request -= + tp->t_hpts_slot = last_slot; + tp->t_hpts_request -= maxslots; } - inp_hpts_insert(inp, hpts); + tcp_hpts_insert_internal(tp, hpts); HPTS_UNLOCK(hpts); INP_WUNLOCK(inp); continue; } - inp->inp_hpts_request = 0; + tp->t_hpts_request = 0; /* Fall through we will so do it now */ } - inp_hpts_release(inp); - tp = intotcpcb(inp); - MPASS(tp); + tcp_hpts_release(tp); if (set_cpu) { /* * Setup so the next time we will move to @@ -1318,7 +1346,7 @@ * gets added to the hpts (not this one) * :-) */ - tcp_set_hpts(inp); + tcp_set_hpts(tp); } CURVNET_SET(inp->inp_vnet); /* Lets do any logging that we might want to */ @@ -1331,16 +1359,17 @@ did_prefetch = 1; } /* - * We set inp_hpts_calls to 1 before any possible output. - * The contract with the transport is that if it cares about - * hpts calling it should clear the flag. That way next time - * it is called it will know it is hpts. + * We set TF2_HPTS_CALLS before any possible output. + * The contract with the transport is that if it cares + * about hpts calling it should clear the flag. That + * way next time it is called it will know it is hpts. * - * We also only call tfb_do_queued_segments() tcp_output() - * it is expected that if segments are queued and come in that - * the final input mbuf will cause a call to output if it is needed. + * We also only call tfb_do_queued_segments() + * tcp_output(). It is expected that if segments are + * queued and come in that the final input mbuf will + * cause a call to output if it is needed. */ - inp->inp_hpts_calls = 1; + tp->t_flags2 |= TF2_HPTS_CALLS; if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && !STAILQ_EMPTY(&tp->t_inqueue)) { error = (*tp->t_fb->tfb_do_queued_segments)(tp, 0); @@ -1353,44 +1382,6 @@ if (error < 0) goto skip_pacing; } - if (ninp) { - /* - * If we have a nxt inp, see if we can - * prefetch it. Note this may seem - * "risky" since we have no locks (other - * than the previous inp) and there no - * assurance that ninp was not pulled while - * we were processing inp and freed. If this - * occurred it could mean that either: - * - * a) Its NULL (which is fine we won't go - * here) b) Its valid (which is cool we - * will prefetch it) c) The inp got - * freed back to the slab which was - * reallocated. Then the piece of memory was - * re-used and something else (not an - * address) is in inp_ppcb. If that occurs - * we don't crash, but take a TLB shootdown - * performance hit (same as if it was NULL - * and we tried to pre-fetch it). - * - * Considering that the likelyhood of is - * quite rare we will take a risk on doing - * this. If performance drops after testing - * we can always take this out. NB: the - * kern_prefetch on amd64 actually has - * protection against a bad address now via - * the DMAP_() tests. This will prevent the - * TLB hit, and instead if occurs just - * cause us to load cache with a useless - * address (to us). - * - * XXXGL: with tcpcb == inpcb, I'm unsure this - * prefetch is still correct and useful. - */ - kern_prefetch(ninp, &prefetch_tp); - prefetch_tp = 1; - } INP_WUNLOCK(inp); skip_pacing: CURVNET_RESTORE(); @@ -1492,18 +1483,18 @@ } void -__tcp_set_hpts(struct inpcb *inp, int32_t line) +__tcp_set_hpts(struct tcpcb *tp, int32_t line) { struct tcp_hpts_entry *hpts; int failed; - INP_WLOCK_ASSERT(inp); - hpts = tcp_hpts_lock(inp); - if ((inp->inp_in_hpts == 0) && - (inp->inp_hpts_cpu_set == 0)) { - inp->inp_hpts_cpu = hpts_cpuid(inp, &failed); + INP_WLOCK_ASSERT(tptoinpcb(tp)); + + hpts = tcp_hpts_lock(tp); + if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { + tp->t_hpts_cpu = hpts_cpuid(tp, &failed); if (failed == 0) - inp->inp_hpts_cpu_set = 1; + tp->t_flags2 |= TF2_HPTS_CPU_SET; } mtx_unlock(&hpts->p_mtx); } diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1380,10 +1380,8 @@ INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } - if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) { - inp->inp_irq_cpu = lc->lro_last_cpu; - inp->inp_irq_cpu_set = 1; - } + if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1) + tp->t_lro_cpu = lc->lro_last_cpu; /* Check if the transport doesn't support the needed optimizations. */ if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) { INP_WUNLOCK(inp); diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -739,7 +739,7 @@ int32_t delay_calc = 0; uint32_t prev_delay = 0; - if (tcp_in_hpts(inp)) { + if (tcp_in_hpts(tp)) { /* A previous call is already set up */ return; } @@ -904,14 +904,14 @@ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; bbr->rc_pacer_started = cts; - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), __LINE__, &diag); bbr->rc_timer_first = 0; bbr->bbr_timer_src = frm; bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); bbr_log_hpts_diag(bbr, cts, &diag); } else if (hpts_timeout) { - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); /* * We add the flag here as well if the slot is set, @@ -1050,8 +1050,8 @@ */ wrong_timer: if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { - if (tcp_in_hpts(inp)) - tcp_hpts_remove(inp); + if (tcp_in_hpts(tp)) + tcp_hpts_remove(tp); bbr_timer_cancel(bbr, __LINE__, cts); bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val, 0); @@ -1875,7 +1875,7 @@ l->lt_epoch = bbr->r_ctl.rc_lt_epoch; l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain; l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain; - l->inhpts = tcp_in_hpts(bbr->rc_inp); + l->inhpts = tcp_in_hpts(bbr->rc_tp); l->use_lt_bw = bbr->rc_lt_use_bw; l->pkts_out = bbr->r_ctl.rc_flight_at_input; l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch; @@ -2496,7 +2496,7 @@ log.u_bbr.flex2 = to; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; - log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot; + log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2; log.u_bbr.flex8 = which; @@ -3953,7 +3953,7 @@ bbr->rc_tlp_rtx_out = 0; bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate; tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); - if (tcp_in_hpts(bbr->rc_inp) && + if (tcp_in_hpts(bbr->rc_tp) && ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) { /* * When we enter recovery, we need to restart @@ -5209,7 +5209,7 @@ left = bbr->r_ctl.rc_timer_exp - cts; ret = -3; bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); - tcp_hpts_insert(tptoinpcb(tp), HPTS_USEC_TO_SLOTS(left)); + tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left)); return (1); } bbr->rc_tmr_stopped = 0; @@ -5240,7 +5240,7 @@ if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { uint8_t hpts_removed = 0; - if (tcp_in_hpts(bbr->rc_inp) && + if (tcp_in_hpts(bbr->rc_tp) && (bbr->rc_timer_first == 1)) { /* * If we are canceling timer's when we have the @@ -5248,7 +5248,7 @@ * must remove ourselves from the hpts. */ hpts_removed = 1; - tcp_hpts_remove(bbr->rc_inp); + tcp_hpts_remove(bbr->rc_tp); if (bbr->r_ctl.rc_last_delay_val) { /* Update the last hptsi delay too */ uint32_t time_since_send; @@ -7920,8 +7920,8 @@ * don't want to transfer forward the time * for our sum's calculations. */ - if (tcp_in_hpts(bbr->rc_inp)) { - tcp_hpts_remove(bbr->rc_inp); + if (tcp_in_hpts(bbr->rc_tp)) { + tcp_hpts_remove(bbr->rc_tp); bbr->rc_timer_first = 0; bbr->r_ctl.rc_hpts_flags = 0; bbr->r_ctl.rc_last_delay_val = 0; @@ -9854,8 +9854,8 @@ /* We enter in persists, set the flag appropriately */ bbr->rc_in_persist = 1; } - if (tcp_in_hpts(bbr->rc_inp)) { - tcp_hpts_remove(bbr->rc_inp); + if (tcp_in_hpts(bbr->rc_tp)) { + tcp_hpts_remove(bbr->rc_tp); } } @@ -11437,7 +11437,7 @@ } /* Set the flag */ bbr->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0; - tcp_set_hpts(inp); + tcp_set_hpts(tp); sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack); } if (thflags & TH_ACK) { @@ -11546,7 +11546,7 @@ */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && - (tcp_in_hpts(bbr->rc_inp)) && + (tcp_in_hpts(tp)) && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* * keep alive not needed if we are hptsi @@ -11554,8 +11554,8 @@ */ ; } else { - if (tcp_in_hpts(bbr->rc_inp)) { - tcp_hpts_remove(bbr->rc_inp); + if (tcp_in_hpts(tp)) { + tcp_hpts_remove(tp); if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && (TSTMP_GT(lcts, bbr->rc_pacer_started))) { uint32_t del; @@ -11582,8 +11582,8 @@ bbr_timer_audit(tp, bbr, lcts, &so->so_snd); } /* Clear the flag, it may have been cleared by output but we may not have */ - if ((nxt_pkt == 0) && (inp->inp_hpts_calls)) - inp->inp_hpts_calls = 0; + if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) + tp->t_hpts_calls &= ~TF2_HPTS_CALLS; /* Do we have a new state */ if (bbr->r_state != tp->t_state) bbr_set_state(tp, bbr, tiwin); @@ -11842,7 +11842,7 @@ int32_t slot = 0; struct inpcb *inp; struct sockbuf *sb; - uint32_t hpts_calling; + bool hpts_calling; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; @@ -11853,8 +11853,8 @@ memcpy(&bbr->rc_tv, tv, sizeof(struct timeval)); cts = tcp_tv_to_usectick(&bbr->rc_tv); inp = bbr->rc_inp; - hpts_calling = inp->inp_hpts_calls; - inp->inp_hpts_calls = 0; + hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); + tp->t_flags2 &= ~TF2_HPTS_CALLS; so = inp->inp_socket; sb = &so->so_snd; if (tp->t_nic_ktls_xmit) @@ -11884,7 +11884,7 @@ } #endif if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && - tcp_in_hpts(inp)) { + tcp_in_hpts(tp)) { /* * We are on the hpts for some timer but not hptsi output. * Possibly remove from the hpts so we can send/recv etc. @@ -11913,7 +11913,7 @@ return (0); } } - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); bbr_timer_cancel(bbr, __LINE__, cts); } if (bbr->r_ctl.rc_last_delay_val) { @@ -11929,9 +11929,9 @@ if ((bbr->r_timer_override) || (tp->t_state < TCPS_ESTABLISHED)) { /* Timeouts or early states are exempt */ - if (tcp_in_hpts(inp)) - tcp_hpts_remove(inp); - } else if (tcp_in_hpts(inp)) { + if (tcp_in_hpts(tp)) + tcp_hpts_remove(tp); + } else if (tcp_in_hpts(tp)) { if ((bbr->r_ctl.rc_last_delay_val) && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && delay_calc) { @@ -11943,10 +11943,10 @@ */ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1); bbr->r_ctl.rc_last_delay_val = 0; - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); } else if (tp->t_state == TCPS_CLOSED) { bbr->r_ctl.rc_last_delay_val = 0; - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); } else { /* * On the hpts, you shall not pass! even if ACKNOW @@ -14088,7 +14088,7 @@ inp->inp_flags2 |= INP_CANNOT_DO_ECN; inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); - if (inp->inp_in_hpts) { + if (tp->t_in_hpts > IHPTS_NONE) { return; } bbr = (struct tcp_bbr *)tp->t_fb_ptr; @@ -14109,7 +14109,7 @@ } } else toval = HPTS_TICKS_PER_SLOT; - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), __LINE__, &diag); bbr_log_hpts_diag(bbr, cts, &diag); } diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -2568,7 +2568,7 @@ log.u_bbr.flex5 = rsm->r_start; log.u_bbr.flex6 = rsm->r_end; log.u_bbr.flex8 = mod; - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; @@ -2594,7 +2594,7 @@ log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; - log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; + log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = rack->rc_in_persist; log.u_bbr.flex8 = which; @@ -2602,7 +2602,7 @@ log.u_bbr.pkts_out = 0; else log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; @@ -2629,7 +2629,7 @@ struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; @@ -2667,7 +2667,7 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex8 = flag; - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.cur_del_rate = (uint64_t)prev; log.u_bbr.delRate = (uint64_t)rsm; log.u_bbr.rttProp = (uint64_t)next; @@ -2711,7 +2711,7 @@ union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = t; log.u_bbr.flex2 = len; log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; @@ -2883,7 +2883,7 @@ struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; @@ -2909,7 +2909,7 @@ union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = slot; if (rack->rack_no_prr) log.u_bbr.flex2 = 0; @@ -2957,7 +2957,7 @@ log.u_bbr.flex7 <<= 1; log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ log.u_bbr.flex8 = rack->rc_in_persist; - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; @@ -3010,7 +3010,7 @@ struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = reason; @@ -3043,7 +3043,7 @@ union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; log.u_bbr.flex3 = flags_on_entry; @@ -4893,7 +4893,7 @@ rack->r_ctl.rc_app_limited_cnt, 0, 0, 10, __LINE__, NULL, quality); } - if (tcp_in_hpts(rack->rc_inp) && + if (tcp_in_hpts(rack->rc_tp) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* * Ok we can't trust the pacer in this case @@ -4903,7 +4903,7 @@ * Stop the pacer and clear up all the aggregate * delays etc. */ - tcp_hpts_remove(rack->rc_inp); + tcp_hpts_remove(rack->rc_tp); rack->r_ctl.rc_hpts_flags = 0; rack->r_ctl.rc_last_output_to = 0; } @@ -6495,8 +6495,8 @@ struct timeval tv; uint32_t t_time; - if (tcp_in_hpts(rack->rc_inp)) { - tcp_hpts_remove(rack->rc_inp); + if (tcp_in_hpts(rack->rc_tp)) { + tcp_hpts_remove(rack->rc_tp); rack->r_ctl.rc_hpts_flags = 0; } #ifdef NETFLIX_SHARED_CWND @@ -6634,7 +6634,7 @@ (tp->t_state == TCPS_LISTEN)) { return; } - if (tcp_in_hpts(inp)) { + if (tcp_in_hpts(tp)) { /* Already on the pacer */ return; } @@ -6885,12 +6885,12 @@ * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } else { - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 1); @@ -6905,7 +6905,7 @@ * at the start of this block) are good enough. */ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); @@ -8028,7 +8028,7 @@ rack->rc_inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; ret = -3; left = rack->r_ctl.rc_timer_exp - cts; - tcp_hpts_insert(tptoinpcb(tp), HPTS_MS_TO_SLOTS(left)); + tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); rack_log_to_processing(rack, cts, ret, left); return (1); } @@ -8069,7 +8069,7 @@ if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || ((tp->snd_max - tp->snd_una) == 0))) { - tcp_hpts_remove(rack->rc_inp); + tcp_hpts_remove(rack->rc_tp); hpts_removed = 1; /* If we were not delayed cancel out the flag. */ if ((tp->snd_max - tp->snd_una) == 0) @@ -8078,14 +8078,14 @@ } if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; - if (tcp_in_hpts(rack->rc_inp) && + if (tcp_in_hpts(rack->rc_tp) && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { /* * Canceling timer's when we have no output being * paced. We also must remove ourselves from the * hpts. */ - tcp_hpts_remove(rack->rc_inp); + tcp_hpts_remove(rack->rc_tp); hpts_removed = 1; } rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); @@ -8113,8 +8113,8 @@ /* We enter in persists, set the flag appropriately */ rack->rc_in_persist = 1; } - if (tcp_in_hpts(rack->rc_inp)) { - tcp_hpts_remove(rack->rc_inp); + if (tcp_in_hpts(rack->rc_tp)) { + tcp_hpts_remove(rack->rc_tp); } } @@ -11383,7 +11383,7 @@ (entered_recovery == 0)) { rack_update_prr(tp, rack, changed, th_ack); if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && - ((tcp_in_hpts(rack->rc_inp) == 0) && + ((tcp_in_hpts(rack->rc_tp) == 0) && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { /* * If you are pacing output you don't want @@ -14572,7 +14572,7 @@ inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; - if (inp->inp_in_hpts) { + if (tp->t_in_hpts > IHPTS_NONE) { /* Strange */ return; } @@ -14593,7 +14593,7 @@ } } else toval = HPTS_TICKS_PER_SLOT; - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), __LINE__, &diag); rack_log_hpts_diag(rack, cts, &diag, &tv); } @@ -15190,7 +15190,7 @@ if (tov) { struct hpts_diag diag; - (void)tcp_hpts_insert_diag(rack->rc_inp, HPTS_USEC_TO_SLOTS(tov), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); } @@ -15476,7 +15476,7 @@ * We will force the hpts to be stopped if any, and restart * with the slot set to what was in the saved slot. */ - if (tcp_in_hpts(rack->rc_inp)) { + if (tcp_in_hpts(rack->rc_tp)) { if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { uint32_t us_cts; @@ -15487,7 +15487,7 @@ } rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; } - tcp_hpts_remove(rack->rc_inp); + tcp_hpts_remove(rack->rc_tp); } rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); @@ -15568,7 +15568,7 @@ } #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; else @@ -16427,8 +16427,8 @@ } did_out = 1; } - if (rack->rc_inp->inp_hpts_calls) - rack->rc_inp->inp_hpts_calls = 0; + if (tp->t_flags2 & TF2_HPTS_CALLS) + tp->t_flags2 &= ~TF2_HPTS_CALLS; rack_free_trim(rack); #ifdef TCP_ACCOUNTING sched_unpin(); @@ -16662,7 +16662,7 @@ } #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; else @@ -16889,7 +16889,7 @@ #endif return (1); } - tcp_set_hpts(inp); + tcp_set_hpts(tp); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } if (thflags & TH_FIN) @@ -16988,7 +16988,7 @@ rack_free_trim(rack); } else if ((no_output == 1) && (nxt_pkt == 0) && - (tcp_in_hpts(rack->rc_inp) == 0)) { + (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use * the remaining time (slot_remaining) to restart the timer. @@ -16998,8 +16998,8 @@ rack_free_trim(rack); } /* Clear the flag, it may have been cleared by output but we may not have */ - if ((nxt_pkt == 0) && (inp->inp_hpts_calls)) - inp->inp_hpts_calls = 0; + if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) + tp->t_flags2 &= ~TF2_HPTS_CALLS; /* Update any rounds needed */ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) rack_log_hystart_event(rack, high_seq, 8); @@ -17033,13 +17033,13 @@ /* We could not send (probably in the hpts but stopped the timer earlier)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && - (tcp_in_hpts(rack->rc_inp)) && + (tcp_in_hpts(rack->rc_tp)) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* keep alive not needed if we are hptsi output yet */ ; } else { int late = 0; - if (tcp_in_hpts(inp)) { + if (tcp_in_hpts(tp)) { if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { us_cts = tcp_get_usecs(NULL); if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { @@ -17049,7 +17049,7 @@ late = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; } - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); } if (late && (did_out == 0)) { /* @@ -18063,7 +18063,7 @@ struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = error; log.u_bbr.flex2 = flags; log.u_bbr.flex3 = rsm_is_null; @@ -18328,7 +18328,7 @@ err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = p_rate; log.u_bbr.flex2 = p_queue; log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; @@ -18393,7 +18393,7 @@ out: if (tcp_bblogging_on(tp)) { memset(&log, 0, sizeof(log)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = p_rate; log.u_bbr.flex2 = p_queue; log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; @@ -18758,7 +18758,7 @@ counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); } memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else @@ -19291,7 +19291,7 @@ union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else @@ -19623,7 +19623,7 @@ uint32_t cts, ms_cts, delayed, early; uint16_t add_flag = RACK_SENT_SP; /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ - uint8_t hpts_calling, doing_tlp = 0; + uint8_t doing_tlp = 0; uint32_t cwnd_to_use, pace_max_seg; int32_t do_a_prefetch = 0; int32_t prefetch_rsm = 0; @@ -19641,7 +19641,7 @@ struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif - bool hw_tls = false; + bool hpts_calling, hw_tls = false; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); @@ -19652,8 +19652,8 @@ sched_pin(); ts_val = get_cyclecount(); #endif - hpts_calling = inp->inp_hpts_calls; - rack->rc_inp->inp_hpts_calls = 0; + hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); + tp->t_flags2 &= ~TF2_HPTS_CALLS; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { #ifdef TCP_ACCOUNTING @@ -19696,7 +19696,7 @@ cts = tcp_get_usecs(&tv); ms_cts = tcp_tv_to_mssectick(&tv); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && - tcp_in_hpts(rack->rc_inp)) { + tcp_in_hpts(rack->rc_tp)) { /* * We are on the hpts for some timer but not hptsi output. * Remove from the hpts unconditionally. @@ -19730,7 +19730,7 @@ } } if (rack->rc_in_persist) { - if (tcp_in_hpts(rack->rc_inp) == 0) { + if (tcp_in_hpts(rack->rc_tp) == 0) { /* Timer is not running */ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); } @@ -19742,7 +19742,7 @@ if ((rack->rc_ack_required == 1) && (rack->r_timer_override == 0)){ /* A timeout occurred and no ack has arrived */ - if (tcp_in_hpts(rack->rc_inp) == 0) { + if (tcp_in_hpts(rack->rc_tp) == 0) { /* Timer is not running */ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); } @@ -19756,9 +19756,9 @@ (delayed) || (tp->t_state < TCPS_ESTABLISHED)) { rack->rc_ack_can_sendout_data = 0; - if (tcp_in_hpts(rack->rc_inp)) - tcp_hpts_remove(rack->rc_inp); - } else if (tcp_in_hpts(rack->rc_inp)) { + if (tcp_in_hpts(rack->rc_tp)) + tcp_hpts_remove(rack->rc_tp); + } else if (tcp_in_hpts(rack->rc_tp)) { /* * On the hpts you can't pass even if ACKNOW is on, we will * when the hpts fires. @@ -21672,7 +21672,7 @@ union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -2148,7 +2148,7 @@ struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = inp->inp_in_hpts; + log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); @@ -2315,11 +2315,7 @@ */ inp->inp_ip_ttl = V_ip_defttl; #ifdef TCPHPTS - /* - * If using hpts lets drop a random number in so - * not all new connections fall on the same CPU. - */ - inp->inp_hpts_cpu = hpts_random_cpu(inp); + tcp_hpts_init(tp); #endif #ifdef TCPPCAP /* @@ -2438,6 +2434,7 @@ if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + MPASS(!tcp_in_hpts(tp)); /* * If we got enough samples through the srtt filter, @@ -2530,7 +2527,7 @@ tp->t_tfo_pending = NULL; } #ifdef TCPHPTS - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1712,7 +1712,7 @@ */ #ifdef TCPHPTS /* Assure that we are not on any hpts */ - tcp_hpts_remove(tptoinpcb(tp)); + tcp_hpts_remove(tp); #endif if (blk->tfb_tcp_fb_init) { error = (*blk->tfb_tcp_fb_init)(tp, &ptr); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -314,6 +314,23 @@ sbintime_t t_timers[TT_N]; sbintime_t t_precisions[TT_N]; + /* HPTS. Used by BBR and Rack stacks. See tcp_hpts.c for more info. */ + TAILQ_ENTRY(tcpcb) t_hpts; /* linkage to HPTS ring */ + STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input packets queue */ + uint32_t t_hpts_request; /* Current hpts request, zero if + * fits in the pacing window. */ + uint32_t t_hpts_slot; /* HPTS wheel slot this tcb is. */ + uint32_t t_hpts_drop_reas; /* Reason we are dropping the pcb. */ + uint32_t t_hpts_gencnt; + uint16_t t_hpts_cpu; /* CPU chosen by hpts_cpuid(). */ + uint16_t t_lro_cpu; /* CPU derived from LRO. */ +#define HPTS_CPU_NONE ((uint16_t)-1) + enum { + IHPTS_NONE = 0, + IHPTS_ONQUEUE, + IHPTS_MOVING, + } t_in_hpts; /* Is it linked into HPTS? */ + uint32_t t_maxseg:24, /* maximum segment size */ _t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ @@ -355,7 +372,6 @@ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* total reassembly queue byte length */ struct tsegqe_head t_segq; /* segment reassembly queue */ - STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input queue */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch @@ -832,9 +848,11 @@ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ +#define TF2_HPTS_CPU_SET 0x00000200 /* t_hpts_cpu is not random */ #define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */ #define TF2_ECN_USE_ECT1 0x00000800 /* Use ECT(1) marking on session */ #define TF2_TCP_ACCOUNTING 0x00010000 /* Do TCP accounting */ +#define TF2_HPTS_CALLS 0x00020000 /* tcp_output() called via HPTS */ /* * Structure to hold TCP options that are only used during segment