diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -5285,37 +5285,13 @@ } } -static void -bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type) +static int +bbr_stopall(struct tcpcb *tp) { struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; bbr->rc_all_timers_stopped = 1; - return; -} - -/* - * stop all timers always returning 0. - */ -static int -bbr_stopall(struct tcpcb *tp) -{ - return (0); -} - -static void -bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) -{ - return; -} - -/* - * return true if a bbr timer (rack or tlp) is active. - */ -static int -bbr_timer_active(struct tcpcb *tp, uint32_t timer_type) -{ return (0); } @@ -14168,9 +14144,6 @@ .tfb_tcp_fb_init = bbr_init, .tfb_tcp_fb_fini = bbr_fini, .tfb_tcp_timer_stop_all = bbr_stopall, - .tfb_tcp_timer_activate = bbr_timer_activate, - .tfb_tcp_timer_active = bbr_timer_active, - .tfb_tcp_timer_stop = bbr_timer_stop, .tfb_tcp_rexmit_tmr = bbr_remxt_tmr, .tfb_tcp_handoff_ok = bbr_handoff_ok, .tfb_tcp_mtu_chg = bbr_mtu_chg, diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -489,10 +489,6 @@ static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt); static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); static int32_t rack_stopall(struct tcpcb *tp); -static void -rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, - uint32_t delta); -static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); static uint32_t @@ -5910,9 +5906,6 @@ */ struct rack_sendmap *rsm; - if (tp->tt_flags & TT_STOPPED) { - return (1); - } counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); @@ -6123,9 +6116,6 @@ uint32_t out, avail; int collapsed_win = 0; - if (tp->tt_flags & TT_STOPPED) { - return (1); - } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); @@ -6312,9 +6302,7 @@ static int rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { - if (tp->tt_flags & TT_STOPPED) { - return (1); - } + rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; @@ -6337,9 +6325,6 @@ struct tcptemp *t_template; int32_t retval = 1; - if (tp->tt_flags & TT_STOPPED) { - return (1); - } if (rack->rc_in_persist == 0) return (0); if (ctf_progress_timeout_check(tp, false)) { @@ -6425,9 +6410,6 @@ struct tcptemp *t_template; struct inpcb *inp = tptoinpcb(tp); - if (tp->tt_flags & TT_STOPPED) { - return (1); - } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); /* @@ -6654,9 +6636,6 @@ int32_t retval = 0; bool isipv6; - if (tp->tt_flags & TT_STOPPED) { - return (1); - } if ((tp->t_flags & TF_GPUTINPROG) && (tp->t_rxtshift)) { /* @@ -7060,12 +7039,6 @@ rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); } -static void -rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) -{ - return; -} - static int rack_stopall(struct tcpcb *tp) { @@ -7075,18 +7048,6 @@ return (0); } -static void -rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) -{ - return; -} - -static int -rack_timer_active(struct tcpcb *tp, uint32_t timer_type) -{ - return (0); -} - static void rack_stop_all_timers(struct tcpcb *tp) { @@ -20307,9 +20268,6 @@ .tfb_tcp_fb_init = rack_init, .tfb_tcp_fb_fini = rack_fini, .tfb_tcp_timer_stop_all = rack_stopall, - .tfb_tcp_timer_activate = rack_timer_activate, - .tfb_tcp_timer_active = rack_timer_active, - .tfb_tcp_timer_stop = rack_timer_stop, .tfb_tcp_rexmit_tmr = rack_remxt_tmr, .tfb_tcp_handoff_ok = rack_handoff_ok, .tfb_tcp_mtu_chg = rack_mtu_change, diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1194,22 +1194,6 @@ *num_names = 0; return (EINVAL); } - if (blk->tfb_tcp_timer_stop_all || - blk->tfb_tcp_timer_activate || - blk->tfb_tcp_timer_active || - blk->tfb_tcp_timer_stop) { - /* - * If you define one timer function you - * must have them all. - */ - if ((blk->tfb_tcp_timer_stop_all == NULL) || - (blk->tfb_tcp_timer_activate == NULL) || - (blk->tfb_tcp_timer_active == NULL) || - (blk->tfb_tcp_timer_stop == NULL)) { - *num_names = 0; - return (EINVAL); - } - } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { *num_names = 0; @@ -2227,12 +2211,9 @@ #endif /* INET6 */ V_tcp_mssdflt; - /* Set up our timeouts. */ - callout_init(&tp->tt_rexmt, 1); - callout_init(&tp->tt_persist, 1); - callout_init(&tp->tt_keep, 1); - callout_init(&tp->tt_2msl, 1); - callout_init(&tp->tt_delack, 1); + callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_RETURNUNLOCKED); + for (int i = 0; i < TT_N; i++) + tp->t_timers[i] = SBT_MAX; switch (V_tcp_do_rfc1323) { case 0: @@ -2301,13 +2282,6 @@ if (V_tcp_do_lrd) tp->t_flags |= TF_LRD; - /* - * XXXGL: this self-reference might be pointless. It will go away - * when the TCP timers are properly locked and could never fire after - * tcp_discardcb(). - */ - in_pcbref(inp); - return (tp); } @@ -2341,32 +2315,15 @@ tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); + struct socket *so = tptosocket(tp); +#ifdef INET6 + bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; +#endif INP_WLOCK_ASSERT(inp); - /* - * Make sure that all of our timers are stopped before we delete the - * PCB. - * - * If stopping a timer fails, we schedule a discard function in same - * callout, and the last discard function called will take care of - * deleting the tcpcb. - */ - tp->tt_draincnt = 0; - tcp_timer_stop(tp, TT_REXMT); - tcp_timer_stop(tp, TT_PERSIST); - tcp_timer_stop(tp, TT_KEEP); - tcp_timer_stop(tp, TT_2MSL); - tcp_timer_stop(tp, TT_DELACK); + tcp_timer_stop(tp); if (tp->t_fb->tfb_tcp_timer_stop_all) { - /* - * Call the stop-all function of the methods, - * this function should call the tcp_timer_stop() - * method with each of the function specific timeouts. - * That stop will be called via the tfb_tcp_timer_stop() - * which should use the async drain function of the - * callout system (see tcp_var.h). - */ tp->t_fb->tfb_tcp_timer_stop_all(tp); } @@ -2402,23 +2359,7 @@ #endif CC_ALGO(tp) = NULL; - if (tp->tt_draincnt == 0) - tcp_freecb(tp); -} -bool -tcp_freecb(struct tcpcb *tp) -{ - struct inpcb *inp = tptoinpcb(tp); - struct socket *so = tptosocket(tp); -#ifdef INET6 - bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; -#endif - - INP_WLOCK_ASSERT(inp); - MPASS(tp->tt_draincnt == 0); - - /* We own the last reference on tcpcb, let's free it. */ #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif @@ -2489,8 +2430,6 @@ } refcount_release(&tp->t_fb->tfb_refcnt); - - return (in_pcbrele_wlocked(inp)); } /* @@ -3940,17 +3879,17 @@ (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; now = getsbinuptime(); -#define COPYTIMER(ttt) do { \ - if (callout_active(&tp->ttt)) \ - xt->ttt = (tp->ttt.c_time - now) / SBT_1MS; \ - else \ - xt->ttt = 0; \ +#define COPYTIMER(which,where) do { \ + if (tp->t_timers[which] != SBT_MAX) \ + xt->where = (tp->t_timers[which] - now) / SBT_1MS; \ + else \ + xt->where = 0; \ } while (0) - COPYTIMER(tt_delack); - COPYTIMER(tt_rexmt); - COPYTIMER(tt_persist); - COPYTIMER(tt_keep); - COPYTIMER(tt_2msl); + COPYTIMER(TT_DELACK, tt_delack); + COPYTIMER(TT_REXMT, tt_rexmt); + COPYTIMER(TT_PERSIST, tt_persist); + COPYTIMER(TT_KEEP, tt_keep); + COPYTIMER(TT_2MSL, tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h --- a/sys/netinet/tcp_timer.h +++ b/sys/netinet/tcp_timer.h @@ -145,18 +145,6 @@ #ifdef _KERNEL -/* - * Flags for the tcpcb's tt_flags field. - */ -#define TT_DELACK 0x0001 -#define TT_REXMT 0x0002 -#define TT_PERSIST 0x0004 -#define TT_KEEP 0x0008 -#define TT_2MSL 0x0010 -#define TT_MASK (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL) - -#define TT_STOPPED 0x00010000 - #define TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit) #define TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle) #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) @@ -205,13 +193,6 @@ VNET_DECLARE(int, tcp_msl); #define V_tcp_msl VNET(tcp_msl) -void tcp_timer_init(void); -void tcp_timer_2msl(void *xtp); -void tcp_timer_keep(void *xtp); -void tcp_timer_persist(void *xtp); -void tcp_timer_rexmt(void *xtp); -void tcp_timer_delack(void *xtp); - #endif /* _KERNEL */ #endif /* !_NETINET_TCP_TIMER_H_ */ diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -243,104 +243,86 @@ /* * TCP timer processing. + * + * Each connection has 5 timers associated with it, which can be scheduled + * simultaneously. They all are serviced by one callout tcp_timer_enter(). + * This function executes the next timer via tcp_timersw[] vector. Each + * timer is supposed to return 'true' unless the connection was destroyed. + * In the former case tcp_timer_enter() will schedule callout for next timer. */ -void -tcp_timer_delack(void *xtp) -{ - struct epoch_tracker et; - struct tcpcb *tp = xtp; - struct inpcb *inp = tptoinpcb(tp); - - INP_WLOCK(inp); - CURVNET_SET(inp->inp_vnet); - - if (callout_pending(&tp->tt_delack) || - !callout_active(&tp->tt_delack)) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - callout_deactivate(&tp->tt_delack); - if ((inp->inp_flags & INP_DROPPED) != 0) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - tp->t_flags |= TF_ACKNOW; - TCPSTAT_INC(tcps_delack); - NET_EPOCH_ENTER(et); - (void) tcp_output_unlock(tp); - NET_EPOCH_EXIT(et); - CURVNET_RESTORE(); -} +typedef bool tcp_timer_t(struct tcpcb *); +static tcp_timer_t tcp_timer_delack; +static tcp_timer_t tcp_timer_2msl; +static tcp_timer_t tcp_timer_keep; +static tcp_timer_t tcp_timer_persist; +static tcp_timer_t tcp_timer_rexmt; + +static tcp_timer_t * const tcp_timersw[TT_N] = { + [TT_DELACK] = tcp_timer_delack, + [TT_REXMT] = tcp_timer_rexmt, + [TT_PERSIST] = tcp_timer_persist, + [TT_KEEP] = tcp_timer_keep, + [TT_2MSL] = tcp_timer_2msl, +}; /* - * Call tcp_close() from a callout context. + * tcp_output_locked() s a timer specific variation of call to tcp_output(), + * see tcp_var.h for the rest. It handles drop request from advanced stacks, + * but keeps tcpcb locked unless tcp_drop() destroyed it. + * Returns true if tcpcb is valid and locked. */ -static void -tcp_timer_close(struct tcpcb *tp) +static inline bool +tcp_output_locked(struct tcpcb *tp) { - struct epoch_tracker et; - struct inpcb *inp = tptoinpcb(tp); + int rv; - INP_WLOCK_ASSERT(inp); + INP_WLOCK_ASSERT(tptoinpcb(tp)); - NET_EPOCH_ENTER(et); - tp = tcp_close(tp); - NET_EPOCH_EXIT(et); - if (tp != NULL) - INP_WUNLOCK(inp); + if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) { + KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, + ("TCP stack %s requested tcp_drop(%p)", + tp->t_fb->tfb_tcp_block_name, tp)); + tp = tcp_drop(tp, rv); + } + + return (tp != NULL); } -/* - * Call tcp_drop() from a callout context. - */ -static void -tcp_timer_drop(struct tcpcb *tp) +static bool +tcp_timer_delack(struct tcpcb *tp) { struct epoch_tracker et; +#if defined(INVARIANTS) || defined(VIMAGE) struct inpcb *inp = tptoinpcb(tp); +#endif + bool rv; INP_WLOCK_ASSERT(inp); + CURVNET_SET(inp->inp_vnet); + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_delack); NET_EPOCH_ENTER(et); - tp = tcp_drop(tp, ETIMEDOUT); + rv = tcp_output_locked(tp); NET_EPOCH_EXIT(et); - if (tp != NULL) - INP_WUNLOCK(inp); + CURVNET_RESTORE(); + + return (rv); } -void -tcp_timer_2msl(void *xtp) +static bool +tcp_timer_2msl(struct tcpcb *tp) { - struct tcpcb *tp = xtp; struct inpcb *inp = tptoinpcb(tp); -#ifdef TCPDEBUG - int ostate; + bool close = false; - ostate = tp->t_state; -#endif + INP_WLOCK_ASSERT(inp); - INP_WLOCK(inp); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); CURVNET_SET(inp->inp_vnet); - tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); tcp_free_sackholes(tp); - if (callout_pending(&tp->tt_2msl) || - !callout_active(&tp->tt_2msl)) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - callout_deactivate(&tp->tt_2msl); - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - KASSERT((tp->tt_flags & TT_STOPPED) == 0, - ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle @@ -354,69 +336,41 @@ * XXXGL: check if inp_socket shall always be !NULL here? */ if (tp->t_state == TCPS_TIME_WAIT) { - tcp_timer_close(tp); - CURVNET_RESTORE(); - return; + close = true; } else if (tp->t_state == TCPS_FIN_WAIT_2 && tcp_fast_finwait2_recycle && inp->inp_socket && (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); - tcp_timer_close(tp); - CURVNET_RESTORE(); - return; + close = true; } else { - if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { - callout_reset(&tp->tt_2msl, - TP_KEEPINTVL(tp), tcp_timer_2msl, tp); - } else { - tcp_timer_close(tp); - CURVNET_RESTORE(); - return; - } + if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) + tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp)); + else + close = true; } + if (close) { + struct epoch_tracker et; -#ifdef TCPDEBUG - if (tptosocket(tp)->so_options & SO_DEBUG) - tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, - PRU_SLOWTIMO); -#endif - TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); - - INP_WUNLOCK(inp); + NET_EPOCH_ENTER(et); + tp = tcp_close(tp); + NET_EPOCH_EXIT(et); + } CURVNET_RESTORE(); + + return (tp != NULL); } -void -tcp_timer_keep(void *xtp) +static bool +tcp_timer_keep(struct tcpcb *tp) { struct epoch_tracker et; - struct tcpcb *tp = xtp; struct inpcb *inp = tptoinpcb(tp); struct tcptemp *t_template; -#ifdef TCPDEBUG - int ostate; - ostate = tp->t_state; -#endif + INP_WLOCK_ASSERT(inp); - INP_WLOCK(inp); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); CURVNET_SET(inp->inp_vnet); - - if (callout_pending(&tp->tt_keep) || - !callout_active(&tp->tt_keep)) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - callout_deactivate(&tp->tt_keep); - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - KASSERT((tp->tt_flags & TT_STOPPED) == 0, - ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); - /* * Because we don't regularly reset the keepalive callout in * the ESTABLISHED state, it may be that we don't actually need @@ -428,11 +382,10 @@ idletime = ticks - tp->t_rcvtime; if (idletime < TP_KEEPIDLE(tp)) { - callout_reset(&tp->tt_keep, - TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); - INP_WUNLOCK(inp); + tcp_timer_activate(tp, TT_KEEP, + TP_KEEPIDLE(tp) - idletime); CURVNET_RESTORE(); - return; + return (true); } } @@ -470,38 +423,22 @@ NET_EPOCH_EXIT(et); free(t_template, M_TEMP); } - callout_reset(&tp->tt_keep, TP_KEEPINTVL(tp), - tcp_timer_keep, tp); + tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp)); } else - callout_reset(&tp->tt_keep, TP_KEEPIDLE(tp), - tcp_timer_keep, tp); + tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); -#ifdef TCPDEBUG - if (inp->inp_socket->so_options & SO_DEBUG) - tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, - PRU_SLOWTIMO); -#endif - TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); - INP_WUNLOCK(inp); CURVNET_RESTORE(); - return; + return (true); dropit: TCPSTAT_INC(tcps_keepdrops); NET_EPOCH_ENTER(et); tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); tp = tcp_drop(tp, ETIMEDOUT); - -#ifdef TCPDEBUG - if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG)) - tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, - PRU_SLOWTIMO); -#endif - TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); NET_EPOCH_EXIT(et); - if (tp != NULL) - INP_WUNLOCK(inp); CURVNET_RESTORE(); + + return (tp != NULL); } /* @@ -529,37 +466,19 @@ return true; } -void -tcp_timer_persist(void *xtp) +static bool +tcp_timer_persist(struct tcpcb *tp) { struct epoch_tracker et; - struct tcpcb *tp = xtp; +#if defined(INVARIANTS) || defined(VIMAGE) struct inpcb *inp = tptoinpcb(tp); - bool progdrop; - int outrv; -#ifdef TCPDEBUG - int ostate; - - ostate = tp->t_state; #endif + bool progdrop, rv; - INP_WLOCK(inp); - CURVNET_SET(inp->inp_vnet); + INP_WLOCK_ASSERT(inp); - if (callout_pending(&tp->tt_persist) || - !callout_active(&tp->tt_persist)) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - callout_deactivate(&tp->tt_persist); - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - KASSERT((tp->tt_flags & TT_STOPPED) == 0, - ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); + CURVNET_SET(inp->inp_vnet); /* * Persistence timer into zero window. * Force a byte to be output, if possible. @@ -581,9 +500,7 @@ if (!progdrop) TCPSTAT_INC(tcps_persistdrop); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); - tcp_timer_drop(tp); - CURVNET_RESTORE(); - return; + goto dropit; } /* * If the user has closed the socket then drop a persisting @@ -593,57 +510,39 @@ (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { TCPSTAT_INC(tcps_persistdrop); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); - tcp_timer_drop(tp); - CURVNET_RESTORE(); - return; + goto dropit; } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; NET_EPOCH_ENTER(et); - outrv = tcp_output_nodrop(tp); - tp->t_flags &= ~TF_FORCEDATA; + if ((rv = tcp_output_locked(tp))) + tp->t_flags &= ~TF_FORCEDATA; + NET_EPOCH_EXIT(et); + CURVNET_RESTORE(); -#ifdef TCPDEBUG - if (tp != NULL && tptosocket(tp)->so_options & SO_DEBUG) - tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); -#endif - TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); - (void) tcp_unlock_or_drop(tp, outrv); + return (rv); + +dropit: + NET_EPOCH_ENTER(et); + tp = tcp_drop(tp, ETIMEDOUT); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); + + return (tp != NULL); } -void -tcp_timer_rexmt(void * xtp) +static bool +tcp_timer_rexmt(struct tcpcb *tp) { struct epoch_tracker et; - struct tcpcb *tp = xtp; struct inpcb *inp = tptoinpcb(tp); - int rexmt, outrv; - bool isipv6; -#ifdef TCPDEBUG - int ostate; + int rexmt; + bool isipv6, rv; - ostate = tp->t_state; -#endif + INP_WLOCK_ASSERT(inp); - INP_WLOCK(inp); + TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); CURVNET_SET(inp->inp_vnet); - - if (callout_pending(&tp->tt_rexmt) || - !callout_active(&tp->tt_rexmt)) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - callout_deactivate(&tp->tt_rexmt); - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } - KASSERT((tp->tt_flags & TT_STOPPED) == 0, - ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); tcp_free_sackholes(tp); TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); if (tp->t_fb->tfb_tcp_rexmit_tmr) { @@ -664,9 +563,12 @@ TCPSTAT_INC(tcps_timeoutdrop); tp->t_rxtshift = TCP_MAXRXTSHIFT; tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); - tcp_timer_drop(tp); + NET_EPOCH_ENTER(et); + tp = tcp_drop(tp, ETIMEDOUT); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); - return; + + return (tp != NULL); } if (tp->t_state == TCPS_SYN_SENT) { /* @@ -883,159 +785,124 @@ cc_cong_signal(tp, NULL, CC_RTO); NET_EPOCH_ENTER(et); - outrv = tcp_output_nodrop(tp); -#ifdef TCPDEBUG - if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG)) - tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, - PRU_SLOWTIMO); -#endif - TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); - (void) tcp_unlock_or_drop(tp, outrv); + rv = tcp_output_locked(tp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); + + return (rv); +} + +static inline tt_which +tcp_timer_next(struct tcpcb *tp) +{ + tt_which i, rv; + sbintime_t sbt; + + for (i = 0, rv = TT_N, sbt = SBT_MAX; i < TT_N; i++) + if (tp->t_timers[i] < sbt) { + sbt = tp->t_timers[i]; + rv = i; + } + + return (rv); +} + +static void +tcp_timer_enter(void *xtp) +{ + struct tcpcb *tp = xtp; + struct inpcb *inp = tptoinpcb(tp); + tt_which which; + + INP_WLOCK_ASSERT(inp); + MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0); + + curthread->td_pflags |= TDP_INTCPCALLOUT; + + which = tcp_timer_next(tp); + MPASS(which < TT_N); + tp->t_timers[which] = SBT_MAX; + + if (tcp_timersw[which](tp)) { + if ((which = tcp_timer_next(tp)) != TT_N) { + callout_reset_sbt_on(&tp->t_callout, + tp->t_timers[which], 0, tcp_timer_enter, tp, + inp_to_cpuid(inp), C_ABSOLUTE); + } + INP_WUNLOCK(inp); + } + + curthread->td_pflags &= ~TDP_INTCPCALLOUT; } +/* + * Activate or stop (delta == 0) a TCP timer. + */ void -tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) +tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta) { - struct callout *t_callout; - callout_func_t *f_callout; struct inpcb *inp = tptoinpcb(tp); - int cpu = inp_to_cpuid(inp); + sbintime_t precision; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; #endif - if (tp->tt_flags & TT_STOPPED) - return; + INP_WLOCK_ASSERT(inp); - switch (timer_type) { - case TT_DELACK: - t_callout = &tp->tt_delack; - f_callout = tcp_timer_delack; - break; - case TT_REXMT: - t_callout = &tp->tt_rexmt; - f_callout = tcp_timer_rexmt; - break; - case TT_PERSIST: - t_callout = &tp->tt_persist; - f_callout = tcp_timer_persist; - break; - case TT_KEEP: - t_callout = &tp->tt_keep; - f_callout = tcp_timer_keep; - break; - case TT_2MSL: - t_callout = &tp->tt_2msl; - f_callout = tcp_timer_2msl; - break; - default: - if (tp->t_fb->tfb_tcp_timer_activate) { - tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); - return; - } - panic("tp %p bad timer_type %#x", tp, timer_type); - } - if (delta == 0) { - callout_stop(t_callout); - } else { - callout_reset_on(t_callout, delta, f_callout, tp, cpu); - } -} + if (delta > 0) + callout_when(tick_sbt * delta, 0, C_HARDCLOCK, + &tp->t_timers[which], &precision); + else + tp->t_timers[which] = SBT_MAX; -int -tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) -{ - struct callout *t_callout; - - switch (timer_type) { - case TT_DELACK: - t_callout = &tp->tt_delack; - break; - case TT_REXMT: - t_callout = &tp->tt_rexmt; - break; - case TT_PERSIST: - t_callout = &tp->tt_persist; - break; - case TT_KEEP: - t_callout = &tp->tt_keep; - break; - case TT_2MSL: - t_callout = &tp->tt_2msl; - break; - default: - if (tp->t_fb->tfb_tcp_timer_active) { - return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); - } - panic("tp %p bad timer_type %#x", tp, timer_type); - } - return callout_active(t_callout); + if ((which = tcp_timer_next(tp)) != TT_N) + callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which], 0, + tcp_timer_enter, tp, inp_to_cpuid(inp), C_ABSOLUTE); + else + callout_stop(&tp->t_callout); } -static void -tcp_timer_discard(void *ptp) +bool +tcp_timer_active(struct tcpcb *tp, tt_which which) { - struct epoch_tracker et; - struct tcpcb *tp = (struct tcpcb *)ptp; - struct inpcb *inp = tptoinpcb(tp); - INP_WLOCK(inp); - CURVNET_SET(inp->inp_vnet); - NET_EPOCH_ENTER(et); + INP_WLOCK_ASSERT(tptoinpcb(tp)); - KASSERT((tp->tt_flags & TT_STOPPED) != 0, - ("%s: tcpcb has to be stopped here", __func__)); - if (--tp->tt_draincnt > 0 || - tcp_freecb(tp) == false) - INP_WUNLOCK(inp); - NET_EPOCH_EXIT(et); - CURVNET_RESTORE(); + return (tp->t_timers[which] != SBT_MAX); } +/* + * Stop all timers associated with tcpcb. + * + * Called only on tcpcb destruction. The tcpcb shall already be dropped from + * the pcb lookup database and socket is not losing the last reference. + * + * XXXGL: unfortunately our callout(9) is not able to fully stop a locked + * callout even when only two threads are involved: the callout itself and the + * thread that does callout_stop(). See where softclock_call_cc() swaps the + * callwheel lock to callout lock and then checks cc_exec_cancel(). This is + * the race window. If it happens, the tcp_timer_enter() won't be executed, + * however pcb lock will be locked and released, hence we can't free memory. + * Until callout(9) is improved, just keep retrying. In my profiling I've seen + * such event happening less than 1 time per hour with 20-30 Gbit/s of traffic. + */ void -tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) +tcp_timer_stop(struct tcpcb *tp) { - struct callout *t_callout; - - tp->tt_flags |= TT_STOPPED; - switch (timer_type) { - case TT_DELACK: - t_callout = &tp->tt_delack; - break; - case TT_REXMT: - t_callout = &tp->tt_rexmt; - break; - case TT_PERSIST: - t_callout = &tp->tt_persist; - break; - case TT_KEEP: - t_callout = &tp->tt_keep; - break; - case TT_2MSL: - t_callout = &tp->tt_2msl; - break; - default: - if (tp->t_fb->tfb_tcp_timer_stop) { - /* - * XXXrrs we need to look at this with the - * stop case below (flags). - */ - tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); - return; - } - panic("tp %p bad timer_type %#x", tp, timer_type); - } + struct inpcb *inp = tptoinpcb(tp); - if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { - /* - * Can't stop the callout, defer tcpcb actual deletion - * to the last one. We do this using the async drain - * function and incrementing the count in - */ - tp->tt_draincnt++; + INP_WLOCK_ASSERT(inp); + + if (curthread->td_pflags & TDP_INTCPCALLOUT) { + int stopped __diagused; + + stopped = callout_stop(&tp->t_callout); + MPASS(stopped == 0); + } else while(__predict_false(callout_stop(&tp->t_callout) == 0)) { + INP_WUNLOCK(inp); + kern_yield(PRI_UNCHANGED); + INP_WLOCK(inp); } } diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -3072,10 +3072,8 @@ TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); - db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", - &tp->tt_rexmt, &tp->tt_persist, &tp->tt_keep); - db_printf("tt_2msl: %p tt_delack: %p\n", &tp->tt_2msl, - &tp->tt_delack); + db_printf("t_callout: %p t_timers: %p\n", + &tp->t_callout, &tp->t_timers); db_print_indent(indent); db_printf("t_state: %d (", tp->t_state); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -126,6 +126,15 @@ STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); +typedef enum { + TT_DELACK = 0, + TT_REXMT, + TT_PERSIST, + TT_KEEP, + TT_2MSL, + TT_N, +} tt_which; + /* * Tcp control block, one per tcp connection. */ @@ -137,13 +146,8 @@ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ - struct callout tt_rexmt; /* retransmit timer */ - struct callout tt_persist; /* retransmit persistence */ - struct callout tt_keep; /* keepalive */ - struct callout tt_2msl; /* 2*msl TIME_WAIT timer */ - struct callout tt_delack; /* delayed ACK timer */ - uint32_t tt_flags; /* Timers flags */ - uint32_t tt_draincnt; /* Count being drained */ + struct callout t_callout; + sbintime_t t_timers[TT_N]; uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ @@ -370,10 +374,6 @@ void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); - void (*tfb_tcp_timer_activate)(struct tcpcb *, - uint32_t, u_int); - int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); - void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *); @@ -1086,7 +1086,6 @@ struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); -bool tcp_freecb(struct tcpcb *); void tcp_twstart(struct tcpcb *); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_fini(void *); @@ -1186,9 +1185,9 @@ struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *); -void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); -int tcp_timer_active(struct tcpcb *, uint32_t); -void tcp_timer_stop(struct tcpcb *, uint32_t); +void tcp_timer_activate(struct tcpcb *, tt_which, u_int); +bool tcp_timer_active(struct tcpcb *, tt_which); +void tcp_timer_stop(struct tcpcb *); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); int inp_to_cpuid(struct inpcb *inp); /* diff --git a/sys/sys/proc.h b/sys/sys/proc.h --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -557,7 +557,7 @@ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ -#define TDP_UNUSED0 0x20000000 /* UNUSED */ +#define TDP_INTCPCALLOUT 0x20000000 /* used by netinet/tcp_timer.c */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ #define TDP_SIGFASTPENDING 0x80000000 /* Pending signal due to sigfastblock */