Changeset View
Standalone View
sys/netinet/tcp_timer.c
Show First 20 Lines • Show All 237 Lines • ▼ Show 20 Lines | |||||
int tcp_backoff[TCP_MAXRXTSHIFT + 1] = | int tcp_backoff[TCP_MAXRXTSHIFT + 1] = | ||||
{ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; | { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; | ||||
int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ | int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ | ||||
/* | /* | ||||
* TCP timer processing. | * TCP timer processing. | ||||
* | |||||
* Each connection has 5 timers associated with it, which can be scheduled | |||||
* simultaneously. They all are serviced by one callout tcp_timer_enter(). | |||||
* This function executes the next timer via tcp_timersw[] vector. Each | |||||
* timer is supposed to return 'true' unless the connection was destroyed. | |||||
* In the former case tcp_timer_enter() will schedule callout for next timer. | |||||
*/ | */ | ||||
void | typedef bool tcp_timer_t(struct tcpcb *); | ||||
tcp_timer_delack(void *xtp) | static tcp_timer_t tcp_timer_delack; | ||||
{ | static tcp_timer_t tcp_timer_2msl; | ||||
struct epoch_tracker et; | static tcp_timer_t tcp_timer_keep; | ||||
struct tcpcb *tp = xtp; | static tcp_timer_t tcp_timer_persist; | ||||
struct inpcb *inp = tptoinpcb(tp); | static tcp_timer_t tcp_timer_rexmt; | ||||
INP_WLOCK(inp); | static tcp_timer_t * const tcp_timersw[TT_N] = { | ||||
CURVNET_SET(inp->inp_vnet); | [TT_DELACK] = tcp_timer_delack, | ||||
[TT_REXMT] = tcp_timer_rexmt, | |||||
[TT_PERSIST] = tcp_timer_persist, | |||||
[TT_KEEP] = tcp_timer_keep, | |||||
[TT_2MSL] = tcp_timer_2msl, | |||||
}; | |||||
if (callout_pending(&tp->tt_delack) || | |||||
!callout_active(&tp->tt_delack)) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
callout_deactivate(&tp->tt_delack); | |||||
if ((inp->inp_flags & INP_DROPPED) != 0) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
tp->t_flags |= TF_ACKNOW; | |||||
TCPSTAT_INC(tcps_delack); | |||||
NET_EPOCH_ENTER(et); | |||||
(void) tcp_output_unlock(tp); | |||||
NET_EPOCH_EXIT(et); | |||||
CURVNET_RESTORE(); | |||||
} | |||||
/* | /* | ||||
* Call tcp_close() from a callout context. | * tcp_output_locked() s a timer specific variation of call to tcp_output(), | ||||
* see tcp_var.h for the rest. It handles drop request from advanced stacks, | |||||
* but keeps tcpcb locked unless tcp_drop() destroyed it. | |||||
* Returns true if tcpcb is valid and locked. | |||||
*/ | */ | ||||
static void | static inline bool | ||||
tcp_timer_close(struct tcpcb *tp) | tcp_output_locked(struct tcpcb *tp) | ||||
{ | { | ||||
struct epoch_tracker et; | int rv; | ||||
struct inpcb *inp = tptoinpcb(tp); | |||||
INP_WLOCK_ASSERT(inp); | INP_WLOCK_ASSERT(tptoinpcb(tp)); | ||||
NET_EPOCH_ENTER(et); | if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) { | ||||
tp = tcp_close(tp); | KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, | ||||
NET_EPOCH_EXIT(et); | ("TCP stack %s requested tcp_drop(%p)", | ||||
if (tp != NULL) | tp->t_fb->tfb_tcp_block_name, tp)); | ||||
INP_WUNLOCK(inp); | tp = tcp_drop(tp, rv); | ||||
} | } | ||||
/* | return (tp != NULL); | ||||
* Call tcp_drop() from a callout context. | } | ||||
*/ | |||||
static void | static bool | ||||
tcp_timer_drop(struct tcpcb *tp) | tcp_timer_delack(struct tcpcb *tp) | ||||
{ | { | ||||
struct epoch_tracker et; | struct epoch_tracker et; | ||||
#if defined(INVARIANTS) || defined(VIMAGE) | |||||
struct inpcb *inp = tptoinpcb(tp); | struct inpcb *inp = tptoinpcb(tp); | ||||
#endif | |||||
bool rv; | |||||
INP_WLOCK_ASSERT(inp); | INP_WLOCK_ASSERT(inp); | ||||
CURVNET_SET(inp->inp_vnet); | |||||
tp->t_flags |= TF_ACKNOW; | |||||
TCPSTAT_INC(tcps_delack); | |||||
NET_EPOCH_ENTER(et); | NET_EPOCH_ENTER(et); | ||||
tp = tcp_drop(tp, ETIMEDOUT); | rv = tcp_output_locked(tp); | ||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
if (tp != NULL) | CURVNET_RESTORE(); | ||||
INP_WUNLOCK(inp); | |||||
return (rv); | |||||
} | } | ||||
void | static bool | ||||
tcp_timer_2msl(void *xtp) | tcp_timer_2msl(struct tcpcb *tp) | ||||
{ | { | ||||
struct tcpcb *tp = xtp; | |||||
struct inpcb *inp = tptoinpcb(tp); | struct inpcb *inp = tptoinpcb(tp); | ||||
#ifdef TCPDEBUG | bool close = false; | ||||
int ostate; | |||||
ostate = tp->t_state; | INP_WLOCK_ASSERT(inp); | ||||
#endif | |||||
INP_WLOCK(inp); | TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | ||||
CURVNET_SET(inp->inp_vnet); | CURVNET_SET(inp->inp_vnet); | ||||
tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); | tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); | ||||
tcp_free_sackholes(tp); | tcp_free_sackholes(tp); | ||||
if (callout_pending(&tp->tt_2msl) || | |||||
!callout_active(&tp->tt_2msl)) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
callout_deactivate(&tp->tt_2msl); | |||||
if (inp->inp_flags & INP_DROPPED) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
KASSERT((tp->tt_flags & TT_STOPPED) == 0, | |||||
("%s: tp %p tcpcb can't be stopped here", __func__, tp)); | |||||
/* | /* | ||||
* 2 MSL timeout in shutdown went off. If we're closed but | * 2 MSL timeout in shutdown went off. If we're closed but | ||||
* still waiting for peer to close and connection has been idle | * still waiting for peer to close and connection has been idle | ||||
* too long delete connection control block. Otherwise, check | * too long delete connection control block. Otherwise, check | ||||
* again in a bit. | * again in a bit. | ||||
* | * | ||||
* If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, | * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, | ||||
* there's no point in hanging onto FIN_WAIT_2 socket. Just close it. | * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. | ||||
* Ignore fact that there were recent incoming segments. | * Ignore fact that there were recent incoming segments. | ||||
* | * | ||||
* XXXGL: check if inp_socket shall always be !NULL here? | * XXXGL: check if inp_socket shall always be !NULL here? | ||||
*/ | */ | ||||
if (tp->t_state == TCPS_TIME_WAIT) { | if (tp->t_state == TCPS_TIME_WAIT) { | ||||
tcp_timer_close(tp); | close = true; | ||||
CURVNET_RESTORE(); | |||||
return; | |||||
} else if (tp->t_state == TCPS_FIN_WAIT_2 && | } else if (tp->t_state == TCPS_FIN_WAIT_2 && | ||||
tcp_fast_finwait2_recycle && inp->inp_socket && | tcp_fast_finwait2_recycle && inp->inp_socket && | ||||
(inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { | (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { | ||||
TCPSTAT_INC(tcps_finwait2_drops); | TCPSTAT_INC(tcps_finwait2_drops); | ||||
tcp_timer_close(tp); | close = true; | ||||
CURVNET_RESTORE(); | |||||
return; | |||||
} else { | } else { | ||||
if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { | if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) | ||||
callout_reset(&tp->tt_2msl, | tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp)); | ||||
TP_KEEPINTVL(tp), tcp_timer_2msl, tp); | else | ||||
} else { | close = true; | ||||
tcp_timer_close(tp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | } | ||||
} | if (close) { | ||||
struct epoch_tracker et; | |||||
#ifdef TCPDEBUG | NET_EPOCH_ENTER(et); | ||||
if (tptosocket(tp)->so_options & SO_DEBUG) | tp = tcp_close(tp); | ||||
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, | NET_EPOCH_EXIT(et); | ||||
PRU_SLOWTIMO); | } | ||||
#endif | |||||
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
return (tp != NULL); | |||||
} | } | ||||
void | static bool | ||||
tcp_timer_keep(void *xtp) | tcp_timer_keep(struct tcpcb *tp) | ||||
{ | { | ||||
struct epoch_tracker et; | struct epoch_tracker et; | ||||
struct tcpcb *tp = xtp; | |||||
struct inpcb *inp = tptoinpcb(tp); | struct inpcb *inp = tptoinpcb(tp); | ||||
struct tcptemp *t_template; | struct tcptemp *t_template; | ||||
#ifdef TCPDEBUG | |||||
int ostate; | |||||
ostate = tp->t_state; | INP_WLOCK_ASSERT(inp); | ||||
#endif | |||||
INP_WLOCK(inp); | TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | ||||
CURVNET_SET(inp->inp_vnet); | CURVNET_SET(inp->inp_vnet); | ||||
if (callout_pending(&tp->tt_keep) || | |||||
!callout_active(&tp->tt_keep)) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
callout_deactivate(&tp->tt_keep); | |||||
if (inp->inp_flags & INP_DROPPED) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
KASSERT((tp->tt_flags & TT_STOPPED) == 0, | |||||
("%s: tp %p tcpcb can't be stopped here", __func__, tp)); | |||||
/* | /* | ||||
* Because we don't regularly reset the keepalive callout in | * Because we don't regularly reset the keepalive callout in | ||||
* the ESTABLISHED state, it may be that we don't actually need | * the ESTABLISHED state, it may be that we don't actually need | ||||
* to send a keepalive yet. If that occurs, schedule another | * to send a keepalive yet. If that occurs, schedule another | ||||
* call for the next time the keepalive timer might expire. | * call for the next time the keepalive timer might expire. | ||||
*/ | */ | ||||
if (TCPS_HAVEESTABLISHED(tp->t_state)) { | if (TCPS_HAVEESTABLISHED(tp->t_state)) { | ||||
u_int idletime; | u_int idletime; | ||||
idletime = ticks - tp->t_rcvtime; | idletime = ticks - tp->t_rcvtime; | ||||
if (idletime < TP_KEEPIDLE(tp)) { | if (idletime < TP_KEEPIDLE(tp)) { | ||||
callout_reset(&tp->tt_keep, | tcp_timer_activate(tp, TT_KEEP, | ||||
TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); | TP_KEEPIDLE(tp) - idletime); | ||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
return; | return (true); | ||||
} | } | ||||
} | } | ||||
/* | /* | ||||
* Keep-alive timer went off; send something | * Keep-alive timer went off; send something | ||||
* or drop connection if idle for too long. | * or drop connection if idle for too long. | ||||
*/ | */ | ||||
TCPSTAT_INC(tcps_keeptimeo); | TCPSTAT_INC(tcps_keeptimeo); | ||||
Show All 21 Lines | if ((V_tcp_always_keepalive || | ||||
if (t_template) { | if (t_template) { | ||||
NET_EPOCH_ENTER(et); | NET_EPOCH_ENTER(et); | ||||
tcp_respond(tp, t_template->tt_ipgen, | tcp_respond(tp, t_template->tt_ipgen, | ||||
&t_template->tt_t, (struct mbuf *)NULL, | &t_template->tt_t, (struct mbuf *)NULL, | ||||
tp->rcv_nxt, tp->snd_una - 1, 0); | tp->rcv_nxt, tp->snd_una - 1, 0); | ||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
free(t_template, M_TEMP); | free(t_template, M_TEMP); | ||||
} | } | ||||
callout_reset(&tp->tt_keep, TP_KEEPINTVL(tp), | tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp)); | ||||
tcp_timer_keep, tp); | |||||
} else | } else | ||||
callout_reset(&tp->tt_keep, TP_KEEPIDLE(tp), | tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); | ||||
tcp_timer_keep, tp); | |||||
#ifdef TCPDEBUG | |||||
if (inp->inp_socket->so_options & SO_DEBUG) | |||||
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, | |||||
PRU_SLOWTIMO); | |||||
#endif | |||||
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
return; | return (true); | ||||
dropit: | dropit: | ||||
TCPSTAT_INC(tcps_keepdrops); | TCPSTAT_INC(tcps_keepdrops); | ||||
NET_EPOCH_ENTER(et); | NET_EPOCH_ENTER(et); | ||||
tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); | tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); | ||||
tp = tcp_drop(tp, ETIMEDOUT); | tp = tcp_drop(tp, ETIMEDOUT); | ||||
#ifdef TCPDEBUG | |||||
if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG)) | |||||
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, | |||||
PRU_SLOWTIMO); | |||||
#endif | |||||
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | |||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
if (tp != NULL) | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
return (tp != NULL); | |||||
} | } | ||||
/* | /* | ||||
* Has this session exceeded the maximum time without seeing a substantive | * Has this session exceeded the maximum time without seeing a substantive | ||||
* acknowledgement? If so, return true; otherwise false. | * acknowledgement? If so, return true; otherwise false. | ||||
*/ | */ | ||||
static bool | static bool | ||||
tcp_maxunacktime_check(struct tcpcb *tp) | tcp_maxunacktime_check(struct tcpcb *tp) | ||||
Show All 11 Lines | tcp_maxunacktime_check(struct tcpcb *tp) | ||||
if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) | if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) | ||||
return false; | return false; | ||||
/* We exceeded the timer. */ | /* We exceeded the timer. */ | ||||
TCPSTAT_INC(tcps_progdrops); | TCPSTAT_INC(tcps_progdrops); | ||||
return true; | return true; | ||||
} | } | ||||
void | static bool | ||||
tcp_timer_persist(void *xtp) | tcp_timer_persist(struct tcpcb *tp) | ||||
{ | { | ||||
struct epoch_tracker et; | struct epoch_tracker et; | ||||
struct tcpcb *tp = xtp; | #if defined(INVARIANTS) || defined(VIMAGE) | ||||
struct inpcb *inp = tptoinpcb(tp); | struct inpcb *inp = tptoinpcb(tp); | ||||
bool progdrop; | |||||
int outrv; | |||||
#ifdef TCPDEBUG | |||||
int ostate; | |||||
ostate = tp->t_state; | |||||
#endif | #endif | ||||
bool progdrop, rv; | |||||
INP_WLOCK(inp); | INP_WLOCK_ASSERT(inp); | ||||
CURVNET_SET(inp->inp_vnet); | |||||
if (callout_pending(&tp->tt_persist) || | TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | ||||
!callout_active(&tp->tt_persist)) { | CURVNET_SET(inp->inp_vnet); | ||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
callout_deactivate(&tp->tt_persist); | |||||
if (inp->inp_flags & INP_DROPPED) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
KASSERT((tp->tt_flags & TT_STOPPED) == 0, | |||||
("%s: tp %p tcpcb can't be stopped here", __func__, tp)); | |||||
/* | /* | ||||
* Persistence timer into zero window. | * Persistence timer into zero window. | ||||
* Force a byte to be output, if possible. | * Force a byte to be output, if possible. | ||||
*/ | */ | ||||
TCPSTAT_INC(tcps_persisttimeo); | TCPSTAT_INC(tcps_persisttimeo); | ||||
/* | /* | ||||
* Hack: if the peer is dead/unreachable, we do not | * Hack: if the peer is dead/unreachable, we do not | ||||
* time out if the window is closed. After a full | * time out if the window is closed. After a full | ||||
* backoff, drop the connection if the idle time | * backoff, drop the connection if the idle time | ||||
* (no responses to probes) reaches the maximum | * (no responses to probes) reaches the maximum | ||||
* backoff that we would use if retransmitting. | * backoff that we would use if retransmitting. | ||||
* Also, drop the connection if we haven't been making | * Also, drop the connection if we haven't been making | ||||
* progress. | * progress. | ||||
*/ | */ | ||||
progdrop = tcp_maxunacktime_check(tp); | progdrop = tcp_maxunacktime_check(tp); | ||||
if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT && | if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT && | ||||
(ticks - tp->t_rcvtime >= tcp_maxpersistidle || | (ticks - tp->t_rcvtime >= tcp_maxpersistidle || | ||||
ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { | ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { | ||||
if (!progdrop) | if (!progdrop) | ||||
TCPSTAT_INC(tcps_persistdrop); | TCPSTAT_INC(tcps_persistdrop); | ||||
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); | tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); | ||||
tcp_timer_drop(tp); | goto dropit; | ||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | } | ||||
/* | /* | ||||
* If the user has closed the socket then drop a persisting | * If the user has closed the socket then drop a persisting | ||||
* connection after a much reduced timeout. | * connection after a much reduced timeout. | ||||
*/ | */ | ||||
if (tp->t_state > TCPS_CLOSE_WAIT && | if (tp->t_state > TCPS_CLOSE_WAIT && | ||||
(ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { | (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { | ||||
TCPSTAT_INC(tcps_persistdrop); | TCPSTAT_INC(tcps_persistdrop); | ||||
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); | tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); | ||||
tcp_timer_drop(tp); | goto dropit; | ||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | } | ||||
tcp_setpersist(tp); | tcp_setpersist(tp); | ||||
tp->t_flags |= TF_FORCEDATA; | tp->t_flags |= TF_FORCEDATA; | ||||
NET_EPOCH_ENTER(et); | NET_EPOCH_ENTER(et); | ||||
outrv = tcp_output_nodrop(tp); | if ((rv = tcp_output_locked(tp))) | ||||
tp->t_flags &= ~TF_FORCEDATA; | tp->t_flags &= ~TF_FORCEDATA; | ||||
NET_EPOCH_EXIT(et); | |||||
CURVNET_RESTORE(); | |||||
#ifdef TCPDEBUG | return (rv); | ||||
if (tp != NULL && tptosocket(tp)->so_options & SO_DEBUG) | |||||
tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); | dropit: | ||||
#endif | NET_EPOCH_ENTER(et); | ||||
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | tp = tcp_drop(tp, ETIMEDOUT); | ||||
(void) tcp_unlock_or_drop(tp, outrv); | |||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
return (tp != NULL); | |||||
} | } | ||||
void | static bool | ||||
tcp_timer_rexmt(void * xtp) | tcp_timer_rexmt(struct tcpcb *tp) | ||||
{ | { | ||||
struct epoch_tracker et; | struct epoch_tracker et; | ||||
struct tcpcb *tp = xtp; | |||||
struct inpcb *inp = tptoinpcb(tp); | struct inpcb *inp = tptoinpcb(tp); | ||||
int rexmt, outrv; | int rexmt; | ||||
bool isipv6; | bool isipv6, rv; | ||||
#ifdef TCPDEBUG | |||||
int ostate; | |||||
ostate = tp->t_state; | INP_WLOCK_ASSERT(inp); | ||||
#endif | |||||
INP_WLOCK(inp); | TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | ||||
CURVNET_SET(inp->inp_vnet); | CURVNET_SET(inp->inp_vnet); | ||||
if (callout_pending(&tp->tt_rexmt) || | |||||
!callout_active(&tp->tt_rexmt)) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
callout_deactivate(&tp->tt_rexmt); | |||||
if (inp->inp_flags & INP_DROPPED) { | |||||
INP_WUNLOCK(inp); | |||||
CURVNET_RESTORE(); | |||||
return; | |||||
} | |||||
KASSERT((tp->tt_flags & TT_STOPPED) == 0, | |||||
("%s: tp %p tcpcb can't be stopped here", __func__, tp)); | |||||
tcp_free_sackholes(tp); | tcp_free_sackholes(tp); | ||||
TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); | TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); | ||||
if (tp->t_fb->tfb_tcp_rexmit_tmr) { | if (tp->t_fb->tfb_tcp_rexmit_tmr) { | ||||
/* The stack has a timer action too. */ | /* The stack has a timer action too. */ | ||||
(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); | (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); | ||||
} | } | ||||
/* | /* | ||||
* Retransmission timer went off. Message has not | * Retransmission timer went off. Message has not | ||||
* been acked within retransmit interval. Back off | * been acked within retransmit interval. Back off | ||||
* to a longer retransmit interval and retransmit one segment. | * to a longer retransmit interval and retransmit one segment. | ||||
* | * | ||||
* If we've either exceeded the maximum number of retransmissions, | * If we've either exceeded the maximum number of retransmissions, | ||||
* or we've gone long enough without making progress, then drop | * or we've gone long enough without making progress, then drop | ||||
* the session. | * the session. | ||||
*/ | */ | ||||
if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) { | if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) { | ||||
if (tp->t_rxtshift > TCP_MAXRXTSHIFT) | if (tp->t_rxtshift > TCP_MAXRXTSHIFT) | ||||
TCPSTAT_INC(tcps_timeoutdrop); | TCPSTAT_INC(tcps_timeoutdrop); | ||||
tp->t_rxtshift = TCP_MAXRXTSHIFT; | tp->t_rxtshift = TCP_MAXRXTSHIFT; | ||||
tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); | tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); | ||||
tcp_timer_drop(tp); | NET_EPOCH_ENTER(et); | ||||
tp = tcp_drop(tp, ETIMEDOUT); | |||||
NET_EPOCH_EXIT(et); | |||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
return; | |||||
return (tp != NULL); | |||||
} | } | ||||
if (tp->t_state == TCPS_SYN_SENT) { | if (tp->t_state == TCPS_SYN_SENT) { | ||||
/* | /* | ||||
* If the SYN was retransmitted, indicate CWND to be | * If the SYN was retransmitted, indicate CWND to be | ||||
* limited to 1 segment in cc_conn_init(). | * limited to 1 segment in cc_conn_init(). | ||||
*/ | */ | ||||
tp->snd_cwnd = 1; | tp->snd_cwnd = 1; | ||||
} else if (tp->t_rxtshift == 1) { | } else if (tp->t_rxtshift == 1) { | ||||
▲ Show 20 Lines • Show All 200 Lines • ▼ Show 20 Lines | #endif | ||||
tp->t_flags |= TF_ACKNOW; | tp->t_flags |= TF_ACKNOW; | ||||
/* | /* | ||||
* If timing a segment in this window, stop the timer. | * If timing a segment in this window, stop the timer. | ||||
*/ | */ | ||||
tp->t_rtttime = 0; | tp->t_rtttime = 0; | ||||
cc_cong_signal(tp, NULL, CC_RTO); | cc_cong_signal(tp, NULL, CC_RTO); | ||||
NET_EPOCH_ENTER(et); | NET_EPOCH_ENTER(et); | ||||
outrv = tcp_output_nodrop(tp); | rv = tcp_output_locked(tp); | ||||
#ifdef TCPDEBUG | |||||
if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG)) | |||||
tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, | |||||
PRU_SLOWTIMO); | |||||
#endif | |||||
TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); | |||||
(void) tcp_unlock_or_drop(tp, outrv); | |||||
NET_EPOCH_EXIT(et); | NET_EPOCH_EXIT(et); | ||||
CURVNET_RESTORE(); | CURVNET_RESTORE(); | ||||
return (rv); | |||||
} | } | ||||
void | static inline tt_which | ||||
tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) | tcp_timer_next(struct tcpcb *tp) | ||||
{ | { | ||||
struct callout *t_callout; | tt_which i, rv; | ||||
callout_func_t *f_callout; | sbintime_t sbt; | ||||
struct inpcb *inp = tptoinpcb(tp); | |||||
int cpu = inp_to_cpuid(inp); | |||||
#ifdef TCP_OFFLOAD | for (i = 0, rv = TT_N, sbt = SBT_MAX; i < TT_N; i++) | ||||
if (tp->t_flags & TF_TOE) | if (tp->t_timers[i] < sbt) { | ||||
return; | sbt = tp->t_timers[i]; | ||||
#endif | rv = i; | ||||
} | |||||
if (tp->tt_flags & TT_STOPPED) | return (rv); | ||||
return; | |||||
switch (timer_type) { | |||||
case TT_DELACK: | |||||
t_callout = &tp->tt_delack; | |||||
f_callout = tcp_timer_delack; | |||||
break; | |||||
case TT_REXMT: | |||||
t_callout = &tp->tt_rexmt; | |||||
f_callout = tcp_timer_rexmt; | |||||
break; | |||||
case TT_PERSIST: | |||||
t_callout = &tp->tt_persist; | |||||
f_callout = tcp_timer_persist; | |||||
break; | |||||
case TT_KEEP: | |||||
t_callout = &tp->tt_keep; | |||||
f_callout = tcp_timer_keep; | |||||
break; | |||||
case TT_2MSL: | |||||
t_callout = &tp->tt_2msl; | |||||
f_callout = tcp_timer_2msl; | |||||
break; | |||||
default: | |||||
if (tp->t_fb->tfb_tcp_timer_activate) { | |||||
tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); | |||||
return; | |||||
} | } | ||||
panic("tp %p bad timer_type %#x", tp, timer_type); | |||||
} | |||||
if (delta == 0) { | |||||
callout_stop(t_callout); | |||||
} else { | |||||
callout_reset_on(t_callout, delta, f_callout, tp, cpu); | |||||
} | |||||
} | |||||
int | static void | ||||
tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) | tcp_timer_enter(void *xtp) | ||||
{ | { | ||||
struct callout *t_callout; | struct tcpcb *tp = xtp; | ||||
struct inpcb *inp = tptoinpcb(tp); | |||||
tt_which which; | |||||
switch (timer_type) { | INP_WLOCK_ASSERT(inp); | ||||
case TT_DELACK: | MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0); | ||||
t_callout = &tp->tt_delack; | |||||
break; | curthread->td_pflags |= TDP_INTCPCALLOUT; | ||||
case TT_REXMT: | |||||
t_callout = &tp->tt_rexmt; | which = tcp_timer_next(tp); | ||||
break; | MPASS(which < TT_N); | ||||
case TT_PERSIST: | tp->t_timers[which] = SBT_MAX; | ||||
t_callout = &tp->tt_persist; | |||||
break; | if (tcp_timersw[which](tp)) { | ||||
case TT_KEEP: | if ((which = tcp_timer_next(tp)) != TT_N) { | ||||
t_callout = &tp->tt_keep; | callout_reset_sbt_on(&tp->t_callout, | ||||
break; | tp->t_timers[which], 0, tcp_timer_enter, tp, | ||||
case TT_2MSL: | inp_to_cpuid(inp), C_ABSOLUTE); | ||||
t_callout = &tp->tt_2msl; | |||||
break; | |||||
default: | |||||
if (tp->t_fb->tfb_tcp_timer_active) { | |||||
return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); | |||||
} | } | ||||
panic("tp %p bad timer_type %#x", tp, timer_type); | INP_WUNLOCK(inp); | ||||
} | } | ||||
return callout_active(t_callout); | |||||
curthread->td_pflags &= ~TDP_INTCPCALLOUT; | |||||
} | } | ||||
static void | /* | ||||
tcp_timer_discard(void *ptp) | * Activate or stop (ticks == 0) a TCP timer. | ||||
*/ | |||||
void | |||||
tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int ticks) | |||||
{ | { | ||||
struct epoch_tracker et; | |||||
struct tcpcb *tp = (struct tcpcb *)ptp; | |||||
struct inpcb *inp = tptoinpcb(tp); | struct inpcb *inp = tptoinpcb(tp); | ||||
INP_WLOCK(inp); | #ifdef TCP_OFFLOAD | ||||
kib: Shouldn't you assert that TDP_INCALLOUT is not set there?
If not, perhaps you need to… | |||||
CURVNET_SET(inp->inp_vnet); | if (tp->t_flags & TF_TOE) | ||||
NET_EPOCH_ENTER(et); | return; | ||||
#endif | |||||
Not Done Inline ActionsDon't you need to execute all timers which t_timers value is less or equal to the current moment? kib: Don't you need to execute all timers which t_timers value is less or equal to the current… | |||||
Done Inline ActionsIf two timers alias to the same sbt, or if we race with callout thread, providing sbt argument from the past, the callout(9) system will self correct and schedule to next slot. It is in the very beginning of callout_cc_add(). glebius: If two timers alias to the same sbt, or if we race with callout thread, providing sbt argument… | |||||
Not Done Inline ActionsMight be, but why go through all the innards of callout_reset(), in particular, take at least two spinlocks etc? kib: Might be, but why go through all the innards of callout_reset(), in particular, take at least… | |||||
Done Inline Actions
The probability of two timers aliasing to the same sbt is very low. The cost of rechecking for aliased timer every time would eat more CPU cycles than going through callout_reset spinlocks once in a while. P.S. We might rethink that if we reduce precision of TCP timers. glebius: > Might be, but why go through all the innards of callout_reset(), in particular, take at least… | |||||
KASSERT((tp->tt_flags & TT_STOPPED) != 0, | INP_WLOCK_ASSERT(inp); | ||||
("%s: tcpcb has to be stopped here", __func__)); | |||||
if (--tp->tt_draincnt > 0 || | if (ticks > 0) | ||||
tcp_freecb(tp) == false) | tp->t_timers[which] = sbinuptime() + tick_sbt * ticks; | ||||
hselaskyUnsubmitted Not Done Inline ActionsHi, It might be an idea to align sbinuptime() to tick_sbt, before doing that addition. This way you avoid firing the timer more than needed. Instead increase kern.hz ? --HPS hselasky: Hi,
It might be an idea to align sbinuptime() to tick_sbt, before doing that addition. This… | |||||
glebiusAuthorUnsubmitted Done Inline Actions
Sorry, can't understand. What do you mean with "align sbinuptime() to tick_sbt"? glebius: > It might be an idea to align sbinuptime() to tick_sbt, before doing that addition. This way… | |||||
hselaskyUnsubmitted Not Done Inline ActionsThis code: sbinuptime() + tick_sbt * ticks; Should be written like: sbt_t sbt = sbinuptime(); sbt - (sbt % tick_sbt) + tick_sbt * ticks; Please also use another variable name than "ticks", because that is the name of a global variable!
I think you don't see that the timer code will fire multiple timeouts if you don't align the absolute timeout value you pass to callout_reset_sbt(). When you don't align the sbt value to tick_sbt, you risk the the "random" remainder, will lead to "N * kern.hz" callout IRQ's on the given CPU instead of only limited by "kern.hz". @mav : Can you help explain this obvious thing? hselasky: This code:
```
sbinuptime() + tick_sbt * ticks;
```
Should be written like:
```
sbt_t sbt =… | |||||
glebiusAuthorUnsubmitted Not Done Inline Actions
glebius: > Hi,
>
> It might be an idea to align sbinuptime() to tick_sbt, before doing that addition. | |||||
INP_WUNLOCK(inp); | else | ||||
NET_EPOCH_EXIT(et); | tp->t_timers[which] = SBT_MAX; | ||||
CURVNET_RESTORE(); | |||||
if ((which = tcp_timer_next(tp)) != TT_N) | |||||
callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which], 0, | |||||
tcp_timer_enter, tp, inp_to_cpuid(inp), C_ABSOLUTE); | |||||
else | |||||
callout_stop(&tp->t_callout); | |||||
} | } | ||||
void | bool | ||||
tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) | tcp_timer_active(struct tcpcb *tp, tt_which which) | ||||
{ | { | ||||
struct callout *t_callout; | |||||
tp->tt_flags |= TT_STOPPED; | INP_WLOCK_ASSERT(tptoinpcb(tp)); | ||||
switch (timer_type) { | |||||
case TT_DELACK: | return (tp->t_timers[which] != SBT_MAX); | ||||
t_callout = &tp->tt_delack; | |||||
break; | |||||
case TT_REXMT: | |||||
t_callout = &tp->tt_rexmt; | |||||
break; | |||||
case TT_PERSIST: | |||||
t_callout = &tp->tt_persist; | |||||
break; | |||||
case TT_KEEP: | |||||
t_callout = &tp->tt_keep; | |||||
break; | |||||
case TT_2MSL: | |||||
t_callout = &tp->tt_2msl; | |||||
break; | |||||
default: | |||||
if (tp->t_fb->tfb_tcp_timer_stop) { | |||||
/* | |||||
* XXXrrs we need to look at this with the | |||||
* stop case below (flags). | |||||
*/ | |||||
tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); | |||||
return; | |||||
} | } | ||||
panic("tp %p bad timer_type %#x", tp, timer_type); | |||||
} | |||||
if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { | |||||
/* | /* | ||||
* Can't stop the callout, defer tcpcb actual deletion | * Stop all timers associated with tcpcb. | ||||
* to the last one. We do this using the async drain | * | ||||
* function and incrementing the count in | * Called only on tcpcb destruction. The tcpcb shall already be dropped from | ||||
* the pcb lookup database and socket is not losing the last reference. | |||||
* | |||||
* XXXGL: unfortunately our callout(9) is not able to fully stop a locked | |||||
* callout even when only two threads are involved: the callout itself and the | |||||
* thread that does callout_stop(). See where softclock_call_cc() swaps the | |||||
* callwheel lock to callout lock and then checks cc_exec_cancel(). This is | |||||
* the race window. If it happens, the tcp_timer_enter() won't be executed, | |||||
* however pcb lock will be locked and released, hence we can't free memory. | |||||
* Until callout(9) is improved, just keep retrying. In my profiling I've seen | |||||
* such event happening less than 1 time per hour with 20-30 Gbit/s of traffic. | |||||
*/ | */ | ||||
tp->tt_draincnt++; | void | ||||
tcp_timer_stop(struct tcpcb *tp) | |||||
{ | |||||
struct inpcb *inp = tptoinpcb(tp); | |||||
INP_WLOCK_ASSERT(inp); | |||||
if (curthread->td_pflags & TDP_INTCPCALLOUT) { | |||||
int stopped __diagused; | |||||
stopped = callout_stop(&tp->t_callout); | |||||
MPASS(stopped == 0); | |||||
} else while(__predict_false(callout_stop(&tp->t_callout) == 0)) { | |||||
hselaskyUnsubmitted Not Done Inline ActionsThis is wrong and might theoretically lead to live locks. Why not learn the callout subsystem about SMR and NET-EPOCH? hselasky: This is wrong and might theoretically lead to live locks. Why not learn the callout subsystem… | |||||
INP_WUNLOCK(inp); | |||||
kern_yield(PRI_UNCHANGED); | |||||
INP_WLOCK(inp); | |||||
} | } | ||||
} | } |
Shouldn't you assert that TDP_INCALLOUT is not set there?
If not, perhaps you need to curthread_pflags_set()/restore().