Page MenuHomeFreeBSD

Restore_multi_threaded_callouts_in_the_TCP_stack

Authored By
hselasky
Jan 20 2015, 7:22 AM
Size
12 KB
Referenced Files
None
Subscribers
None

Restore_multi_threaded_callouts_in_the_TCP_stack

Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c (revision 277315)
+++ sys/netinet/tcp_subr.c (working copy)
@@ -790,11 +790,16 @@
V_tcp_mssdflt;
/* Set up our timeouts. */
- callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE);
- callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE);
- callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE);
- callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE);
- callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE);
+ callout_init_rw(&tp->t_timers->tt_rexmt,
+ &inp->inp_lock, CALLOUT_RETURNUNLOCKED);
+ callout_init_rw(&tp->t_timers->tt_persist,
+ &inp->inp_lock, CALLOUT_RETURNUNLOCKED);
+ callout_init_rw(&tp->t_timers->tt_keep,
+ &inp->inp_lock, CALLOUT_RETURNUNLOCKED);
+ callout_init_rw(&tp->t_timers->tt_2msl,
+ &inp->inp_lock, CALLOUT_RETURNUNLOCKED);
+ callout_init_rw(&tp->t_timers->tt_delack,
+ &inp->inp_lock, CALLOUT_RETURNUNLOCKED);
if (V_tcp_do_rfc1323)
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
@@ -888,6 +893,32 @@
}
/*
+ * Switch locks to avoid LOR
+ * Return values:
+ * 0: No lock switching needed
+ * 1: Success
+ * 2: Failure and all locks dropped
+ */
+int
+tcp_switch_locks(struct inpcb *inp)
+{
+ INP_WLOCK_ASSERT(inp);
+
+ if (rw_wowned(&V_tcbinfo.ipi_lock))
+ return (0);
+
+ in_pcbref(inp);
+ INP_WUNLOCK(inp);
+ INP_INFO_WLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (2);
+ }
+ return (1);
+}
+
+/*
* Drop a TCP connection, reporting
* the specified error. If connection is synchronized,
* then send a RST to peer.
@@ -896,10 +927,17 @@
tcp_drop(struct tcpcb *tp, int errno)
{
struct socket *so = tp->t_inpcb->inp_socket;
+ struct tcpcb *retval;
+ int do_unlock;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tp->t_inpcb);
+ do_unlock = tcp_switch_locks(tp->t_inpcb);
+ if (do_unlock == 2)
+ return (NULL);
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tcp_state_change(tp, TCPS_CLOSED);
(void) tcp_output(tp);
@@ -909,9 +947,34 @@
if (errno == ETIMEDOUT && tp->t_softerror)
errno = tp->t_softerror;
so->so_error = errno;
- return (tcp_close(tp));
+ retval = tcp_close(tp);
+ if (do_unlock)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ return (retval);
}
+static void
+tcp_callout_drain(void *arg)
+{
+ struct tcpcb *tp = arg;
+ struct inpcb *inp;
+
+ inp = tp->t_inpcb;
+
+ INP_WLOCK(inp);
+ if (--(tp->t_calloutref) != 0) {
+ INP_WUNLOCK(inp);
+ return;
+ }
+ inp->inp_ppcb = NULL;
+ if (in_pcbrele_wlocked(inp) == 0)
+ INP_WUNLOCK(inp);
+
+ CC_ALGO(tp) = NULL;
+ tp->t_inpcb = NULL;
+ uma_zfree(V_tcpcb_zone, tp);
+}
+
void
tcp_discardcb(struct tcpcb *tp)
{
@@ -924,26 +987,25 @@
INP_WLOCK_ASSERT(inp);
/*
- * Make sure that all of our timers are stopped before we delete the
- * PCB.
- *
- * XXXRW: Really, we would like to use callout_drain() here in order
- * to avoid races experienced in tcp_timer.c where a timer is already
- * executing at this point. However, we can't, both because we're
- * running in a context where we can't sleep, and also because we
- * hold locks required by the timers. What we instead need to do is
- * test to see if callout_drain() is required, and if so, defer some
- * portion of the remainder of tcp_discardcb() to an asynchronous
- * context that can callout_drain() and then continue. Some care
- * will be required to ensure that no further processing takes place
- * on the tcpcb, even though it hasn't been freed (a flag?).
+ * Make sure that all of our timers are stopped before we
+ * delete the PCB:
*/
- callout_stop(&tp->t_timers->tt_rexmt);
- callout_stop(&tp->t_timers->tt_persist);
- callout_stop(&tp->t_timers->tt_keep);
- callout_stop(&tp->t_timers->tt_2msl);
- callout_stop(&tp->t_timers->tt_delack);
+ tp->t_calloutref = 6; /* set initial value */
+
+ if (callout_drain_async(&tp->t_timers->tt_rexmt, &tcp_callout_drain, tp) == 0)
+ tp->t_calloutref--;
+ if (callout_drain_async(&tp->t_timers->tt_persist, &tcp_callout_drain, tp) == 0)
+ tp->t_calloutref--;
+ if (callout_drain_async(&tp->t_timers->tt_keep, &tcp_callout_drain, tp) == 0)
+ tp->t_calloutref--;
+ if (callout_drain_async(&tp->t_timers->tt_2msl, &tcp_callout_drain, tp) == 0)
+ tp->t_calloutref--;
+ if (callout_drain_async(&tp->t_timers->tt_delack, &tcp_callout_drain, tp) == 0)
+ tp->t_calloutref--;
+ /* this ref is taken by the final tcp_callout_drain() */
+ in_pcbref(inp);
+
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
@@ -1017,10 +1079,7 @@
khelp_destroy_osd(tp->osd);
- CC_ALGO(tp) = NULL;
- inp->inp_ppcb = NULL;
- tp->t_inpcb = NULL;
- uma_zfree(V_tcpcb_zone, tp);
+ tcp_callout_drain(tp);
}
/*
@@ -1032,10 +1091,16 @@
{
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
+ int do_unlock;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
+ do_unlock = tcp_switch_locks(tp->t_inpcb);
+ if (do_unlock == 2)
+ return (NULL);
+
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
#ifdef TCP_OFFLOAD
if (tp->t_state == TCPS_LISTEN)
tcp_offload_listen_stop(tp);
@@ -1054,8 +1119,10 @@
SOCK_LOCK(so);
so->so_state &= ~SS_PROTOREF;
sofree(so);
- return (NULL);
+ tp = NULL;
}
+ if (do_unlock)
+ INP_INFO_WUNLOCK(&V_tcbinfo);
return (tp);
}
Index: sys/netinet/tcp_timer.c
===================================================================
--- sys/netinet/tcp_timer.c (revision 277315)
+++ sys/netinet/tcp_timer.c (working copy)
@@ -273,25 +273,7 @@
CURVNET_SET(tp->t_vnet);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- CURVNET_RESTORE();
- return;
- }
- INP_WLOCK(inp);
- if (callout_pending(&tp->t_timers->tt_delack) ||
- !callout_active(&tp->t_timers->tt_delack)) {
- INP_WUNLOCK(inp);
- CURVNET_RESTORE();
- return;
- }
+
callout_deactivate(&tp->t_timers->tt_delack);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
@@ -317,37 +299,11 @@
ostate = tp->t_state;
#endif
- /*
- * XXXRW: Does this actually happen?
- */
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
- INP_WLOCK(inp);
tcp_free_sackholes(tp);
- if (callout_pending(&tp->t_timers->tt_2msl) ||
- !callout_active(&tp->t_timers->tt_2msl)) {
- INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
callout_deactivate(&tp->t_timers->tt_2msl);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -383,7 +339,6 @@
#endif
if (tp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -399,33 +354,10 @@
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
- INP_WLOCK(inp);
- if (callout_pending(&tp->t_timers->tt_keep) ||
- !callout_active(&tp->t_timers->tt_keep)) {
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
callout_deactivate(&tp->t_timers->tt_keep);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -472,7 +404,6 @@
PRU_SLOWTIMO);
#endif
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
@@ -487,7 +418,6 @@
#endif
if (tp != NULL)
INP_WUNLOCK(tp->t_inpcb);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -502,33 +432,11 @@
ostate = tp->t_state;
#endif
- INP_INFO_WLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
- INP_WLOCK(inp);
- if (callout_pending(&tp->t_timers->tt_persist) ||
- !callout_active(&tp->t_timers->tt_persist)) {
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+
callout_deactivate(&tp->t_timers->tt_persist);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -573,7 +481,6 @@
#endif
if (tp != NULL)
INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -583,7 +490,7 @@
struct tcpcb *tp = xtp;
CURVNET_SET(tp->t_vnet);
int rexmt;
- int headlocked;
+ int do_unlock = 0;
struct inpcb *inp;
#ifdef TCPDEBUG
int ostate;
@@ -590,34 +497,10 @@
ostate = tp->t_state;
#endif
-
- INP_INFO_RLOCK(&V_tcbinfo);
inp = tp->t_inpcb;
- /*
- * XXXRW: While this assert is in fact correct, bugs in the tcpcb
- * tear-down mean we need it as a work-around for races between
- * timers and tcp_discardcb().
- *
- * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
- */
- if (inp == NULL) {
- tcp_timer_race++;
- INP_INFO_RUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
- INP_WLOCK(inp);
- if (callout_pending(&tp->t_timers->tt_rexmt) ||
- !callout_active(&tp->t_timers->tt_rexmt)) {
- INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
callout_deactivate(&tp->t_timers->tt_rexmt);
if ((inp->inp_flags & INP_DROPPED) != 0) {
INP_WUNLOCK(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
return;
}
@@ -630,30 +513,18 @@
if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
tp->t_rxtshift = TCP_MAXRXTSHIFT;
TCPSTAT_INC(tcps_timeoutdrop);
- in_pcbref(inp);
- INP_INFO_RUNLOCK(&V_tcbinfo);
- INP_WUNLOCK(inp);
- INP_INFO_WLOCK(&V_tcbinfo);
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp)) {
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ do_unlock = tcp_switch_locks(inp);
+ if (do_unlock == 2) {
CURVNET_RESTORE();
return;
}
- if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- INP_INFO_WUNLOCK(&V_tcbinfo);
- CURVNET_RESTORE();
- return;
- }
+ if (inp->inp_flags & INP_DROPPED)
+ goto out;
tp = tcp_drop(tp, tp->t_softerror ?
tp->t_softerror : ETIMEDOUT);
- headlocked = 1;
goto out;
}
- INP_INFO_RUNLOCK(&V_tcbinfo);
- headlocked = 0;
if (tp->t_state == TCPS_SYN_SENT) {
/*
* If the SYN was retransmitted, indicate CWND to be
@@ -843,7 +714,7 @@
#endif
if (tp != NULL)
INP_WUNLOCK(inp);
- if (headlocked)
+ if (do_unlock)
INP_INFO_WUNLOCK(&V_tcbinfo);
CURVNET_RESTORE();
}
@@ -856,6 +727,8 @@
struct inpcb *inp = tp->t_inpcb;
int cpu = inp_to_cpuid(inp);
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return;
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h (revision 277315)
+++ sys/netinet/tcp_var.h (working copy)
@@ -153,6 +153,7 @@
u_long max_sndwnd; /* largest window peer has offered */
int t_softerror; /* possible error not yet reported */
+ u_char t_calloutref; /* callout drain references */
/* out-of-band data */
char t_oobflags; /* have some */
char t_iobc; /* input character */
@@ -654,6 +655,7 @@
void tcp_twclose(struct tcptw *, int);
void tcp_ctlinput(int, struct sockaddr *, void *);
int tcp_ctloutput(struct socket *, struct sockopt *);
+int tcp_switch_locks(struct inpcb *inp);
struct tcpcb *
tcp_drop(struct tcpcb *, int);
void tcp_drain(void);

File Metadata

Mime Type
text/plain; charset=utf-8
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
72967
Default Alt Text
Restore_multi_threaded_callouts_in_the_TCP_stack (12 KB)

Event Timeline