Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F161594270
D32897.id98211.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
16 KB
Referenced Files
None
Subscribers
None
D32897.id98211.diff
View Options
Index: sys/netinet/tcp_stacks/rack.c
===================================================================
--- sys/netinet/tcp_stacks/rack.c
+++ sys/netinet/tcp_stacks/rack.c
@@ -205,6 +205,7 @@
static int32_t rack_stats_gets_ms_rtt = 1;
static int32_t rack_prr_addbackmax = 2;
static int32_t rack_do_hystart = 0;
+static int32_t rack_apply_rtt_with_reduced_conf = 0;
static int32_t rack_pkt_delay = 1000;
static int32_t rack_send_a_lot_in_prr = 1;
@@ -343,6 +344,10 @@
counter_u64_t rack_per_timer_hole;
counter_u64_t rack_large_ackcmp;
counter_u64_t rack_small_ackcmp;
+counter_u64_t rack_persists_sends;
+counter_u64_t rack_persists_acks;
+counter_u64_t rack_persists_loss;
+counter_u64_t rack_persists_lost_ends;
#ifdef INVARIANTS
counter_u64_t rack_adjust_map_bw;
#endif
@@ -772,6 +777,10 @@
counter_u64_zero(rack_per_timer_hole);
counter_u64_zero(rack_large_ackcmp);
counter_u64_zero(rack_small_ackcmp);
+ counter_u64_zero(rack_persists_sends);
+ counter_u64_zero(rack_persists_acks);
+ counter_u64_zero(rack_persists_loss);
+ counter_u64_zero(rack_persists_lost_ends);
#ifdef INVARIANTS
counter_u64_zero(rack_adjust_map_bw);
#endif
@@ -1412,6 +1421,11 @@
&rack_tcp_accounting, 0,
"Should we turn on TCP accounting for all rack sessions?");
#endif
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
+ &rack_apply_rtt_with_reduced_conf, 0,
+ "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
@@ -1774,6 +1788,30 @@
OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
&rack_large_ackcmp,
"Number of TCP connections with large mbuf's for compressed acks");
+ rack_persists_sends = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_sends", CTLFLAG_RD,
+ &rack_persists_sends,
+ "Number of times we sent a persist probe");
+ rack_persists_acks = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_acks", CTLFLAG_RD,
+ &rack_persists_acks,
+ "Number of times a persist probe was acked");
+ rack_persists_loss = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_loss", CTLFLAG_RD,
+ &rack_persists_loss,
+ "Number of times we detected a lost persist probe (no ack)");
+ rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
+ &rack_persists_lost_ends,
+ "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -2938,6 +2976,10 @@
counter_u64_free(rack_per_timer_hole);
counter_u64_free(rack_large_ackcmp);
counter_u64_free(rack_small_ackcmp);
+ counter_u64_free(rack_persists_sends);
+ counter_u64_free(rack_persists_acks);
+ counter_u64_free(rack_persists_loss);
+ counter_u64_free(rack_persists_lost_ends);
#ifdef INVARIANTS
counter_u64_free(rack_adjust_map_bw);
#endif
@@ -5623,6 +5665,9 @@
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
rack_timer_cancel(tp, rack, cts, __LINE__);
+ rack->r_ctl.persist_lost_ends = 0;
+ rack->probe_not_answered = 0;
+ rack->forced_ack = 0;
tp->t_rxtshift = 0;
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
@@ -6494,6 +6539,7 @@
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
tcp_set_inp_to_drop(inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
return (1);
}
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
@@ -6515,6 +6561,7 @@
retval = 1;
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
goto out;
}
if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
@@ -6531,6 +6578,7 @@
KMOD_TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
goto out;
}
t_template = tcpip_maketemplate(rack->rc_inp);
@@ -6539,7 +6587,12 @@
if (rack->forced_ack == 0) {
rack->forced_ack = 1;
rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
+ counter_u64_add(rack_persists_loss, 1);
+ rack->r_ctl.persist_lost_ends++;
}
+ counter_u64_add(rack_persists_sends, 1);
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
tp->rcv_nxt, tp->snd_una - 1, 0);
@@ -6602,6 +6655,8 @@
if (rack->forced_ack == 0) {
rack->forced_ack = 1;
rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
}
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
@@ -10301,6 +10356,14 @@
INP_WLOCK_ASSERT(tp->t_inpcb);
acked = BYTES_THIS_ACK(tp, th);
+ if (acked) {
+ /*
+ * Any time we move the cum-ack forward clear
+ * keep-alive tied probe-not-answered. The
+ * persists clears its own on entry.
+ */
+ rack->probe_not_answered = 0;
+ }
KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
/*
@@ -13374,6 +13437,61 @@
}
+static void
+rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
+{
+ uint32_t us_rtt;
+ /*
+ * A persist or keep-alive was forced out, update our
+ * min rtt time. Note now worry about lost responses.
+ * When a subsequent keep-alive or persist times out
+ * and forced_ack is still on, then the last probe
+ * was not responded to. In such cases we have a
+ * sysctl that controls the behavior. Either we apply
+ * the rtt but with reduced confidence (0). Or we just
+ * plain don't apply the rtt estimate. Having data flow
+ * will clear the probe_not_answered flag i.e. cum-ack
+ * move forward <or> exiting and reentering persists.
+ */
+
+ rack->forced_ack = 0;
+ rack->rc_tp->t_rxtshift = 0;
+ if ((rack->rc_in_persist &&
+ (tiwin == rack->rc_tp->snd_wnd)) ||
+ (rack->rc_in_persist == 0)) {
+ /*
+ * In persists only apply the RTT update if this is
+ * a response to our window probe. And that
+ * means the rwnd sent must match the current
+ * snd_wnd. If it does not, then we got a
+ * window update ack instead. For keepalive
+ * we allow the answer no matter what the window.
+ *
+ * Note that if the probe_not_answered is set then
+ * the forced_ack_ts is the oldest one i.e. the first
+ * probe sent that might have been lost. This assures
+ * us that if we do calculate an RTT it is longer not
+ * some short thing.
+ */
+ if (rack->rc_in_persist)
+ counter_u64_add(rack_persists_acks, 1);
+ us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
+ if (us_rtt == 0)
+ us_rtt = 1;
+ if (rack->probe_not_answered == 0) {
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
+ } else {
+ /* We have a retransmitted probe here too */
+ if (rack_apply_rtt_with_reduced_conf) {
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
+ }
+ }
+ }
+}
+
+
static int
rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
{
@@ -13483,7 +13601,7 @@
} else if (SEQ_GT(ae->ack, high_seq)) {
/* Case A */
ae->ack_val_set = ACK_CUMACK;
- } else if (tiwin == the_win) {
+ } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
/* Case D */
ae->ack_val_set = ACK_DUPACK;
} else {
@@ -13596,6 +13714,18 @@
rack_strike_dupack(rack);
} else if (ae->ack_val_set == ACK_RWND) {
/* Case C */
+ if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
+ ts.tv_sec = ae->timestamp / 1000000000;
+ ts.tv_nsec = ae->timestamp % 1000000000;
+ rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
+ rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
+ } else {
+ rack->r_ctl.act_rcv_time = *tv;
+ }
+ if (rack->forced_ack) {
+ rack_handle_probe_response(rack, tiwin,
+ tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
+ }
win_up_req = 1;
win_upd_ack = ae->ack;
win_seq = ae->seq;
@@ -13677,6 +13807,11 @@
#endif
acked_amount = acked = (high_seq - tp->snd_una);
if (acked) {
+ /*
+ * Clear the probe not answered flag
+ * since cum-ack moved forward.
+ */
+ rack->probe_not_answered = 0;
if (rack->sack_attack_disable == 0)
rack_do_decay(rack);
if (acked >= segsiz) {
@@ -14432,31 +14567,7 @@
}
rack_clear_rate_sample(rack);
if (rack->forced_ack) {
- uint32_t us_rtt;
-
- /*
- * A persist or keep-alive was forced out, update our
- * min rtt time. Note we do not worry about lost
- * retransmissions since KEEP-ALIVES and persists
- * are usually way long on times of sending (though
- * if we were really paranoid or worried we could
- * at least use timestamps if available to validate).
- */
- rack->forced_ack = 0;
- if (tiwin == tp->snd_wnd) {
- /*
- * Only apply the RTT update if this is
- * a response to our window probe. And that
- * means the rwnd sent must match the current
- * snd_wnd. If it does not, then we got a
- * window update ack instead.
- */
- us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
- if (us_rtt == 0)
- us_rtt = 1;
- rack_apply_updated_usrtt(rack, us_rtt, us_cts);
- tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
- }
+ rack_handle_probe_response(rack, tiwin, us_cts);
}
/*
* This is the one exception case where we set the rack state
Index: sys/netinet/tcp_stacks/tcp_rack.h
===================================================================
--- sys/netinet/tcp_stacks/tcp_rack.h
+++ sys/netinet/tcp_stacks/tcp_rack.h
@@ -496,6 +496,7 @@
uint32_t challenge_ack_cnt;
uint32_t rc_min_to; /* Socket option value Lock(a) */
uint32_t rc_pkt_delay; /* Socket option value Lock(a) */
+ uint32_t persist_lost_ends;
struct newreno rc_saved_beta; /*
* For newreno cc:
* rc_saved_cc are the values we have had
@@ -567,7 +568,8 @@
rc_last_tlp_past_cumack: 1,
rc_last_sent_tlp_seq_valid: 1,
rc_last_sent_tlp_past_cumack: 1,
- avail_bytes : 3;
+ probe_not_answered: 1,
+ avail_bytes : 2;
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */
rtt_limit_mul : 4, /* muliply this by low rtt */
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -1748,6 +1748,7 @@
struct mbuf *optm;
struct udphdr *uh = NULL;
struct tcphdr *nth;
+ struct tcp_log_buffer *lgb;
u_char *optp;
#ifdef INET6
struct ip6_hdr *ip6;
@@ -1756,6 +1757,7 @@
int optlen, tlen, win, ulen;
bool incl_opts;
uint16_t port;
+ int output_ret;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
NET_EPOCH_ASSERT();
@@ -2086,11 +2088,26 @@
TCP_PROBE3(debug__output, tp, th, m);
if (flags & TH_RST)
TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
+ if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
+ log.u_bbr.ininput = tp->t_inpcb->inp_in_input;
+ log.u_bbr.flex8 = 4;
+ log.u_bbr.pkts_out = tp->t_maxseg;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.delivered = 0;
+ lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ } else
+ lgb = NULL;
#ifdef INET6
if (isipv6) {
TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
- (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
+ output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
}
#endif /* INET6 */
#if defined(INET) && defined(INET6)
@@ -2099,9 +2116,13 @@
#ifdef INET
{
TCP_PROBE5(send, NULL, tp, ip, tp, nth);
- (void)ip_output(m, NULL, NULL, 0, NULL, inp);
+ output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
}
#endif
+ if (lgb) {
+ lgb->tlb_errno = output_ret;
+ lgb = NULL;
+ }
}
/*
@@ -2137,8 +2158,9 @@
*/
CC_LIST_RLOCK();
KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
- CC_ALGO(tp) = CC_DEFAULT();
+ CC_ALGO(tp) = CC_DEFAULT_ALGO();
CC_LIST_RUNLOCK();
+
/*
* The tcpcb will hold a reference on its inpcb until tcp_discardcb()
* is called.
@@ -2147,7 +2169,7 @@
tp->t_inpcb = inp;
if (CC_ALGO(tp)->cb_init != NULL)
- if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
+ if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) {
if (tp->t_fb->tfb_tcp_fb_fini)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
in_pcbrele_wlocked(inp);
@@ -2240,25 +2262,23 @@
}
/*
- * Switch the congestion control algorithm back to NewReno for any active
- * control blocks using an algorithm which is about to go away.
- * This ensures the CC framework can allow the unload to proceed without leaving
- * any dangling pointers which would trigger a panic.
- * Returning non-zero would inform the CC framework that something went wrong
- * and it would be unsafe to allow the unload to proceed. However, there is no
- * way for this to occur with this implementation so we always return zero.
+ * Switch the congestion control algorithm back to Vnet default for any active
+ * control blocks using an algorithm which is about to go away. If the algorithm
+ * has a cb_init function and it fails (no memory) then the operation fails and
+ * the unload will not succeed.
+ *
*/
int
tcp_ccalgounload(struct cc_algo *unload_algo)
{
- struct cc_algo *tmpalgo;
+ struct cc_algo *oldalgo, *newalgo;
struct inpcb *inp;
struct tcpcb *tp;
VNET_ITERATOR_DECL(vnet_iter);
/*
* Check all active control blocks across all network stacks and change
- * any that are using "unload_algo" back to NewReno. If "unload_algo"
+ * any that are using "unload_algo" back to its default. If "unload_algo"
* requires cleanup code to be run, call it.
*/
VNET_LIST_RLOCK();
@@ -2272,6 +2292,7 @@
* therefore don't enter the loop below until the connection
* list has stabilised.
*/
+ newalgo = CC_DEFAULT_ALGO();
CK_LIST_FOREACH(inp, &V_tcb, inp_list) {
INP_WLOCK(inp);
/* Important to skip tcptw structs. */
@@ -2280,24 +2301,48 @@
/*
* By holding INP_WLOCK here, we are assured
* that the connection is not currently
- * executing inside the CC module's functions
- * i.e. it is safe to make the switch back to
- * NewReno.
+ * executing inside the CC module's functions.
+ * We attempt to switch to the Vnets default,
+ * if the init fails then we fail the whole
+ * operation and the module unload will fail.
*/
if (CC_ALGO(tp) == unload_algo) {
- tmpalgo = CC_ALGO(tp);
- if (tmpalgo->cb_destroy != NULL)
- tmpalgo->cb_destroy(tp->ccv);
- CC_DATA(tp) = NULL;
- /*
- * NewReno may allocate memory on
- * demand for certain stateful
- * configuration as needed, but is
- * coded to never fail on memory
- * allocation failure so it is a safe
- * fallback.
- */
- CC_ALGO(tp) = &newreno_cc_algo;
+ struct cc_var cc_mem;
+ int err;
+
+ oldalgo = CC_ALGO(tp);
+ memset(&cc_mem, 0, sizeof(cc_mem));
+ cc_mem.ccvc.tcp = tp;
+ if (newalgo->cb_init == NULL) {
+ /*
+ * No init we can skip the
+ * dance around a possible failure.
+ */
+ CC_DATA(tp) = NULL;
+ goto proceed;
+ }
+ err = (newalgo->cb_init)(&cc_mem, NULL);
+ if (err) {
+ /*
+ * Presumably no memory the caller will
+ * need to try again.
+ */
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ VNET_LIST_RUNLOCK();
+ return (err);
+ }
+proceed:
+ if (oldalgo->cb_destroy != NULL)
+ oldalgo->cb_destroy(tp->ccv);
+ CC_ALGO(tp) = newalgo;
+ memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (CC_ALGO(tp)->conn_init != NULL)) {
+ /* Yep run the connection init for the new CC */
+ CC_ALGO(tp)->conn_init(tp->ccv);
+ }
}
}
INP_WUNLOCK(inp);
@@ -2306,7 +2351,6 @@
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
-
return (0);
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Jul 6, 4:55 AM (9 h, 40 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34736545
Default Alt Text
D32897.id98211.diff (16 KB)
Attached To
Mode
D32897: tcp: Rack may still calculate long RTT on persists probes.
Attached
Detach File
Event Timeline
Log In to Comment