Page MenuHomeFreeBSD

D32897.id98211.diff
No OneTemporary

D32897.id98211.diff

Index: sys/netinet/tcp_stacks/rack.c
===================================================================
--- sys/netinet/tcp_stacks/rack.c
+++ sys/netinet/tcp_stacks/rack.c
@@ -205,6 +205,7 @@
static int32_t rack_stats_gets_ms_rtt = 1;
static int32_t rack_prr_addbackmax = 2;
static int32_t rack_do_hystart = 0;
+static int32_t rack_apply_rtt_with_reduced_conf = 0;
static int32_t rack_pkt_delay = 1000;
static int32_t rack_send_a_lot_in_prr = 1;
@@ -343,6 +344,10 @@
counter_u64_t rack_per_timer_hole;
counter_u64_t rack_large_ackcmp;
counter_u64_t rack_small_ackcmp;
+counter_u64_t rack_persists_sends;
+counter_u64_t rack_persists_acks;
+counter_u64_t rack_persists_loss;
+counter_u64_t rack_persists_lost_ends;
#ifdef INVARIANTS
counter_u64_t rack_adjust_map_bw;
#endif
@@ -772,6 +777,10 @@
counter_u64_zero(rack_per_timer_hole);
counter_u64_zero(rack_large_ackcmp);
counter_u64_zero(rack_small_ackcmp);
+ counter_u64_zero(rack_persists_sends);
+ counter_u64_zero(rack_persists_acks);
+ counter_u64_zero(rack_persists_loss);
+ counter_u64_zero(rack_persists_lost_ends);
#ifdef INVARIANTS
counter_u64_zero(rack_adjust_map_bw);
#endif
@@ -1412,6 +1421,11 @@
&rack_tcp_accounting, 0,
"Should we turn on TCP accounting for all rack sessions?");
#endif
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
+ &rack_apply_rtt_with_reduced_conf, 0,
+ "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
@@ -1774,6 +1788,30 @@
OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
&rack_large_ackcmp,
"Number of TCP connections with large mbuf's for compressed acks");
+ rack_persists_sends = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_sends", CTLFLAG_RD,
+ &rack_persists_sends,
+ "Number of times we sent a persist probe");
+ rack_persists_acks = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_acks", CTLFLAG_RD,
+ &rack_persists_acks,
+ "Number of times a persist probe was acked");
+ rack_persists_loss = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_loss", CTLFLAG_RD,
+ &rack_persists_loss,
+ "Number of times we detected a lost persist probe (no ack)");
+ rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
+ &rack_persists_lost_ends,
+ "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -2938,6 +2976,10 @@
counter_u64_free(rack_per_timer_hole);
counter_u64_free(rack_large_ackcmp);
counter_u64_free(rack_small_ackcmp);
+ counter_u64_free(rack_persists_sends);
+ counter_u64_free(rack_persists_acks);
+ counter_u64_free(rack_persists_loss);
+ counter_u64_free(rack_persists_lost_ends);
#ifdef INVARIANTS
counter_u64_free(rack_adjust_map_bw);
#endif
@@ -5623,6 +5665,9 @@
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
rack_timer_cancel(tp, rack, cts, __LINE__);
+ rack->r_ctl.persist_lost_ends = 0;
+ rack->probe_not_answered = 0;
+ rack->forced_ack = 0;
tp->t_rxtshift = 0;
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
@@ -6494,6 +6539,7 @@
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
tcp_set_inp_to_drop(inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
return (1);
}
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
@@ -6515,6 +6561,7 @@
retval = 1;
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
goto out;
}
if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
@@ -6531,6 +6578,7 @@
KMOD_TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
goto out;
}
t_template = tcpip_maketemplate(rack->rc_inp);
@@ -6539,7 +6587,12 @@
if (rack->forced_ack == 0) {
rack->forced_ack = 1;
rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
+ counter_u64_add(rack_persists_loss, 1);
+ rack->r_ctl.persist_lost_ends++;
}
+ counter_u64_add(rack_persists_sends, 1);
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
tp->rcv_nxt, tp->snd_una - 1, 0);
@@ -6602,6 +6655,8 @@
if (rack->forced_ack == 0) {
rack->forced_ack = 1;
rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
}
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
@@ -10301,6 +10356,14 @@
INP_WLOCK_ASSERT(tp->t_inpcb);
acked = BYTES_THIS_ACK(tp, th);
+ if (acked) {
+ /*
+ * Any time we move the cum-ack forward clear
+ * keep-alive tied probe-not-answered. The
+ * persists clears its own on entry.
+ */
+ rack->probe_not_answered = 0;
+ }
KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
/*
@@ -13374,6 +13437,61 @@
}
+static void
+rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
+{
+ uint32_t us_rtt;
+ /*
+ * A persist or keep-alive was forced out, update our
+ * min rtt time. Note now worry about lost responses.
+ * When a subsequent keep-alive or persist times out
+ * and forced_ack is still on, then the last probe
+ * was not responded to. In such cases we have a
+ * sysctl that controls the behavior. Either we apply
+ * the rtt but with reduced confidence (0). Or we just
+ * plain don't apply the rtt estimate. Having data flow
+ * will clear the probe_not_answered flag i.e. cum-ack
+ * move forward <or> exiting and reentering persists.
+ */
+
+ rack->forced_ack = 0;
+ rack->rc_tp->t_rxtshift = 0;
+ if ((rack->rc_in_persist &&
+ (tiwin == rack->rc_tp->snd_wnd)) ||
+ (rack->rc_in_persist == 0)) {
+ /*
+ * In persists only apply the RTT update if this is
+ * a response to our window probe. And that
+ * means the rwnd sent must match the current
+ * snd_wnd. If it does not, then we got a
+ * window update ack instead. For keepalive
+ * we allow the answer no matter what the window.
+ *
+ * Note that if the probe_not_answered is set then
+ * the forced_ack_ts is the oldest one i.e. the first
+ * probe sent that might have been lost. This assures
+ * us that if we do calculate an RTT it is longer not
+ * some short thing.
+ */
+ if (rack->rc_in_persist)
+ counter_u64_add(rack_persists_acks, 1);
+ us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
+ if (us_rtt == 0)
+ us_rtt = 1;
+ if (rack->probe_not_answered == 0) {
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
+ } else {
+ /* We have a retransmitted probe here too */
+ if (rack_apply_rtt_with_reduced_conf) {
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
+ }
+ }
+ }
+}
+
+
static int
rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
{
@@ -13483,7 +13601,7 @@
} else if (SEQ_GT(ae->ack, high_seq)) {
/* Case A */
ae->ack_val_set = ACK_CUMACK;
- } else if (tiwin == the_win) {
+ } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
/* Case D */
ae->ack_val_set = ACK_DUPACK;
} else {
@@ -13596,6 +13714,18 @@
rack_strike_dupack(rack);
} else if (ae->ack_val_set == ACK_RWND) {
/* Case C */
+ if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
+ ts.tv_sec = ae->timestamp / 1000000000;
+ ts.tv_nsec = ae->timestamp % 1000000000;
+ rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
+ rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
+ } else {
+ rack->r_ctl.act_rcv_time = *tv;
+ }
+ if (rack->forced_ack) {
+ rack_handle_probe_response(rack, tiwin,
+ tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
+ }
win_up_req = 1;
win_upd_ack = ae->ack;
win_seq = ae->seq;
@@ -13677,6 +13807,11 @@
#endif
acked_amount = acked = (high_seq - tp->snd_una);
if (acked) {
+ /*
+ * Clear the probe not answered flag
+ * since cum-ack moved forward.
+ */
+ rack->probe_not_answered = 0;
if (rack->sack_attack_disable == 0)
rack_do_decay(rack);
if (acked >= segsiz) {
@@ -14432,31 +14567,7 @@
}
rack_clear_rate_sample(rack);
if (rack->forced_ack) {
- uint32_t us_rtt;
-
- /*
- * A persist or keep-alive was forced out, update our
- * min rtt time. Note we do not worry about lost
- * retransmissions since KEEP-ALIVES and persists
- * are usually way long on times of sending (though
- * if we were really paranoid or worried we could
- * at least use timestamps if available to validate).
- */
- rack->forced_ack = 0;
- if (tiwin == tp->snd_wnd) {
- /*
- * Only apply the RTT update if this is
- * a response to our window probe. And that
- * means the rwnd sent must match the current
- * snd_wnd. If it does not, then we got a
- * window update ack instead.
- */
- us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
- if (us_rtt == 0)
- us_rtt = 1;
- rack_apply_updated_usrtt(rack, us_rtt, us_cts);
- tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
- }
+ rack_handle_probe_response(rack, tiwin, us_cts);
}
/*
* This is the one exception case where we set the rack state
Index: sys/netinet/tcp_stacks/tcp_rack.h
===================================================================
--- sys/netinet/tcp_stacks/tcp_rack.h
+++ sys/netinet/tcp_stacks/tcp_rack.h
@@ -496,6 +496,7 @@
uint32_t challenge_ack_cnt;
uint32_t rc_min_to; /* Socket option value Lock(a) */
uint32_t rc_pkt_delay; /* Socket option value Lock(a) */
+ uint32_t persist_lost_ends;
struct newreno rc_saved_beta; /*
* For newreno cc:
* rc_saved_cc are the values we have had
@@ -567,7 +568,8 @@
rc_last_tlp_past_cumack: 1,
rc_last_sent_tlp_seq_valid: 1,
rc_last_sent_tlp_past_cumack: 1,
- avail_bytes : 3;
+ probe_not_answered: 1,
+ avail_bytes : 2;
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */
rtt_limit_mul : 4, /* muliply this by low rtt */
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -1748,6 +1748,7 @@
struct mbuf *optm;
struct udphdr *uh = NULL;
struct tcphdr *nth;
+ struct tcp_log_buffer *lgb;
u_char *optp;
#ifdef INET6
struct ip6_hdr *ip6;
@@ -1756,6 +1757,7 @@
int optlen, tlen, win, ulen;
bool incl_opts;
uint16_t port;
+ int output_ret;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
NET_EPOCH_ASSERT();
@@ -2086,11 +2088,26 @@
TCP_PROBE3(debug__output, tp, th, m);
if (flags & TH_RST)
TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
+ if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
+ log.u_bbr.ininput = tp->t_inpcb->inp_in_input;
+ log.u_bbr.flex8 = 4;
+ log.u_bbr.pkts_out = tp->t_maxseg;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.delivered = 0;
+ lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ } else
+ lgb = NULL;
#ifdef INET6
if (isipv6) {
TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
- (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
+ output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
}
#endif /* INET6 */
#if defined(INET) && defined(INET6)
@@ -2099,9 +2116,13 @@
#ifdef INET
{
TCP_PROBE5(send, NULL, tp, ip, tp, nth);
- (void)ip_output(m, NULL, NULL, 0, NULL, inp);
+ output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
}
#endif
+ if (lgb) {
+ lgb->tlb_errno = output_ret;
+ lgb = NULL;
+ }
}
/*
@@ -2137,8 +2158,9 @@
*/
CC_LIST_RLOCK();
KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
- CC_ALGO(tp) = CC_DEFAULT();
+ CC_ALGO(tp) = CC_DEFAULT_ALGO();
CC_LIST_RUNLOCK();
+
/*
* The tcpcb will hold a reference on its inpcb until tcp_discardcb()
* is called.
@@ -2147,7 +2169,7 @@
tp->t_inpcb = inp;
if (CC_ALGO(tp)->cb_init != NULL)
- if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
+ if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) {
if (tp->t_fb->tfb_tcp_fb_fini)
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
in_pcbrele_wlocked(inp);
@@ -2240,25 +2262,23 @@
}
/*
- * Switch the congestion control algorithm back to NewReno for any active
- * control blocks using an algorithm which is about to go away.
- * This ensures the CC framework can allow the unload to proceed without leaving
- * any dangling pointers which would trigger a panic.
- * Returning non-zero would inform the CC framework that something went wrong
- * and it would be unsafe to allow the unload to proceed. However, there is no
- * way for this to occur with this implementation so we always return zero.
+ * Switch the congestion control algorithm back to Vnet default for any active
+ * control blocks using an algorithm which is about to go away. If the algorithm
+ * has a cb_init function and it fails (no memory) then the operation fails and
+ * the unload will not succeed.
+ *
*/
int
tcp_ccalgounload(struct cc_algo *unload_algo)
{
- struct cc_algo *tmpalgo;
+ struct cc_algo *oldalgo, *newalgo;
struct inpcb *inp;
struct tcpcb *tp;
VNET_ITERATOR_DECL(vnet_iter);
/*
* Check all active control blocks across all network stacks and change
- * any that are using "unload_algo" back to NewReno. If "unload_algo"
+ * any that are using "unload_algo" back to its default. If "unload_algo"
* requires cleanup code to be run, call it.
*/
VNET_LIST_RLOCK();
@@ -2272,6 +2292,7 @@
* therefore don't enter the loop below until the connection
* list has stabilised.
*/
+ newalgo = CC_DEFAULT_ALGO();
CK_LIST_FOREACH(inp, &V_tcb, inp_list) {
INP_WLOCK(inp);
/* Important to skip tcptw structs. */
@@ -2280,24 +2301,48 @@
/*
* By holding INP_WLOCK here, we are assured
* that the connection is not currently
- * executing inside the CC module's functions
- * i.e. it is safe to make the switch back to
- * NewReno.
+ * executing inside the CC module's functions.
+ * We attempt to switch to the Vnets default,
+ * if the init fails then we fail the whole
+ * operation and the module unload will fail.
*/
if (CC_ALGO(tp) == unload_algo) {
- tmpalgo = CC_ALGO(tp);
- if (tmpalgo->cb_destroy != NULL)
- tmpalgo->cb_destroy(tp->ccv);
- CC_DATA(tp) = NULL;
- /*
- * NewReno may allocate memory on
- * demand for certain stateful
- * configuration as needed, but is
- * coded to never fail on memory
- * allocation failure so it is a safe
- * fallback.
- */
- CC_ALGO(tp) = &newreno_cc_algo;
+ struct cc_var cc_mem;
+ int err;
+
+ oldalgo = CC_ALGO(tp);
+ memset(&cc_mem, 0, sizeof(cc_mem));
+ cc_mem.ccvc.tcp = tp;
+ if (newalgo->cb_init == NULL) {
+ /*
+ * No init we can skip the
+ * dance around a possible failure.
+ */
+ CC_DATA(tp) = NULL;
+ goto proceed;
+ }
+ err = (newalgo->cb_init)(&cc_mem, NULL);
+ if (err) {
+ /*
+ * Presumably no memory the caller will
+ * need to try again.
+ */
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ VNET_LIST_RUNLOCK();
+ return (err);
+ }
+proceed:
+ if (oldalgo->cb_destroy != NULL)
+ oldalgo->cb_destroy(tp->ccv);
+ CC_ALGO(tp) = newalgo;
+ memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (CC_ALGO(tp)->conn_init != NULL)) {
+ /* Yep run the connection init for the new CC */
+ CC_ALGO(tp)->conn_init(tp->ccv);
+ }
}
}
INP_WUNLOCK(inp);
@@ -2306,7 +2351,6 @@
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK();
-
return (0);
}

File Metadata

Mime Type
text/plain
Expires
Mon, Jul 6, 4:55 AM (9 h, 40 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34736545
Default Alt Text
D32897.id98211.diff (16 KB)

Event Timeline