Page MenuHomeFreeBSD

D32897.id98355.diff
No OneTemporary

D32897.id98355.diff

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -205,6 +205,7 @@
static int32_t rack_stats_gets_ms_rtt = 1;
static int32_t rack_prr_addbackmax = 2;
static int32_t rack_do_hystart = 0;
+static int32_t rack_apply_rtt_with_reduced_conf = 0;
static int32_t rack_pkt_delay = 1000;
static int32_t rack_send_a_lot_in_prr = 1;
@@ -343,6 +344,10 @@
counter_u64_t rack_per_timer_hole;
counter_u64_t rack_large_ackcmp;
counter_u64_t rack_small_ackcmp;
+counter_u64_t rack_persists_sends;
+counter_u64_t rack_persists_acks;
+counter_u64_t rack_persists_loss;
+counter_u64_t rack_persists_lost_ends;
#ifdef INVARIANTS
counter_u64_t rack_adjust_map_bw;
#endif
@@ -772,6 +777,10 @@
counter_u64_zero(rack_per_timer_hole);
counter_u64_zero(rack_large_ackcmp);
counter_u64_zero(rack_small_ackcmp);
+ counter_u64_zero(rack_persists_sends);
+ counter_u64_zero(rack_persists_acks);
+ counter_u64_zero(rack_persists_loss);
+ counter_u64_zero(rack_persists_lost_ends);
#ifdef INVARIANTS
counter_u64_zero(rack_adjust_map_bw);
#endif
@@ -1412,6 +1421,11 @@
&rack_tcp_accounting, 0,
"Should we turn on TCP accounting for all rack sessions?");
#endif
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
+ &rack_apply_rtt_with_reduced_conf, 0,
+ "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
@@ -1774,6 +1788,30 @@
OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
&rack_large_ackcmp,
"Number of TCP connections with large mbuf's for compressed acks");
+ rack_persists_sends = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_sends", CTLFLAG_RD,
+ &rack_persists_sends,
+ "Number of times we sent a persist probe");
+ rack_persists_acks = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_acks", CTLFLAG_RD,
+ &rack_persists_acks,
+ "Number of times a persist probe was acked");
+ rack_persists_loss = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_loss", CTLFLAG_RD,
+ &rack_persists_loss,
+ "Number of times we detected a lost persist probe (no ack)");
+ rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
+ &rack_persists_lost_ends,
+ "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -2938,6 +2976,10 @@
counter_u64_free(rack_per_timer_hole);
counter_u64_free(rack_large_ackcmp);
counter_u64_free(rack_small_ackcmp);
+ counter_u64_free(rack_persists_sends);
+ counter_u64_free(rack_persists_acks);
+ counter_u64_free(rack_persists_loss);
+ counter_u64_free(rack_persists_lost_ends);
#ifdef INVARIANTS
counter_u64_free(rack_adjust_map_bw);
#endif
@@ -5623,6 +5665,9 @@
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
rack_timer_cancel(tp, rack, cts, __LINE__);
+ rack->r_ctl.persist_lost_ends = 0;
+ rack->probe_not_answered = 0;
+ rack->forced_ack = 0;
tp->t_rxtshift = 0;
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
@@ -6494,6 +6539,7 @@
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
tcp_set_inp_to_drop(inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
return (1);
}
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
@@ -6515,6 +6561,7 @@
retval = 1;
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
goto out;
}
if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
@@ -6531,6 +6578,7 @@
KMOD_TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
goto out;
}
t_template = tcpip_maketemplate(rack->rc_inp);
@@ -6539,7 +6587,12 @@
if (rack->forced_ack == 0) {
rack->forced_ack = 1;
rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
+ counter_u64_add(rack_persists_loss, 1);
+ rack->r_ctl.persist_lost_ends++;
}
+ counter_u64_add(rack_persists_sends, 1);
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
tp->rcv_nxt, tp->snd_una - 1, 0);
@@ -6602,6 +6655,8 @@
if (rack->forced_ack == 0) {
rack->forced_ack = 1;
rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
}
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
@@ -10301,6 +10356,14 @@
INP_WLOCK_ASSERT(tp->t_inpcb);
acked = BYTES_THIS_ACK(tp, th);
+ if (acked) {
+ /*
+ * Any time we move the cum-ack forward clear
+ * keep-alive tied probe-not-answered. The
+ * persists clears its own on entry.
+ */
+ rack->probe_not_answered = 0;
+ }
KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
/*
@@ -13374,6 +13437,61 @@
}
+static void
+rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
+{
+ uint32_t us_rtt;
+ /*
+ * A persist or keep-alive was forced out, update our
+ * min rtt time. Note now worry about lost responses.
+ * When a subsequent keep-alive or persist times out
+ * and forced_ack is still on, then the last probe
+ * was not responded to. In such cases we have a
+ * sysctl that controls the behavior. Either we apply
+ * the rtt but with reduced confidence (0). Or we just
+ * plain don't apply the rtt estimate. Having data flow
+ * will clear the probe_not_answered flag i.e. cum-ack
+ * move forward <or> exiting and reentering persists.
+ */
+
+ rack->forced_ack = 0;
+ rack->rc_tp->t_rxtshift = 0;
+ if ((rack->rc_in_persist &&
+ (tiwin == rack->rc_tp->snd_wnd)) ||
+ (rack->rc_in_persist == 0)) {
+ /*
+ * In persists only apply the RTT update if this is
+ * a response to our window probe. And that
+ * means the rwnd sent must match the current
+ * snd_wnd. If it does not, then we got a
+ * window update ack instead. For keepalive
+ * we allow the answer no matter what the window.
+ *
+ * Note that if the probe_not_answered is set then
+ * the forced_ack_ts is the oldest one i.e. the first
+ * probe sent that might have been lost. This assures
+ * us that if we do calculate an RTT it is longer not
+ * some short thing.
+ */
+ if (rack->rc_in_persist)
+ counter_u64_add(rack_persists_acks, 1);
+ us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
+ if (us_rtt == 0)
+ us_rtt = 1;
+ if (rack->probe_not_answered == 0) {
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
+ } else {
+ /* We have a retransmitted probe here too */
+ if (rack_apply_rtt_with_reduced_conf) {
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
+ }
+ }
+ }
+}
+
+
static int
rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
{
@@ -13483,7 +13601,7 @@
} else if (SEQ_GT(ae->ack, high_seq)) {
/* Case A */
ae->ack_val_set = ACK_CUMACK;
- } else if (tiwin == the_win) {
+ } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
/* Case D */
ae->ack_val_set = ACK_DUPACK;
} else {
@@ -13596,6 +13714,18 @@
rack_strike_dupack(rack);
} else if (ae->ack_val_set == ACK_RWND) {
/* Case C */
+ if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
+ ts.tv_sec = ae->timestamp / 1000000000;
+ ts.tv_nsec = ae->timestamp % 1000000000;
+ rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
+ rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
+ } else {
+ rack->r_ctl.act_rcv_time = *tv;
+ }
+ if (rack->forced_ack) {
+ rack_handle_probe_response(rack, tiwin,
+ tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
+ }
win_up_req = 1;
win_upd_ack = ae->ack;
win_seq = ae->seq;
@@ -13677,6 +13807,11 @@
#endif
acked_amount = acked = (high_seq - tp->snd_una);
if (acked) {
+ /*
+ * Clear the probe not answered flag
+ * since cum-ack moved forward.
+ */
+ rack->probe_not_answered = 0;
if (rack->sack_attack_disable == 0)
rack_do_decay(rack);
if (acked >= segsiz) {
@@ -14432,31 +14567,7 @@
}
rack_clear_rate_sample(rack);
if (rack->forced_ack) {
- uint32_t us_rtt;
-
- /*
- * A persist or keep-alive was forced out, update our
- * min rtt time. Note we do not worry about lost
- * retransmissions since KEEP-ALIVES and persists
- * are usually way long on times of sending (though
- * if we were really paranoid or worried we could
- * at least use timestamps if available to validate).
- */
- rack->forced_ack = 0;
- if (tiwin == tp->snd_wnd) {
- /*
- * Only apply the RTT update if this is
- * a response to our window probe. And that
- * means the rwnd sent must match the current
- * snd_wnd. If it does not, then we got a
- * window update ack instead.
- */
- us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
- if (us_rtt == 0)
- us_rtt = 1;
- rack_apply_updated_usrtt(rack, us_rtt, us_cts);
- tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
- }
+ rack_handle_probe_response(rack, tiwin, us_cts);
}
/*
* This is the one exception case where we set the rack state
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -496,6 +496,7 @@
uint32_t challenge_ack_cnt;
uint32_t rc_min_to; /* Socket option value Lock(a) */
uint32_t rc_pkt_delay; /* Socket option value Lock(a) */
+ uint32_t persist_lost_ends;
struct newreno rc_saved_beta; /*
* For newreno cc:
* rc_saved_cc are the values we have had
@@ -567,7 +568,8 @@
rc_last_tlp_past_cumack: 1,
rc_last_sent_tlp_seq_valid: 1,
rc_last_sent_tlp_past_cumack: 1,
- avail_bytes : 3;
+ probe_not_answered: 1,
+ avail_bytes : 2;
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */
rtt_limit_mul : 4, /* muliply this by low rtt */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1748,6 +1748,7 @@
struct mbuf *optm;
struct udphdr *uh = NULL;
struct tcphdr *nth;
+ struct tcp_log_buffer *lgb;
u_char *optp;
#ifdef INET6
struct ip6_hdr *ip6;
@@ -1756,6 +1757,7 @@
int optlen, tlen, win, ulen;
bool incl_opts;
uint16_t port;
+ int output_ret;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
NET_EPOCH_ASSERT();
@@ -2086,11 +2088,26 @@
TCP_PROBE3(debug__output, tp, th, m);
if (flags & TH_RST)
TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
+ if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
+ log.u_bbr.ininput = tp->t_inpcb->inp_in_input;
+ log.u_bbr.flex8 = 4;
+ log.u_bbr.pkts_out = tp->t_maxseg;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.delivered = 0;
+ lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ } else
+ lgb = NULL;
#ifdef INET6
if (isipv6) {
TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
- (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
+ output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
}
#endif /* INET6 */
#if defined(INET) && defined(INET6)
@@ -2099,9 +2116,13 @@
#ifdef INET
{
TCP_PROBE5(send, NULL, tp, ip, tp, nth);
- (void)ip_output(m, NULL, NULL, 0, NULL, inp);
+ output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
}
#endif
+ if (lgb) {
+ lgb->tlb_errno = output_ret;
+ lgb = NULL;
+ }
}
/*

File Metadata

Mime Type
text/plain
Expires
Mon, Jan 27, 7:32 PM (7 h, 26 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16207102
Default Alt Text
D32897.id98355.diff (12 KB)

Event Timeline