Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F108600966
D32897.id98355.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
12 KB
Referenced Files
None
Subscribers
None
D32897.id98355.diff
View Options
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -205,6 +205,7 @@
static int32_t rack_stats_gets_ms_rtt = 1;
static int32_t rack_prr_addbackmax = 2;
static int32_t rack_do_hystart = 0;
+static int32_t rack_apply_rtt_with_reduced_conf = 0;
static int32_t rack_pkt_delay = 1000;
static int32_t rack_send_a_lot_in_prr = 1;
@@ -343,6 +344,10 @@
counter_u64_t rack_per_timer_hole;
counter_u64_t rack_large_ackcmp;
counter_u64_t rack_small_ackcmp;
+counter_u64_t rack_persists_sends;
+counter_u64_t rack_persists_acks;
+counter_u64_t rack_persists_loss;
+counter_u64_t rack_persists_lost_ends;
#ifdef INVARIANTS
counter_u64_t rack_adjust_map_bw;
#endif
@@ -772,6 +777,10 @@
counter_u64_zero(rack_per_timer_hole);
counter_u64_zero(rack_large_ackcmp);
counter_u64_zero(rack_small_ackcmp);
+ counter_u64_zero(rack_persists_sends);
+ counter_u64_zero(rack_persists_acks);
+ counter_u64_zero(rack_persists_loss);
+ counter_u64_zero(rack_persists_lost_ends);
#ifdef INVARIANTS
counter_u64_zero(rack_adjust_map_bw);
#endif
@@ -1412,6 +1421,11 @@
&rack_tcp_accounting, 0,
"Should we turn on TCP accounting for all rack sessions?");
#endif
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
+ &rack_apply_rtt_with_reduced_conf, 0,
+ "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
@@ -1774,6 +1788,30 @@
OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD,
&rack_large_ackcmp,
"Number of TCP connections with large mbuf's for compressed acks");
+ rack_persists_sends = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_sends", CTLFLAG_RD,
+ &rack_persists_sends,
+ "Number of times we sent a persist probe");
+ rack_persists_acks = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_acks", CTLFLAG_RD,
+ &rack_persists_acks,
+ "Number of times a persist probe was acked");
+ rack_persists_loss = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_loss", CTLFLAG_RD,
+ &rack_persists_loss,
+ "Number of times we detected a lost persist probe (no ack)");
+ rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
+ &rack_persists_lost_ends,
+ "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
rack_small_ackcmp = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -2938,6 +2976,10 @@
counter_u64_free(rack_per_timer_hole);
counter_u64_free(rack_large_ackcmp);
counter_u64_free(rack_small_ackcmp);
+ counter_u64_free(rack_persists_sends);
+ counter_u64_free(rack_persists_acks);
+ counter_u64_free(rack_persists_loss);
+ counter_u64_free(rack_persists_lost_ends);
#ifdef INVARIANTS
counter_u64_free(rack_adjust_map_bw);
#endif
@@ -5623,6 +5665,9 @@
if (rack->r_ctl.rc_went_idle_time == 0)
rack->r_ctl.rc_went_idle_time = 1;
rack_timer_cancel(tp, rack, cts, __LINE__);
+ rack->r_ctl.persist_lost_ends = 0;
+ rack->probe_not_answered = 0;
+ rack->forced_ack = 0;
tp->t_rxtshift = 0;
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
@@ -6494,6 +6539,7 @@
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
tcp_set_inp_to_drop(inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
return (1);
}
KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
@@ -6515,6 +6561,7 @@
retval = 1;
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
goto out;
}
if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
@@ -6531,6 +6578,7 @@
KMOD_TCPSTAT_INC(tcps_persistdrop);
tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
+ counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
goto out;
}
t_template = tcpip_maketemplate(rack->rc_inp);
@@ -6539,7 +6587,12 @@
if (rack->forced_ack == 0) {
rack->forced_ack = 1;
rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
+ counter_u64_add(rack_persists_loss, 1);
+ rack->r_ctl.persist_lost_ends++;
}
+ counter_u64_add(rack_persists_sends, 1);
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
tp->rcv_nxt, tp->snd_una - 1, 0);
@@ -6602,6 +6655,8 @@
if (rack->forced_ack == 0) {
rack->forced_ack = 1;
rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
+ } else {
+ rack->probe_not_answered = 1;
}
tcp_respond(tp, t_template->tt_ipgen,
&t_template->tt_t, (struct mbuf *)NULL,
@@ -10301,6 +10356,14 @@
INP_WLOCK_ASSERT(tp->t_inpcb);
acked = BYTES_THIS_ACK(tp, th);
+ if (acked) {
+ /*
+ * Any time we move the cum-ack forward clear
+ * keep-alive tied probe-not-answered. The
+ * persists clears its own on entry.
+ */
+ rack->probe_not_answered = 0;
+ }
KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
/*
@@ -13374,6 +13437,61 @@
}
+static void
+rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
+{
+ uint32_t us_rtt;
+ /*
+ * A persist or keep-alive was forced out, update our
+ * min rtt time. Note now worry about lost responses.
+ * When a subsequent keep-alive or persist times out
+ * and forced_ack is still on, then the last probe
+ * was not responded to. In such cases we have a
+ * sysctl that controls the behavior. Either we apply
+ * the rtt but with reduced confidence (0). Or we just
+ * plain don't apply the rtt estimate. Having data flow
+ * will clear the probe_not_answered flag i.e. cum-ack
+ * move forward <or> exiting and reentering persists.
+ */
+
+ rack->forced_ack = 0;
+ rack->rc_tp->t_rxtshift = 0;
+ if ((rack->rc_in_persist &&
+ (tiwin == rack->rc_tp->snd_wnd)) ||
+ (rack->rc_in_persist == 0)) {
+ /*
+ * In persists only apply the RTT update if this is
+ * a response to our window probe. And that
+ * means the rwnd sent must match the current
+ * snd_wnd. If it does not, then we got a
+ * window update ack instead. For keepalive
+ * we allow the answer no matter what the window.
+ *
+ * Note that if the probe_not_answered is set then
+ * the forced_ack_ts is the oldest one i.e. the first
+ * probe sent that might have been lost. This assures
+ * us that if we do calculate an RTT it is longer not
+ * some short thing.
+ */
+ if (rack->rc_in_persist)
+ counter_u64_add(rack_persists_acks, 1);
+ us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
+ if (us_rtt == 0)
+ us_rtt = 1;
+ if (rack->probe_not_answered == 0) {
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
+ } else {
+ /* We have a retransmitted probe here too */
+ if (rack_apply_rtt_with_reduced_conf) {
+ rack_apply_updated_usrtt(rack, us_rtt, us_cts);
+ tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
+ }
+ }
+ }
+}
+
+
static int
rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
{
@@ -13483,7 +13601,7 @@
} else if (SEQ_GT(ae->ack, high_seq)) {
/* Case A */
ae->ack_val_set = ACK_CUMACK;
- } else if (tiwin == the_win) {
+ } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
/* Case D */
ae->ack_val_set = ACK_DUPACK;
} else {
@@ -13596,6 +13714,18 @@
rack_strike_dupack(rack);
} else if (ae->ack_val_set == ACK_RWND) {
/* Case C */
+ if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
+ ts.tv_sec = ae->timestamp / 1000000000;
+ ts.tv_nsec = ae->timestamp % 1000000000;
+ rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
+ rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
+ } else {
+ rack->r_ctl.act_rcv_time = *tv;
+ }
+ if (rack->forced_ack) {
+ rack_handle_probe_response(rack, tiwin,
+ tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time));
+ }
win_up_req = 1;
win_upd_ack = ae->ack;
win_seq = ae->seq;
@@ -13677,6 +13807,11 @@
#endif
acked_amount = acked = (high_seq - tp->snd_una);
if (acked) {
+ /*
+ * Clear the probe not answered flag
+ * since cum-ack moved forward.
+ */
+ rack->probe_not_answered = 0;
if (rack->sack_attack_disable == 0)
rack_do_decay(rack);
if (acked >= segsiz) {
@@ -14432,31 +14567,7 @@
}
rack_clear_rate_sample(rack);
if (rack->forced_ack) {
- uint32_t us_rtt;
-
- /*
- * A persist or keep-alive was forced out, update our
- * min rtt time. Note we do not worry about lost
- * retransmissions since KEEP-ALIVES and persists
- * are usually way long on times of sending (though
- * if we were really paranoid or worried we could
- * at least use timestamps if available to validate).
- */
- rack->forced_ack = 0;
- if (tiwin == tp->snd_wnd) {
- /*
- * Only apply the RTT update if this is
- * a response to our window probe. And that
- * means the rwnd sent must match the current
- * snd_wnd. If it does not, then we got a
- * window update ack instead.
- */
- us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
- if (us_rtt == 0)
- us_rtt = 1;
- rack_apply_updated_usrtt(rack, us_rtt, us_cts);
- tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
- }
+ rack_handle_probe_response(rack, tiwin, us_cts);
}
/*
* This is the one exception case where we set the rack state
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -496,6 +496,7 @@
uint32_t challenge_ack_cnt;
uint32_t rc_min_to; /* Socket option value Lock(a) */
uint32_t rc_pkt_delay; /* Socket option value Lock(a) */
+ uint32_t persist_lost_ends;
struct newreno rc_saved_beta; /*
* For newreno cc:
* rc_saved_cc are the values we have had
@@ -567,7 +568,8 @@
rc_last_tlp_past_cumack: 1,
rc_last_sent_tlp_seq_valid: 1,
rc_last_sent_tlp_past_cumack: 1,
- avail_bytes : 3;
+ probe_not_answered: 1,
+ avail_bytes : 2;
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */
rtt_limit_mul : 4, /* muliply this by low rtt */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1748,6 +1748,7 @@
struct mbuf *optm;
struct udphdr *uh = NULL;
struct tcphdr *nth;
+ struct tcp_log_buffer *lgb;
u_char *optp;
#ifdef INET6
struct ip6_hdr *ip6;
@@ -1756,6 +1757,7 @@
int optlen, tlen, win, ulen;
bool incl_opts;
uint16_t port;
+ int output_ret;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
NET_EPOCH_ASSERT();
@@ -2086,11 +2088,26 @@
TCP_PROBE3(debug__output, tp, th, m);
if (flags & TH_RST)
TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
+ if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
+ log.u_bbr.ininput = tp->t_inpcb->inp_in_input;
+ log.u_bbr.flex8 = 4;
+ log.u_bbr.pkts_out = tp->t_maxseg;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.delivered = 0;
+ lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
+ 0, &log, false, NULL, NULL, 0, &tv);
+ } else
+ lgb = NULL;
#ifdef INET6
if (isipv6) {
TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
- (void)ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
+ output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
}
#endif /* INET6 */
#if defined(INET) && defined(INET6)
@@ -2099,9 +2116,13 @@
#ifdef INET
{
TCP_PROBE5(send, NULL, tp, ip, tp, nth);
- (void)ip_output(m, NULL, NULL, 0, NULL, inp);
+ output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
}
#endif
+ if (lgb) {
+ lgb->tlb_errno = output_ret;
+ lgb = NULL;
+ }
}
/*
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Jan 27, 7:32 PM (7 h, 26 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16207102
Default Alt Text
D32897.id98355.diff (12 KB)
Attached To
Mode
D32897: tcp: Rack may still calculate long RTT on persists probes.
Attached
Detach File
Event Timeline
Log In to Comment