Page MenuHomeFreeBSD

D31506.diff
No OneTemporary

D31506.diff

diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -309,6 +309,7 @@
#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */
#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */
#define TCP_RACK_TIMER_SLOP 1140 /* Set or get the timer slop used */
+#define TCP_RACK_DSACK_OPT 1141 /* How do we setup rack timer DSACK options bit 1/2 */
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -234,7 +234,8 @@
TCP_LOG_HTTP_T, /* logging of http request tracking 61 */
TCP_LOG_ACCOUNTING, /* Log of TCP Accounting data 62 */
TCP_LOG_FSB, /* FSB information 63 */
- TCP_LOG_END /* End (keep at end) 64 */
+ RACK_DSACK_HANDLING, /* Handling of DSACK in rack for reordering window 64 */
+ TCP_LOG_END /* End (keep at end) 65 */
};
enum tcp_log_states {
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -217,6 +217,7 @@
static int32_t rack_use_rsm_rfo = 1;
static int32_t rack_max_abc_post_recovery = 2;
static int32_t rack_client_low_buf = 0;
+static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
#ifdef TCP_ACCOUNTING
static int32_t rack_tcp_accounting = 0;
#endif
@@ -1375,6 +1376,12 @@
&rack_tcp_accounting, 0,
"Should we turn on TCP accounting for all rack sessions?");
#endif
+
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_misc),
+ OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
+ &rack_dsack_std_based, 3,
+ "How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_misc),
OID_AUTO, "prr_addback_max", CTLFLAG_RW,
@@ -2071,6 +2078,44 @@
return (rack->r_ctl.rack_per_of_gp_ca);
}
+static void
+rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6)
+{
+ /*
+ * Types of logs (mod value)
+ * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit.
+ * 2 = a dsack round begins, persist is reset to 16.
+ * 3 = a dsack round ends
+ * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh
+ * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack
+ * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh.
+ */
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = rack->rc_rack_tmr_std_based;
+ log.u_bbr.flex1 <<= 1;
+ log.u_bbr.flex1 |= rack->rc_rack_use_dsack;
+ log.u_bbr.flex1 <<= 1;
+ log.u_bbr.flex1 |= rack->rc_dsack_round_seen;
+ log.u_bbr.flex2 = rack->r_ctl.dsack_round_end;
+ log.u_bbr.flex3 = rack->r_ctl.num_dsack;
+ log.u_bbr.flex4 = flex4;
+ log.u_bbr.flex5 = flex5;
+ log.u_bbr.flex6 = flex6;
+ log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
+ log.u_bbr.flex8 = mod;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ RACK_DSACK_HANDLING, 0,
+ 0, &log, false, &tv);
+ }
+}
+
static void
rack_log_hdwr_pacing(struct tcp_rack *rack,
uint64_t rate, uint64_t hw_rate, int line,
@@ -4862,6 +4907,13 @@
}
rack_log_to_prr(rack, 14, orig_cwnd);
tp->snd_recover = tp->snd_una;
+ if (rack->r_ctl.dsack_persist) {
+ rack->r_ctl.dsack_persist--;
+ if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
+ rack->r_ctl.num_dsack = 0;
+ }
+ rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
+ }
EXIT_RECOVERY(tp->t_flags);
}
@@ -5097,24 +5149,38 @@
} else {
lro = 0;
}
- thresh = srtt + rack->r_ctl.rc_pkt_delay;
- if (lro) {
+ if (rack->rc_rack_tmr_std_based == 0) {
+ thresh = srtt + rack->r_ctl.rc_pkt_delay;
+ } else {
+ /* Standards based pkt-delay is 1/4 srtt */
+ thresh = srtt + (srtt >> 2);
+ }
+ if (lro && (rack->rc_rack_tmr_std_based == 0)) {
/* It must be set, if not you get 1/4 rtt */
if (rack->r_ctl.rc_reorder_shift)
thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
else
thresh += (srtt >> 2);
- } else {
- thresh += 1;
}
- /* We don't let the rack timeout be above a RTO */
- if (thresh > rack->rc_tp->t_rxtcur) {
- thresh = rack->rc_tp->t_rxtcur;
+ if (rack->rc_rack_use_dsack &&
+ lro &&
+ (rack->r_ctl.num_dsack > 0)) {
+ /*
+ * We only increase the reordering window if we
+ * have seen reordering <and> we have a DSACK count.
+ */
+ thresh += rack->r_ctl.num_dsack * (srtt >> 2);
+ rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh);
+ }
+ /* SRTT * 2 is the ceiling */
+ if (thresh > (srtt * 2)) {
+ thresh = srtt * 2;
}
/* And we don't want it above the RTO max either */
if (thresh > rack_rto_max) {
thresh = rack_rto_max;
}
+ rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh);
return (thresh);
}
@@ -6219,6 +6285,13 @@
collapsed_win = 1;
goto need_retran;
}
+ if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) {
+ rack->r_ctl.dsack_persist--;
+ if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
+ rack->r_ctl.num_dsack = 0;
+ }
+ rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
+ }
if ((tp->t_flags & TF_GPUTINPROG) &&
(rack->r_ctl.rc_tlp_cnt_out == 1)) {
/*
@@ -6349,7 +6422,6 @@
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
nrsm->r_in_tmap = 1;
}
- rsm->r_flags &= (~RACK_HAS_FIN);
rsm = nrsm;
}
rack->r_ctl.rc_tlpsend = rsm;
@@ -6755,6 +6827,13 @@
/* Nothing outstanding .. nothing to do */
return (0);
}
+ if (rack->r_ctl.dsack_persist) {
+ rack->r_ctl.dsack_persist--;
+ if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
+ rack->r_ctl.num_dsack = 0;
+ }
+ rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
+ }
/*
* Rack can only run one timer at a time, so we cannot
* run a KEEPINIT (gating SYN sending) and a retransmit
@@ -8218,6 +8297,22 @@
}
}
+static inline int
+is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm)
+{
+ if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) {
+ /* Behind our TLP definition or right at */
+ return (0);
+ }
+ if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) {
+ /* The start is beyond or right at our end of TLP definition */
+ return (0);
+ }
+ /* It has to be a sub-part of the original TLP recorded */
+ return (1);
+}
+
+
static uint32_t
rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two)
@@ -8253,6 +8348,46 @@
/* Ok we have an ACK for some piece of this rsm */
if (rsm->r_start != start) {
if ((rsm->r_flags & RACK_ACKED) == 0) {
+ /*
+ * Before any splitting or hookery is
+ * done is it a TLP of interest i.e. rxt?
+ */
+ if ((rsm->r_flags & RACK_TLP) &&
+ (rsm->r_rtr_cnt > 1)) {
+ /*
+ * We are splitting a rxt TLP, check
+ * if we need to save off the start/end
+ */
+ if (rack->rc_last_tlp_acked_set &&
+ (is_rsm_inside_declared_tlp_block(rack, rsm))) {
+ /*
+ * We already turned this on since we are inside
+ * the previous one was a partially sack now we
+ * are getting another one (maybe all of it).
+ *
+ */
+ rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
+ /*
+ * Lets make sure we have all of it though.
+ */
+ if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ } else {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack->rc_last_tlp_past_cumack = 0;
+ rack->rc_last_tlp_acked_set = 1;
+ rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
+ }
+ }
/**
* Need to split this in two pieces the before and after,
* the before remains in the map, the after must be
@@ -8267,6 +8402,7 @@
* But before we start down that path lets
* see if the sack spans over on top of
* the next guy and it is already sacked.
+ *
*/
next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
if (next && (next->r_flags & RACK_ACKED) &&
@@ -8426,6 +8562,44 @@
* end |---------|
*/
if ((rsm->r_flags & RACK_ACKED) == 0) {
+ /*
+ * Is it a TLP of interest?
+ */
+ if ((rsm->r_flags & RACK_TLP) &&
+ (rsm->r_rtr_cnt > 1)) {
+ /*
+ * We are splitting a rxt TLP, check
+ * if we need to save off the start/end
+ */
+ if (rack->rc_last_tlp_acked_set &&
+ (is_rsm_inside_declared_tlp_block(rack, rsm))) {
+ /*
+ * We already turned this on since we are inside
+ * the previous one was a partially sack now we
+ * are getting another one (maybe all of it).
+ */
+ rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
+ /*
+ * Lets make sure we have all of it though.
+ */
+ if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ } else {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack->rc_last_tlp_past_cumack = 0;
+ rack->rc_last_tlp_acked_set = 1;
+ rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
+ }
+ }
rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
changed += (rsm->r_end - rsm->r_start);
rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
@@ -8441,7 +8615,6 @@
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
rsm->r_flags |= RACK_ACKED;
- rsm->r_flags &= ~RACK_TLP;
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 0;
@@ -8473,8 +8646,47 @@
* end |--|
*/
if ((rsm->r_flags & RACK_ACKED) == 0) {
+ /*
+ * Is it a TLP of interest?
+ */
+ if ((rsm->r_flags & RACK_TLP) &&
+ (rsm->r_rtr_cnt > 1)) {
+ /*
+ * We are splitting a rxt TLP, check
+ * if we need to save off the start/end
+ */
+ if (rack->rc_last_tlp_acked_set &&
+ (is_rsm_inside_declared_tlp_block(rack, rsm))) {
+ /*
+ * We already turned this on since we are inside
+ * the previous one was a partially sack now we
+ * are getting another one (maybe all of it).
+ */
+ rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
+ /*
+ * Lets make sure we have all of it though.
+ */
+ if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ } else {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack->rc_last_tlp_past_cumack = 0;
+ rack->rc_last_tlp_acked_set = 1;
+ rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
+ }
+ }
prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- if (prev && (prev->r_flags & RACK_ACKED)) {
+ if (prev &&
+ (prev->r_flags & RACK_ACKED)) {
/**
* Goal, we want the right remainder of rsm to shrink
* in place and span from (rsm->r_start = end) to rsm->r_end.
@@ -8488,6 +8700,9 @@
* prev |----------| (acked)
* rsm |-----| (non-acked)
* nrsm |-| (temporary)
+ *
+ * Note if either prev/rsm is a TLP we don't
+ * do this.
*/
nrsm = &stack_map;
memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
@@ -8534,6 +8749,41 @@
/* failed rrs what can we do but loose the sack info? */
goto out;
}
+ if ((rsm->r_flags & RACK_TLP) &&
+ (rsm->r_rtr_cnt > 1)) {
+ /*
+ * We are splitting a rxt TLP, check
+ * if we need to save off the start/end
+ */
+ if (rack->rc_last_tlp_acked_set &&
+ (is_rsm_inside_declared_tlp_block(rack, rsm))) {
+ /*
+ * We already turned this on since this block is inside
+ * the previous one was a partially sack now we
+ * are getting another one (maybe all of it).
+ */
+ rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
+ /*
+ * Lets make sure we have all of it though.
+ */
+ if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ } else {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack->rc_last_tlp_acked_set = 1;
+ rack->rc_last_tlp_past_cumack = 0;
+ rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
+ }
+ }
/**
* In this case nrsm becomes
* nrsm->r_start = end;
@@ -8584,7 +8834,6 @@
rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
rsm->r_flags |= RACK_ACKED;
- rsm->r_flags &= ~RACK_TLP;
rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
if (rsm->r_in_tmap) {
TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
@@ -8599,7 +8848,9 @@
moved++;
}
out:
- if (rsm && (rsm->r_flags & RACK_ACKED)) {
+ if (rsm &&
+ ((rsm->r_flags & RACK_TLP) == 0) &&
+ (rsm->r_flags & RACK_ACKED)) {
/*
* Now can we merge where we worked
* with either the previous or
@@ -8607,22 +8858,26 @@
*/
next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
while (next) {
- if (next->r_flags & RACK_ACKED) {
+ if (next->r_flags & RACK_TLP)
+ break;
+ if (next->r_flags & RACK_ACKED) {
/* yep this and next can be merged */
- rsm = rack_merge_rsm(rack, rsm, next);
- next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- } else
- break;
+ rsm = rack_merge_rsm(rack, rsm, next);
+ next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ } else
+ break;
}
/* Now what about the previous? */
prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
while (prev) {
- if (prev->r_flags & RACK_ACKED) {
- /* yep the previous and this can be merged */
- rsm = rack_merge_rsm(rack, prev, rsm);
- prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
- } else
- break;
+ if (prev->r_flags & RACK_TLP)
+ break;
+ if (prev->r_flags & RACK_ACKED) {
+ /* yep the previous and this can be merged */
+ rsm = rack_merge_rsm(rack, prev, rsm);
+ prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm);
+ } else
+ break;
}
}
if (used_ref == 0) {
@@ -8744,6 +8999,55 @@
* RTT's.
*/
rack->r_wanted_output = 1;
+
+ /* Tend any TLP that has been marked for 1/2 the seq space (its old) */
+ if ((rack->rc_last_tlp_acked_set == 1)&&
+ (rack->rc_last_tlp_past_cumack == 1) &&
+ (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) {
+ /*
+ * We have reached the point where our last rack
+ * tlp retransmit sequence is ahead of the cum-ack.
+ * This can only happen when the cum-ack moves all
+ * the way around (its been a full 2^^31+1 bytes
+ * or more since we sent a retransmitted TLP). Lets
+ * turn off the valid flag since its not really valid.
+ *
+ * Note since sack's also turn on this event we have
+ * a complication, we have to wait to age it out until
+ * the cum-ack is by the TLP before checking which is
+ * what the next else clause does.
+ */
+ rack_log_dsack_event(rack, 9, __LINE__,
+ rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ rack->rc_last_tlp_acked_set = 0;
+ rack->rc_last_tlp_past_cumack = 0;
+ } else if ((rack->rc_last_tlp_acked_set == 1) &&
+ (rack->rc_last_tlp_past_cumack == 0) &&
+ (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) {
+ /*
+ * It is safe to start aging TLP's out.
+ */
+ rack->rc_last_tlp_past_cumack = 1;
+ }
+ /* We do the same for the tlp send seq as well */
+ if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
+ (rack->rc_last_sent_tlp_past_cumack == 1) &&
+ (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) {
+ rack_log_dsack_event(rack, 9, __LINE__,
+ rack->r_ctl.last_sent_tlp_seq,
+ (rack->r_ctl.last_sent_tlp_seq +
+ rack->r_ctl.last_sent_tlp_len));
+ rack->rc_last_sent_tlp_seq_valid = 0;
+ rack->rc_last_sent_tlp_past_cumack = 0;
+ } else if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
+ (rack->rc_last_sent_tlp_past_cumack == 0) &&
+ (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) {
+ /*
+ * It is safe to start aging TLP's send.
+ */
+ rack->rc_last_sent_tlp_past_cumack = 1;
+ }
more:
rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree);
if (rsm == NULL) {
@@ -8778,6 +9082,48 @@
return;
}
rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
+
+ /* Now was it a retransmitted TLP? */
+ if ((rsm->r_flags & RACK_TLP) &&
+ (rsm->r_rtr_cnt > 1)) {
+ /*
+ * Yes, this rsm was a TLP and retransmitted, remember that
+ * since if a DSACK comes back on this we don't want
+ * to think of it as a reordered segment. This may
+ * get updated again with possibly even other TLPs
+ * in flight, but thats ok. Only when we don't send
+ * a retransmitted TLP for 1/2 the sequences space
+ * will it get turned off (above).
+ */
+ if (rack->rc_last_tlp_acked_set &&
+ (is_rsm_inside_declared_tlp_block(rack, rsm))) {
+ /*
+ * We already turned this on since the end matches,
+ * the previous one was a partially ack now we
+ * are getting another one (maybe all of it).
+ */
+ rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
+ /*
+ * Lets make sure we have all of it though.
+ */
+ if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
+ rack->r_ctl.last_tlp_acked_end);
+ }
+ } else {
+ rack->rc_last_tlp_past_cumack = 1;
+ rack->r_ctl.last_tlp_acked_start = rsm->r_start;
+ rack->r_ctl.last_tlp_acked_end = rsm->r_end;
+ rack->rc_last_tlp_acked_set = 1;
+ rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
+ }
+ }
/* Now do we consume the whole thing? */
if (SEQ_GEQ(th_ack, rsm->r_end)) {
/* Its all consumed. */
@@ -9059,12 +9405,44 @@
rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
{
- uint32_t am;
+ uint32_t am, l_end;
if (SEQ_GT(end, start))
am = end - start;
else
am = 0;
+ if ((rack->rc_last_tlp_acked_set ) &&
+ (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) &&
+ (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) {
+ /*
+ * The DSACK is because of a TLP which we don't
+ * do anything with the reordering window over since
+ * it was not reordering that caused the DSACK but
+ * our previous retransmit TLP.
+ */
+ rack_log_dsack_event(rack, 7, __LINE__, start, end);
+ goto skip_dsack_round;
+ }
+ if (rack->rc_last_sent_tlp_seq_valid) {
+ l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len;
+ if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) &&
+ (SEQ_LEQ(end, l_end))) {
+ /*
+ * This dsack is from the last sent TLP, ignore it
+ * for reordering purposes.
+ */
+ rack_log_dsack_event(rack, 7, __LINE__, start, end);
+ goto skip_dsack_round;
+ }
+ }
+ if (rack->rc_dsack_round_seen == 0) {
+ rack->rc_dsack_round_seen = 1;
+ rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max;
+ rack->r_ctl.num_dsack++;
+ rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */
+ rack_log_dsack_event(rack, 2, __LINE__, 0, 0);
+ }
+skip_dsack_round:
/*
* We keep track of how many DSACK blocks we get
* after a recovery incident.
@@ -9233,9 +9611,9 @@
SEQ_LEQ(sack.end, tp->snd_max)) {
sack_blocks[num_sack_blks] = sack;
num_sack_blks++;
-#ifdef NETFLIX_STATS
} else if (SEQ_LEQ(sack.start, th_ack) &&
SEQ_LEQ(sack.end, th_ack)) {
+#ifdef NETFLIX_STATS
/*
* Its a D-SACK block.
*/
@@ -9244,6 +9622,14 @@
rack_note_dsack(rack, sack.start, sack.end);
}
}
+ if (rack->rc_dsack_round_seen) {
+ /* Is the dsack roound over? */
+ if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) {
+ /* Yes it is */
+ rack->rc_dsack_round_seen = 0;
+ rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
+ }
+ }
/*
* Sort the SACK blocks so we can update the rack scoreboard with
* just one pass.
@@ -9823,7 +10209,9 @@
RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
}
- if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) {
+ if ((th->th_ack == tp->snd_una) &&
+ (tiwin == tp->snd_wnd) &&
+ ((to->to_flags & TOF_SACK) == 0)) {
rack_strike_dupack(rack);
dup_ack_struck = 1;
}
@@ -12043,9 +12431,7 @@
#ifdef INET
struct ip *ip = NULL;
#endif
-#if defined(INET) || defined(INET6)
struct udphdr *udp = NULL;
-#endif
/* Ok lets fill in the fast block, it can only be used with no IP options! */
#ifdef INET6
@@ -12069,7 +12455,6 @@
ip6, rack->r_ctl.fsb.th);
} else
#endif /* INET6 */
-#ifdef INET
{
rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
@@ -12089,7 +12474,6 @@
tp->t_port,
ip, rack->r_ctl.fsb.th);
}
-#endif
rack->r_fsb_inited = 1;
}
@@ -12245,6 +12629,14 @@
rack->r_ctl.rc_time_of_last_probertt = us_cts;
rack->r_ctl.challenge_ack_ts = tcp_ts_getticks();
rack->r_ctl.rc_time_probertt_starts = 0;
+ if (rack_dsack_std_based & 0x1) {
+ /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
+ rack->rc_rack_tmr_std_based = 1;
+ }
+ if (rack_dsack_std_based & 0x2) {
+ /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */
+ rack->rc_rack_use_dsack = 1;
+ }
/* We require at least one measurement, even if the sysctl is 0 */
if (rack_req_measurements)
rack->r_ctl.req_measurements = rack_req_measurements;
@@ -13122,7 +13514,6 @@
}
} else if (ae->ack_val_set == ACK_DUPACK) {
/* Case D */
-
rack_strike_dupack(rack);
} else if (ae->ack_val_set == ACK_RWND) {
/* Case C */
@@ -13172,6 +13563,14 @@
rack->r_ctl.act_rcv_time = *tv;
}
rack_process_to_cumack(tp, rack, ae->ack, cts, to);
+ if (rack->rc_dsack_round_seen) {
+ /* Is the dsack roound over? */
+ if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
+ /* Yes it is */
+ rack->rc_dsack_round_seen = 0;
+ rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
+ }
+ }
}
}
/* And lets be sure to commit the rtt measurements for this ack */
@@ -13609,13 +14008,6 @@
* the scale is zero.
*/
tiwin = th->th_win << tp->snd_scale;
- /*
- * Parse options on any incoming segment.
- */
- memset(&to, 0, sizeof(to));
- tcp_dooptions(&to, (u_char *)(th + 1),
- (th->th_off << 2) - sizeof(struct tcphdr),
- (thflags & TH_SYN) ? TO_SYN : 0);
#ifdef TCP_ACCOUNTING
if (thflags & TH_ACK) {
/*
@@ -13639,6 +14031,13 @@
ctf_fixed_maxseg(tp));
}
#endif
+ /*
+ * Parse options on any incoming segment.
+ */
+ memset(&to, 0, sizeof(to));
+ tcp_dooptions(&to, (u_char *)(th + 1),
+ (th->th_off << 2) - sizeof(struct tcphdr),
+ (thflags & TH_SYN) ? TO_SYN : 0);
NET_EPOCH_ASSERT();
INP_WLOCK_ASSERT(tp->t_inpcb);
KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
@@ -13769,14 +14168,6 @@
#endif
return (1);
}
-
- /*
- * Parse options on any incoming segment.
- */
- tcp_dooptions(&to, (u_char *)(th + 1),
- (th->th_off << 2) - sizeof(struct tcphdr),
- (thflags & TH_SYN) ? TO_SYN : 0);
-
/*
* If timestamps were negotiated during SYN/ACK and a
* segment without a timestamp is received, silently drop
@@ -15230,7 +15621,7 @@
struct tcpopt to;
u_char opt[TCP_MAXOLEN];
uint32_t hdrlen, optlen;
- int32_t slot, segsiz, max_val, tso = 0, error = 0, flags, ulen = 0;
+ int32_t slot, segsiz, max_val, tso = 0, error, flags, ulen = 0;
uint32_t us_cts;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
uint32_t if_hw_tsomaxsegsize;
@@ -15250,10 +15641,13 @@
if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
goto failed;
}
- if (rsm->r_flags & RACK_TLP)
- doing_tlp = 1;
- else if (doing_tlp)
+ if (doing_tlp) {
+ /* Its a TLP add the flag, it may already be there but be sure */
rsm->r_flags |= RACK_TLP;
+ } else {
+ /* If it was a TLP it is not not on this retransmit */
+ rsm->r_flags &= ~RACK_TLP;
+ }
startseq = rsm->r_start;
segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
inp = rack->rc_inp;
@@ -15555,8 +15949,15 @@
rack->rc_tlp_in_progress = 1;
rack->r_ctl.rc_tlp_cnt_out++;
}
- if (error == 0)
+ if (error == 0) {
tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
+ if (doing_tlp) {
+ rack->rc_last_sent_tlp_past_cumack = 0;
+ rack->rc_last_sent_tlp_seq_valid = 1;
+ rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
+ rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
+ }
+ }
tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
rack->forced_ack = 0; /* If we send something zap the FA flag */
if (IN_FASTRECOVERY(tp->t_flags) && rsm)
@@ -15710,7 +16111,7 @@
u_char opt[TCP_MAXOLEN];
uint32_t hdrlen, optlen;
int cnt_thru = 1;
- int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error = 0, flags, ulen = 0;
+ int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, flags, ulen = 0;
uint32_t us_cts, s_soff;
uint32_t if_hw_tsomaxsegcount = 0, startseq;
uint32_t if_hw_tsomaxsegsize;
@@ -16123,9 +16524,9 @@
long tot_len_this_send = 0;
#ifdef INET
struct ip *ip = NULL;
+#endif
#ifdef TCPDEBUG
struct ipovly *ipov = NULL;
-#endif
#endif
struct udphdr *udp = NULL;
struct tcp_rack *rack;
@@ -16134,10 +16535,7 @@
uint8_t mark = 0;
uint8_t wanted_cookie = 0;
u_char opt[TCP_MAXOLEN];
- unsigned ipoptlen, optlen, hdrlen;
-#if defined(INET) || defined(INET6)
- unsigned ulen=0;
-#endif
+ unsigned ipoptlen, optlen, hdrlen, ulen=0;
uint32_t rack_seq;
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
@@ -16420,7 +16818,6 @@
/* Retransmit timer */
rsm = rack->r_ctl.rc_resend;
rack->r_ctl.rc_resend = NULL;
- rsm->r_flags &= ~RACK_TLP;
len = rsm->r_end - rsm->r_start;
sack_rxmit = 1;
sendalot = 0;
@@ -16433,7 +16830,6 @@
len = segsiz;
} else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
/* We have a retransmit that takes precedence */
- rsm->r_flags &= ~RACK_TLP;
if ((!IN_FASTRECOVERY(tp->t_flags)) &&
((tp->t_flags & TF_WASFRECOVERY) == 0)) {
/* Enter recovery if not induced by a time-out */
@@ -16476,8 +16872,8 @@
* went off.
*/
rsm = rack->r_ctl.rc_tlpsend;
+ /* We are doing a TLP make sure the flag is preent */
rsm->r_flags |= RACK_TLP;
-
rack->r_ctl.rc_tlpsend = NULL;
sack_rxmit = 1;
tlen = rsm->r_end - rsm->r_start;
@@ -17837,29 +18233,21 @@
#endif
if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
#ifdef INET6
- if (isipv6) {
+ if (isipv6)
ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
- } else
+ else
#endif /* INET6 */
- {
-#ifdef INET
ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
-#endif
- }
th = rack->r_ctl.fsb.th;
udp = rack->r_ctl.fsb.udp;
if (udp) {
#ifdef INET6
- if (isipv6) {
+ if (isipv6)
ulen = hdrlen + len - sizeof(struct ip6_hdr);
- } else
+ else
#endif /* INET6 */
- {
-#ifdef INET
ulen = hdrlen + len - sizeof(struct ip);
- udp->uh_ulen = htons(ulen);
-#endif
- }
+ udp->uh_ulen = htons(ulen);
}
} else {
#ifdef INET6
@@ -17878,7 +18266,6 @@
} else
#endif /* INET6 */
{
-#ifdef INET
ip = mtod(m, struct ip *);
#ifdef TCPDEBUG
ipov = (struct ipovly *)ip;
@@ -17893,7 +18280,6 @@
} else
th = (struct tcphdr *)(ip + 1);
tcpip_fillheaders(inp, tp->t_port, ip, th);
-#endif /* INET */
}
}
/*
@@ -17932,15 +18318,11 @@
if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
(sack_rxmit == 0)) {
#ifdef INET6
- if (isipv6) {
+ if (isipv6)
ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
- } else
+ else
#endif
- {
-#ifdef INET
ip->ip_tos |= IPTOS_ECN_ECT0;
-#endif
- }
KMOD_TCPSTAT_INC(tcps_ecn_ect0);
/*
* Reply with proper ECN notifications.
@@ -18045,9 +18427,7 @@
ip6 = mtod(m, struct ip6_hdr *);
else
#endif /* INET6 */
-#ifdef INET
ip = mtod(m, struct ip *);
-#endif /* INET */
th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
/* If we have a udp header lets set it into the mbuf as well */
if (udp)
@@ -18172,7 +18552,10 @@
else
log.u_bbr.flex8 = 1;
} else {
- log.u_bbr.flex8 = 0;
+ if (doing_tlp)
+ log.u_bbr.flex8 = 3;
+ else
+ log.u_bbr.flex8 = 0;
}
log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
log.u_bbr.flex7 = mark;
@@ -18298,6 +18681,12 @@
*/
if (error == 0) {
tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
+ if (rsm && doing_tlp) {
+ rack->rc_last_sent_tlp_past_cumack = 0;
+ rack->rc_last_sent_tlp_seq_valid = 1;
+ rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
+ rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
+ }
rack->forced_ack = 0; /* If we send something zap the FA flag */
if (rsm && (doing_tlp == 0)) {
/* Set we retransmitted */
@@ -18343,9 +18732,12 @@
rack->r_ctl.rc_prr_sndcnt = 0;
}
sub_from_prr = 0;
- if (doing_tlp && (rsm == NULL)) {
- /* New send doing a TLP */
+ if (doing_tlp) {
+ /* Make sure the TLP is added */
add_flag |= RACK_TLP;
+ } else if (rsm) {
+ /* If its a resend without TLP then it must not have the flag */
+ rsm->r_flags &= ~RACK_TLP;
}
rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
rack_to_usec_ts(&tv),
@@ -18962,6 +19354,14 @@
rack_set_cc_pacing(rack);
} else
rack->rc_always_pace = 0;
+ if (rack_dsack_std_based & 0x1) {
+ /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
+ rack->rc_rack_tmr_std_based = 1;
+ }
+ if (rack_dsack_std_based & 0x2) {
+ /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */
+ rack->rc_rack_use_dsack = 1;
+ }
if (rack_use_cmp_acks)
rack->r_use_cmp_ack = 1;
else
@@ -19051,6 +19451,20 @@
switch (sopt_name) {
+ case TCP_RACK_DSACK_OPT:
+ RACK_OPTS_INC(tcp_rack_dsack_opt);
+ if (optval & 0x1) {
+ rack->rc_rack_tmr_std_based = 1;
+ } else {
+ rack->rc_rack_tmr_std_based = 0;
+ }
+ if (optval & 0x2) {
+ rack->rc_rack_use_dsack = 1;
+ } else {
+ rack->rc_rack_use_dsack = 0;
+ }
+ rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
+ break;
case TCP_RACK_PACING_BETA:
RACK_OPTS_INC(tcp_rack_beta);
if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) {
@@ -19761,6 +20175,7 @@
case TCP_REC_ABC_VAL: /* URL:reclabc */
case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */
case TCP_DEFER_OPTIONS: /* URL:defer */
+ case TCP_RACK_DSACK_OPT: /* URL:dsack */
case TCP_RACK_PACING_BETA: /* URL:pacing_beta */
case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */
@@ -19939,6 +20354,15 @@
error = EINVAL;
}
break;
+ case TCP_RACK_DSACK_OPT:
+ optval = 0;
+ if (rack->rc_rack_tmr_std_based) {
+ optval |= 1;
+ }
+ if (rack->rc_rack_use_dsack) {
+ optval |= 2;
+ }
+ break;
case TCP_FAST_RSM_HACK:
optval = rack->fast_rsm_hack;
break;
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -247,6 +247,7 @@
uint64_t tcp_rack_beta;
uint64_t tcp_rack_beta_ecn;
uint64_t tcp_rack_timer_slop;
+ uint64_t tcp_rack_dsack_opt;
};
/* RTT shrink reasons */
@@ -384,12 +385,11 @@
uint32_t rc_prr_sndcnt; /* Prr sndcnt Lock(a) */
uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */
- uint32_t xxx_rc_last_tlp_seq; /* Last tlp sequence Lock(a) */
+ uint32_t last_sent_tlp_seq; /* Last tlp sequence that was retransmitted Lock(a) */
uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */
uint16_t rc_tlp_cnt_out; /* count of times we have sent a TLP without new data */
- uint16_t xxx_rc_tlp_seg_send_cnt; /* Number of times we have TLP sent
- * rc_last_tlp_seq Lock(a) */
+ uint16_t last_sent_tlp_len; /* Number of bytes in the last sent tlp */
uint32_t rc_loss_count; /* How many bytes have been retransmitted
* Lock(a) */
@@ -464,6 +464,8 @@
uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */
uint32_t rc_loss_at_start; /* At measurement window where was our lost value */
+ uint32_t dsack_round_end; /* In a round of seeing a DSACK */
+ uint32_t num_dsack; /* Count of dsack's seen (1 per window)*/
uint32_t forced_ack_ts;
uint32_t rc_lower_rtt_us_cts; /* Time our GP rtt was last lowered */
uint32_t rc_time_probertt_entered;
@@ -485,6 +487,8 @@
int32_t rc_scw_index;
uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */
uint32_t rc_last_timeout_snduna;
+ uint32_t last_tlp_acked_start;
+ uint32_t last_tlp_acked_end;
uint32_t challenge_ack_ts;
uint32_t challenge_ack_cnt;
uint32_t rc_min_to; /* Socket option value Lock(a) */
@@ -503,6 +507,7 @@
*/
uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */
uint16_t rc_reorder_shift; /* Socket option value Lock(a) */
+ uint8_t dsack_persist;
uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */
uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */
uint8_t req_measurements; /* How many measurements are required? */
@@ -552,8 +557,14 @@
* Note this only happens if the cc name is newreno (CCALGONAME_NEWRENO).
*/
- avail :2;
- uint8_t avail_bytes;
+ rc_rack_tmr_std_based :1,
+ rc_rack_use_dsack: 1;
+ uint8_t rc_dsack_round_seen: 1,
+ rc_last_tlp_acked_set: 1,
+ rc_last_tlp_past_cumack: 1,
+ rc_last_sent_tlp_seq_valid: 1,
+ rc_last_sent_tlp_past_cumack: 1,
+ avail_bytes : 3;
uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */
uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */
rtt_limit_mul : 4, /* muliply this by low rtt */

File Metadata

Mime Type
text/plain
Expires
Thu, Jul 2, 5:46 PM (39 m, 50 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34600054
Default Alt Text
D31506.diff (35 KB)

Event Timeline