diff --git a/sys/netinet/cc/cc_cubic.h b/sys/netinet/cc/cc_cubic.h --- a/sys/netinet/cc/cc_cubic.h +++ b/sys/netinet/cc/cc_cubic.h @@ -93,12 +93,18 @@ int64_t K; /* Sum of RTT samples across an epoch in usecs. */ int64_t sum_rtt_usecs; - /* cwnd at the most recent congestion event. */ - unsigned long max_cwnd; - /* cwnd at the previous congestion event. */ - unsigned long prev_max_cwnd; - /* A copy of prev_max_cwnd. Used for CC_RTO_ERR */ - unsigned long prev_max_cwnd_cp; + /* Size of cwnd just before cwnd was reduced in the last congestion event */ + uint64_t W_max; + /* An estimate for the congestion window in the Reno-friendly region */ + uint64_t W_est; + /* The cwnd at the beginning of the current congestion avoidance stage */ + uint64_t cwnd_epoch; + /* + * Size of cwnd at the time of setting ssthresh most recently, + * either upon exiting the first slow start, or just before cwnd + * was reduced in the last congestion event + */ + uint64_t cwnd_prior; /* various flags */ uint32_t flags; /* Minimum observed rtt in usecs. */ @@ -107,14 +113,18 @@ int mean_rtt_usecs; /* ACKs since last congestion event. */ int epoch_ack_count; - /* Timestamp (in ticks) of arriving in congestion avoidance from last - * congestion event. - */ - int t_last_cong; - /* Timestamp (in ticks) of a previous congestion event. Used for - * CC_RTO_ERR. - */ - int t_last_cong_prev; + /* Timestamp (in ticks) at which the current CA epoch started. */ + int t_epoch; + /* Timestamp (in ticks) at which the previous CA epoch started. */ + int undo_t_epoch; + /* Few variables to restore the state after RTO_ERR */ + int64_t undo_K; + uint64_t undo_cwnd_prior; + uint64_t undo_W_max; + uint64_t undo_W_est; + uint64_t undo_cwnd_epoch; + /* Number of congestion events experienced */ + uint64_t num_cong_events; uint32_t css_baseline_minrtt; uint32_t css_current_round_minrtt; uint32_t css_lastround_minrtt; @@ -149,7 +159,7 @@ } static __inline unsigned long -theoretical_cubic_cwnd(int ticks_since_cong, unsigned long wmax, uint32_t smss) +theoretical_cubic_cwnd(int ticks_since_epoch, unsigned long wmax, uint32_t smss) { double C, wmax_pkts; @@ -157,25 +167,25 @@ wmax_pkts = wmax / (double)smss; return (smss * (wmax_pkts + - (C * pow(ticks_since_cong / (double)hz - + (C * pow(ticks_since_epoch / (double)hz - theoretical_cubic_k(wmax_pkts) / pow(2, CUBIC_SHIFT), 3.0)))); } static __inline unsigned long -theoretical_reno_cwnd(int ticks_since_cong, int rtt_ticks, unsigned long wmax, +theoretical_reno_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax, uint32_t smss) { - return ((wmax * 0.5) + ((ticks_since_cong / (float)rtt_ticks) * smss)); + return ((wmax * 0.5) + ((ticks_since_epoch / (float)rtt_ticks) * smss)); } static __inline unsigned long -theoretical_tf_cwnd(int ticks_since_cong, int rtt_ticks, unsigned long wmax, +theoretical_tf_cwnd(int ticks_since_epoch, int rtt_ticks, unsigned long wmax, uint32_t smss) { return ((wmax * 0.7) + ((3 * 0.3) / (2 - 0.3) * - (ticks_since_cong / (float)rtt_ticks) * smss)); + (ticks_since_epoch / (float)rtt_ticks) * smss)); } #endif /* !_KERNEL */ @@ -222,14 +232,14 @@ * XXXLAS: Characterise bounds for overflow. */ static __inline unsigned long -cubic_cwnd(int usecs_since_cong, unsigned long wmax, uint32_t smss, int64_t K) +cubic_cwnd(int usecs_since_epoch, unsigned long wmax, uint32_t smss, int64_t K) { int64_t cwnd; /* K is in fixed point form with CUBIC_SHIFT worth of precision. */ /* t - K, with CUBIC_SHIFT worth of precision. */ - cwnd = (((int64_t)usecs_since_cong << CUBIC_SHIFT) - (K * hz * tick)) / + cwnd = (((int64_t)usecs_since_epoch << CUBIC_SHIFT) - (K * hz * tick)) / (hz * tick); if (cwnd > CUBED_ROOT_MAX_ULONG) @@ -266,7 +276,7 @@ * XXX: Not used */ static __inline unsigned long -reno_cwnd(int usecs_since_cong, int rtt_usecs, unsigned long wmax, +reno_cwnd(int usecs_since_epoch, int rtt_usecs, unsigned long wmax, uint32_t smss) { @@ -275,7 +285,7 @@ * W_tcp(t) deals with cwnd/wmax in pkts, so because our cwnd is in * bytes, we have to multiply by smss. */ - return (((wmax * RENO_BETA) + (((usecs_since_cong * smss) + return (((wmax * RENO_BETA) + (((usecs_since_epoch * smss) << CUBIC_SHIFT) / rtt_usecs)) >> CUBIC_SHIFT); } @@ -287,13 +297,13 @@ * the value of cwnd at the last congestion event. */ static __inline unsigned long -tf_cwnd(int usecs_since_cong, int rtt_usecs, unsigned long wmax, +tf_cwnd(int usecs_since_epoch, int rtt_usecs, unsigned long wmax, uint32_t smss) { /* Equation 4 of I-D. */ return (((wmax * CUBIC_BETA) + - (((THREE_X_PT3 * (unsigned long)usecs_since_cong * + (((THREE_X_PT3 * (unsigned long)usecs_since_epoch * (unsigned long)smss) << CUBIC_SHIFT) / (TWO_SUB_PT3 * rtt_usecs))) >> CUBIC_SHIFT); } diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c --- a/sys/netinet/cc/cc_cubic.c +++ b/sys/netinet/cc/cc_cubic.c @@ -239,8 +239,8 @@ cubic_ack_received(struct cc_var *ccv, uint16_t type) { struct cubic *cubic_data; - unsigned long w_tf, w_cubic_next; - int usecs_since_cong; + unsigned long W_est, W_cubic; + int usecs_since_epoch; cubic_data = ccv->cc_data; cubic_record_rtt(ccv); @@ -272,57 +272,56 @@ /* RFC8312 Section 4.7 */ cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT | CUBICFLAG_IN_SLOWSTART); - cubic_data->max_cwnd = CCV(ccv, snd_cwnd); + cubic_data->W_max = CCV(ccv, snd_cwnd); cubic_data->K = 0; } else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | CUBICFLAG_IN_APPLIMIT)) { cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART | CUBICFLAG_IN_APPLIMIT); - cubic_data->t_last_cong = ticks; - cubic_data->K = cubic_k(cubic_data->max_cwnd / + cubic_data->t_epoch = ticks; + cubic_data->K = cubic_k(cubic_data->W_max / CCV(ccv, t_maxseg)); } - usecs_since_cong = (ticks - cubic_data->t_last_cong) * tick; - if (usecs_since_cong < 0) { + usecs_since_epoch = (ticks - cubic_data->t_epoch) * tick; + if (usecs_since_epoch < 0) { /* - * dragging t_last_cong along + * dragging t_epoch along */ - usecs_since_cong = INT_MAX; - cubic_data->t_last_cong = ticks - INT_MAX; + usecs_since_epoch = INT_MAX; + cubic_data->t_epoch = ticks - INT_MAX; } /* * The mean RTT is used to best reflect the equations in * the I-D. Using min_rtt in the tf_cwnd calculation - * causes w_tf to grow much faster than it should if the + * causes W_est to grow much faster than it should if the * RTT is dominated by network buffering rather than * propagation delay. */ - w_tf = tf_cwnd(usecs_since_cong, cubic_data->mean_rtt_usecs, - cubic_data->max_cwnd, CCV(ccv, t_maxseg)); + W_est = tf_cwnd(usecs_since_epoch, cubic_data->mean_rtt_usecs, + cubic_data->W_max, CCV(ccv, t_maxseg)); - w_cubic_next = cubic_cwnd(usecs_since_cong + - cubic_data->mean_rtt_usecs, - cubic_data->max_cwnd, - CCV(ccv, t_maxseg), - cubic_data->K); + W_cubic = cubic_cwnd(usecs_since_epoch + + cubic_data->mean_rtt_usecs, + cubic_data->W_max, + CCV(ccv, t_maxseg), + cubic_data->K); ccv->flags &= ~CCF_ABC_SENTAWND; - if (w_cubic_next < w_tf) { + if (W_cubic < W_est) { /* * TCP-friendly region, follow tf * cwnd growth. */ - if (CCV(ccv, snd_cwnd) < w_tf) - CCV(ccv, snd_cwnd) = ulmin(w_tf, INT_MAX); - } else if (CCV(ccv, snd_cwnd) < w_cubic_next) { + if (CCV(ccv, snd_cwnd) < W_est) + CCV(ccv, snd_cwnd) = ulmin(W_est, INT_MAX); + } else if (CCV(ccv, snd_cwnd) < W_cubic) { /* * Concave or convex region, follow CUBIC * cwnd growth. * Only update snd_cwnd, if it doesn't shrink. */ - CCV(ccv, snd_cwnd) = ulmin(w_cubic_next, - INT_MAX); + CCV(ccv, snd_cwnd) = ulmin(W_cubic, INT_MAX); } /* @@ -330,12 +329,12 @@ * new cwnd limit at the start of a connection * (happens when hostcache has a relevant entry), * keep updating our current estimate of the - * max_cwnd. + * W_max. */ if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) && - cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) { - cubic_data->max_cwnd = CCV(ccv, snd_cwnd); - cubic_data->K = cubic_k(cubic_data->max_cwnd / + cubic_data->W_max < CCV(ccv, snd_cwnd)) { + cubic_data->W_max = CCV(ccv, snd_cwnd); + cubic_data->K = cubic_k(cubic_data->W_max / CCV(ccv, t_maxseg)); } } @@ -348,7 +347,7 @@ /* * This is a CUBIC specific implementation of after_idle. * - Reset cwnd by calling New Reno implementation of after_idle. - * - Reset t_last_cong. + * - Reset t_epoch. */ static void cubic_after_idle(struct cc_var *ccv) @@ -357,8 +356,8 @@ cubic_data = ccv->cc_data; - cubic_data->max_cwnd = ulmax(cubic_data->max_cwnd, CCV(ccv, snd_cwnd)); - cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); + cubic_data->W_max = ulmax(cubic_data->W_max, CCV(ccv, snd_cwnd)); + cubic_data->K = cubic_k(cubic_data->W_max / CCV(ccv, t_maxseg)); if ((cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) == 0) { /* * Re-enable hystart if we have been idle. @@ -368,7 +367,7 @@ cubic_log_hystart_event(ccv, cubic_data, 12, CCV(ccv, snd_ssthresh)); } newreno_cc_after_idle(ccv); - cubic_data->t_last_cong = ticks; + cubic_data->t_epoch = ticks; } static void @@ -397,7 +396,7 @@ cubic_data = ptr; /* Init some key variables with sensible defaults. */ - cubic_data->t_last_cong = ticks; + cubic_data->t_epoch = ticks; cubic_data->min_rtt_usecs = TCPTV_SRTTBASE; cubic_data->mean_rtt_usecs = 1; @@ -441,8 +440,8 @@ if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { cubic_ssthresh_update(ccv, mss); cubic_data->flags |= CUBICFLAG_CONG_EVENT; - cubic_data->t_last_cong = ticks; - cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); + cubic_data->t_epoch = ticks; + cubic_data->K = cubic_k(cubic_data->W_max / mss); } ENTER_RECOVERY(CCV(ccv, t_flags)); } @@ -458,8 +457,8 @@ if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { cubic_ssthresh_update(ccv, mss); cubic_data->flags |= CUBICFLAG_CONG_EVENT; - cubic_data->t_last_cong = ticks; - cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); + cubic_data->t_epoch = ticks; + cubic_data->K = cubic_k(cubic_data->W_max / mss); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } @@ -468,22 +467,36 @@ case CC_RTO: /* RFC8312 Section 4.7 */ if (CCV(ccv, t_rxtshift) == 1) { - cubic_data->t_last_cong_prev = cubic_data->t_last_cong; - cubic_data->prev_max_cwnd_cp = cubic_data->prev_max_cwnd; + /* + * Remember the state only for the first RTO event. This + * will help us restore the state to the values seen + * at the most recent congestion avoidance stage before + * the current RTO event. + */ + cubic_data->undo_t_epoch = cubic_data->t_epoch; + cubic_data->undo_cwnd_epoch = cubic_data->cwnd_epoch; + cubic_data->undo_W_est = cubic_data->W_est; + cubic_data->undo_cwnd_prior = cubic_data->cwnd_prior; + cubic_data->undo_W_max = cubic_data->W_max; + cubic_data->undo_K = cubic_data->K; } cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT; - cubic_data->prev_max_cwnd = cubic_data->max_cwnd; - CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_cwnd) * + cubic_data->undo_W_max = cubic_data->W_max; + cubic_data->num_cong_events++; + CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_cwnd) * CUBIC_BETA) >> CUBIC_SHIFT; CCV(ccv, snd_cwnd) = mss; break; case CC_RTO_ERR: cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT); - cubic_data->max_cwnd = cubic_data->prev_max_cwnd; - cubic_data->prev_max_cwnd = cubic_data->prev_max_cwnd_cp; - cubic_data->t_last_cong = cubic_data->t_last_cong_prev; - cubic_data->K = cubic_k(cubic_data->max_cwnd / mss); + cubic_data->num_cong_events--; + cubic_data->K = cubic_data->undo_K; + cubic_data->cwnd_prior = cubic_data->undo_cwnd_prior; + cubic_data->W_max = cubic_data->undo_W_max; + cubic_data->W_est = cubic_data->undo_W_est; + cubic_data->cwnd_epoch = cubic_data->undo_cwnd_epoch; + cubic_data->t_epoch = cubic_data->undo_t_epoch; break; } } @@ -496,11 +509,11 @@ cubic_data = ccv->cc_data; /* - * Ensure we have a sane initial value for max_cwnd recorded. Without + * Ensure we have a sane initial value for W_max recorded. Without * this here bad things happen when entries from the TCP hostcache * get used. */ - cubic_data->max_cwnd = CCV(ccv, snd_cwnd); + cubic_data->W_max = CCV(ccv, snd_cwnd); } static int @@ -542,8 +555,8 @@ CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + CCV(ccv, t_maxseg); else - /* Update cwnd based on beta and adjusted max_cwnd. */ - CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->max_cwnd * + /* Update cwnd based on beta and adjusted W_max. */ + CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->W_max * CUBIC_BETA) >> CUBIC_SHIFT, 2 * CCV(ccv, t_maxseg)); } @@ -619,21 +632,21 @@ cwnd = CCV(ccv, snd_cwnd); /* Fast convergence heuristic. */ - if (cwnd < cubic_data->max_cwnd) { + if (cwnd < cubic_data->W_max) { cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT; } - cubic_data->prev_max_cwnd = cubic_data->max_cwnd; - cubic_data->max_cwnd = cwnd; + cubic_data->undo_W_max = cubic_data->W_max; + cubic_data->W_max = cwnd; /* * On the first congestion event, set ssthresh to cwnd * 0.5 - * and reduce max_cwnd to cwnd * beta. This aligns the cubic concave + * and reduce W_max to cwnd * beta. This aligns the cubic concave * region appropriately. On subsequent congestion events, set * ssthresh to cwnd * beta. */ if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) { ssthresh = cwnd >> 1; - cubic_data->max_cwnd = ((uint64_t)cwnd * + cubic_data->W_max = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT; } else { ssthresh = ((uint64_t)cwnd *