diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index a34eefcf0066..d85ad4e9f4fd 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -1,758 +1,753 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Have a sane default if no CC_DEFAULT is specified in the kernel config file. */ #ifndef CC_DEFAULT #define CC_DEFAULT "cubic" #endif uint32_t hystart_minrtt_thresh = 4000; uint32_t hystart_maxrtt_thresh = 16000; uint32_t hystart_n_rttsamples = 8; uint32_t hystart_css_growth_div = 4; uint32_t hystart_css_rounds = 5; uint32_t hystart_bblogs = 0; MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); /* * List of available cc algorithms on the current system. First element * is used as the system default CC algorithm. */ struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); /* Protects the cc_list TAILQ. */ struct rwlock cc_list_lock; VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; VNET_DEFINE(uint32_t, newreno_beta) = 50; #define V_newreno_beta VNET(newreno_beta) VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; void cc_refer(struct cc_algo *algo) { CC_LIST_LOCK_ASSERT(); refcount_acquire(&algo->cc_refcount); } void cc_release(struct cc_algo *algo) { CC_LIST_LOCK_ASSERT(); refcount_release(&algo->cc_refcount); } void cc_attach(struct tcpcb *tp, struct cc_algo *algo) { /* * Attach the tcpcb to the algorithm. */ CC_LIST_RLOCK(); CC_ALGO(tp) = algo; cc_refer(algo); CC_LIST_RUNLOCK(); } void cc_detach(struct tcpcb *tp) { struct cc_algo *algo; CC_LIST_RLOCK(); algo = CC_ALGO(tp); CC_ALGO(tp) = NULL; cc_release(algo); CC_LIST_RUNLOCK(); } /* * Sysctl handler to show and change the default CC algorithm. */ static int cc_default_algo(SYSCTL_HANDLER_ARGS) { char default_cc[TCP_CA_NAME_MAX]; struct cc_algo *funcs; int error; /* Get the current default: */ CC_LIST_RLOCK(); if (CC_DEFAULT_ALGO() != NULL) strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); else memset(default_cc, 0, TCP_CA_NAME_MAX); CC_LIST_RUNLOCK(); error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) goto done; error = ESRCH; /* Find algo with specified name and set it to default. */ CC_LIST_RLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (strncmp(default_cc, funcs->name, sizeof(default_cc))) continue; if (funcs->flags & CC_MODULE_BEING_REMOVED) { /* Its being removed, its not eligible */ continue; } V_default_cc_ptr = funcs; error = 0; break; } CC_LIST_RUNLOCK(); done: return (error); } /* * Sysctl handler to display the list of available CC algorithms. */ static int cc_list_available(SYSCTL_HANDLER_ARGS) { struct cc_algo *algo; int error, nalgos; int linesz; char *buffer, *cp; size_t bufsz, outsz; error = nalgos = 0; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { nalgos++; } CC_LIST_RUNLOCK(); if (nalgos == 0) { return (ENOENT); } bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1); buffer = malloc(bufsz, M_TEMP, M_WAITOK); cp = buffer; linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D', "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { linesz = snprintf(cp, bufsz, "%-16s%c %u\n", algo->name, (algo == CC_DEFAULT_ALGO()) ? '*' : ' ', algo->cc_refcount); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } CC_LIST_RUNLOCK(); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } /* * Return the number of times a proposed removal_cc is * being used as the default. */ static int cc_check_default(struct cc_algo *remove_cc) { int cnt = 0; VNET_ITERATOR_DECL(vnet_iter); CC_LIST_LOCK_ASSERT(); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if ((CC_DEFAULT_ALGO() != NULL) && strncmp(CC_DEFAULT_ALGO()->name, remove_cc->name, TCP_CA_NAME_MAX) == 0) { cnt++; } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); return (cnt); } /* * Initialise CC subsystem on system boot. */ static void cc_init(void) { CC_LIST_LOCK_INIT(); STAILQ_INIT(&cc_list); } /* * Returns non-zero on success, 0 on failure. */ static int cc_deregister_algo_locked(struct cc_algo *remove_cc) { struct cc_algo *funcs; int found = 0; /* This is unlikely to fail */ STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == remove_cc) found = 1; } if (found == 0) { /* Nothing to remove? */ return (ENOENT); } /* We assert it should have been MOD_QUIESCE'd */ KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED), ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc)); if (cc_check_default(remove_cc)) { return(EBUSY); } if (remove_cc->cc_refcount != 0) { return (EBUSY); } /* Remove algo from cc_list so that new connections can't use it. */ STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries); return (0); } /* * Returns non-zero on success, 0 on failure. */ int cc_deregister_algo(struct cc_algo *remove_cc) { int ret; CC_LIST_WLOCK(); ret = cc_deregister_algo_locked(remove_cc); CC_LIST_WUNLOCK(); return (ret); } /* * Returns 0 on success, non-zero on failure. */ int cc_register_algo(struct cc_algo *add_cc) { struct cc_algo *funcs; int err; err = 0; /* * Iterate over list of registered CC algorithms and make sure * we're not trying to add a duplicate. */ CC_LIST_WLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == add_cc || strncmp(funcs->name, add_cc->name, TCP_CA_NAME_MAX) == 0) { err = EEXIST; break; } } /* Init its reference count */ if (err == 0) refcount_init(&add_cc->cc_refcount, 0); /* * The first loaded congestion control module will become * the default until we find the "CC_DEFAULT" defined in * the config (if we do). */ if (!err) { STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); if (strcmp(add_cc->name, CC_DEFAULT) == 0) { V_default_cc_ptr = add_cc; } else if (V_default_cc_ptr == NULL) { V_default_cc_ptr = add_cc; } } CC_LIST_WUNLOCK(); return (err); } static void vnet_cc_sysinit(void *arg) { struct cc_algo *cc; if (IS_DEFAULT_VNET(curvnet)) return; CURVNET_SET(vnet0); cc = V_default_cc_ptr; CURVNET_RESTORE(); V_default_cc_ptr = cc; } VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_cc_sysinit, NULL); /* * Perform any necessary tasks before we exit congestion recovery. */ void newreno_cc_post_recovery(struct cc_var *ccv) { int pipe; uint32_t mss = tcp_fixed_maxseg(ccv->tp); if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * Fast recovery will conclude after returning from this * function. Window inflation should have left us with * approximately snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do it via the * slow start mechanism. - * - * XXXLAS: Find a way to do this without needing curack */ - if (V_tcp_do_newsack) - pipe = tcp_compute_pipe(ccv->tp); - else - pipe = CCV(ccv, snd_max) - ccv->curack; + pipe = tcp_compute_pipe(ccv->tp); if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd does not collapse to 1 MSS under * adverse conditions. Implements RFC6582 */ CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } } void newreno_cc_after_idle(struct cc_var *ccv) { uint32_t rw; /* * If we've been idle for more than one retransmit timeout the old * congestion window is no longer current and we have to reduce it to * the restart window before we can transmit again. * * The restart window is the initial window or the last CWND, whichever * is smaller. * * This is done to prevent us from flooding the path with a full CWND at * wirespeed, overloading router and switch buffers along the way. * * See RFC5681 Section 4.1. "Restarting Idle Connections". * * In addition, per RFC2861 Section 2, the ssthresh is set to the * maximum of the former ssthresh or 3/4 of the old cwnd, to * not exit slow-start prematurely. */ rw = tcp_compute_initwnd(tcp_fixed_maxseg(ccv->tp)); CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); } /* * Get a new congestion window size on a multiplicative decrease event. * */ u_int newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss) { uint32_t cwin, factor; cwin = CCV(ccv, snd_cwnd); /* * Other TCP congestion controls use newreno_cong_signal(), but * with their own private cc_data. Make sure the cc_data is used * correctly. */ factor = V_newreno_beta; return max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss; } /* * Perform any necessary tasks before we enter congestion recovery. */ void newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) { uint32_t cwin, mss, pipe; mss = tcp_fixed_maxseg(ccv->tp); /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, ("%s: congestion signal type 0x%08x is private\n", __func__, type)); cwin = newreno_cc_cwnd_on_multiplicative_decrease(ccv, mss); switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_ssthresh) = cwin; CCV(ccv, snd_cwnd) = cwin; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { pipe = tcp_compute_pipe(ccv->tp); CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } CCV(ccv, snd_cwnd) = mss; break; default: break; } } u_int newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv) { u_int cw = CCV(ccv, snd_cwnd); u_int incr = tcp_fixed_maxseg(ccv->tp); KASSERT(cw > CCV(ccv, snd_ssthresh), ("congestion control state not in congestion avoidance\n")); /* * Regular in-order ACK, open the congestion window. * The congestion control state we're in is congestion avoidance. * * Check if ABC (RFC 3465) is enabled. * cong avoid: cwnd > ssthresh * * cong avoid and ABC (RFC 3465): * Grow cwnd linearly by maxseg per RTT for each * cwnd worth of ACKed data. * * cong avoid without ABC (RFC 5681): * Grow cwnd linearly by approximately maxseg per RTT using * maxseg^2 / cwnd per ACK as the increment. * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to * avoid capping cwnd. */ if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) ccv->flags &= ~CCF_ABC_SENTAWND; else incr = 0; } else incr = max((incr * incr / cw), 1); /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); else return cw; } u_int newreno_cc_cwnd_in_slow_start(struct cc_var *ccv) { u_int cw = CCV(ccv, snd_cwnd); u_int mss = tcp_fixed_maxseg(ccv->tp); u_int incr = mss; KASSERT(cw <= CCV(ccv, snd_ssthresh), ("congestion control state not in slow start\n")); /* * Regular in-order ACK, open the congestion window. * The congestion control state we're in is slow start. * * slow start: cwnd <= ssthresh * * slow start and ABC (RFC 3465): * Grow cwnd exponentially by the amount of data * ACKed capping the max increment per ACK to * (abc_l_var * maxseg) bytes. * * slow start without ABC (RFC 5681): * Grow cwnd exponentially by maxseg per ACK. */ if (V_tcp_do_rfc3465) { /* * In slow-start with ABC enabled and no RTO in sight? * (Must not use abc_l_var > 1 if slow starting after * an RTO. On RTO, snd_nxt = snd_una, so the * snd_nxt == snd_max check is sufficient to * handle this). * * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ uint16_t abc_val; if (ccv->flags & CCF_USE_LOCAL_ABC) abc_val = ccv->labc; else abc_val = V_tcp_abc_l_var; if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, ccv->nsegs * abc_val * mss); else incr = min(ccv->bytes_this_ack, mss); } /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); else return cw; } void newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type) { if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { if (CCV(ccv, snd_cwnd) > CCV(ccv, snd_ssthresh)) { CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_cong_avoid(ccv); } else { CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_slow_start(ccv); } } } static int cc_stop_new_assignments(struct cc_algo *algo) { CC_LIST_WLOCK(); if (cc_check_default(algo)) { /* A default cannot be removed */ CC_LIST_WUNLOCK(); return (EBUSY); } algo->flags |= CC_MODULE_BEING_REMOVED; CC_LIST_WUNLOCK(); return (0); } /* * Handles kld related events. Returns 0 on success, non-zero on failure. */ int cc_modevent(module_t mod, int event_type, void *data) { struct cc_algo *algo; int err; err = 0; algo = (struct cc_algo *)data; switch(event_type) { case MOD_LOAD: if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { /* * A module must have a cc_data_sz function * even if it has no data it should return 0. */ printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); err = EINVAL; break; } if (algo->mod_init != NULL) err = algo->mod_init(); if (!err) err = cc_register_algo(algo); break; case MOD_SHUTDOWN: break; case MOD_QUIESCE: /* Stop any new assigments */ err = cc_stop_new_assignments(algo); break; case MOD_UNLOAD: /* * Deregister and remove the module from the list */ CC_LIST_WLOCK(); /* Even with -f we can't unload if its the default */ if (cc_check_default(algo)) { /* A default cannot be removed */ CC_LIST_WUNLOCK(); return (EBUSY); } /* * If -f was used and users are still attached to * the algorithm things are going to go boom. */ err = cc_deregister_algo_locked(algo); CC_LIST_WUNLOCK(); if ((err == 0) && (algo->mod_destroy != NULL)) { algo->mod_destroy(); } break; default: err = EINVAL; break; } return (err); } SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); /* Declare sysctl tree and populate it. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Congestion control related settings"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, cc_default_algo, "A", "Default congestion control algorithm"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, cc_list_available, "A", "List available congestion control algorithms"); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "New Reno related HyStart++ settings"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh, CTLFLAG_RW, &hystart_minrtt_thresh, 4000, "HyStarts++ minimum RTT thresh used in clamp (in microseconds)"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh, CTLFLAG_RW, &hystart_maxrtt_thresh, 16000, "HyStarts++ maximum RTT thresh used in clamp (in microseconds)"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples, CTLFLAG_RW, &hystart_n_rttsamples, 8, "The number of RTT samples that must be seen to consider HyStart++"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div, CTLFLAG_RW, &hystart_css_growth_div, 4, "The divisor to the growth when in Hystart++ CSS"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds, CTLFLAG_RW, &hystart_css_rounds, 5, "The number of rounds HyStart++ lasts in CSS before falling to CA"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs, CTLFLAG_RW, &hystart_bblogs, 0, "Do we enable HyStart++ Black Box logs to be generated if BB logging is on"); VNET_DEFINE(int, cc_do_abe) = 0; SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cc_do_abe), 0, "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); VNET_DEFINE(int, cc_abe_frlossreduce) = 0; SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cc_abe_frlossreduce), 0, "Apply standard beta instead of ABE-beta during ECN-signalled congestion " "recovery episodes if loss also needs to be repaired"); diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c index 82b0d9b7fa99..a2e72130fa88 100644 --- a/sys/netinet/cc/cc_cubic.c +++ b/sys/netinet/cc/cc_cubic.c @@ -1,728 +1,722 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2008-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Lawrence Stewart while studying at the Centre * for Advanced Internet Architectures, Swinburne University of Technology, made * possible in part by a grant from the Cisco University Research Program Fund * at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the CUBIC congestion control algorithm for FreeBSD, * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the * Cisco University Research Program Fund at Community Foundation Silicon * Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void cubic_ack_received(struct cc_var *ccv, ccsignal_t type); static void cubic_cb_destroy(struct cc_var *ccv); static int cubic_cb_init(struct cc_var *ccv, void *ptr); static void cubic_cong_signal(struct cc_var *ccv, ccsignal_t type); static void cubic_conn_init(struct cc_var *ccv); static int cubic_mod_init(void); static void cubic_post_recovery(struct cc_var *ccv); static void cubic_record_rtt(struct cc_var *ccv); static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg); static void cubic_after_idle(struct cc_var *ccv); static size_t cubic_data_sz(void); static void cubic_newround(struct cc_var *ccv, uint32_t round_cnt); static void cubic_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas); struct cc_algo cubic_cc_algo = { .name = "cubic", .ack_received = cubic_ack_received, .cb_destroy = cubic_cb_destroy, .cb_init = cubic_cb_init, .cong_signal = cubic_cong_signal, .conn_init = cubic_conn_init, .mod_init = cubic_mod_init, .post_recovery = cubic_post_recovery, .after_idle = cubic_after_idle, .cc_data_sz = cubic_data_sz, .rttsample = cubic_rttsample, .newround = cubic_newround }; static void cubic_log_hystart_event(struct cc_var *ccv, struct cubic *cubicd, uint8_t mod, uint32_t flex1) { /* * Types of logs (mod value) * 1 - rtt_thresh in flex1, checking to see if RTT is to great. * 2 - rtt is too great, rtt_thresh in flex1. * 3 - CSS is active incr in flex1 * 4 - A new round is beginning flex1 is round count * 5 - A new RTT measurement flex1 is the new measurement. * 6 - We enter CA ssthresh is also in flex1. * 7 - Socket option to change hystart executed opt.val in flex1. * 8 - Back out of CSS into SS, flex1 is the css_baseline_minrtt * 9 - We enter CA, via an ECN mark. * 10 - We enter CA, via a loss. * 11 - We have slipped out of SS into CA via cwnd growth. * 12 - After idle has re-enabled hystart++ */ struct tcpcb *tp; if (hystart_bblogs == 0) return; tp = ccv->tp; if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = cubicd->css_current_round_minrtt; log.u_bbr.flex3 = cubicd->css_lastround_minrtt; log.u_bbr.flex4 = cubicd->css_rttsample_count; log.u_bbr.flex5 = cubicd->css_entered_at_round; log.u_bbr.flex6 = cubicd->css_baseline_minrtt; /* We only need bottom 16 bits of flags */ log.u_bbr.flex7 = cubicd->flags & 0x0000ffff; log.u_bbr.flex8 = mod; log.u_bbr.epoch = cubicd->css_current_round; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.lt_epoch = cubicd->css_fas_at_css_entry; log.u_bbr.pkts_out = cubicd->css_last_fas; log.u_bbr.delivered = cubicd->css_lowrtt_fas; log.u_bbr.pkt_epoch = ccv->flags; TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_HYSTART, 0, 0, &log, false, &tv); } } static void cubic_does_slow_start(struct cc_var *ccv, struct cubic *cubicd) { /* * In slow-start with ABC enabled and no RTO in sight? * (Must not use abc_l_var > 1 if slow starting after * an RTO. On RTO, snd_nxt = snd_una, so the * snd_nxt == snd_max check is sufficient to * handle this). * * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ u_int cw = CCV(ccv, snd_cwnd); uint32_t mss = tcp_fixed_maxseg(ccv->tp); u_int incr = mss; uint16_t abc_val; cubicd->flags |= CUBICFLAG_IN_SLOWSTART; if (ccv->flags & CCF_USE_LOCAL_ABC) abc_val = ccv->labc; else abc_val = V_tcp_abc_l_var; if ((ccv->flags & CCF_HYSTART_ALLOWED) && (cubicd->flags & CUBICFLAG_HYSTART_ENABLED) && ((cubicd->flags & CUBICFLAG_HYSTART_IN_CSS) == 0)) { /* * Hystart is allowed and still enabled and we are not yet * in CSS. Lets check to see if we can make a decision on * if we need to go into CSS. */ if ((cubicd->css_rttsample_count >= hystart_n_rttsamples) && (cubicd->css_current_round_minrtt != 0xffffffff) && (cubicd->css_lastround_minrtt != 0xffffffff)) { uint32_t rtt_thresh; /* Clamp (minrtt_thresh, lastround/8, maxrtt_thresh) */ rtt_thresh = (cubicd->css_lastround_minrtt >> 3); if (rtt_thresh < hystart_minrtt_thresh) rtt_thresh = hystart_minrtt_thresh; if (rtt_thresh > hystart_maxrtt_thresh) rtt_thresh = hystart_maxrtt_thresh; cubic_log_hystart_event(ccv, cubicd, 1, rtt_thresh); if (cubicd->css_current_round_minrtt >= (cubicd->css_lastround_minrtt + rtt_thresh)) { /* Enter CSS */ cubicd->flags |= CUBICFLAG_HYSTART_IN_CSS; cubicd->css_fas_at_css_entry = cubicd->css_lowrtt_fas; /* * The draft (v4) calls for us to set baseline to css_current_round_min * but that can cause an oscillation. We probably shoudl be using * css_lastround_minrtt, but the authors insist that will cause * issues on exiting early. We will leave the draft version for now * but I suspect this is incorrect. */ cubicd->css_baseline_minrtt = cubicd->css_current_round_minrtt; cubicd->css_entered_at_round = cubicd->css_current_round; cubic_log_hystart_event(ccv, cubicd, 2, rtt_thresh); } } } if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, ccv->nsegs * abc_val * mss); else incr = min(ccv->bytes_this_ack, mss); /* Only if Hystart is enabled will the flag get set */ if (cubicd->flags & CUBICFLAG_HYSTART_IN_CSS) { incr /= hystart_css_growth_div; cubic_log_hystart_event(ccv, cubicd, 3, incr); } /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) CCV(ccv, snd_cwnd) = min((cw + incr), TCP_MAXWIN << CCV(ccv, snd_scale)); } static void cubic_ack_received(struct cc_var *ccv, ccsignal_t type) { struct cubic *cubic_data; unsigned long W_est, W_cubic; int usecs_since_epoch; uint32_t mss = tcp_fixed_maxseg(ccv->tp); cubic_data = ccv->cc_data; cubic_record_rtt(ccv); /* * For a regular ACK and we're not in cong/fast recovery and * we're cwnd limited, always recalculate cwnd. */ if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { /* Use the logic in NewReno ack_received() for slow start. */ if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || cubic_data->min_rtt_usecs == TCPTV_SRTTBASE) { cubic_does_slow_start(ccv, cubic_data); } else { if (cubic_data->flags & CUBICFLAG_HYSTART_IN_CSS) { /* * We have slipped into CA with * CSS active. Deactivate all. */ /* Turn off the CSS flag */ cubic_data->flags &= ~CUBICFLAG_HYSTART_IN_CSS; /* Disable use of CSS in the future except long idle */ cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED; cubic_log_hystart_event(ccv, cubic_data, 11, CCV(ccv, snd_ssthresh)); } if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) && (cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) { /* RFC8312 Section 4.7 */ cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT | CUBICFLAG_IN_SLOWSTART); cubic_data->W_max = CCV(ccv, snd_cwnd); cubic_data->t_epoch = ticks; cubic_data->K = 0; } else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | CUBICFLAG_IN_APPLIMIT)) { cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART | CUBICFLAG_IN_APPLIMIT); cubic_data->t_epoch = ticks; cubic_data->K = cubic_k(cubic_data->W_max / mss); } usecs_since_epoch = (ticks - cubic_data->t_epoch) * tick; if (usecs_since_epoch < 0) { /* * dragging t_epoch along */ usecs_since_epoch = INT_MAX; cubic_data->t_epoch = ticks - INT_MAX; } W_est = tf_cwnd(ccv); /* * The mean RTT is used to best reflect the equations in * the I-D. */ W_cubic = cubic_cwnd(usecs_since_epoch + cubic_data->mean_rtt_usecs, cubic_data->W_max, mss, cubic_data->K); if (W_cubic < W_est) { /* * TCP-friendly region, follow tf * cwnd growth. */ CCV(ccv, snd_cwnd) = ulmin(W_est, INT_MAX); cubic_data->flags |= CUBICFLAG_IN_TF; } else if (CCV(ccv, snd_cwnd) < W_cubic) { /* * Concave or convex region, follow CUBIC * cwnd growth. * Only update snd_cwnd, if it doesn't shrink. */ CCV(ccv, snd_cwnd) = ulmin(W_cubic, INT_MAX); cubic_data->flags &= ~CUBICFLAG_IN_TF; } /* * If we're not in slow start and we're probing for a * new cwnd limit at the start of a connection * (happens when hostcache has a relevant entry), * keep updating our current estimate of the * W_max. */ if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) && cubic_data->W_max < CCV(ccv, snd_cwnd)) { cubic_data->W_max = CCV(ccv, snd_cwnd); cubic_data->K = cubic_k(cubic_data->W_max / mss); } } } else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && !(ccv->flags & CCF_CWND_LIMITED)) { cubic_data->flags |= CUBICFLAG_IN_APPLIMIT; } } /* * This is a CUBIC specific implementation of after_idle. * - Reset cwnd by calling New Reno implementation of after_idle. * - Reset t_epoch. */ static void cubic_after_idle(struct cc_var *ccv) { struct cubic *cubic_data; cubic_data = ccv->cc_data; cubic_data->W_max = ulmax(cubic_data->W_max, CCV(ccv, snd_cwnd)); cubic_data->K = cubic_k(cubic_data->W_max / tcp_fixed_maxseg(ccv->tp)); if ((cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) == 0) { /* * Re-enable hystart if we have been idle. */ cubic_data->flags &= ~CUBICFLAG_HYSTART_IN_CSS; cubic_data->flags |= CUBICFLAG_HYSTART_ENABLED; cubic_log_hystart_event(ccv, cubic_data, 12, CCV(ccv, snd_ssthresh)); } newreno_cc_after_idle(ccv); cubic_data->t_epoch = ticks; } static void cubic_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } static size_t cubic_data_sz(void) { return (sizeof(struct cubic)); } static int cubic_cb_init(struct cc_var *ccv, void *ptr) { struct cubic *cubic_data; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { cubic_data = malloc(sizeof(struct cubic), M_CC_MEM, M_NOWAIT|M_ZERO); if (cubic_data == NULL) return (ENOMEM); } else cubic_data = ptr; /* Init some key variables with sensible defaults. */ cubic_data->t_epoch = ticks; cubic_data->min_rtt_usecs = TCPTV_SRTTBASE; cubic_data->mean_rtt_usecs = 1; ccv->cc_data = cubic_data; cubic_data->flags = CUBICFLAG_HYSTART_ENABLED; /* At init set both to infinity */ cubic_data->css_lastround_minrtt = 0xffffffff; cubic_data->css_current_round_minrtt = 0xffffffff; cubic_data->css_current_round = 0; cubic_data->css_baseline_minrtt = 0xffffffff; cubic_data->css_rttsample_count = 0; cubic_data->css_entered_at_round = 0; cubic_data->css_fas_at_css_entry = 0; cubic_data->css_lowrtt_fas = 0; cubic_data->css_last_fas = 0; return (0); } /* * Perform any necessary tasks before we enter congestion recovery. */ static void cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) { struct cubic *cubic_data; uint32_t mss, pipe; cubic_data = ccv->cc_data; mss = tcp_fixed_maxseg(ccv->tp); switch (type) { case CC_NDUPACK: if (cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED; cubic_data->flags &= ~CUBICFLAG_HYSTART_IN_CSS; cubic_log_hystart_event(ccv, cubic_data, 10, CCV(ccv, snd_ssthresh)); } if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { cubic_ssthresh_update(ccv, mss); cubic_data->flags |= CUBICFLAG_CONG_EVENT; cubic_data->t_epoch = ticks; cubic_data->K = cubic_k(cubic_data->W_max / mss); } ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED; cubic_data->flags &= ~CUBICFLAG_HYSTART_IN_CSS; cubic_log_hystart_event(ccv, cubic_data, 9, CCV(ccv, snd_ssthresh)); } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { cubic_ssthresh_update(ccv, mss); cubic_data->flags |= CUBICFLAG_CONG_EVENT; cubic_data->t_epoch = ticks; cubic_data->K = cubic_k(cubic_data->W_max / mss); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: /* RFC8312 Section 4.7 */ if (CCV(ccv, t_rxtshift) == 1) { /* * Remember the state only for the first RTO event. This * will help us restore the state to the values seen * at the most recent congestion avoidance stage before * the current RTO event. */ cubic_data->undo_t_epoch = cubic_data->t_epoch; cubic_data->undo_cwnd_epoch = cubic_data->cwnd_epoch; cubic_data->undo_W_max = cubic_data->W_max; cubic_data->undo_K = cubic_data->K; pipe = tcp_compute_pipe(ccv->tp); CCV(ccv, snd_ssthresh) = max(2, (((uint64_t)min(CCV(ccv, snd_wnd), pipe) * CUBIC_BETA) >> CUBIC_SHIFT) / mss) * mss; } cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT; CCV(ccv, snd_cwnd) = mss; break; case CC_RTO_ERR: cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT); cubic_data->K = cubic_data->undo_K; cubic_data->W_max = cubic_data->undo_W_max; cubic_data->cwnd_epoch = cubic_data->undo_cwnd_epoch; cubic_data->t_epoch = cubic_data->undo_t_epoch; break; default: break; } } static void cubic_conn_init(struct cc_var *ccv) { struct cubic *cubic_data; cubic_data = ccv->cc_data; /* * Ensure we have a sane initial value for W_max recorded. Without * this here bad things happen when entries from the TCP hostcache * get used. */ cubic_data->W_max = CCV(ccv, snd_cwnd); } static int cubic_mod_init(void) { return (0); } /* * Perform any necessary tasks before we exit congestion recovery. */ static void cubic_post_recovery(struct cc_var *ccv) { struct cubic *cubic_data; int pipe; uint32_t mss = tcp_fixed_maxseg(ccv->tp); cubic_data = ccv->cc_data; pipe = 0; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * If inflight data is less than ssthresh, set cwnd * conservatively to avoid a burst of data, as suggested in * the NewReno RFC. Otherwise, use the CUBIC method. - * - * XXXLAS: Find a way to do this without needing curack */ - if (V_tcp_do_newsack) - pipe = tcp_compute_pipe(ccv->tp); - else - pipe = CCV(ccv, snd_max) - ccv->curack; - + pipe = tcp_compute_pipe(ccv->tp); if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd does not collapse to 1 MSS under * adverse conditions. Implements RFC6582 */ CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else /* Update cwnd based on beta and adjusted W_max. */ CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->W_max * CUBIC_BETA) >> CUBIC_SHIFT, 2 * mss); } /* Calculate the average RTT between congestion epochs. */ if (cubic_data->epoch_ack_count > 0 && cubic_data->sum_rtt_usecs >= cubic_data->epoch_ack_count) { cubic_data->mean_rtt_usecs = (int)(cubic_data->sum_rtt_usecs / cubic_data->epoch_ack_count); } cubic_data->epoch_ack_count = 0; cubic_data->sum_rtt_usecs = 0; } /* * Record the min RTT and sum samples for the epoch average RTT calculation. */ static void cubic_record_rtt(struct cc_var *ccv) { struct cubic *cubic_data; uint32_t t_srtt_usecs; /* Ignore srtt until a min number of samples have been taken. */ if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { cubic_data = ccv->cc_data; t_srtt_usecs = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_USEC); /* * Record the current SRTT as our minrtt if it's the smallest * we've seen or minrtt is currently equal to its initialised * value. * * XXXLAS: Should there be some hysteresis for minrtt? */ if ((t_srtt_usecs < cubic_data->min_rtt_usecs || cubic_data->min_rtt_usecs == TCPTV_SRTTBASE)) { /* A minimal rtt is a single unshifted tick of a ticks * timer. */ cubic_data->min_rtt_usecs = max(tick >> TCP_RTT_SHIFT, t_srtt_usecs); /* * If the connection is within its first congestion * epoch, ensure we prime mean_rtt_usecs with a * reasonable value until the epoch average RTT is * calculated in cubic_post_recovery(). */ if (cubic_data->min_rtt_usecs > cubic_data->mean_rtt_usecs) cubic_data->mean_rtt_usecs = cubic_data->min_rtt_usecs; } /* Sum samples for epoch average RTT calculation. */ cubic_data->sum_rtt_usecs += t_srtt_usecs; cubic_data->epoch_ack_count++; } } /* * Update the ssthresh in the event of congestion. */ static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg) { struct cubic *cubic_data; uint32_t ssthresh; uint32_t cwnd; cubic_data = ccv->cc_data; cwnd = CCV(ccv, snd_cwnd); /* Fast convergence heuristic. */ if (cwnd < cubic_data->W_max) { cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT; } cubic_data->undo_W_max = cubic_data->W_max; cubic_data->W_max = cwnd; if (cubic_data->flags & CUBICFLAG_IN_TF) { /* If in the TCP friendly region, follow what newreno does */ ssthresh = newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg); } else if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) { /* * On the first congestion event, set ssthresh to cwnd * 0.5 * and reduce W_max to cwnd * beta. This aligns the cubic * concave region appropriately. */ ssthresh = cwnd >> 1; cubic_data->W_max = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT; } else { /* * On subsequent congestion events, set ssthresh to cwnd * beta. */ ssthresh = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT; } CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * maxseg); } static void cubic_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas) { struct cubic *cubicd; cubicd = ccv->cc_data; if (rxtcnt > 1) { /* * Only look at RTT's that are non-ambiguous. */ return; } cubicd->css_rttsample_count++; cubicd->css_last_fas = fas; if (cubicd->css_current_round_minrtt > usec_rtt) { cubicd->css_current_round_minrtt = usec_rtt; cubicd->css_lowrtt_fas = cubicd->css_last_fas; } if ((cubicd->css_rttsample_count >= hystart_n_rttsamples) && (cubicd->css_current_round_minrtt != 0xffffffff) && (cubicd->css_current_round_minrtt < cubicd->css_baseline_minrtt) && (cubicd->css_lastround_minrtt != 0xffffffff)) { /* * We were in CSS and the RTT is now less, we * entered CSS erroneously. */ cubicd->flags &= ~CUBICFLAG_HYSTART_IN_CSS; cubic_log_hystart_event(ccv, cubicd, 8, cubicd->css_baseline_minrtt); cubicd->css_baseline_minrtt = 0xffffffff; } if (cubicd->flags & CUBICFLAG_HYSTART_ENABLED) cubic_log_hystart_event(ccv, cubicd, 5, usec_rtt); } static void cubic_newround(struct cc_var *ccv, uint32_t round_cnt) { struct cubic *cubicd; cubicd = ccv->cc_data; /* We have entered a new round */ cubicd->css_lastround_minrtt = cubicd->css_current_round_minrtt; cubicd->css_current_round_minrtt = 0xffffffff; cubicd->css_rttsample_count = 0; cubicd->css_current_round = round_cnt; if ((cubicd->flags & CUBICFLAG_HYSTART_IN_CSS) && ((round_cnt - cubicd->css_entered_at_round) >= hystart_css_rounds)) { /* Enter CA */ if (ccv->flags & CCF_HYSTART_CAN_SH_CWND) { /* * We engage more than snd_ssthresh, engage * the brakes!! Though we will stay in SS to * creep back up again, so lets leave CSS active * and give us hystart_css_rounds more rounds. */ if (ccv->flags & CCF_HYSTART_CONS_SSTH) { CCV(ccv, snd_ssthresh) = ((cubicd->css_lowrtt_fas + cubicd->css_fas_at_css_entry) / 2); } else { CCV(ccv, snd_ssthresh) = cubicd->css_lowrtt_fas; } CCV(ccv, snd_cwnd) = cubicd->css_fas_at_css_entry; cubicd->css_entered_at_round = round_cnt; } else { CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); /* Turn off the CSS flag */ cubicd->flags &= ~CUBICFLAG_HYSTART_IN_CSS; /* Disable use of CSS in the future except long idle */ cubicd->flags &= ~CUBICFLAG_HYSTART_ENABLED; } cubic_log_hystart_event(ccv, cubicd, 6, CCV(ccv, snd_ssthresh)); } if (cubicd->flags & CUBICFLAG_HYSTART_ENABLED) cubic_log_hystart_event(ccv, cubicd, 4, round_cnt); } DECLARE_CC_MODULE(cubic, &cubic_cc_algo); MODULE_VERSION(cubic, 2); diff --git a/sys/netinet/cc/cc_htcp.c b/sys/netinet/cc/cc_htcp.c index ab6165f2e720..569495144d50 100644 --- a/sys/netinet/cc/cc_htcp.c +++ b/sys/netinet/cc/cc_htcp.c @@ -1,549 +1,543 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the H-TCP congestion control algorithm for FreeBSD, * based on the Internet Draft "draft-leith-tcp-htcp-06.txt" by Leith and * Shorten. Originally released as part of the NewTCP research project at * Swinburne University of Technology's Centre for Advanced Internet * Architectures, Melbourne, Australia, which was made possible in part by a * grant from the Cisco University Research Program Fund at Community Foundation * Silicon Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Fixed point math shifts. */ #define HTCP_SHIFT 8 #define HTCP_ALPHA_INC_SHIFT 4 #define HTCP_INIT_ALPHA 1 #define HTCP_DELTA_L hz /* 1 sec in ticks. */ #define HTCP_MINBETA 128 /* 0.5 << HTCP_SHIFT. */ #define HTCP_MAXBETA 204 /* ~0.8 << HTCP_SHIFT. */ #define HTCP_MINROWE 26 /* ~0.1 << HTCP_SHIFT. */ #define HTCP_MAXROWE 512 /* 2 << HTCP_SHIFT. */ /* RTT_ref (ms) used in the calculation of alpha if RTT scaling is enabled. */ #define HTCP_RTT_REF 100 /* Don't trust SRTT until this many samples have been taken. */ #define HTCP_MIN_RTT_SAMPLES 8 /* * HTCP_CALC_ALPHA performs a fixed point math calculation to determine the * value of alpha, based on the function defined in the HTCP spec. * * i.e. 1 + 10(delta - delta_l) + ((delta - delta_l) / 2) ^ 2 * * "diff" is passed in to the macro as "delta - delta_l" and is expected to be * in units of ticks. * * The joyousnous of fixed point maths means our function implementation looks a * little funky... * * In order to maintain some precision in the calculations, a fixed point shift * HTCP_ALPHA_INC_SHIFT is used to ensure the integer divisions don't * truncate the results too badly. * * The "16" value is the "1" term in the alpha function shifted up by * HTCP_ALPHA_INC_SHIFT * * The "160" value is the "10" multiplier in the alpha function multiplied by * 2^HTCP_ALPHA_INC_SHIFT * * Specifying these as constants reduces the computations required. After * up-shifting all the terms in the function and performing the required * calculations, we down-shift the final result by HTCP_ALPHA_INC_SHIFT to * ensure it is back in the correct range. * * The "hz" terms are required as kernels can be configured to run with * different tick timers, which we have to adjust for in the alpha calculation * (which originally was defined in terms of seconds). * * We also have to be careful to constrain the value of diff such that it won't * overflow whilst performing the calculation. The middle term i.e. (160 * diff) * / hz is the limiting factor in the calculation. We must constrain diff to be * less than the max size of an int divided by the constant 160 figure * i.e. diff < INT_MAX / 160 * * NB: Changing HTCP_ALPHA_INC_SHIFT will require you to MANUALLY update the * constants used in this function! */ #define HTCP_CALC_ALPHA(diff) \ ((\ (16) + \ ((160 * (diff)) / hz) + \ (((diff) / hz) * (((diff) << HTCP_ALPHA_INC_SHIFT) / (4 * hz))) \ ) >> HTCP_ALPHA_INC_SHIFT) static void htcp_ack_received(struct cc_var *ccv, ccsignal_t type); static void htcp_cb_destroy(struct cc_var *ccv); static int htcp_cb_init(struct cc_var *ccv, void *ptr); static void htcp_cong_signal(struct cc_var *ccv, ccsignal_t type); static int htcp_mod_init(void); static void htcp_post_recovery(struct cc_var *ccv); static void htcp_recalc_alpha(struct cc_var *ccv); static void htcp_recalc_beta(struct cc_var *ccv); static void htcp_record_rtt(struct cc_var *ccv); static void htcp_ssthresh_update(struct cc_var *ccv); static size_t htcp_data_sz(void); struct htcp { /* cwnd before entering cong recovery. */ unsigned long prev_cwnd; /* cwnd additive increase parameter. */ int alpha; /* cwnd multiplicative decrease parameter. */ int beta; /* Largest rtt seen for the flow. */ int maxrtt; /* Shortest rtt seen for the flow. */ int minrtt; /* Time of last congestion event in ticks. */ int t_last_cong; }; static int htcp_rtt_ref; /* * The maximum number of ticks the value of diff can reach in * htcp_recalc_alpha() before alpha will stop increasing due to overflow. * See comment above HTCP_CALC_ALPHA for more info. */ static int htcp_max_diff = INT_MAX / ((1 << HTCP_ALPHA_INC_SHIFT) * 10); /* Per-netstack vars. */ VNET_DEFINE_STATIC(u_int, htcp_adaptive_backoff) = 0; VNET_DEFINE_STATIC(u_int, htcp_rtt_scaling) = 0; #define V_htcp_adaptive_backoff VNET(htcp_adaptive_backoff) #define V_htcp_rtt_scaling VNET(htcp_rtt_scaling) struct cc_algo htcp_cc_algo = { .name = "htcp", .ack_received = htcp_ack_received, .cb_destroy = htcp_cb_destroy, .cb_init = htcp_cb_init, .cong_signal = htcp_cong_signal, .mod_init = htcp_mod_init, .post_recovery = htcp_post_recovery, .cc_data_sz = htcp_data_sz, .after_idle = newreno_cc_after_idle, }; static void htcp_ack_received(struct cc_var *ccv, ccsignal_t type) { struct htcp *htcp_data; uint32_t mss = tcp_fixed_maxseg(ccv->tp); htcp_data = ccv->cc_data; htcp_record_rtt(ccv); /* * Regular ACK and we're not in cong/fast recovery and we're cwnd * limited and we're either not doing ABC or are slow starting or are * doing ABC and we've sent a cwnd's worth of bytes. */ if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { htcp_recalc_beta(ccv); htcp_recalc_alpha(ccv); /* * Use the logic in NewReno ack_received() for slow start and * for the first HTCP_DELTA_L ticks after either the flow starts * or a congestion event (when alpha equals 1). */ if (htcp_data->alpha == 1 || CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) newreno_cc_ack_received(ccv, type); else { if (V_tcp_do_rfc3465) { /* Increment cwnd by alpha segments. */ CCV(ccv, snd_cwnd) += htcp_data->alpha * mss; ccv->flags &= ~CCF_ABC_SENTAWND; } else /* * Increment cwnd by alpha/cwnd segments to * approximate an increase of alpha segments * per RTT. */ CCV(ccv, snd_cwnd) += (((htcp_data->alpha << HTCP_SHIFT) / (max(1, CCV(ccv, snd_cwnd) / mss))) * mss) >> HTCP_SHIFT; } } } static void htcp_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } static size_t htcp_data_sz(void) { return(sizeof(struct htcp)); } static int htcp_cb_init(struct cc_var *ccv, void *ptr) { struct htcp *htcp_data; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { htcp_data = malloc(sizeof(struct htcp), M_CC_MEM, M_NOWAIT); if (htcp_data == NULL) return (ENOMEM); } else htcp_data = ptr; /* Init some key variables with sensible defaults. */ htcp_data->alpha = HTCP_INIT_ALPHA; htcp_data->beta = HTCP_MINBETA; htcp_data->maxrtt = TCPTV_SRTTBASE; htcp_data->minrtt = TCPTV_SRTTBASE; htcp_data->prev_cwnd = 0; htcp_data->t_last_cong = ticks; ccv->cc_data = htcp_data; return (0); } /* * Perform any necessary tasks before we enter congestion recovery. */ static void htcp_cong_signal(struct cc_var *ccv, ccsignal_t type) { struct htcp *htcp_data; uint32_t mss, pipe; htcp_data = ccv->cc_data; mss = tcp_fixed_maxseg(ccv->tp); switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { /* * Apply hysteresis to maxrtt to ensure * reductions in the RTT are reflected in our * measurements. */ htcp_data->maxrtt = (htcp_data->minrtt + (htcp_data->maxrtt - htcp_data->minrtt) * 95) / 100; htcp_ssthresh_update(ccv); htcp_data->t_last_cong = ticks; htcp_data->prev_cwnd = CCV(ccv, snd_cwnd); } ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { /* * Apply hysteresis to maxrtt to ensure reductions in * the RTT are reflected in our measurements. */ htcp_data->maxrtt = (htcp_data->minrtt + (htcp_data->maxrtt - htcp_data->minrtt) * 95) / 100; htcp_ssthresh_update(ccv); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); htcp_data->t_last_cong = ticks; htcp_data->prev_cwnd = CCV(ccv, snd_cwnd); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { pipe = tcp_compute_pipe(ccv->tp); CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } CCV(ccv, snd_cwnd) = mss; /* * Grab the current time and record it so we know when the * most recent congestion event was. Only record it when the * timeout has fired more than once, as there is a reasonable * chance the first one is a false alarm and may not indicate * congestion. */ if (CCV(ccv, t_rxtshift) >= 2) htcp_data->t_last_cong = ticks; break; default: break; } } static int htcp_mod_init(void) { /* * HTCP_RTT_REF is defined in ms, and t_srtt in the tcpcb is stored in * units of TCP_RTT_SCALE*hz. Scale HTCP_RTT_REF to be in the same units * as t_srtt. */ htcp_rtt_ref = (HTCP_RTT_REF * TCP_RTT_SCALE * hz) / 1000; return (0); } /* * Perform any necessary tasks before we exit congestion recovery. */ static void htcp_post_recovery(struct cc_var *ccv) { int pipe; struct htcp *htcp_data; uint32_t mss = tcp_fixed_maxseg(ccv->tp); pipe = 0; htcp_data = ccv->cc_data; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * If inflight data is less than ssthresh, set cwnd * conservatively to avoid a burst of data, as suggested in the * NewReno RFC. Otherwise, use the HTCP method. - * - * XXXLAS: Find a way to do this without needing curack */ - if (V_tcp_do_newsack) - pipe = tcp_compute_pipe(ccv->tp); - else - pipe = CCV(ccv, snd_max) - ccv->curack; - + pipe = tcp_compute_pipe(ccv->tp); if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd down not collape to 1 MSS under * adverse conditions. Implements RFC6582 */ CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else CCV(ccv, snd_cwnd) = max(1, ((htcp_data->beta * htcp_data->prev_cwnd / mss) >> HTCP_SHIFT)) * mss; } } static void htcp_recalc_alpha(struct cc_var *ccv) { struct htcp *htcp_data; int alpha, diff, now; htcp_data = ccv->cc_data; now = ticks; /* * If ticks has wrapped around (will happen approximately once every 49 * days on a machine with the default kern.hz=1000) and a flow straddles * the wrap point, our alpha calcs will be completely wrong. We cut our * losses and restart alpha from scratch by setting t_last_cong = now - * HTCP_DELTA_L. * * This does not deflate our cwnd at all. It simply slows the rate cwnd * is growing by until alpha regains the value it held prior to taking * this drastic measure. */ if (now < htcp_data->t_last_cong) htcp_data->t_last_cong = now - HTCP_DELTA_L; diff = now - htcp_data->t_last_cong - HTCP_DELTA_L; /* Cap alpha if the value of diff would overflow HTCP_CALC_ALPHA(). */ if (diff < htcp_max_diff) { /* * If it has been more than HTCP_DELTA_L ticks since congestion, * increase alpha according to the function defined in the spec. */ if (diff > 0) { alpha = HTCP_CALC_ALPHA(diff); /* * Adaptive backoff fairness adjustment: * 2 * (1 - beta) * alpha_raw */ if (V_htcp_adaptive_backoff) alpha = max(1, (2 * ((1 << HTCP_SHIFT) - htcp_data->beta) * alpha) >> HTCP_SHIFT); /* * RTT scaling: (RTT / RTT_ref) * alpha * alpha will be the raw value from HTCP_CALC_ALPHA() if * adaptive backoff is off, or the adjusted value if * adaptive backoff is on. */ if (V_htcp_rtt_scaling) alpha = max(1, (min(max(HTCP_MINROWE, (tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) << HTCP_SHIFT) / htcp_rtt_ref), HTCP_MAXROWE) * alpha) >> HTCP_SHIFT); } else alpha = 1; htcp_data->alpha = alpha; } } static void htcp_recalc_beta(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* * TCPTV_SRTTBASE is the initialised value of each connection's SRTT, so * we only calc beta if the connection's SRTT has been changed from its * initial value. beta is bounded to ensure it is always between * HTCP_MINBETA and HTCP_MAXBETA. */ if (V_htcp_adaptive_backoff && htcp_data->minrtt != TCPTV_SRTTBASE && htcp_data->maxrtt != TCPTV_SRTTBASE) htcp_data->beta = min(max(HTCP_MINBETA, (htcp_data->minrtt << HTCP_SHIFT) / htcp_data->maxrtt), HTCP_MAXBETA); else htcp_data->beta = HTCP_MINBETA; } /* * Record the minimum and maximum RTT seen for the connection. These are used in * the calculation of beta if adaptive backoff is enabled. */ static void htcp_record_rtt(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* XXXLAS: Should there be some hysteresis for minrtt? */ /* * Record the current SRTT as our minrtt if it's the smallest we've seen * or minrtt is currently equal to its initialised value. Ignore SRTT * until a min number of samples have been taken. */ if ((tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) < htcp_data->minrtt || htcp_data->minrtt == TCPTV_SRTTBASE) && (CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES)) htcp_data->minrtt = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS); /* * Record the current SRTT as our maxrtt if it's the largest we've * seen. Ignore SRTT until a min number of samples have been taken. */ if (tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) > htcp_data->maxrtt && CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES) htcp_data->maxrtt = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS); } /* * Update the ssthresh in the event of congestion. */ static void htcp_ssthresh_update(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* * On the first congestion event, set ssthresh to cwnd * 0.5, on * subsequent congestion events, set it to cwnd * beta. */ if (CCV(ccv, snd_ssthresh) == TCP_MAXWIN << TCP_MAX_WINSHIFT) CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) * HTCP_MINBETA) >> HTCP_SHIFT; else { htcp_recalc_beta(ccv); CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) * htcp_data->beta) >> HTCP_SHIFT; } } SYSCTL_DECL(_net_inet_tcp_cc_htcp); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, htcp, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "H-TCP related settings"); SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, adaptive_backoff, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_adaptive_backoff), 0, "enable H-TCP adaptive backoff"); SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, rtt_scaling, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_rtt_scaling), 0, "enable H-TCP RTT scaling"); DECLARE_CC_MODULE(htcp, &htcp_cc_algo); MODULE_VERSION(htcp, 2);