diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index 0c2d6cb0e86d..cd65d5a1a992 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -1,712 +1,713 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Have a sane default if no CC_DEFAULT is specified in the kernel config file. */ #ifndef CC_DEFAULT #define CC_DEFAULT "cubic" #endif uint32_t hystart_minrtt_thresh = 4000; uint32_t hystart_maxrtt_thresh = 16000; uint32_t hystart_n_rttsamples = 8; uint32_t hystart_css_growth_div = 4; uint32_t hystart_css_rounds = 5; uint32_t hystart_bblogs = 0; MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); /* * List of available cc algorithms on the current system. First element * is used as the system default CC algorithm. */ struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); /* Protects the cc_list TAILQ. */ struct rwlock cc_list_lock; VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; VNET_DEFINE(uint32_t, newreno_beta) = 50; #define V_newreno_beta VNET(newreno_beta) +VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; void cc_refer(struct cc_algo *algo) { CC_LIST_LOCK_ASSERT(); refcount_acquire(&algo->cc_refcount); } void cc_release(struct cc_algo *algo) { CC_LIST_LOCK_ASSERT(); refcount_release(&algo->cc_refcount); } void cc_attach(struct tcpcb *tp, struct cc_algo *algo) { /* * Attach the tcpcb to the algorithm. */ CC_LIST_RLOCK(); CC_ALGO(tp) = algo; cc_refer(algo); CC_LIST_RUNLOCK(); } void cc_detach(struct tcpcb *tp) { struct cc_algo *algo; CC_LIST_RLOCK(); algo = CC_ALGO(tp); CC_ALGO(tp) = NULL; cc_release(algo); CC_LIST_RUNLOCK(); } /* * Sysctl handler to show and change the default CC algorithm. */ static int cc_default_algo(SYSCTL_HANDLER_ARGS) { char default_cc[TCP_CA_NAME_MAX]; struct cc_algo *funcs; int error; /* Get the current default: */ CC_LIST_RLOCK(); if (CC_DEFAULT_ALGO() != NULL) strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); else memset(default_cc, 0, TCP_CA_NAME_MAX); CC_LIST_RUNLOCK(); error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) goto done; error = ESRCH; /* Find algo with specified name and set it to default. */ CC_LIST_RLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (strncmp(default_cc, funcs->name, sizeof(default_cc))) continue; if (funcs->flags & CC_MODULE_BEING_REMOVED) { /* Its being removed, its not eligible */ continue; } V_default_cc_ptr = funcs; error = 0; break; } CC_LIST_RUNLOCK(); done: return (error); } /* * Sysctl handler to display the list of available CC algorithms. */ static int cc_list_available(SYSCTL_HANDLER_ARGS) { struct cc_algo *algo; int error, nalgos; int linesz; char *buffer, *cp; size_t bufsz, outsz; error = nalgos = 0; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { nalgos++; } CC_LIST_RUNLOCK(); if (nalgos == 0) { return (ENOENT); } bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1); buffer = malloc(bufsz, M_TEMP, M_WAITOK); cp = buffer; linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D', "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { linesz = snprintf(cp, bufsz, "%-16s%c %u\n", algo->name, (algo == CC_DEFAULT_ALGO()) ? '*' : ' ', algo->cc_refcount); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } CC_LIST_RUNLOCK(); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } /* * Return the number of times a proposed removal_cc is * being used as the default. */ static int cc_check_default(struct cc_algo *remove_cc) { int cnt = 0; VNET_ITERATOR_DECL(vnet_iter); CC_LIST_LOCK_ASSERT(); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if ((CC_DEFAULT_ALGO() != NULL) && strncmp(CC_DEFAULT_ALGO()->name, remove_cc->name, TCP_CA_NAME_MAX) == 0) { cnt++; } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); return (cnt); } /* * Initialise CC subsystem on system boot. */ static void cc_init(void) { CC_LIST_LOCK_INIT(); STAILQ_INIT(&cc_list); } /* * Returns non-zero on success, 0 on failure. */ static int cc_deregister_algo_locked(struct cc_algo *remove_cc) { struct cc_algo *funcs; int found = 0; /* This is unlikely to fail */ STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == remove_cc) found = 1; } if (found == 0) { /* Nothing to remove? */ return (ENOENT); } /* We assert it should have been MOD_QUIESCE'd */ KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED), ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc)); if (cc_check_default(remove_cc)) { return(EBUSY); } if (remove_cc->cc_refcount != 0) { return (EBUSY); } /* Remove algo from cc_list so that new connections can't use it. */ STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries); return (0); } /* * Returns non-zero on success, 0 on failure. */ int cc_deregister_algo(struct cc_algo *remove_cc) { int ret; CC_LIST_WLOCK(); ret = cc_deregister_algo_locked(remove_cc); CC_LIST_WUNLOCK(); return (ret); } /* * Returns 0 on success, non-zero on failure. */ int cc_register_algo(struct cc_algo *add_cc) { struct cc_algo *funcs; int err; err = 0; /* * Iterate over list of registered CC algorithms and make sure * we're not trying to add a duplicate. */ CC_LIST_WLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == add_cc || strncmp(funcs->name, add_cc->name, TCP_CA_NAME_MAX) == 0) { err = EEXIST; break; } } /* Init its reference count */ if (err == 0) refcount_init(&add_cc->cc_refcount, 0); /* * The first loaded congestion control module will become * the default until we find the "CC_DEFAULT" defined in * the config (if we do). */ if (!err) { STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); if (strcmp(add_cc->name, CC_DEFAULT) == 0) { V_default_cc_ptr = add_cc; } else if (V_default_cc_ptr == NULL) { V_default_cc_ptr = add_cc; } } CC_LIST_WUNLOCK(); return (err); } static void vnet_cc_sysinit(void *arg) { struct cc_algo *cc; if (IS_DEFAULT_VNET(curvnet)) return; CURVNET_SET(vnet0); cc = V_default_cc_ptr; CURVNET_RESTORE(); V_default_cc_ptr = cc; } VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_cc_sysinit, NULL); /* * Perform any necessary tasks before we exit congestion recovery. */ void newreno_cc_post_recovery(struct cc_var *ccv) { int pipe; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * Fast recovery will conclude after returning from this * function. Window inflation should have left us with * approximately snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do it via the * slow start mechanism. * * XXXLAS: Find a way to do this without needing curack */ if (V_tcp_do_newsack) pipe = tcp_compute_pipe(ccv->ccvc.tcp); else pipe = CCV(ccv, snd_max) - ccv->curack; if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd does not collapse to 1 MSS under * adverse conditions. Implements RFC6582 */ CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + CCV(ccv, t_maxseg); else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } } void newreno_cc_after_idle(struct cc_var *ccv) { uint32_t rw; /* * If we've been idle for more than one retransmit timeout the old * congestion window is no longer current and we have to reduce it to * the restart window before we can transmit again. * * The restart window is the initial window or the last CWND, whichever * is smaller. * * This is done to prevent us from flooding the path with a full CWND at * wirespeed, overloading router and switch buffers along the way. * * See RFC5681 Section 4.1. "Restarting Idle Connections". * * In addition, per RFC2861 Section 2, the ssthresh is set to the * maximum of the former ssthresh or 3/4 of the old cwnd, to * not exit slow-start prematurely. */ rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); } /* * Perform any necessary tasks before we enter congestion recovery. */ void newreno_cc_cong_signal(struct cc_var *ccv, uint32_t type) { uint32_t cwin, factor; u_int mss; cwin = CCV(ccv, snd_cwnd); mss = tcp_fixed_maxseg(ccv->ccvc.tcp); /* * Other TCP congestion controls use newreno_cong_signal(), but * with their own private cc_data. Make sure the cc_data is used * correctly. */ factor = V_newreno_beta; /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, ("%s: congestion signal type 0x%08x is private\n", __func__, type)); cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss; switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_ssthresh) = cwin; CCV(ccv, snd_cwnd) = cwin; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / 2 / mss, 2) * mss; CCV(ccv, snd_cwnd) = mss; break; } } void newreno_cc_ack_received(struct cc_var *ccv, uint16_t type) { if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { u_int cw = CCV(ccv, snd_cwnd); u_int incr = CCV(ccv, t_maxseg); /* * Regular in-order ACK, open the congestion window. * Method depends on which congestion control state we're * in (slow start or cong avoid) and if ABC (RFC 3465) is * enabled. * * slow start: cwnd <= ssthresh * cong avoid: cwnd > ssthresh * * slow start and ABC (RFC 3465): * Grow cwnd exponentially by the amount of data * ACKed capping the max increment per ACK to * (abc_l_var * maxseg) bytes. * * slow start without ABC (RFC 5681): * Grow cwnd exponentially by maxseg per ACK. * * cong avoid and ABC (RFC 3465): * Grow cwnd linearly by maxseg per RTT for each * cwnd worth of ACKed data. * * cong avoid without ABC (RFC 5681): * Grow cwnd linearly by approximately maxseg per RTT using * maxseg^2 / cwnd per ACK as the increment. * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to * avoid capping cwnd. */ if (cw > CCV(ccv, snd_ssthresh)) { if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) ccv->flags &= ~CCF_ABC_SENTAWND; else incr = 0; } else incr = max((incr * incr / cw), 1); } else if (V_tcp_do_rfc3465) { /* * In slow-start with ABC enabled and no RTO in sight? * (Must not use abc_l_var > 1 if slow starting after * an RTO. On RTO, snd_nxt = snd_una, so the * snd_nxt == snd_max check is sufficient to * handle this). * * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ uint16_t abc_val; if (ccv->flags & CCF_USE_LOCAL_ABC) abc_val = ccv->labc; else abc_val = V_tcp_abc_l_var; if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, ccv->nsegs * abc_val * CCV(ccv, t_maxseg)); else incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); } /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) CCV(ccv, snd_cwnd) = min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } } static int cc_stop_new_assignments(struct cc_algo *algo) { CC_LIST_WLOCK(); if (cc_check_default(algo)) { /* A default cannot be removed */ CC_LIST_WUNLOCK(); return (EBUSY); } algo->flags |= CC_MODULE_BEING_REMOVED; CC_LIST_WUNLOCK(); return (0); } /* * Handles kld related events. Returns 0 on success, non-zero on failure. */ int cc_modevent(module_t mod, int event_type, void *data) { struct cc_algo *algo; int err; err = 0; algo = (struct cc_algo *)data; switch(event_type) { case MOD_LOAD: if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { /* * A module must have a cc_data_sz function * even if it has no data it should return 0. */ printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); err = EINVAL; break; } if (algo->mod_init != NULL) err = algo->mod_init(); if (!err) err = cc_register_algo(algo); break; case MOD_SHUTDOWN: break; case MOD_QUIESCE: /* Stop any new assigments */ err = cc_stop_new_assignments(algo); break; case MOD_UNLOAD: /* * Deregister and remove the module from the list */ CC_LIST_WLOCK(); /* Even with -f we can't unload if its the default */ if (cc_check_default(algo)) { /* A default cannot be removed */ CC_LIST_WUNLOCK(); return (EBUSY); } /* * If -f was used and users are still attached to * the algorithm things are going to go boom. */ err = cc_deregister_algo_locked(algo); CC_LIST_WUNLOCK(); if ((err == 0) && (algo->mod_destroy != NULL)) { algo->mod_destroy(); } break; default: err = EINVAL; break; } return (err); } SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); /* Declare sysctl tree and populate it. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Congestion control related settings"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, cc_default_algo, "A", "Default congestion control algorithm"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, cc_list_available, "A", "List available congestion control algorithms"); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "New Reno related HyStart++ settings"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh, CTLFLAG_RW, &hystart_minrtt_thresh, 4000, "HyStarts++ minimum RTT thresh used in clamp (in microseconds)"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh, CTLFLAG_RW, &hystart_maxrtt_thresh, 16000, "HyStarts++ maximum RTT thresh used in clamp (in microseconds)"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples, CTLFLAG_RW, &hystart_n_rttsamples, 8, "The number of RTT samples that must be seen to consider HyStart++"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div, CTLFLAG_RW, &hystart_css_growth_div, 4, "The divisor to the growth when in Hystart++ CSS"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds, CTLFLAG_RW, &hystart_css_rounds, 5, "The number of rounds HyStart++ lasts in CSS before falling to CA"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs, CTLFLAG_RW, &hystart_bblogs, 0, "Do we enable HyStart++ Black Box logs to be generated if BB logging is on"); VNET_DEFINE(int, cc_do_abe) = 0; SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cc_do_abe), 0, "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); VNET_DEFINE(int, cc_abe_frlossreduce) = 0; SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cc_abe_frlossreduce), 0, "Apply standard beta instead of ABE-beta during ECN-signalled congestion " "recovery episodes if loss also needs to be repaired"); diff --git a/sys/netinet/cc/cc_newreno.c b/sys/netinet/cc/cc_newreno.c index 90895e0f6988..43f3d81389f3 100644 --- a/sys/netinet/cc/cc_newreno.c +++ b/sys/netinet/cc/cc_newreno.c @@ -1,604 +1,604 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. * Copyright (c) 2007-2008,2010,2014 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, James * Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ * * Dec 2014 garmitage@swin.edu.au * Borrowed code fragments from cc_cdg.c to add modifiable beta * via sysctls. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void newreno_cb_destroy(struct cc_var *ccv); static void newreno_ack_received(struct cc_var *ccv, uint16_t type); static void newreno_after_idle(struct cc_var *ccv); static void newreno_cong_signal(struct cc_var *ccv, uint32_t type); static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf); static void newreno_newround(struct cc_var *ccv, uint32_t round_cnt); static void newreno_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas); static int newreno_cb_init(struct cc_var *ccv, void *); static size_t newreno_data_sz(void); VNET_DECLARE(uint32_t, newreno_beta); #define V_newreno_beta VNET(newreno_beta) -VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; +VNET_DECLARE(uint32_t, newreno_beta_ecn); #define V_newreno_beta_ecn VNET(newreno_beta_ecn) struct cc_algo newreno_cc_algo = { .name = "newreno", .cb_destroy = newreno_cb_destroy, .ack_received = newreno_ack_received, .after_idle = newreno_after_idle, .cong_signal = newreno_cong_signal, .post_recovery = newreno_cc_post_recovery, .ctl_output = newreno_ctl_output, .newround = newreno_newround, .rttsample = newreno_rttsample, .cb_init = newreno_cb_init, .cc_data_sz = newreno_data_sz, }; static void newreno_log_hystart_event(struct cc_var *ccv, struct newreno *nreno, uint8_t mod, uint32_t flex1) { /* * Types of logs (mod value) * 1 - rtt_thresh in flex1, checking to see if RTT is to great. * 2 - rtt is too great, rtt_thresh in flex1. * 3 - CSS is active incr in flex1 * 4 - A new round is beginning flex1 is round count * 5 - A new RTT measurement flex1 is the new measurement. * 6 - We enter CA ssthresh is also in flex1. * 7 - Socket option to change hystart executed opt.val in flex1. * 8 - Back out of CSS into SS, flex1 is the css_baseline_minrtt * 9 - We enter CA, via an ECN mark. * 10 - We enter CA, via a loss. * 11 - We have slipped out of SS into CA via cwnd growth. * 12 - After idle has re-enabled hystart++ */ struct tcpcb *tp; if (hystart_bblogs == 0) return; tp = ccv->ccvc.tcp; if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = nreno->css_current_round_minrtt; log.u_bbr.flex3 = nreno->css_lastround_minrtt; log.u_bbr.flex4 = nreno->css_rttsample_count; log.u_bbr.flex5 = nreno->css_entered_at_round; log.u_bbr.flex6 = nreno->css_baseline_minrtt; /* We only need bottom 16 bits of flags */ log.u_bbr.flex7 = nreno->newreno_flags & 0x0000ffff; log.u_bbr.flex8 = mod; log.u_bbr.epoch = nreno->css_current_round; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.lt_epoch = nreno->css_fas_at_css_entry; log.u_bbr.pkts_out = nreno->css_last_fas; log.u_bbr.delivered = nreno->css_lowrtt_fas; log.u_bbr.pkt_epoch = ccv->flags; TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, TCP_HYSTART, 0, 0, &log, false, &tv); } } static size_t newreno_data_sz(void) { return (sizeof(struct newreno)); } static int newreno_cb_init(struct cc_var *ccv, void *ptr) { struct newreno *nreno; INP_WLOCK_ASSERT(ccv->ccvc.tcp->t_inpcb); if (ptr == NULL) { ccv->cc_data = malloc(sizeof(struct newreno), M_CC_MEM, M_NOWAIT); if (ccv->cc_data == NULL) return (ENOMEM); } else ccv->cc_data = ptr; nreno = (struct newreno *)ccv->cc_data; /* NB: nreno is not zeroed, so initialise all fields. */ nreno->beta = V_newreno_beta; nreno->beta_ecn = V_newreno_beta_ecn; /* * We set the enabled flag so that if * the socket option gets strobed and * we have not hit a loss */ nreno->newreno_flags = CC_NEWRENO_HYSTART_ENABLED; /* At init set both to infinity */ nreno->css_lastround_minrtt = 0xffffffff; nreno->css_current_round_minrtt = 0xffffffff; nreno->css_current_round = 0; nreno->css_baseline_minrtt = 0xffffffff; nreno->css_rttsample_count = 0; nreno->css_entered_at_round = 0; nreno->css_fas_at_css_entry = 0; nreno->css_lowrtt_fas = 0; nreno->css_last_fas = 0; return (0); } static void newreno_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } static void newreno_ack_received(struct cc_var *ccv, uint16_t type) { struct newreno *nreno; nreno = ccv->cc_data; if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { u_int cw = CCV(ccv, snd_cwnd); u_int incr = CCV(ccv, t_maxseg); /* * Regular in-order ACK, open the congestion window. * Method depends on which congestion control state we're * in (slow start or cong avoid) and if ABC (RFC 3465) is * enabled. * * slow start: cwnd <= ssthresh * cong avoid: cwnd > ssthresh * * slow start and ABC (RFC 3465): * Grow cwnd exponentially by the amount of data * ACKed capping the max increment per ACK to * (abc_l_var * maxseg) bytes. * * slow start without ABC (RFC 5681): * Grow cwnd exponentially by maxseg per ACK. * * cong avoid and ABC (RFC 3465): * Grow cwnd linearly by maxseg per RTT for each * cwnd worth of ACKed data. * * cong avoid without ABC (RFC 5681): * Grow cwnd linearly by approximately maxseg per RTT using * maxseg^2 / cwnd per ACK as the increment. * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to * avoid capping cwnd. */ if (cw > CCV(ccv, snd_ssthresh)) { if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) { /* * We have slipped into CA with * CSS active. Deactivate all. */ /* Turn off the CSS flag */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; /* Disable use of CSS in the future except long idle */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; newreno_log_hystart_event(ccv, nreno, 11, CCV(ccv, snd_ssthresh)); } if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) ccv->flags &= ~CCF_ABC_SENTAWND; else incr = 0; } else incr = max((incr * incr / cw), 1); } else if (V_tcp_do_rfc3465) { /* * In slow-start with ABC enabled and no RTO in sight? * (Must not use abc_l_var > 1 if slow starting after * an RTO. On RTO, snd_nxt = snd_una, so the * snd_nxt == snd_max check is sufficient to * handle this). * * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ uint16_t abc_val; if (ccv->flags & CCF_USE_LOCAL_ABC) abc_val = ccv->labc; else abc_val = V_tcp_abc_l_var; if ((ccv->flags & CCF_HYSTART_ALLOWED) && (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) && ((nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) == 0)) { /* * Hystart is allowed and still enabled and we are not yet * in CSS. Lets check to see if we can make a decision on * if we need to go into CSS. */ if ((nreno->css_rttsample_count >= hystart_n_rttsamples) && (nreno->css_current_round_minrtt != 0xffffffff) && (nreno->css_lastround_minrtt != 0xffffffff)) { uint32_t rtt_thresh; /* Clamp (minrtt_thresh, lastround/8, maxrtt_thresh) */ rtt_thresh = (nreno->css_lastround_minrtt >> 3); if (rtt_thresh < hystart_minrtt_thresh) rtt_thresh = hystart_minrtt_thresh; if (rtt_thresh > hystart_maxrtt_thresh) rtt_thresh = hystart_maxrtt_thresh; newreno_log_hystart_event(ccv, nreno, 1, rtt_thresh); if (nreno->css_current_round_minrtt >= (nreno->css_lastround_minrtt + rtt_thresh)) { /* Enter CSS */ nreno->newreno_flags |= CC_NEWRENO_HYSTART_IN_CSS; nreno->css_fas_at_css_entry = nreno->css_lowrtt_fas; /* * The draft (v4) calls for us to set baseline to css_current_round_min * but that can cause an oscillation. We probably shoudl be using * css_lastround_minrtt, but the authors insist that will cause * issues on exiting early. We will leave the draft version for now * but I suspect this is incorrect. */ nreno->css_baseline_minrtt = nreno->css_current_round_minrtt; nreno->css_entered_at_round = nreno->css_current_round; newreno_log_hystart_event(ccv, nreno, 2, rtt_thresh); } } } if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, ccv->nsegs * abc_val * CCV(ccv, t_maxseg)); else incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); /* Only if Hystart is enabled will the flag get set */ if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) { incr /= hystart_css_growth_div; newreno_log_hystart_event(ccv, nreno, 3, incr); } } /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) CCV(ccv, snd_cwnd) = min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } } static void newreno_after_idle(struct cc_var *ccv) { struct newreno *nreno; nreno = ccv->cc_data; newreno_cc_after_idle(ccv); if ((nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) == 0) { /* * Re-enable hystart if we have been idle. */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; nreno->newreno_flags |= CC_NEWRENO_HYSTART_ENABLED; newreno_log_hystart_event(ccv, nreno, 12, CCV(ccv, snd_ssthresh)); } } /* * Perform any necessary tasks before we enter congestion recovery. */ static void newreno_cong_signal(struct cc_var *ccv, uint32_t type) { struct newreno *nreno; uint32_t beta, beta_ecn, cwin, factor; u_int mss; cwin = CCV(ccv, snd_cwnd); mss = tcp_fixed_maxseg(ccv->ccvc.tcp); nreno = ccv->cc_data; beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;; beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn; /* * Note that we only change the backoff for ECN if the * global sysctl V_cc_do_abe is set the stack itself * has set a flag in our newreno_flags (due to pacing) telling * us to use the lower valued back-off. */ if ((type == CC_ECN) && (V_cc_do_abe || ((nreno != NULL) && (nreno->newreno_flags & CC_NEWRENO_BETA_ECN_ENABLED)))) factor = beta_ecn; else factor = beta; /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, ("%s: congestion signal type 0x%08x is private\n", __func__, type)); cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss; switch (type) { case CC_NDUPACK: if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; newreno_log_hystart_event(ccv, nreno, 10, CCV(ccv, snd_ssthresh)); } if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (IN_CONGRECOVERY(CCV(ccv, t_flags) && V_cc_do_abe && V_cc_abe_frlossreduce)) { CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_ssthresh) * (uint64_t)beta) / (uint64_t)beta_ecn; } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; newreno_log_hystart_event(ccv, nreno, 9, CCV(ccv, snd_ssthresh)); } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_ssthresh) = cwin; CCV(ccv, snd_cwnd) = cwin; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / 2 / mss, 2) * mss; CCV(ccv, snd_cwnd) = mss; break; } } static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) { struct newreno *nreno; struct cc_newreno_opts *opt; if (sopt->sopt_valsize != sizeof(struct cc_newreno_opts)) return (EMSGSIZE); if (CC_ALGO(ccv->ccvc.tcp) != &newreno_cc_algo) return (ENOPROTOOPT); nreno = (struct newreno *)ccv->cc_data; opt = buf; switch (sopt->sopt_dir) { case SOPT_SET: switch (opt->name) { case CC_NEWRENO_BETA: nreno->beta = opt->val; break; case CC_NEWRENO_BETA_ECN: if ((!V_cc_do_abe) && ((nreno->newreno_flags & CC_NEWRENO_BETA_ECN) == 0)) return (EACCES); nreno->beta_ecn = opt->val; nreno->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; break; default: return (ENOPROTOOPT); } break; case SOPT_GET: switch (opt->name) { case CC_NEWRENO_BETA: opt->val = (nreno == NULL) ? V_newreno_beta : nreno->beta; break; case CC_NEWRENO_BETA_ECN: opt->val = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn; break; default: return (ENOPROTOOPT); } break; default: return (EINVAL); } return (0); } static int newreno_beta_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = *(uint32_t *)arg1; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL ) { if (arg1 == &VNET_NAME(newreno_beta_ecn) && !V_cc_do_abe) error = EACCES; else if (new == 0 || new > 100) error = EINVAL; else *(uint32_t *)arg1 = new; } return (error); } static void newreno_newround(struct cc_var *ccv, uint32_t round_cnt) { struct newreno *nreno; nreno = (struct newreno *)ccv->cc_data; /* We have entered a new round */ nreno->css_lastround_minrtt = nreno->css_current_round_minrtt; nreno->css_current_round_minrtt = 0xffffffff; nreno->css_rttsample_count = 0; nreno->css_current_round = round_cnt; if ((nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) && ((round_cnt - nreno->css_entered_at_round) >= hystart_css_rounds)) { /* Enter CA */ if (ccv->flags & CCF_HYSTART_CAN_SH_CWND) { /* * We engage more than snd_ssthresh, engage * the brakes!! Though we will stay in SS to * creep back up again, so lets leave CSS active * and give us hystart_css_rounds more rounds. */ if (ccv->flags & CCF_HYSTART_CONS_SSTH) { CCV(ccv, snd_ssthresh) = ((nreno->css_lowrtt_fas + nreno->css_fas_at_css_entry) / 2); } else { CCV(ccv, snd_ssthresh) = nreno->css_lowrtt_fas; } CCV(ccv, snd_cwnd) = nreno->css_fas_at_css_entry; nreno->css_entered_at_round = round_cnt; } else { CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); /* Turn off the CSS flag */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; /* Disable use of CSS in the future except long idle */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; } newreno_log_hystart_event(ccv, nreno, 6, CCV(ccv, snd_ssthresh)); } if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) newreno_log_hystart_event(ccv, nreno, 4, round_cnt); } static void newreno_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas) { struct newreno *nreno; nreno = (struct newreno *)ccv->cc_data; if (rxtcnt > 1) { /* * Only look at RTT's that are non-ambiguous. */ return; } nreno->css_rttsample_count++; nreno->css_last_fas = fas; if (nreno->css_current_round_minrtt > usec_rtt) { nreno->css_current_round_minrtt = usec_rtt; nreno->css_lowrtt_fas = nreno->css_last_fas; } if ((nreno->css_rttsample_count >= hystart_n_rttsamples) && (nreno->css_current_round_minrtt != 0xffffffff) && (nreno->css_current_round_minrtt < nreno->css_baseline_minrtt) && (nreno->css_lastround_minrtt != 0xffffffff)) { /* * We were in CSS and the RTT is now less, we * entered CSS erroneously. */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; newreno_log_hystart_event(ccv, nreno, 8, nreno->css_baseline_minrtt); nreno->css_baseline_minrtt = 0xffffffff; } if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) newreno_log_hystart_event(ccv, nreno, 5, usec_rtt); } SYSCTL_DECL(_net_inet_tcp_cc_newreno); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, newreno, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "New Reno related settings"); SYSCTL_PROC(_net_inet_tcp_cc_newreno, OID_AUTO, beta, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(newreno_beta), 3, &newreno_beta_handler, "IU", "New Reno beta, specified as number between 1 and 100"); SYSCTL_PROC(_net_inet_tcp_cc_newreno, OID_AUTO, beta_ecn, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(newreno_beta_ecn), 3, &newreno_beta_handler, "IU", "New Reno beta ecn, specified as number between 1 and 100"); DECLARE_CC_MODULE(newreno, &newreno_cc_algo); MODULE_VERSION(newreno, 2);