diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index 16d27a868bd3..c628be250abf 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -1,721 +1,763 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Have a sane default if no CC_DEFAULT is specified in the kernel config file. */ #ifndef CC_DEFAULT #define CC_DEFAULT "cubic" #endif uint32_t hystart_minrtt_thresh = 4000; uint32_t hystart_maxrtt_thresh = 16000; uint32_t hystart_n_rttsamples = 8; uint32_t hystart_css_growth_div = 4; uint32_t hystart_css_rounds = 5; uint32_t hystart_bblogs = 0; MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); /* * List of available cc algorithms on the current system. First element * is used as the system default CC algorithm. */ struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); /* Protects the cc_list TAILQ. */ struct rwlock cc_list_lock; VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; VNET_DEFINE(uint32_t, newreno_beta) = 50; #define V_newreno_beta VNET(newreno_beta) VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; void cc_refer(struct cc_algo *algo) { CC_LIST_LOCK_ASSERT(); refcount_acquire(&algo->cc_refcount); } void cc_release(struct cc_algo *algo) { CC_LIST_LOCK_ASSERT(); refcount_release(&algo->cc_refcount); } void cc_attach(struct tcpcb *tp, struct cc_algo *algo) { /* * Attach the tcpcb to the algorithm. */ CC_LIST_RLOCK(); CC_ALGO(tp) = algo; cc_refer(algo); CC_LIST_RUNLOCK(); } void cc_detach(struct tcpcb *tp) { struct cc_algo *algo; CC_LIST_RLOCK(); algo = CC_ALGO(tp); CC_ALGO(tp) = NULL; cc_release(algo); CC_LIST_RUNLOCK(); } /* * Sysctl handler to show and change the default CC algorithm. */ static int cc_default_algo(SYSCTL_HANDLER_ARGS) { char default_cc[TCP_CA_NAME_MAX]; struct cc_algo *funcs; int error; /* Get the current default: */ CC_LIST_RLOCK(); if (CC_DEFAULT_ALGO() != NULL) strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); else memset(default_cc, 0, TCP_CA_NAME_MAX); CC_LIST_RUNLOCK(); error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) goto done; error = ESRCH; /* Find algo with specified name and set it to default. */ CC_LIST_RLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (strncmp(default_cc, funcs->name, sizeof(default_cc))) continue; if (funcs->flags & CC_MODULE_BEING_REMOVED) { /* Its being removed, its not eligible */ continue; } V_default_cc_ptr = funcs; error = 0; break; } CC_LIST_RUNLOCK(); done: return (error); } /* * Sysctl handler to display the list of available CC algorithms. */ static int cc_list_available(SYSCTL_HANDLER_ARGS) { struct cc_algo *algo; int error, nalgos; int linesz; char *buffer, *cp; size_t bufsz, outsz; error = nalgos = 0; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { nalgos++; } CC_LIST_RUNLOCK(); if (nalgos == 0) { return (ENOENT); } bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1); buffer = malloc(bufsz, M_TEMP, M_WAITOK); cp = buffer; linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D', "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { linesz = snprintf(cp, bufsz, "%-16s%c %u\n", algo->name, (algo == CC_DEFAULT_ALGO()) ? '*' : ' ', algo->cc_refcount); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } CC_LIST_RUNLOCK(); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } /* * Return the number of times a proposed removal_cc is * being used as the default. */ static int cc_check_default(struct cc_algo *remove_cc) { int cnt = 0; VNET_ITERATOR_DECL(vnet_iter); CC_LIST_LOCK_ASSERT(); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if ((CC_DEFAULT_ALGO() != NULL) && strncmp(CC_DEFAULT_ALGO()->name, remove_cc->name, TCP_CA_NAME_MAX) == 0) { cnt++; } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); return (cnt); } /* * Initialise CC subsystem on system boot. */ static void cc_init(void) { CC_LIST_LOCK_INIT(); STAILQ_INIT(&cc_list); } /* * Returns non-zero on success, 0 on failure. */ static int cc_deregister_algo_locked(struct cc_algo *remove_cc) { struct cc_algo *funcs; int found = 0; /* This is unlikely to fail */ STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == remove_cc) found = 1; } if (found == 0) { /* Nothing to remove? */ return (ENOENT); } /* We assert it should have been MOD_QUIESCE'd */ KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED), ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc)); if (cc_check_default(remove_cc)) { return(EBUSY); } if (remove_cc->cc_refcount != 0) { return (EBUSY); } /* Remove algo from cc_list so that new connections can't use it. */ STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries); return (0); } /* * Returns non-zero on success, 0 on failure. */ int cc_deregister_algo(struct cc_algo *remove_cc) { int ret; CC_LIST_WLOCK(); ret = cc_deregister_algo_locked(remove_cc); CC_LIST_WUNLOCK(); return (ret); } /* * Returns 0 on success, non-zero on failure. */ int cc_register_algo(struct cc_algo *add_cc) { struct cc_algo *funcs; int err; err = 0; /* * Iterate over list of registered CC algorithms and make sure * we're not trying to add a duplicate. */ CC_LIST_WLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == add_cc || strncmp(funcs->name, add_cc->name, TCP_CA_NAME_MAX) == 0) { err = EEXIST; break; } } /* Init its reference count */ if (err == 0) refcount_init(&add_cc->cc_refcount, 0); /* * The first loaded congestion control module will become * the default until we find the "CC_DEFAULT" defined in * the config (if we do). */ if (!err) { STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); if (strcmp(add_cc->name, CC_DEFAULT) == 0) { V_default_cc_ptr = add_cc; } else if (V_default_cc_ptr == NULL) { V_default_cc_ptr = add_cc; } } CC_LIST_WUNLOCK(); return (err); } static void vnet_cc_sysinit(void *arg) { struct cc_algo *cc; if (IS_DEFAULT_VNET(curvnet)) return; CURVNET_SET(vnet0); cc = V_default_cc_ptr; CURVNET_RESTORE(); V_default_cc_ptr = cc; } VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_cc_sysinit, NULL); /* * Perform any necessary tasks before we exit congestion recovery. */ void newreno_cc_post_recovery(struct cc_var *ccv) { int pipe; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * Fast recovery will conclude after returning from this * function. Window inflation should have left us with * approximately snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do it via the * slow start mechanism. * * XXXLAS: Find a way to do this without needing curack */ if (V_tcp_do_newsack) pipe = tcp_compute_pipe(ccv->tp); else pipe = CCV(ccv, snd_max) - ccv->curack; if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd does not collapse to 1 MSS under * adverse conditions. Implements RFC6582 */ CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + CCV(ccv, t_maxseg); else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } } void newreno_cc_after_idle(struct cc_var *ccv) { uint32_t rw; /* * If we've been idle for more than one retransmit timeout the old * congestion window is no longer current and we have to reduce it to * the restart window before we can transmit again. * * The restart window is the initial window or the last CWND, whichever * is smaller. * * This is done to prevent us from flooding the path with a full CWND at * wirespeed, overloading router and switch buffers along the way. * * See RFC5681 Section 4.1. "Restarting Idle Connections". * * In addition, per RFC2861 Section 2, the ssthresh is set to the * maximum of the former ssthresh or 3/4 of the old cwnd, to * not exit slow-start prematurely. */ rw = tcp_compute_initwnd(tcp_fixed_maxseg(ccv->tp)); CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); } /* - * Perform any necessary tasks before we enter congestion recovery. - */ -void -newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) + * Get a new congestion window size on a multiplicative decrease event. + * */ +u_int +newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss) { - uint32_t cwin, factor, mss, pipe; + uint32_t cwin, factor; cwin = CCV(ccv, snd_cwnd); - mss = tcp_fixed_maxseg(ccv->tp); /* * Other TCP congestion controls use newreno_cong_signal(), but * with their own private cc_data. Make sure the cc_data is used * correctly. */ factor = V_newreno_beta; + return max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss; +} + +/* + * Perform any necessary tasks before we enter congestion recovery. + */ +void +newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) +{ + uint32_t cwin, mss, pipe; + + mss = tcp_fixed_maxseg(ccv->tp); + /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, ("%s: congestion signal type 0x%08x is private\n", __func__, type)); - cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), - 2) * mss; + cwin = newreno_cc_cwnd_on_multiplicative_decrease(ccv, mss); switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_ssthresh) = cwin; CCV(ccv, snd_cwnd) = cwin; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { if (V_tcp_do_newsack) { pipe = tcp_compute_pipe(ccv->tp); } else { pipe = CCV(ccv, snd_max) - CCV(ccv, snd_fack) + CCV(ccv, sackhint.sack_bytes_rexmit); } CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } CCV(ccv, snd_cwnd) = mss; break; default: break; } } -void -newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type) +u_int +newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv) { - if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && - (ccv->flags & CCF_CWND_LIMITED)) { - u_int cw = CCV(ccv, snd_cwnd); - u_int incr = CCV(ccv, t_maxseg); + u_int cw = CCV(ccv, snd_cwnd); + u_int incr = CCV(ccv, t_maxseg); + + KASSERT(cw > CCV(ccv, snd_ssthresh), + ("congestion control state not in congestion avoidance\n")); + /* + * Regular in-order ACK, open the congestion window. + * The congestion control state we're in is congestion avoidance. + * + * Check if ABC (RFC 3465) is enabled. + * cong avoid: cwnd > ssthresh + * + * cong avoid and ABC (RFC 3465): + * Grow cwnd linearly by maxseg per RTT for each + * cwnd worth of ACKed data. + * + * cong avoid without ABC (RFC 5681): + * Grow cwnd linearly by approximately maxseg per RTT using + * maxseg^2 / cwnd per ACK as the increment. + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to + * avoid capping cwnd. + */ + if (V_tcp_do_rfc3465) { + if (ccv->flags & CCF_ABC_SENTAWND) + ccv->flags &= ~CCF_ABC_SENTAWND; + else + incr = 0; + } else + incr = max((incr * incr / cw), 1); + /* ABC is on by default, so incr equals 0 frequently. */ + if (incr > 0) + return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); + else + return cw; +} + +u_int +newreno_cc_cwnd_in_slow_start(struct cc_var *ccv) +{ + u_int cw = CCV(ccv, snd_cwnd); + u_int incr = CCV(ccv, t_maxseg); + + KASSERT(cw <= CCV(ccv, snd_ssthresh), + ("congestion control state not in slow start\n")); + + /* + * Regular in-order ACK, open the congestion window. + * The congestion control state we're in is slow start. + * + * slow start: cwnd <= ssthresh + * + * slow start and ABC (RFC 3465): + * Grow cwnd exponentially by the amount of data + * ACKed capping the max increment per ACK to + * (abc_l_var * maxseg) bytes. + * + * slow start without ABC (RFC 5681): + * Grow cwnd exponentially by maxseg per ACK. + */ + if (V_tcp_do_rfc3465) { /* - * Regular in-order ACK, open the congestion window. - * Method depends on which congestion control state we're - * in (slow start or cong avoid) and if ABC (RFC 3465) is - * enabled. - * - * slow start: cwnd <= ssthresh - * cong avoid: cwnd > ssthresh - * - * slow start and ABC (RFC 3465): - * Grow cwnd exponentially by the amount of data - * ACKed capping the max increment per ACK to - * (abc_l_var * maxseg) bytes. - * - * slow start without ABC (RFC 5681): - * Grow cwnd exponentially by maxseg per ACK. - * - * cong avoid and ABC (RFC 3465): - * Grow cwnd linearly by maxseg per RTT for each - * cwnd worth of ACKed data. + * In slow-start with ABC enabled and no RTO in sight? + * (Must not use abc_l_var > 1 if slow starting after + * an RTO. On RTO, snd_nxt = snd_una, so the + * snd_nxt == snd_max check is sufficient to + * handle this). * - * cong avoid without ABC (RFC 5681): - * Grow cwnd linearly by approximately maxseg per RTT using - * maxseg^2 / cwnd per ACK as the increment. - * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to - * avoid capping cwnd. + * XXXLAS: Find a way to signal SS after RTO that + * doesn't rely on tcpcb vars. */ - if (cw > CCV(ccv, snd_ssthresh)) { - if (V_tcp_do_rfc3465) { - if (ccv->flags & CCF_ABC_SENTAWND) - ccv->flags &= ~CCF_ABC_SENTAWND; - else - incr = 0; - } else - incr = max((incr * incr / cw), 1); - } else if (V_tcp_do_rfc3465) { - /* - * In slow-start with ABC enabled and no RTO in sight? - * (Must not use abc_l_var > 1 if slow starting after - * an RTO. On RTO, snd_nxt = snd_una, so the - * snd_nxt == snd_max check is sufficient to - * handle this). - * - * XXXLAS: Find a way to signal SS after RTO that - * doesn't rely on tcpcb vars. - */ - uint16_t abc_val; - - if (ccv->flags & CCF_USE_LOCAL_ABC) - abc_val = ccv->labc; - else - abc_val = V_tcp_abc_l_var; - if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) - incr = min(ccv->bytes_this_ack, - ccv->nsegs * abc_val * - CCV(ccv, t_maxseg)); - else - incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + uint16_t abc_val; + if (ccv->flags & CCF_USE_LOCAL_ABC) + abc_val = ccv->labc; + else + abc_val = V_tcp_abc_l_var; + if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) + incr = min(ccv->bytes_this_ack, + ccv->nsegs * abc_val * CCV(ccv, t_maxseg)); + else + incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + } + /* ABC is on by default, so incr equals 0 frequently. */ + if (incr > 0) + return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); + else + return cw; +} + +void +newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type) +{ + if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && + (ccv->flags & CCF_CWND_LIMITED)) { + if (CCV(ccv, snd_cwnd) > CCV(ccv, snd_ssthresh)) { + CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_cong_avoid(ccv); + } else { + CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_slow_start(ccv); } - /* ABC is on by default, so incr equals 0 frequently. */ - if (incr > 0) - CCV(ccv, snd_cwnd) = min(cw + incr, - TCP_MAXWIN << CCV(ccv, snd_scale)); } } static int cc_stop_new_assignments(struct cc_algo *algo) { CC_LIST_WLOCK(); if (cc_check_default(algo)) { /* A default cannot be removed */ CC_LIST_WUNLOCK(); return (EBUSY); } algo->flags |= CC_MODULE_BEING_REMOVED; CC_LIST_WUNLOCK(); return (0); } /* * Handles kld related events. Returns 0 on success, non-zero on failure. */ int cc_modevent(module_t mod, int event_type, void *data) { struct cc_algo *algo; int err; err = 0; algo = (struct cc_algo *)data; switch(event_type) { case MOD_LOAD: if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { /* * A module must have a cc_data_sz function * even if it has no data it should return 0. */ printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); err = EINVAL; break; } if (algo->mod_init != NULL) err = algo->mod_init(); if (!err) err = cc_register_algo(algo); break; case MOD_SHUTDOWN: break; case MOD_QUIESCE: /* Stop any new assigments */ err = cc_stop_new_assignments(algo); break; case MOD_UNLOAD: /* * Deregister and remove the module from the list */ CC_LIST_WLOCK(); /* Even with -f we can't unload if its the default */ if (cc_check_default(algo)) { /* A default cannot be removed */ CC_LIST_WUNLOCK(); return (EBUSY); } /* * If -f was used and users are still attached to * the algorithm things are going to go boom. */ err = cc_deregister_algo_locked(algo); CC_LIST_WUNLOCK(); if ((err == 0) && (algo->mod_destroy != NULL)) { algo->mod_destroy(); } break; default: err = EINVAL; break; } return (err); } SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); /* Declare sysctl tree and populate it. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Congestion control related settings"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, cc_default_algo, "A", "Default congestion control algorithm"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, cc_list_available, "A", "List available congestion control algorithms"); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "New Reno related HyStart++ settings"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh, CTLFLAG_RW, &hystart_minrtt_thresh, 4000, "HyStarts++ minimum RTT thresh used in clamp (in microseconds)"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh, CTLFLAG_RW, &hystart_maxrtt_thresh, 16000, "HyStarts++ maximum RTT thresh used in clamp (in microseconds)"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples, CTLFLAG_RW, &hystart_n_rttsamples, 8, "The number of RTT samples that must be seen to consider HyStart++"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div, CTLFLAG_RW, &hystart_css_growth_div, 4, "The divisor to the growth when in Hystart++ CSS"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds, CTLFLAG_RW, &hystart_css_rounds, 5, "The number of rounds HyStart++ lasts in CSS before falling to CA"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs, CTLFLAG_RW, &hystart_bblogs, 0, "Do we enable HyStart++ Black Box logs to be generated if BB logging is on"); VNET_DEFINE(int, cc_do_abe) = 0; SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cc_do_abe), 0, "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); VNET_DEFINE(int, cc_abe_frlossreduce) = 0; SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cc_abe_frlossreduce), 0, "Apply standard beta instead of ABE-beta during ECN-signalled congestion " "recovery episodes if loss also needs to be repaired"); diff --git a/sys/netinet/cc/cc.h b/sys/netinet/cc/cc.h index d78c61bde2fe..890bea69a14b 100644 --- a/sys/netinet/cc/cc.h +++ b/sys/netinet/cc/cc.h @@ -1,246 +1,249 @@ /*- * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #ifndef _NETINET_CC_CC_H_ #define _NETINET_CC_CC_H_ #ifdef _KERNEL MALLOC_DECLARE(M_CC_MEM); /* Global CC vars. */ extern STAILQ_HEAD(cc_head, cc_algo) cc_list; extern const int tcprexmtthresh; /* Per-netstack bits. */ VNET_DECLARE(struct cc_algo *, default_cc_ptr); #define V_default_cc_ptr VNET(default_cc_ptr) VNET_DECLARE(int, cc_do_abe); #define V_cc_do_abe VNET(cc_do_abe) VNET_DECLARE(int, cc_abe_frlossreduce); #define V_cc_abe_frlossreduce VNET(cc_abe_frlossreduce) /* Define the new net.inet.tcp.cc sysctl tree. */ #ifdef _SYS_SYSCTL_H_ SYSCTL_DECL(_net_inet_tcp_cc); #endif /* For CC modules that use hystart++ */ extern uint32_t hystart_lowcwnd; extern uint32_t hystart_minrtt_thresh; extern uint32_t hystart_maxrtt_thresh; extern uint32_t hystart_n_rttsamples; extern uint32_t hystart_css_growth_div; extern uint32_t hystart_css_rounds; extern uint32_t hystart_bblogs; /* CC housekeeping functions. */ int cc_register_algo(struct cc_algo *add_cc); int cc_deregister_algo(struct cc_algo *remove_cc); #endif /* _KERNEL */ #if defined(_KERNEL) || defined(_WANT_TCPCB) struct cc_var { void *cc_data; /* Per-connection private CC algorithm data. */ int bytes_this_ack; /* # bytes acked by the current ACK. */ tcp_seq curack; /* Most recent ACK. */ uint32_t flags; /* Flags for cc_var (see below) */ struct tcpcb *tp; /* Pointer to tcpcb */ uint16_t nsegs; /* # segments coalesced into current chain. */ uint8_t labc; /* Dont use system abc use passed in */ }; /* cc_var flags. */ #define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */ #define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */ #define CCF_USE_LOCAL_ABC 0x0004 /* Dont use the system l_abc val */ #define CCF_ACKNOW 0x0008 /* Will this ack be sent now? */ #define CCF_IPHDR_CE 0x0010 /* Does this packet set CE bit? */ #define CCF_TCPHDR_CWR 0x0020 /* Does this packet set CWR bit? */ #define CCF_UNUSED1 0x0040 #define CCF_UNUSED2 0x0080 #define CCF_UNUSED3 0x0100 #define CCF_UNUSED4 0x0200 #define CCF_HYSTART_ALLOWED 0x0400 /* If the CC supports it Hystart is allowed */ #define CCF_HYSTART_CAN_SH_CWND 0x0800 /* Can hystart when going CSS -> CA slam the cwnd */ #define CCF_HYSTART_CONS_SSTH 0x1000 /* Should hystart use the more conservative ssthresh */ #endif /* defined(_KERNEL) || defined(_WANT_TCPCB) */ typedef enum { #if defined(_KERNEL) || defined(_WANT_TCPCB) /* ACK types passed to the ack_received() hook. */ CC_ACK = 0x0001, /* Regular in sequence ACK. */ CC_DUPACK = 0x0002, /* Duplicate ACK. */ CC_PARTIALACK = 0x0004, /* Not yet. */ CC_SACK = 0x0008, /* Not yet. */ #endif /* defined(_KERNEL) || defined(_WANT_TCPCB) */ /* Congestion signal types passed to the cong_signal() hook. */ CC_ECN = 0x0100, /* ECN marked packet received. */ CC_RTO = 0x0200, /* RTO fired. */ CC_RTO_ERR = 0x0400, /* RTO fired in error. */ CC_NDUPACK = 0x0800, /* Threshold of dupack's reached. */ /* * The highest order 8 bits (0x01000000 - 0x80000000) are reserved * for CC algos to declare their own congestion signal types. */ CC_SIGPRIVMASK = 0xFF000000 /* Mask to check if sig is private. */ } ccsignal_t; #ifdef _KERNEL /* * Structure to hold data and function pointers that together represent a * congestion control algorithm. */ struct cc_algo { char name[TCP_CA_NAME_MAX]; /* Init global module state on kldload. */ int (*mod_init)(void); /* Cleanup global module state on kldunload. */ int (*mod_destroy)(void); /* Return the size of the void pointer the CC needs for state */ size_t (*cc_data_sz)(void); /* * Init CC state for a new control block. The CC * module may be passed a NULL ptr indicating that * it must allocate the memory. If it is passed a * non-null pointer it is pre-allocated memory by * the caller and the cb_init is expected to use that memory. * It is not expected to fail if memory is passed in and * all currently defined modules do not. */ int (*cb_init)(struct cc_var *ccv, void *ptr); /* Cleanup CC state for a terminating control block. */ void (*cb_destroy)(struct cc_var *ccv); /* Init variables for a newly established connection. */ void (*conn_init)(struct cc_var *ccv); /* Called on receipt of an ack. */ void (*ack_received)(struct cc_var *ccv, ccsignal_t type); /* Called on detection of a congestion signal. */ void (*cong_signal)(struct cc_var *ccv, ccsignal_t type); /* Called after exiting congestion recovery. */ void (*post_recovery)(struct cc_var *ccv); /* Called when data transfer resumes after an idle period. */ void (*after_idle)(struct cc_var *ccv); /* Called for an additional ECN processing apart from RFC3168. */ void (*ecnpkt_handler)(struct cc_var *ccv); /* Called when a new "round" begins, if the transport is tracking rounds. */ void (*newround)(struct cc_var *ccv, uint32_t round_cnt); /* * Called when a RTT sample is made (fas = flight at send, if you dont have it * send the cwnd in). */ void (*rttsample)(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas); /* Called for {get|set}sockopt() on a TCP socket with TCP_CCALGOOPT. */ int (*ctl_output)(struct cc_var *, struct sockopt *, void *); STAILQ_ENTRY (cc_algo) entries; u_int cc_refcount; uint8_t flags; }; #define CC_MODULE_BEING_REMOVED 0x01 /* The module is being removed */ /* Macro to obtain the CC algo's struct ptr. */ #define CC_ALGO(tp) ((tp)->t_cc) /* Macro to obtain the CC algo's data ptr. */ #define CC_DATA(tp) ((tp)->t_ccv.cc_data) /* Macro to obtain the system default CC algo's struct ptr. */ #define CC_DEFAULT_ALGO() V_default_cc_ptr extern struct rwlock cc_list_lock; #define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list") #define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock) #define CC_LIST_RLOCK() rw_rlock(&cc_list_lock) #define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock) #define CC_LIST_WLOCK() rw_wlock(&cc_list_lock) #define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock) #define CC_LIST_LOCK_ASSERT() rw_assert(&cc_list_lock, RA_LOCKED) #define CC_ALGOOPT_LIMIT 2048 /* * These routines give NewReno behavior to the caller * they require no state and can be used by any other CC * module that wishes to use NewReno type behaviour (along * with anything else they may add on, pre or post call). */ void newreno_cc_post_recovery(struct cc_var *); void newreno_cc_after_idle(struct cc_var *); void newreno_cc_cong_signal(struct cc_var *, ccsignal_t); void newreno_cc_ack_received(struct cc_var *, ccsignal_t); +u_int newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss); +u_int newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv); +u_int newreno_cc_cwnd_in_slow_start(struct cc_var *ccv); /* Called to temporarily keep an algo from going away during change */ void cc_refer(struct cc_algo *algo); /* Called to release the temporary hold */ void cc_release(struct cc_algo *algo); /* Called to attach a CC algorithm to a tcpcb */ void cc_attach(struct tcpcb *, struct cc_algo *); /* Called to detach a CC algorithm from a tcpcb */ void cc_detach(struct tcpcb *); #endif /* _KERNEL */ #endif /* _NETINET_CC_CC_H_ */