diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c index c628be250abf..eae5b7bca4d9 100644 --- a/sys/netinet/cc/cc.c +++ b/sys/netinet/cc/cc.c @@ -1,763 +1,764 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Have a sane default if no CC_DEFAULT is specified in the kernel config file. */ #ifndef CC_DEFAULT #define CC_DEFAULT "cubic" #endif uint32_t hystart_minrtt_thresh = 4000; uint32_t hystart_maxrtt_thresh = 16000; uint32_t hystart_n_rttsamples = 8; uint32_t hystart_css_growth_div = 4; uint32_t hystart_css_rounds = 5; uint32_t hystart_bblogs = 0; MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); /* * List of available cc algorithms on the current system. First element * is used as the system default CC algorithm. */ struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); /* Protects the cc_list TAILQ. */ struct rwlock cc_list_lock; VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL; VNET_DEFINE(uint32_t, newreno_beta) = 50; #define V_newreno_beta VNET(newreno_beta) VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; void cc_refer(struct cc_algo *algo) { CC_LIST_LOCK_ASSERT(); refcount_acquire(&algo->cc_refcount); } void cc_release(struct cc_algo *algo) { CC_LIST_LOCK_ASSERT(); refcount_release(&algo->cc_refcount); } void cc_attach(struct tcpcb *tp, struct cc_algo *algo) { /* * Attach the tcpcb to the algorithm. */ CC_LIST_RLOCK(); CC_ALGO(tp) = algo; cc_refer(algo); CC_LIST_RUNLOCK(); } void cc_detach(struct tcpcb *tp) { struct cc_algo *algo; CC_LIST_RLOCK(); algo = CC_ALGO(tp); CC_ALGO(tp) = NULL; cc_release(algo); CC_LIST_RUNLOCK(); } /* * Sysctl handler to show and change the default CC algorithm. */ static int cc_default_algo(SYSCTL_HANDLER_ARGS) { char default_cc[TCP_CA_NAME_MAX]; struct cc_algo *funcs; int error; /* Get the current default: */ CC_LIST_RLOCK(); if (CC_DEFAULT_ALGO() != NULL) strlcpy(default_cc, CC_DEFAULT_ALGO()->name, sizeof(default_cc)); else memset(default_cc, 0, TCP_CA_NAME_MAX); CC_LIST_RUNLOCK(); error = sysctl_handle_string(oidp, default_cc, sizeof(default_cc), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) goto done; error = ESRCH; /* Find algo with specified name and set it to default. */ CC_LIST_RLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (strncmp(default_cc, funcs->name, sizeof(default_cc))) continue; if (funcs->flags & CC_MODULE_BEING_REMOVED) { /* Its being removed, its not eligible */ continue; } V_default_cc_ptr = funcs; error = 0; break; } CC_LIST_RUNLOCK(); done: return (error); } /* * Sysctl handler to display the list of available CC algorithms. */ static int cc_list_available(SYSCTL_HANDLER_ARGS) { struct cc_algo *algo; int error, nalgos; int linesz; char *buffer, *cp; size_t bufsz, outsz; error = nalgos = 0; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { nalgos++; } CC_LIST_RUNLOCK(); if (nalgos == 0) { return (ENOENT); } bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1); buffer = malloc(bufsz, M_TEMP, M_WAITOK); cp = buffer; linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D', "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) { linesz = snprintf(cp, bufsz, "%-16s%c %u\n", algo->name, (algo == CC_DEFAULT_ALGO()) ? '*' : ' ', algo->cc_refcount); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } CC_LIST_RUNLOCK(); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } /* * Return the number of times a proposed removal_cc is * being used as the default. */ static int cc_check_default(struct cc_algo *remove_cc) { int cnt = 0; VNET_ITERATOR_DECL(vnet_iter); CC_LIST_LOCK_ASSERT(); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if ((CC_DEFAULT_ALGO() != NULL) && strncmp(CC_DEFAULT_ALGO()->name, remove_cc->name, TCP_CA_NAME_MAX) == 0) { cnt++; } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); return (cnt); } /* * Initialise CC subsystem on system boot. */ static void cc_init(void) { CC_LIST_LOCK_INIT(); STAILQ_INIT(&cc_list); } /* * Returns non-zero on success, 0 on failure. */ static int cc_deregister_algo_locked(struct cc_algo *remove_cc) { struct cc_algo *funcs; int found = 0; /* This is unlikely to fail */ STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == remove_cc) found = 1; } if (found == 0) { /* Nothing to remove? */ return (ENOENT); } /* We assert it should have been MOD_QUIESCE'd */ KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED), ("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc)); if (cc_check_default(remove_cc)) { return(EBUSY); } if (remove_cc->cc_refcount != 0) { return (EBUSY); } /* Remove algo from cc_list so that new connections can't use it. */ STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries); return (0); } /* * Returns non-zero on success, 0 on failure. */ int cc_deregister_algo(struct cc_algo *remove_cc) { int ret; CC_LIST_WLOCK(); ret = cc_deregister_algo_locked(remove_cc); CC_LIST_WUNLOCK(); return (ret); } /* * Returns 0 on success, non-zero on failure. */ int cc_register_algo(struct cc_algo *add_cc) { struct cc_algo *funcs; int err; err = 0; /* * Iterate over list of registered CC algorithms and make sure * we're not trying to add a duplicate. */ CC_LIST_WLOCK(); STAILQ_FOREACH(funcs, &cc_list, entries) { if (funcs == add_cc || strncmp(funcs->name, add_cc->name, TCP_CA_NAME_MAX) == 0) { err = EEXIST; break; } } /* Init its reference count */ if (err == 0) refcount_init(&add_cc->cc_refcount, 0); /* * The first loaded congestion control module will become * the default until we find the "CC_DEFAULT" defined in * the config (if we do). */ if (!err) { STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); if (strcmp(add_cc->name, CC_DEFAULT) == 0) { V_default_cc_ptr = add_cc; } else if (V_default_cc_ptr == NULL) { V_default_cc_ptr = add_cc; } } CC_LIST_WUNLOCK(); return (err); } static void vnet_cc_sysinit(void *arg) { struct cc_algo *cc; if (IS_DEFAULT_VNET(curvnet)) return; CURVNET_SET(vnet0); cc = V_default_cc_ptr; CURVNET_RESTORE(); V_default_cc_ptr = cc; } VNET_SYSINIT(vnet_cc_sysinit, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_cc_sysinit, NULL); /* * Perform any necessary tasks before we exit congestion recovery. */ void newreno_cc_post_recovery(struct cc_var *ccv) { int pipe; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * Fast recovery will conclude after returning from this * function. Window inflation should have left us with * approximately snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do it via the * slow start mechanism. * * XXXLAS: Find a way to do this without needing curack */ if (V_tcp_do_newsack) pipe = tcp_compute_pipe(ccv->tp); else pipe = CCV(ccv, snd_max) - ccv->curack; if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd does not collapse to 1 MSS under * adverse conditions. Implements RFC6582 */ - CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + - CCV(ccv, t_maxseg); + CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } } void newreno_cc_after_idle(struct cc_var *ccv) { uint32_t rw; /* * If we've been idle for more than one retransmit timeout the old * congestion window is no longer current and we have to reduce it to * the restart window before we can transmit again. * * The restart window is the initial window or the last CWND, whichever * is smaller. * * This is done to prevent us from flooding the path with a full CWND at * wirespeed, overloading router and switch buffers along the way. * * See RFC5681 Section 4.1. "Restarting Idle Connections". * * In addition, per RFC2861 Section 2, the ssthresh is set to the * maximum of the former ssthresh or 3/4 of the old cwnd, to * not exit slow-start prematurely. */ rw = tcp_compute_initwnd(tcp_fixed_maxseg(ccv->tp)); CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); } /* * Get a new congestion window size on a multiplicative decrease event. * */ u_int newreno_cc_cwnd_on_multiplicative_decrease(struct cc_var *ccv, uint32_t mss) { uint32_t cwin, factor; cwin = CCV(ccv, snd_cwnd); /* * Other TCP congestion controls use newreno_cong_signal(), but * with their own private cc_data. Make sure the cc_data is used * correctly. */ factor = V_newreno_beta; return max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss; } /* * Perform any necessary tasks before we enter congestion recovery. */ void newreno_cc_cong_signal(struct cc_var *ccv, ccsignal_t type) { uint32_t cwin, mss, pipe; mss = tcp_fixed_maxseg(ccv->tp); /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, ("%s: congestion signal type 0x%08x is private\n", __func__, type)); cwin = newreno_cc_cwnd_on_multiplicative_decrease(ccv, mss); switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_ssthresh) = cwin; CCV(ccv, snd_cwnd) = cwin; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { if (V_tcp_do_newsack) { pipe = tcp_compute_pipe(ccv->tp); } else { pipe = CCV(ccv, snd_max) - CCV(ccv, snd_fack) + CCV(ccv, sackhint.sack_bytes_rexmit); } CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } CCV(ccv, snd_cwnd) = mss; break; default: break; } } u_int newreno_cc_cwnd_in_cong_avoid(struct cc_var *ccv) { u_int cw = CCV(ccv, snd_cwnd); - u_int incr = CCV(ccv, t_maxseg); + u_int incr = tcp_fixed_maxseg(ccv->tp); KASSERT(cw > CCV(ccv, snd_ssthresh), ("congestion control state not in congestion avoidance\n")); /* * Regular in-order ACK, open the congestion window. * The congestion control state we're in is congestion avoidance. * * Check if ABC (RFC 3465) is enabled. * cong avoid: cwnd > ssthresh * * cong avoid and ABC (RFC 3465): * Grow cwnd linearly by maxseg per RTT for each * cwnd worth of ACKed data. * * cong avoid without ABC (RFC 5681): * Grow cwnd linearly by approximately maxseg per RTT using * maxseg^2 / cwnd per ACK as the increment. * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to * avoid capping cwnd. */ if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) ccv->flags &= ~CCF_ABC_SENTAWND; else incr = 0; } else incr = max((incr * incr / cw), 1); /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); else return cw; } u_int newreno_cc_cwnd_in_slow_start(struct cc_var *ccv) { u_int cw = CCV(ccv, snd_cwnd); - u_int incr = CCV(ccv, t_maxseg); + u_int mss = tcp_fixed_maxseg(ccv->tp); + u_int incr = mss; KASSERT(cw <= CCV(ccv, snd_ssthresh), ("congestion control state not in slow start\n")); /* * Regular in-order ACK, open the congestion window. * The congestion control state we're in is slow start. * * slow start: cwnd <= ssthresh * * slow start and ABC (RFC 3465): * Grow cwnd exponentially by the amount of data * ACKed capping the max increment per ACK to * (abc_l_var * maxseg) bytes. * * slow start without ABC (RFC 5681): * Grow cwnd exponentially by maxseg per ACK. */ if (V_tcp_do_rfc3465) { /* * In slow-start with ABC enabled and no RTO in sight? * (Must not use abc_l_var > 1 if slow starting after * an RTO. On RTO, snd_nxt = snd_una, so the * snd_nxt == snd_max check is sufficient to * handle this). * * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ uint16_t abc_val; if (ccv->flags & CCF_USE_LOCAL_ABC) abc_val = ccv->labc; else abc_val = V_tcp_abc_l_var; if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, - ccv->nsegs * abc_val * CCV(ccv, t_maxseg)); + ccv->nsegs * abc_val * mss); else - incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + incr = min(ccv->bytes_this_ack, mss); } /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) return min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); else return cw; } void newreno_cc_ack_received(struct cc_var *ccv, ccsignal_t type) { if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { if (CCV(ccv, snd_cwnd) > CCV(ccv, snd_ssthresh)) { CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_cong_avoid(ccv); } else { CCV(ccv, snd_cwnd) = newreno_cc_cwnd_in_slow_start(ccv); } } } static int cc_stop_new_assignments(struct cc_algo *algo) { CC_LIST_WLOCK(); if (cc_check_default(algo)) { /* A default cannot be removed */ CC_LIST_WUNLOCK(); return (EBUSY); } algo->flags |= CC_MODULE_BEING_REMOVED; CC_LIST_WUNLOCK(); return (0); } /* * Handles kld related events. Returns 0 on success, non-zero on failure. */ int cc_modevent(module_t mod, int event_type, void *data) { struct cc_algo *algo; int err; err = 0; algo = (struct cc_algo *)data; switch(event_type) { case MOD_LOAD: if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) { /* * A module must have a cc_data_sz function * even if it has no data it should return 0. */ printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n"); err = EINVAL; break; } if (algo->mod_init != NULL) err = algo->mod_init(); if (!err) err = cc_register_algo(algo); break; case MOD_SHUTDOWN: break; case MOD_QUIESCE: /* Stop any new assigments */ err = cc_stop_new_assignments(algo); break; case MOD_UNLOAD: /* * Deregister and remove the module from the list */ CC_LIST_WLOCK(); /* Even with -f we can't unload if its the default */ if (cc_check_default(algo)) { /* A default cannot be removed */ CC_LIST_WUNLOCK(); return (EBUSY); } /* * If -f was used and users are still attached to * the algorithm things are going to go boom. */ err = cc_deregister_algo_locked(algo); CC_LIST_WUNLOCK(); if ((err == 0) && (algo->mod_destroy != NULL)) { algo->mod_destroy(); } break; default: err = EINVAL; break; } return (err); } SYSINIT(cc, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, cc_init, NULL); /* Declare sysctl tree and populate it. */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Congestion control related settings"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, cc_default_algo, "A", "Default congestion control algorithm"); SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, cc_list_available, "A", "List available congestion control algorithms"); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hystartplusplus, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "New Reno related HyStart++ settings"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, minrtt_thresh, CTLFLAG_RW, &hystart_minrtt_thresh, 4000, "HyStarts++ minimum RTT thresh used in clamp (in microseconds)"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, maxrtt_thresh, CTLFLAG_RW, &hystart_maxrtt_thresh, 16000, "HyStarts++ maximum RTT thresh used in clamp (in microseconds)"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, n_rttsamples, CTLFLAG_RW, &hystart_n_rttsamples, 8, "The number of RTT samples that must be seen to consider HyStart++"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_growth_div, CTLFLAG_RW, &hystart_css_growth_div, 4, "The divisor to the growth when in Hystart++ CSS"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, css_rounds, CTLFLAG_RW, &hystart_css_rounds, 5, "The number of rounds HyStart++ lasts in CSS before falling to CA"); SYSCTL_UINT(_net_inet_tcp_cc_hystartplusplus, OID_AUTO, bblogs, CTLFLAG_RW, &hystart_bblogs, 0, "Do we enable HyStart++ Black Box logs to be generated if BB logging is on"); VNET_DEFINE(int, cc_do_abe) = 0; SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cc_do_abe), 0, "Enable draft-ietf-tcpm-alternativebackoff-ecn (TCP Alternative Backoff with ECN)"); VNET_DEFINE(int, cc_abe_frlossreduce) = 0; SYSCTL_INT(_net_inet_tcp_cc, OID_AUTO, abe_frlossreduce, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cc_abe_frlossreduce), 0, "Apply standard beta instead of ABE-beta during ECN-signalled congestion " "recovery episodes if loss also needs to be repaired"); diff --git a/sys/netinet/cc/cc_cdg.c b/sys/netinet/cc/cc_cdg.c index 997d9435870f..5b1df76e71a2 100644 --- a/sys/netinet/cc/cc_cdg.c +++ b/sys/netinet/cc/cc_cdg.c @@ -1,727 +1,728 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009-2013 * Swinburne University of Technology, Melbourne, Australia * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by David Hayes, made * possible in part by a gift from The Cisco University Research Program Fund, * a corporate advised fund of Silicon Valley Community Foundation. Development * and testing were further assisted by a grant from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * CAIA Delay-Gradient (CDG) congestion control algorithm * * An implemention of the delay-gradient congestion control algorithm proposed * in the following paper: * * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011. * * Developed as part of the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define CDG_VERSION "0.1" /* Private delay-gradient induced congestion control signal. */ #define CC_CDG_DELAY 0x01000000 /* NewReno window deflation factor on loss (as a percentage). */ #define RENO_BETA 50 /* Queue states. */ #define CDG_Q_EMPTY 1 #define CDG_Q_RISING 2 #define CDG_Q_FALLING 3 #define CDG_Q_FULL 4 #define CDG_Q_UNKNOWN 9999 /* Number of bit shifts used in probexp lookup table. */ #define EXP_PREC 15 /* Largest gradient represented in probexp lookup table. */ #define MAXGRAD 5 /* * Delay Precision Enhance - number of bit shifts used for qtrend related * integer arithmetic precision. */ #define D_P_E 7 struct qdiff_sample { long qdiff; STAILQ_ENTRY(qdiff_sample) qdiff_lnk; }; struct cdg { long max_qtrend; long min_qtrend; STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q; STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q; long window_incr; /* rttcount for window increase when in congestion avoidance */ long rtt_count; /* maximum measured rtt within an rtt period */ int maxrtt_in_rtt; /* maximum measured rtt within prev rtt period */ int maxrtt_in_prevrtt; /* minimum measured rtt within an rtt period */ int minrtt_in_rtt; /* minimum measured rtt within prev rtt period */ int minrtt_in_prevrtt; /* consecutive congestion episode counter */ uint32_t consec_cong_cnt; /* when tracking a new reno type loss window */ uint32_t shadow_w; /* maximum number of samples in the moving average queue */ int sample_q_size; /* number of samples in the moving average queue */ int num_samples; /* estimate of the queue state of the path */ int queue_state; }; /* * Lookup table for: * (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments * * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase * gradients. */ static const int probexp[641] = { 10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624, 3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846, 7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713, 9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939, 12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951, 14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768, 15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410, 17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893, 19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233, 20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444, 21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537, 22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525, 23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418, 24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224, 25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953, 26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611, 26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206, 27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743, 27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228, 28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667, 28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063, 29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421, 29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744, 29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036, 30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300, 30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538, 30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753, 30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948, 30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124, 31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283, 31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426, 31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556, 31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673, 31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778, 31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874, 31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960, 31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038, 32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109, 32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173, 32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230, 32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282, 32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329, 32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371, 32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410, 32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444, 32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476, 32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504, 32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529, 32531,32533,32535,32537,32538,32540,32542,32544,32545,32547}; static uma_zone_t qdiffsample_zone; static int ertt_id; VNET_DEFINE_STATIC(uint32_t, cdg_alpha_inc); VNET_DEFINE_STATIC(uint32_t, cdg_beta_delay); VNET_DEFINE_STATIC(uint32_t, cdg_beta_loss); VNET_DEFINE_STATIC(uint32_t, cdg_smoothing_factor); VNET_DEFINE_STATIC(uint32_t, cdg_exp_backoff_scale); VNET_DEFINE_STATIC(uint32_t, cdg_consec_cong); VNET_DEFINE_STATIC(uint32_t, cdg_hold_backoff); #define V_cdg_alpha_inc VNET(cdg_alpha_inc) #define V_cdg_beta_delay VNET(cdg_beta_delay) #define V_cdg_beta_loss VNET(cdg_beta_loss) #define V_cdg_smoothing_factor VNET(cdg_smoothing_factor) #define V_cdg_exp_backoff_scale VNET(cdg_exp_backoff_scale) #define V_cdg_consec_cong VNET(cdg_consec_cong) #define V_cdg_hold_backoff VNET(cdg_hold_backoff) /* Function prototypes. */ static int cdg_mod_init(void); static int cdg_mod_destroy(void); static void cdg_conn_init(struct cc_var *ccv); static int cdg_cb_init(struct cc_var *ccv, void *ptr); static void cdg_cb_destroy(struct cc_var *ccv); static void cdg_cong_signal(struct cc_var *ccv, ccsignal_t signal_type); static void cdg_ack_received(struct cc_var *ccv, ccsignal_t ack_type); static size_t cdg_data_sz(void); struct cc_algo cdg_cc_algo = { .name = "cdg", .mod_init = cdg_mod_init, .ack_received = cdg_ack_received, .cb_destroy = cdg_cb_destroy, .cb_init = cdg_cb_init, .conn_init = cdg_conn_init, .cong_signal = cdg_cong_signal, .mod_destroy = cdg_mod_destroy, .cc_data_sz = cdg_data_sz, .post_recovery = newreno_cc_post_recovery, .after_idle = newreno_cc_after_idle, }; /* Vnet created and being initialised. */ static void cdg_init_vnet(const void *unused __unused) { V_cdg_alpha_inc = 0; V_cdg_beta_delay = 70; V_cdg_beta_loss = 50; V_cdg_smoothing_factor = 8; V_cdg_exp_backoff_scale = 3; V_cdg_consec_cong = 5; V_cdg_hold_backoff = 5; } static int cdg_mod_init(void) { VNET_ITERATOR_DECL(v); ertt_id = khelp_get_id("ertt"); if (ertt_id <= 0) return (EINVAL); qdiffsample_zone = uma_zcreate("cdg_qdiffsample", sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0); VNET_LIST_RLOCK(); VNET_FOREACH(v) { CURVNET_SET(v); cdg_init_vnet(NULL); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } static int cdg_mod_destroy(void) { uma_zdestroy(qdiffsample_zone); return (0); } static size_t cdg_data_sz(void) { return (sizeof(struct cdg)); } static int cdg_cb_init(struct cc_var *ccv, void *ptr) { struct cdg *cdg_data; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { cdg_data = malloc(sizeof(struct cdg), M_CC_MEM, M_NOWAIT); if (cdg_data == NULL) return (ENOMEM); } else { cdg_data = ptr; } cdg_data->shadow_w = 0; cdg_data->max_qtrend = 0; cdg_data->min_qtrend = 0; cdg_data->queue_state = CDG_Q_UNKNOWN; cdg_data->maxrtt_in_rtt = 0; cdg_data->maxrtt_in_prevrtt = 0; cdg_data->minrtt_in_rtt = INT_MAX; cdg_data->minrtt_in_prevrtt = 0; cdg_data->window_incr = 0; cdg_data->rtt_count = 0; cdg_data->consec_cong_cnt = 0; cdg_data->sample_q_size = V_cdg_smoothing_factor; cdg_data->num_samples = 0; STAILQ_INIT(&cdg_data->qdiffmin_q); STAILQ_INIT(&cdg_data->qdiffmax_q); ccv->cc_data = cdg_data; return (0); } static void cdg_conn_init(struct cc_var *ccv) { struct cdg *cdg_data = ccv->cc_data; /* * Initialise the shadow_cwnd in case we are competing with loss based * flows from the start */ cdg_data->shadow_w = CCV(ccv, snd_cwnd); } static void cdg_cb_destroy(struct cc_var *ccv) { struct cdg *cdg_data; struct qdiff_sample *qds, *qds_n; cdg_data = ccv->cc_data; qds = STAILQ_FIRST(&cdg_data->qdiffmin_q); while (qds != NULL) { qds_n = STAILQ_NEXT(qds, qdiff_lnk); uma_zfree(qdiffsample_zone,qds); qds = qds_n; } qds = STAILQ_FIRST(&cdg_data->qdiffmax_q); while (qds != NULL) { qds_n = STAILQ_NEXT(qds, qdiff_lnk); uma_zfree(qdiffsample_zone,qds); qds = qds_n; } free(ccv->cc_data, M_CC_MEM); } static int cdg_beta_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = *(uint32_t *)arg1; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new == 0 || new > 100) error = EINVAL; else *(uint32_t *)arg1 = new; } return (error); } static int cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = *(uint32_t *)arg1; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new < 1) error = EINVAL; else *(uint32_t *)arg1 = new; } return (error); } static inline uint32_t cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta) { return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100); } /* * Window increase function * This window increase function is independent of the initial window size * to ensure small window flows are not discriminated against (i.e. fairness). * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for * the next alpha_inc rtts, etc. */ static void cdg_window_increase(struct cc_var *ccv, int new_measurement) { struct cdg *cdg_data; int incr, s_w_incr; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); cdg_data = ccv->cc_data; incr = s_w_incr = 0; if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) { /* Slow start. */ - incr = CCV(ccv, t_maxseg); + incr = mss; s_w_incr = incr; cdg_data->window_incr = cdg_data->rtt_count = 0; } else { /* Congestion avoidance. */ if (new_measurement) { - s_w_incr = CCV(ccv, t_maxseg); + s_w_incr = mss; if (V_cdg_alpha_inc == 0) { - incr = CCV(ccv, t_maxseg); + incr = mss; } else { if (++cdg_data->rtt_count >= V_cdg_alpha_inc) { cdg_data->window_incr++; cdg_data->rtt_count = 0; } - incr = CCV(ccv, t_maxseg) * + incr = mss * cdg_data->window_incr; } } } if (cdg_data->shadow_w > 0) cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr, TCP_MAXWIN << CCV(ccv, snd_scale)); CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } static void cdg_cong_signal(struct cc_var *ccv, ccsignal_t signal_type) { struct cdg *cdg_data = ccv->cc_data; switch((int)signal_type) { case CC_CDG_DELAY: CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv, CCV(ccv, snd_cwnd), V_cdg_beta_delay); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); CCV(ccv, snd_recover) = CCV(ccv, snd_max); cdg_data->window_incr = cdg_data->rtt_count = 0; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); break; case CC_NDUPACK: /* * If already responding to congestion OR we have guessed no * queue in the path is full. */ if (IN_CONGRECOVERY(CCV(ccv, t_flags)) || cdg_data->queue_state < CDG_Q_FULL) { CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); CCV(ccv, snd_recover) = CCV(ccv, snd_max); } else { /* * Loss is likely to be congestion related. We have * inferred a queue full state, so have shadow window * react to loss as NewReno would. */ if (cdg_data->shadow_w > 0) cdg_data->shadow_w = cdg_window_decrease(ccv, cdg_data->shadow_w, RENO_BETA); CCV(ccv, snd_ssthresh) = max(cdg_data->shadow_w, cdg_window_decrease(ccv, CCV(ccv, snd_cwnd), V_cdg_beta_loss)); cdg_data->window_incr = cdg_data->rtt_count = 0; } ENTER_RECOVERY(CCV(ccv, t_flags)); break; default: newreno_cc_cong_signal(ccv, signal_type); break; } } /* * Using a negative exponential probabilistic backoff so that sources with * varying RTTs which share the same link will, on average, have the same * probability of backoff over time. * * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff. */ static inline int prob_backoff(long qtrend) { int backoff, idx; uint32_t p; backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E)); if (!backoff) { if (V_cdg_exp_backoff_scale > 1) idx = (qtrend + V_cdg_exp_backoff_scale / 2) / V_cdg_exp_backoff_scale; else idx = qtrend; /* Backoff probability proportional to rate of queue growth. */ p = (UINT32_MAX / (1 << EXP_PREC)) * probexp[idx]; backoff = (prng32() < p); } return (backoff); } static inline void calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min) { struct qdiff_sample *qds; ++cdg_data->num_samples; if (cdg_data->num_samples > cdg_data->sample_q_size) { /* Minimum RTT. */ qds = STAILQ_FIRST(&cdg_data->qdiffmin_q); cdg_data->min_qtrend = cdg_data->min_qtrend + (qdiff_min - qds->qdiff) / cdg_data->sample_q_size; STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk); qds->qdiff = qdiff_min; STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk); /* Maximum RTT. */ qds = STAILQ_FIRST(&cdg_data->qdiffmax_q); cdg_data->max_qtrend = cdg_data->max_qtrend + (qdiff_max - qds->qdiff) / cdg_data->sample_q_size; STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk); qds->qdiff = qdiff_max; STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk); --cdg_data->num_samples; } else { qds = uma_zalloc(qdiffsample_zone, M_NOWAIT); if (qds != NULL) { cdg_data->min_qtrend = cdg_data->min_qtrend + qdiff_min / cdg_data->sample_q_size; qds->qdiff = qdiff_min; STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk); } qds = uma_zalloc(qdiffsample_zone, M_NOWAIT); if (qds) { cdg_data->max_qtrend = cdg_data->max_qtrend + qdiff_max / cdg_data->sample_q_size; qds->qdiff = qdiff_max; STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk); } } } static void cdg_ack_received(struct cc_var *ccv, ccsignal_t ack_type) { struct cdg *cdg_data; struct ertt *e_t; long qdiff_max, qdiff_min; int congestion, new_measurement, slowstart; cdg_data = ccv->cc_data; e_t = (struct ertt *)khelp_get_osd(&CCV(ccv, t_osd), ertt_id); new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT; congestion = 0; cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt); cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt); if (new_measurement) { slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)); /* * Update smoothed gradient measurements. Since we are only * using one measurement per RTT, use max or min rtt_in_rtt. * This is also less noisy than a sample RTT measurement. Max * RTT measurements can have trouble due to OS issues. */ if (cdg_data->maxrtt_in_prevrtt) { qdiff_max = ((long)(cdg_data->maxrtt_in_rtt - cdg_data->maxrtt_in_prevrtt) << D_P_E ); qdiff_min = ((long)(cdg_data->minrtt_in_rtt - cdg_data->minrtt_in_prevrtt) << D_P_E ); if (cdg_data->sample_q_size == 0) { cdg_data->max_qtrend = qdiff_max; cdg_data->min_qtrend = qdiff_min; } else calc_moving_average(cdg_data, qdiff_max, qdiff_min); /* Probabilistic backoff with respect to gradient. */ if (slowstart && qdiff_min > 0) congestion = prob_backoff(qdiff_min); else if (cdg_data->min_qtrend > 0) congestion = prob_backoff(cdg_data->min_qtrend); else if (slowstart && qdiff_max > 0) congestion = prob_backoff(qdiff_max); else if (cdg_data->max_qtrend > 0) congestion = prob_backoff(cdg_data->max_qtrend); /* Update estimate of queue state. */ if (cdg_data->min_qtrend > 0 && cdg_data->max_qtrend <= 0) { cdg_data->queue_state = CDG_Q_FULL; } else if (cdg_data->min_qtrend >= 0 && cdg_data->max_qtrend < 0) { cdg_data->queue_state = CDG_Q_EMPTY; cdg_data->shadow_w = 0; } else if (cdg_data->min_qtrend > 0 && cdg_data->max_qtrend > 0) { cdg_data->queue_state = CDG_Q_RISING; } else if (cdg_data->min_qtrend < 0 && cdg_data->max_qtrend < 0) { cdg_data->queue_state = CDG_Q_FALLING; } if (cdg_data->min_qtrend < 0 || cdg_data->max_qtrend < 0) cdg_data->consec_cong_cnt = 0; } cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt; cdg_data->minrtt_in_rtt = INT_MAX; cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt; cdg_data->maxrtt_in_rtt = 0; e_t->flags &= ~ERTT_NEW_MEASUREMENT; } if (congestion) { cdg_data->consec_cong_cnt++; if (!IN_RECOVERY(CCV(ccv, t_flags))) { if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong) cdg_cong_signal(ccv, CC_CDG_DELAY); else /* * We have been backing off but the queue is not * falling. Assume we are competing with * loss-based flows and don't back off for the * next V_cdg_hold_backoff RTT periods. */ if (cdg_data->consec_cong_cnt >= V_cdg_consec_cong + V_cdg_hold_backoff) cdg_data->consec_cong_cnt = 0; /* Won't see effect until 2nd RTT. */ cdg_data->maxrtt_in_prevrtt = 0; /* * Resync shadow window in case we are competing with a * loss based flow */ cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd), cdg_data->shadow_w); } } else if (ack_type == CC_ACK) cdg_window_increase(ccv, new_measurement); } /* When a vnet is created and being initialised, init the per-stack CDG vars. */ VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, cdg_init_vnet, NULL); SYSCTL_DECL(_net_inet_tcp_cc_cdg); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "CAIA delay-gradient congestion control related settings"); SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version, CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1, "Current algorithm/implementation version number"); SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0, "Increment the window increase factor alpha by 1 MSS segment every " "alpha_inc RTTs during congestion avoidance mode."); SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(cdg_beta_delay), 70, &cdg_beta_handler, "IU", "Delay-based window decrease factor as a percentage " "(on delay-based backoff, w = w * beta_delay / 100)"); SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(cdg_beta_loss), 50, &cdg_beta_handler, "IU", "Loss-based window decrease factor as a percentage " "(on loss-based backoff, w = w * beta_loss / 100)"); SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(cdg_exp_backoff_scale), 2, &cdg_exp_backoff_scale_handler, "IU", "Scaling parameter for the probabilistic exponential backoff"); SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, smoothing_factor, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8, "Number of samples used for moving average smoothing (0 = no smoothing)"); SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5, "Number of consecutive delay-gradient based congestion episodes which will " "trigger loss based CC compatibility"); SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5, "Number of consecutive delay-gradient based congestion episodes to hold " "the window backoff for loss based CC compatibility"); DECLARE_CC_MODULE(cdg, &cdg_cc_algo); MODULE_VERSION(cdg, 2); MODULE_DEPEND(cdg, ertt, 1, 1, 1); diff --git a/sys/netinet/cc/cc_chd.c b/sys/netinet/cc/cc_chd.c index f48d1f0066e2..1d440f43578f 100644 --- a/sys/netinet/cc/cc_chd.c +++ b/sys/netinet/cc/cc_chd.c @@ -1,507 +1,509 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009-2010 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2010-2011 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by David Hayes and * Lawrence Stewart, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, Melbourne, Australia by * David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the CAIA-Hamilton delay based congestion control * algorithm, based on "Improved coexistence and loss tolerance for delay based * TCP congestion control" by D. A. Hayes and G. Armitage., in 35th Annual IEEE * Conference on Local Computer Networks (LCN 2010), Denver, Colorado, USA, * 11-14 October 2010. * * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the * Cisco University Research Program Fund at Community Foundation Silicon * Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Private signal type for rate based congestion signal. * See for appropriate bit-range to use for private signals. */ #define CC_CHD_DELAY 0x02000000 /* Largest possible number returned by prng32(). */ #define RANDOM_MAX UINT32_MAX static void chd_ack_received(struct cc_var *ccv, ccsignal_t ack_type); static void chd_cb_destroy(struct cc_var *ccv); static int chd_cb_init(struct cc_var *ccv, void *ptr); static void chd_cong_signal(struct cc_var *ccv, ccsignal_t signal_type); static void chd_conn_init(struct cc_var *ccv); static int chd_mod_init(void); static size_t chd_data_sz(void); struct chd { /* * Shadow window - keeps track of what the NewReno congestion window * would have been if delay-based cwnd backoffs had not been made. This * functionality aids coexistence with loss-based TCP flows which may be * sharing links along the path. */ unsigned long shadow_w; /* * Loss-based TCP compatibility flag - When set, it turns on the shadow * window functionality. */ int loss_compete; /* The maximum round trip time seen within a measured rtt period. */ int maxrtt_in_rtt; /* The previous qdly that caused cwnd to backoff. */ int prev_backoff_qdly; }; static int ertt_id; VNET_DEFINE_STATIC(uint32_t, chd_qmin) = 5; VNET_DEFINE_STATIC(uint32_t, chd_pmax) = 50; VNET_DEFINE_STATIC(uint32_t, chd_loss_fair) = 1; VNET_DEFINE_STATIC(uint32_t, chd_use_max) = 1; VNET_DEFINE_STATIC(uint32_t, chd_qthresh) = 20; #define V_chd_qthresh VNET(chd_qthresh) #define V_chd_qmin VNET(chd_qmin) #define V_chd_pmax VNET(chd_pmax) #define V_chd_loss_fair VNET(chd_loss_fair) #define V_chd_use_max VNET(chd_use_max) struct cc_algo chd_cc_algo = { .name = "chd", .ack_received = chd_ack_received, .cb_destroy = chd_cb_destroy, .cb_init = chd_cb_init, .cong_signal = chd_cong_signal, .conn_init = chd_conn_init, .mod_init = chd_mod_init, .cc_data_sz = chd_data_sz, .after_idle = newreno_cc_after_idle, .post_recovery = newreno_cc_post_recovery, }; static __inline void chd_window_decrease(struct cc_var *ccv) { unsigned long win; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); - win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / CCV(ccv, t_maxseg); + win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / mss; win -= max((win / 2), 1); - CCV(ccv, snd_ssthresh) = max(win, 2) * CCV(ccv, t_maxseg); + CCV(ccv, snd_ssthresh) = max(win, 2) * mss; } /* * Probabilistic backoff function. Returns 1 if we should backoff or 0 * otherwise. The calculation of p is similar to the calculation of p in cc_hd. */ static __inline int should_backoff(int qdly, int maxqdly, struct chd *chd_data) { uint32_t rand, p; rand = prng32(); if (qdly < V_chd_qthresh) { chd_data->loss_compete = 0; p = (((RANDOM_MAX / 100) * V_chd_pmax) / (V_chd_qthresh - V_chd_qmin)) * (qdly - V_chd_qmin); } else { if (qdly > V_chd_qthresh) { p = (((RANDOM_MAX / 100) * V_chd_pmax) / (maxqdly - V_chd_qthresh)) * (maxqdly - qdly); if (V_chd_loss_fair && rand < p) chd_data->loss_compete = 1; } else { p = (RANDOM_MAX / 100) * V_chd_pmax; chd_data->loss_compete = 0; } } return (rand < p); } static __inline void chd_window_increase(struct cc_var *ccv, int new_measurement) { struct chd *chd_data; int incr; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); chd_data = ccv->cc_data; incr = 0; if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) { /* Adapted from NewReno slow start. */ if (V_tcp_do_rfc3465) { /* In slow-start with ABC enabled. */ if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) { /* Not due to RTO. */ incr = min(ccv->bytes_this_ack, - V_tcp_abc_l_var * CCV(ccv, t_maxseg)); + V_tcp_abc_l_var * mss); } else { /* Due to RTO. */ - incr = min(ccv->bytes_this_ack, - CCV(ccv, t_maxseg)); + incr = min(ccv->bytes_this_ack, mss); } } else - incr = CCV(ccv, t_maxseg); + incr = mss; } else { /* Congestion avoidance. */ if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) { ccv->flags &= ~CCF_ABC_SENTAWND; - incr = CCV(ccv, t_maxseg); + incr = mss; } } else if (new_measurement) - incr = CCV(ccv, t_maxseg); + incr = mss; } if (chd_data->shadow_w > 0) { /* Track NewReno window. */ chd_data->shadow_w = min(chd_data->shadow_w + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } CCV(ccv,snd_cwnd) = min(CCV(ccv, snd_cwnd) + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } /* * All ACK signals are used for timing measurements to determine delay-based * congestion. However, window increases are only performed when * ack_type == CC_ACK. */ static void chd_ack_received(struct cc_var *ccv, ccsignal_t ack_type) { struct chd *chd_data; struct ertt *e_t; int backoff, new_measurement, qdly, rtt; e_t = khelp_get_osd(&CCV(ccv, t_osd), ertt_id); chd_data = ccv->cc_data; new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT; backoff = qdly = 0; chd_data->maxrtt_in_rtt = imax(e_t->rtt, chd_data->maxrtt_in_rtt); if (new_measurement) { /* * There is a new per RTT measurement, so check to see if there * is delay based congestion. */ rtt = V_chd_use_max ? chd_data->maxrtt_in_rtt : e_t->rtt; chd_data->maxrtt_in_rtt = 0; if (rtt && e_t->minrtt && !IN_RECOVERY(CCV(ccv, t_flags))) { qdly = rtt - e_t->minrtt; if (qdly > V_chd_qmin) { /* * Probabilistic delay based congestion * indication. */ backoff = should_backoff(qdly, e_t->maxrtt - e_t->minrtt, chd_data); } else chd_data->loss_compete = 0; } /* Reset per RTT measurement flag to start a new measurement. */ e_t->flags &= ~ERTT_NEW_MEASUREMENT; } if (backoff) { /* * Update shadow_w before delay based backoff. */ if (chd_data->loss_compete || qdly > chd_data->prev_backoff_qdly) { /* * Delay is higher than when we backed off previously, * so it is possible that this flow is competing with * loss based flows. */ chd_data->shadow_w = max(CCV(ccv, snd_cwnd), chd_data->shadow_w); } else { /* * Reset shadow_w, as it is probable that this flow is * not competing with loss based flows at the moment. */ chd_data->shadow_w = 0; } chd_data->prev_backoff_qdly = qdly; /* * Send delay-based congestion signal to the congestion signal * handler. */ chd_cong_signal(ccv, CC_CHD_DELAY); } else if (ack_type == CC_ACK) chd_window_increase(ccv, new_measurement); } static void chd_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } size_t chd_data_sz(void) { return (sizeof(struct chd)); } static int chd_cb_init(struct cc_var *ccv, void *ptr) { struct chd *chd_data; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { chd_data = malloc(sizeof(struct chd), M_CC_MEM, M_NOWAIT); if (chd_data == NULL) return (ENOMEM); } else chd_data = ptr; chd_data->shadow_w = 0; ccv->cc_data = chd_data; return (0); } static void chd_cong_signal(struct cc_var *ccv, ccsignal_t signal_type) { struct ertt *e_t; struct chd *chd_data; int qdly; e_t = khelp_get_osd(&CCV(ccv, t_osd), ertt_id); chd_data = ccv->cc_data; qdly = imax(e_t->rtt, chd_data->maxrtt_in_rtt) - e_t->minrtt; switch((int)signal_type) { case CC_CHD_DELAY: chd_window_decrease(ccv); /* Set new ssthresh. */ CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); CCV(ccv, snd_recover) = CCV(ccv, snd_max); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); break; case CC_NDUPACK: /* Packet loss. */ /* * Only react to loss as a congestion signal if qdly > * V_chd_qthresh. If qdly is less than qthresh, presume that * this is a non congestion related loss. If qdly is greater * than qthresh, assume that we are competing with loss based * tcp flows and restore window from any unnecessary backoffs, * before the decrease. */ if (!IN_RECOVERY(CCV(ccv, t_flags)) && qdly > V_chd_qthresh) { if (chd_data->loss_compete) { CCV(ccv, snd_cwnd) = max(CCV(ccv, snd_cwnd), chd_data->shadow_w); } chd_window_decrease(ccv); } else { /* * This loss isn't congestion related, or already * recovering from congestion. */ CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); CCV(ccv, snd_recover) = CCV(ccv, snd_max); } if (chd_data->shadow_w > 0) { + uint32_t mss = tcp_fixed_maxseg(ccv->tp); chd_data->shadow_w = max(chd_data->shadow_w / - CCV(ccv, t_maxseg) / 2, 2) * CCV(ccv, t_maxseg); + mss / 2, 2) * mss; } ENTER_FASTRECOVERY(CCV(ccv, t_flags)); break; default: newreno_cc_cong_signal(ccv, signal_type); break; } } static void chd_conn_init(struct cc_var *ccv) { struct chd *chd_data; chd_data = ccv->cc_data; chd_data->prev_backoff_qdly = 0; chd_data->maxrtt_in_rtt = 0; chd_data->loss_compete = 0; /* * Initialise the shadow_cwnd to be equal to snd_cwnd in case we are * competing with loss based flows from the start. */ chd_data->shadow_w = CCV(ccv, snd_cwnd); } static int chd_mod_init(void) { ertt_id = khelp_get_id("ertt"); if (ertt_id <= 0) { printf("%s: h_ertt module not found\n", __func__); return (ENOENT); } return (0); } static int chd_loss_fair_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_chd_loss_fair; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new > 1) error = EINVAL; else V_chd_loss_fair = new; } return (error); } static int chd_pmax_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_chd_pmax; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new == 0 || new > 100) error = EINVAL; else V_chd_pmax = new; } return (error); } static int chd_qthresh_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_chd_qthresh; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new <= V_chd_qmin) error = EINVAL; else V_chd_qthresh = new; } return (error); } SYSCTL_DECL(_net_inet_tcp_cc_chd); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, chd, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "CAIA Hamilton delay-based congestion control related settings"); SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler, "IU", "Flag to enable shadow window functionality."); SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(chd_pmax), 5, &chd_pmax_handler, "IU", "Per RTT maximum backoff probability as a percentage"); SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler, "IU", "Queueing congestion threshold in ticks"); SYSCTL_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_qmin), 5, "Minimum queueing delay threshold in ticks"); SYSCTL_UINT(_net_inet_tcp_cc_chd, OID_AUTO, use_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_use_max), 1, "Use the maximum RTT seen within the measurement period (RTT) " "as the basic delay measurement for the algorithm."); DECLARE_CC_MODULE(chd, &chd_cc_algo); MODULE_VERSION(chd, 2); MODULE_DEPEND(chd, ertt, 1, 1, 1); diff --git a/sys/netinet/cc/cc_cubic.c b/sys/netinet/cc/cc_cubic.c index c4b44d5c3660..3b134082a59b 100644 --- a/sys/netinet/cc/cc_cubic.c +++ b/sys/netinet/cc/cc_cubic.c @@ -1,740 +1,740 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2008-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Lawrence Stewart while studying at the Centre * for Advanced Internet Architectures, Swinburne University of Technology, made * possible in part by a grant from the Cisco University Research Program Fund * at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the CUBIC congestion control algorithm for FreeBSD, * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha. * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the * Cisco University Research Program Fund at Community Foundation Silicon * Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void cubic_ack_received(struct cc_var *ccv, ccsignal_t type); static void cubic_cb_destroy(struct cc_var *ccv); static int cubic_cb_init(struct cc_var *ccv, void *ptr); static void cubic_cong_signal(struct cc_var *ccv, ccsignal_t type); static void cubic_conn_init(struct cc_var *ccv); static int cubic_mod_init(void); static void cubic_post_recovery(struct cc_var *ccv); static void cubic_record_rtt(struct cc_var *ccv); static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg); static void cubic_after_idle(struct cc_var *ccv); static size_t cubic_data_sz(void); static void cubic_newround(struct cc_var *ccv, uint32_t round_cnt); static void cubic_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas); struct cc_algo cubic_cc_algo = { .name = "cubic", .ack_received = cubic_ack_received, .cb_destroy = cubic_cb_destroy, .cb_init = cubic_cb_init, .cong_signal = cubic_cong_signal, .conn_init = cubic_conn_init, .mod_init = cubic_mod_init, .post_recovery = cubic_post_recovery, .after_idle = cubic_after_idle, .cc_data_sz = cubic_data_sz, .rttsample = cubic_rttsample, .newround = cubic_newround }; static void cubic_log_hystart_event(struct cc_var *ccv, struct cubic *cubicd, uint8_t mod, uint32_t flex1) { /* * Types of logs (mod value) * 1 - rtt_thresh in flex1, checking to see if RTT is to great. * 2 - rtt is too great, rtt_thresh in flex1. * 3 - CSS is active incr in flex1 * 4 - A new round is beginning flex1 is round count * 5 - A new RTT measurement flex1 is the new measurement. * 6 - We enter CA ssthresh is also in flex1. * 7 - Socket option to change hystart executed opt.val in flex1. * 8 - Back out of CSS into SS, flex1 is the css_baseline_minrtt * 9 - We enter CA, via an ECN mark. * 10 - We enter CA, via a loss. * 11 - We have slipped out of SS into CA via cwnd growth. * 12 - After idle has re-enabled hystart++ */ struct tcpcb *tp; if (hystart_bblogs == 0) return; tp = ccv->tp; if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = cubicd->css_current_round_minrtt; log.u_bbr.flex3 = cubicd->css_lastround_minrtt; log.u_bbr.flex4 = cubicd->css_rttsample_count; log.u_bbr.flex5 = cubicd->css_entered_at_round; log.u_bbr.flex6 = cubicd->css_baseline_minrtt; /* We only need bottom 16 bits of flags */ log.u_bbr.flex7 = cubicd->flags & 0x0000ffff; log.u_bbr.flex8 = mod; log.u_bbr.epoch = cubicd->css_current_round; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.lt_epoch = cubicd->css_fas_at_css_entry; log.u_bbr.pkts_out = cubicd->css_last_fas; log.u_bbr.delivered = cubicd->css_lowrtt_fas; log.u_bbr.pkt_epoch = ccv->flags; TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_HYSTART, 0, 0, &log, false, &tv); } } static void cubic_does_slow_start(struct cc_var *ccv, struct cubic *cubicd) { /* * In slow-start with ABC enabled and no RTO in sight? * (Must not use abc_l_var > 1 if slow starting after * an RTO. On RTO, snd_nxt = snd_una, so the * snd_nxt == snd_max check is sufficient to * handle this). * * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ u_int cw = CCV(ccv, snd_cwnd); - u_int incr = CCV(ccv, t_maxseg); + uint32_t mss = tcp_fixed_maxseg(ccv->tp); + u_int incr = mss; uint16_t abc_val; cubicd->flags |= CUBICFLAG_IN_SLOWSTART; if (ccv->flags & CCF_USE_LOCAL_ABC) abc_val = ccv->labc; else abc_val = V_tcp_abc_l_var; if ((ccv->flags & CCF_HYSTART_ALLOWED) && (cubicd->flags & CUBICFLAG_HYSTART_ENABLED) && ((cubicd->flags & CUBICFLAG_HYSTART_IN_CSS) == 0)) { /* * Hystart is allowed and still enabled and we are not yet * in CSS. Lets check to see if we can make a decision on * if we need to go into CSS. */ if ((cubicd->css_rttsample_count >= hystart_n_rttsamples) && (cubicd->css_current_round_minrtt != 0xffffffff) && (cubicd->css_lastround_minrtt != 0xffffffff)) { uint32_t rtt_thresh; /* Clamp (minrtt_thresh, lastround/8, maxrtt_thresh) */ rtt_thresh = (cubicd->css_lastround_minrtt >> 3); if (rtt_thresh < hystart_minrtt_thresh) rtt_thresh = hystart_minrtt_thresh; if (rtt_thresh > hystart_maxrtt_thresh) rtt_thresh = hystart_maxrtt_thresh; cubic_log_hystart_event(ccv, cubicd, 1, rtt_thresh); if (cubicd->css_current_round_minrtt >= (cubicd->css_lastround_minrtt + rtt_thresh)) { /* Enter CSS */ cubicd->flags |= CUBICFLAG_HYSTART_IN_CSS; cubicd->css_fas_at_css_entry = cubicd->css_lowrtt_fas; /* * The draft (v4) calls for us to set baseline to css_current_round_min * but that can cause an oscillation. We probably shoudl be using * css_lastround_minrtt, but the authors insist that will cause * issues on exiting early. We will leave the draft version for now * but I suspect this is incorrect. */ cubicd->css_baseline_minrtt = cubicd->css_current_round_minrtt; cubicd->css_entered_at_round = cubicd->css_current_round; cubic_log_hystart_event(ccv, cubicd, 2, rtt_thresh); } } } if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, - ccv->nsegs * abc_val * - CCV(ccv, t_maxseg)); + ccv->nsegs * abc_val * mss); else - incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + incr = min(ccv->bytes_this_ack, mss); /* Only if Hystart is enabled will the flag get set */ if (cubicd->flags & CUBICFLAG_HYSTART_IN_CSS) { incr /= hystart_css_growth_div; cubic_log_hystart_event(ccv, cubicd, 3, incr); } /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) CCV(ccv, snd_cwnd) = min((cw + incr), TCP_MAXWIN << CCV(ccv, snd_scale)); } static void cubic_ack_received(struct cc_var *ccv, ccsignal_t type) { struct cubic *cubic_data; unsigned long W_est, W_cubic; int usecs_since_epoch; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); cubic_data = ccv->cc_data; cubic_record_rtt(ccv); /* * For a regular ACK and we're not in cong/fast recovery and * we're cwnd limited, always recalculate cwnd. */ if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { /* Use the logic in NewReno ack_received() for slow start. */ if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || cubic_data->min_rtt_usecs == TCPTV_SRTTBASE) { cubic_does_slow_start(ccv, cubic_data); } else { if (cubic_data->flags & CUBICFLAG_HYSTART_IN_CSS) { /* * We have slipped into CA with * CSS active. Deactivate all. */ /* Turn off the CSS flag */ cubic_data->flags &= ~CUBICFLAG_HYSTART_IN_CSS; /* Disable use of CSS in the future except long idle */ cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED; cubic_log_hystart_event(ccv, cubic_data, 11, CCV(ccv, snd_ssthresh)); } if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) && (cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) { /* RFC8312 Section 4.7 */ cubic_data->flags &= ~(CUBICFLAG_RTO_EVENT | CUBICFLAG_IN_SLOWSTART); cubic_data->W_max = CCV(ccv, snd_cwnd); cubic_data->t_epoch = ticks; cubic_data->K = 0; } else if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART | CUBICFLAG_IN_APPLIMIT)) { cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART | CUBICFLAG_IN_APPLIMIT); cubic_data->t_epoch = ticks; - cubic_data->K = cubic_k(cubic_data->W_max / - CCV(ccv, t_maxseg)); + cubic_data->K = cubic_k(cubic_data->W_max / mss); } usecs_since_epoch = (ticks - cubic_data->t_epoch) * tick; if (usecs_since_epoch < 0) { /* * dragging t_epoch along */ usecs_since_epoch = INT_MAX; cubic_data->t_epoch = ticks - INT_MAX; } W_est = tf_cwnd(ccv); /* * The mean RTT is used to best reflect the equations in * the I-D. */ W_cubic = cubic_cwnd(usecs_since_epoch + cubic_data->mean_rtt_usecs, cubic_data->W_max, - CCV(ccv, t_maxseg), + tcp_fixed_maxseg(ccv->tp), cubic_data->K); if (W_cubic < W_est) { /* * TCP-friendly region, follow tf * cwnd growth. */ CCV(ccv, snd_cwnd) = ulmin(W_est, INT_MAX); cubic_data->flags |= CUBICFLAG_IN_TF; } else if (CCV(ccv, snd_cwnd) < W_cubic) { /* * Concave or convex region, follow CUBIC * cwnd growth. * Only update snd_cwnd, if it doesn't shrink. */ CCV(ccv, snd_cwnd) = ulmin(W_cubic, INT_MAX); cubic_data->flags &= ~CUBICFLAG_IN_TF; } /* * If we're not in slow start and we're probing for a * new cwnd limit at the start of a connection * (happens when hostcache has a relevant entry), * keep updating our current estimate of the * W_max. */ if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) && cubic_data->W_max < CCV(ccv, snd_cwnd)) { cubic_data->W_max = CCV(ccv, snd_cwnd); cubic_data->K = cubic_k(cubic_data->W_max / - CCV(ccv, t_maxseg)); + tcp_fixed_maxseg(ccv->tp)); } } } else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && !(ccv->flags & CCF_CWND_LIMITED)) { cubic_data->flags |= CUBICFLAG_IN_APPLIMIT; } } /* * This is a CUBIC specific implementation of after_idle. * - Reset cwnd by calling New Reno implementation of after_idle. * - Reset t_epoch. */ static void cubic_after_idle(struct cc_var *ccv) { struct cubic *cubic_data; cubic_data = ccv->cc_data; cubic_data->W_max = ulmax(cubic_data->W_max, CCV(ccv, snd_cwnd)); - cubic_data->K = cubic_k(cubic_data->W_max / CCV(ccv, t_maxseg)); + cubic_data->K = cubic_k(cubic_data->W_max / tcp_fixed_maxseg(ccv->tp)); if ((cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) == 0) { /* * Re-enable hystart if we have been idle. */ cubic_data->flags &= ~CUBICFLAG_HYSTART_IN_CSS; cubic_data->flags |= CUBICFLAG_HYSTART_ENABLED; cubic_log_hystart_event(ccv, cubic_data, 12, CCV(ccv, snd_ssthresh)); } newreno_cc_after_idle(ccv); cubic_data->t_epoch = ticks; } static void cubic_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } static size_t cubic_data_sz(void) { return (sizeof(struct cubic)); } static int cubic_cb_init(struct cc_var *ccv, void *ptr) { struct cubic *cubic_data; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { cubic_data = malloc(sizeof(struct cubic), M_CC_MEM, M_NOWAIT|M_ZERO); if (cubic_data == NULL) return (ENOMEM); } else cubic_data = ptr; /* Init some key variables with sensible defaults. */ cubic_data->t_epoch = ticks; cubic_data->min_rtt_usecs = TCPTV_SRTTBASE; cubic_data->mean_rtt_usecs = 1; ccv->cc_data = cubic_data; cubic_data->flags = CUBICFLAG_HYSTART_ENABLED; /* At init set both to infinity */ cubic_data->css_lastround_minrtt = 0xffffffff; cubic_data->css_current_round_minrtt = 0xffffffff; cubic_data->css_current_round = 0; cubic_data->css_baseline_minrtt = 0xffffffff; cubic_data->css_rttsample_count = 0; cubic_data->css_entered_at_round = 0; cubic_data->css_fas_at_css_entry = 0; cubic_data->css_lowrtt_fas = 0; cubic_data->css_last_fas = 0; return (0); } /* * Perform any necessary tasks before we enter congestion recovery. */ static void cubic_cong_signal(struct cc_var *ccv, ccsignal_t type) { struct cubic *cubic_data; uint32_t mss, pipe; cubic_data = ccv->cc_data; mss = tcp_fixed_maxseg(ccv->tp); switch (type) { case CC_NDUPACK: if (cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED; cubic_data->flags &= ~CUBICFLAG_HYSTART_IN_CSS; cubic_log_hystart_event(ccv, cubic_data, 10, CCV(ccv, snd_ssthresh)); } if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { cubic_ssthresh_update(ccv, mss); cubic_data->flags |= CUBICFLAG_CONG_EVENT; cubic_data->t_epoch = ticks; cubic_data->K = cubic_k(cubic_data->W_max / mss); } ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (cubic_data->flags & CUBICFLAG_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ cubic_data->flags &= ~CUBICFLAG_HYSTART_ENABLED; cubic_data->flags &= ~CUBICFLAG_HYSTART_IN_CSS; cubic_log_hystart_event(ccv, cubic_data, 9, CCV(ccv, snd_ssthresh)); } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { cubic_ssthresh_update(ccv, mss); cubic_data->flags |= CUBICFLAG_CONG_EVENT; cubic_data->t_epoch = ticks; cubic_data->K = cubic_k(cubic_data->W_max / mss); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: /* RFC8312 Section 4.7 */ if (CCV(ccv, t_rxtshift) == 1) { /* * Remember the state only for the first RTO event. This * will help us restore the state to the values seen * at the most recent congestion avoidance stage before * the current RTO event. */ cubic_data->undo_t_epoch = cubic_data->t_epoch; cubic_data->undo_cwnd_epoch = cubic_data->cwnd_epoch; cubic_data->undo_W_est = cubic_data->W_est; cubic_data->undo_cwnd_prior = cubic_data->cwnd_prior; cubic_data->undo_W_max = cubic_data->W_max; cubic_data->undo_K = cubic_data->K; if (V_tcp_do_newsack) { pipe = tcp_compute_pipe(ccv->tp); } else { pipe = CCV(ccv, snd_max) - CCV(ccv, snd_fack) + CCV(ccv, sackhint.sack_bytes_rexmit); } CCV(ccv, snd_ssthresh) = max(2, (((uint64_t)min(CCV(ccv, snd_wnd), pipe) * CUBIC_BETA) >> CUBIC_SHIFT) / mss) * mss; } cubic_data->flags |= CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT; cubic_data->undo_W_max = cubic_data->W_max; CCV(ccv, snd_cwnd) = mss; break; case CC_RTO_ERR: cubic_data->flags &= ~(CUBICFLAG_CONG_EVENT | CUBICFLAG_RTO_EVENT); cubic_data->K = cubic_data->undo_K; cubic_data->cwnd_prior = cubic_data->undo_cwnd_prior; cubic_data->W_max = cubic_data->undo_W_max; cubic_data->W_est = cubic_data->undo_W_est; cubic_data->cwnd_epoch = cubic_data->undo_cwnd_epoch; cubic_data->t_epoch = cubic_data->undo_t_epoch; break; default: break; } } static void cubic_conn_init(struct cc_var *ccv) { struct cubic *cubic_data; cubic_data = ccv->cc_data; /* * Ensure we have a sane initial value for W_max recorded. Without * this here bad things happen when entries from the TCP hostcache * get used. */ cubic_data->W_max = CCV(ccv, snd_cwnd); } static int cubic_mod_init(void) { return (0); } /* * Perform any necessary tasks before we exit congestion recovery. */ static void cubic_post_recovery(struct cc_var *ccv) { struct cubic *cubic_data; int pipe; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); cubic_data = ccv->cc_data; pipe = 0; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * If inflight data is less than ssthresh, set cwnd * conservatively to avoid a burst of data, as suggested in * the NewReno RFC. Otherwise, use the CUBIC method. * * XXXLAS: Find a way to do this without needing curack */ if (V_tcp_do_newsack) pipe = tcp_compute_pipe(ccv->tp); else pipe = CCV(ccv, snd_max) - ccv->curack; if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd does not collapse to 1 MSS under * adverse conditions. Implements RFC6582 */ - CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + - CCV(ccv, t_maxseg); + CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else /* Update cwnd based on beta and adjusted W_max. */ CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->W_max * CUBIC_BETA) >> CUBIC_SHIFT, - 2 * CCV(ccv, t_maxseg)); + 2 * mss); } /* Calculate the average RTT between congestion epochs. */ if (cubic_data->epoch_ack_count > 0 && cubic_data->sum_rtt_usecs >= cubic_data->epoch_ack_count) { cubic_data->mean_rtt_usecs = (int)(cubic_data->sum_rtt_usecs / cubic_data->epoch_ack_count); } cubic_data->epoch_ack_count = 0; cubic_data->sum_rtt_usecs = 0; } /* * Record the min RTT and sum samples for the epoch average RTT calculation. */ static void cubic_record_rtt(struct cc_var *ccv) { struct cubic *cubic_data; uint32_t t_srtt_usecs; /* Ignore srtt until a min number of samples have been taken. */ if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) { cubic_data = ccv->cc_data; t_srtt_usecs = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_USEC); /* * Record the current SRTT as our minrtt if it's the smallest * we've seen or minrtt is currently equal to its initialised * value. * * XXXLAS: Should there be some hysteresis for minrtt? */ if ((t_srtt_usecs < cubic_data->min_rtt_usecs || cubic_data->min_rtt_usecs == TCPTV_SRTTBASE)) { /* A minimal rtt is a single unshifted tick of a ticks * timer. */ cubic_data->min_rtt_usecs = max(tick >> TCP_RTT_SHIFT, t_srtt_usecs); /* * If the connection is within its first congestion * epoch, ensure we prime mean_rtt_usecs with a * reasonable value until the epoch average RTT is * calculated in cubic_post_recovery(). */ if (cubic_data->min_rtt_usecs > cubic_data->mean_rtt_usecs) cubic_data->mean_rtt_usecs = cubic_data->min_rtt_usecs; } /* Sum samples for epoch average RTT calculation. */ cubic_data->sum_rtt_usecs += t_srtt_usecs; cubic_data->epoch_ack_count++; } } /* * Update the ssthresh in the event of congestion. */ static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg) { struct cubic *cubic_data; uint32_t ssthresh; uint32_t cwnd; cubic_data = ccv->cc_data; cwnd = CCV(ccv, snd_cwnd); /* Fast convergence heuristic. */ if (cwnd < cubic_data->W_max) { cwnd = ((uint64_t)cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT; } cubic_data->undo_W_max = cubic_data->W_max; cubic_data->W_max = cwnd; if (cubic_data->flags & CUBICFLAG_IN_TF) { /* If in the TCP friendly region, follow what newreno does */ ssthresh = newreno_cc_cwnd_on_multiplicative_decrease(ccv, maxseg); } else if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) { /* * On the first congestion event, set ssthresh to cwnd * 0.5 * and reduce W_max to cwnd * beta. This aligns the cubic * concave region appropriately. */ ssthresh = cwnd >> 1; cubic_data->W_max = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT; } else { /* * On subsequent congestion events, set ssthresh to cwnd * beta. */ ssthresh = ((uint64_t)cwnd * CUBIC_BETA) >> CUBIC_SHIFT; } CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * maxseg); } static void cubic_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas) { struct cubic *cubicd; cubicd = ccv->cc_data; if (rxtcnt > 1) { /* * Only look at RTT's that are non-ambiguous. */ return; } cubicd->css_rttsample_count++; cubicd->css_last_fas = fas; if (cubicd->css_current_round_minrtt > usec_rtt) { cubicd->css_current_round_minrtt = usec_rtt; cubicd->css_lowrtt_fas = cubicd->css_last_fas; } if ((cubicd->css_rttsample_count >= hystart_n_rttsamples) && (cubicd->css_current_round_minrtt != 0xffffffff) && (cubicd->css_current_round_minrtt < cubicd->css_baseline_minrtt) && (cubicd->css_lastround_minrtt != 0xffffffff)) { /* * We were in CSS and the RTT is now less, we * entered CSS erroneously. */ cubicd->flags &= ~CUBICFLAG_HYSTART_IN_CSS; cubic_log_hystart_event(ccv, cubicd, 8, cubicd->css_baseline_minrtt); cubicd->css_baseline_minrtt = 0xffffffff; } if (cubicd->flags & CUBICFLAG_HYSTART_ENABLED) cubic_log_hystart_event(ccv, cubicd, 5, usec_rtt); } static void cubic_newround(struct cc_var *ccv, uint32_t round_cnt) { struct cubic *cubicd; cubicd = ccv->cc_data; /* We have entered a new round */ cubicd->css_lastround_minrtt = cubicd->css_current_round_minrtt; cubicd->css_current_round_minrtt = 0xffffffff; cubicd->css_rttsample_count = 0; cubicd->css_current_round = round_cnt; if ((cubicd->flags & CUBICFLAG_HYSTART_IN_CSS) && ((round_cnt - cubicd->css_entered_at_round) >= hystart_css_rounds)) { /* Enter CA */ if (ccv->flags & CCF_HYSTART_CAN_SH_CWND) { /* * We engage more than snd_ssthresh, engage * the brakes!! Though we will stay in SS to * creep back up again, so lets leave CSS active * and give us hystart_css_rounds more rounds. */ if (ccv->flags & CCF_HYSTART_CONS_SSTH) { CCV(ccv, snd_ssthresh) = ((cubicd->css_lowrtt_fas + cubicd->css_fas_at_css_entry) / 2); } else { CCV(ccv, snd_ssthresh) = cubicd->css_lowrtt_fas; } CCV(ccv, snd_cwnd) = cubicd->css_fas_at_css_entry; cubicd->css_entered_at_round = round_cnt; } else { CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); /* Turn off the CSS flag */ cubicd->flags &= ~CUBICFLAG_HYSTART_IN_CSS; /* Disable use of CSS in the future except long idle */ cubicd->flags &= ~CUBICFLAG_HYSTART_ENABLED; } cubic_log_hystart_event(ccv, cubicd, 6, CCV(ccv, snd_ssthresh)); } if (cubicd->flags & CUBICFLAG_HYSTART_ENABLED) cubic_log_hystart_event(ccv, cubicd, 4, round_cnt); } DECLARE_CC_MODULE(cubic, &cubic_cc_algo); MODULE_VERSION(cubic, 2); diff --git a/sys/netinet/cc/cc_dctcp.c b/sys/netinet/cc/cc_dctcp.c index 06b2de11fd46..f43efe0e27c7 100644 --- a/sys/netinet/cc/cc_dctcp.c +++ b/sys/netinet/cc/cc_dctcp.c @@ -1,499 +1,500 @@ /*- * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2014 Midori Kato * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the DCTCP algorithm for FreeBSD, based on * "Data Center TCP (DCTCP)" by M. Alizadeh, A. Greenberg, D. A. Maltz, * J. Padhye, P. Patel, B. Prabhakar, S. Sengupta, and M. Sridharan., * in ACM Conference on SIGCOMM 2010, New York, USA, * Originally released as the contribution of Microsoft Research project. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DCTCP_SHIFT 10 #define MAX_ALPHA_VALUE (1<tp); dctcp_data = ccv->cc_data; if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) { /* * DCTCP doesn't treat receipt of ECN marked packet as a * congestion event. Thus, DCTCP always executes the ACK * processing out of congestion recovery. */ if (IN_CONGRECOVERY(CCV(ccv, t_flags))) { EXIT_CONGRECOVERY(CCV(ccv, t_flags)); newreno_cc_ack_received(ccv, type); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } else newreno_cc_ack_received(ccv, type); if (type == CC_DUPACK) - bytes_acked = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + bytes_acked = min(ccv->bytes_this_ack, mss); if (type == CC_ACK) bytes_acked = ccv->bytes_this_ack; /* Update total bytes. */ dctcp_data->bytes_total += bytes_acked; /* Update total marked bytes. */ if (dctcp_data->ece_curr) { //XXRMS: For fluid-model DCTCP, update //cwnd here during for RTT fairness if (!dctcp_data->ece_prev - && bytes_acked > CCV(ccv, t_maxseg)) { + && bytes_acked > mss) { dctcp_data->bytes_ecn += - (bytes_acked - CCV(ccv, t_maxseg)); + (bytes_acked - mss); } else dctcp_data->bytes_ecn += bytes_acked; dctcp_data->ece_prev = 1; } else { if (dctcp_data->ece_prev - && bytes_acked > CCV(ccv, t_maxseg)) - dctcp_data->bytes_ecn += CCV(ccv, t_maxseg); + && bytes_acked > mss) + dctcp_data->bytes_ecn += mss; dctcp_data->ece_prev = 0; } dctcp_data->ece_curr = 0; /* * Update the fraction of marked bytes at the end of * current window size. */ if (!IN_FASTRECOVERY(CCV(ccv, t_flags)) && SEQ_GT(ccv->curack, dctcp_data->save_sndnxt)) dctcp_update_alpha(ccv); } else newreno_cc_ack_received(ccv, type); } static size_t dctcp_data_sz(void) { return (sizeof(struct dctcp)); } static void dctcp_after_idle(struct cc_var *ccv) { struct dctcp *dctcp_data; if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) { dctcp_data = ccv->cc_data; /* Initialize internal parameters after idle time */ dctcp_data->bytes_ecn = 0; dctcp_data->bytes_total = 0; dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); dctcp_data->alpha = V_dctcp_alpha; dctcp_data->ece_curr = 0; dctcp_data->ece_prev = 0; dctcp_data->num_cong_events = 0; } newreno_cc_after_idle(ccv); } static void dctcp_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } static int dctcp_cb_init(struct cc_var *ccv, void *ptr) { struct dctcp *dctcp_data; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { dctcp_data = malloc(sizeof(struct dctcp), M_CC_MEM, M_NOWAIT|M_ZERO); if (dctcp_data == NULL) return (ENOMEM); } else dctcp_data = ptr; /* Initialize some key variables with sensible defaults. */ dctcp_data->bytes_ecn = 0; dctcp_data->bytes_total = 0; /* * When alpha is set to 0 in the beginning, DCTCP sender transfers as * much data as possible until the value converges which may expand the * queueing delay at the switch. When alpha is set to 1, queueing delay * is kept small. * Throughput-sensitive applications should have alpha = 0 * Latency-sensitive applications should have alpha = 1 * * Note: DCTCP draft suggests initial alpha to be 1 but we've decided to * keep it 0 as default. */ dctcp_data->alpha = V_dctcp_alpha; dctcp_data->save_sndnxt = 0; dctcp_data->ce_prev = 0; dctcp_data->ece_curr = 0; dctcp_data->ece_prev = 0; dctcp_data->num_cong_events = 0; ccv->cc_data = dctcp_data; return (0); } /* * Perform any necessary tasks before we enter congestion recovery. */ static void dctcp_cong_signal(struct cc_var *ccv, ccsignal_t type) { struct dctcp *dctcp_data; uint32_t cwin, mss, pipe; if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) { dctcp_data = ccv->cc_data; cwin = CCV(ccv, snd_cwnd); mss = tcp_fixed_maxseg(ccv->tp); switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_ssthresh) = max(cwin / 2, 2 * mss); dctcp_data->num_cong_events++; } else { /* cwnd has already updated as congestion * recovery. Reverse cwnd value using * snd_cwnd_prev and recalculate snd_ssthresh */ cwin = CCV(ccv, snd_cwnd_prev); CCV(ccv, snd_ssthresh) = max(cwin / 2, 2 * mss); } ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: /* * Save current snd_cwnd when the host encounters both * congestion recovery and fast recovery. */ CCV(ccv, snd_cwnd_prev) = cwin; if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { if (V_dctcp_slowstart && dctcp_data->num_cong_events++ == 0) { CCV(ccv, snd_ssthresh) = max(cwin / 2, 2 * mss); dctcp_data->alpha = MAX_ALPHA_VALUE; dctcp_data->bytes_ecn = 0; dctcp_data->bytes_total = 0; dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); } else CCV(ccv, snd_ssthresh) = max((cwin - (((uint64_t)cwin * dctcp_data->alpha) >> (DCTCP_SHIFT+1))), 2 * mss); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } dctcp_data->ece_curr = 1; break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { if (V_tcp_do_newsack) { pipe = tcp_compute_pipe(ccv->tp); } else { pipe = CCV(ccv, snd_max) - CCV(ccv, snd_fack) + CCV(ccv, sackhint.sack_bytes_rexmit); } CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } CCV(ccv, snd_cwnd) = mss; dctcp_update_alpha(ccv); - dctcp_data->save_sndnxt += CCV(ccv, t_maxseg); + dctcp_data->save_sndnxt += mss; dctcp_data->num_cong_events++; break; default: break; } } else newreno_cc_cong_signal(ccv, type); } static void dctcp_conn_init(struct cc_var *ccv) { struct dctcp *dctcp_data; dctcp_data = ccv->cc_data; if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) { dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); if (V_dctcp_ect1) CCV(ccv, t_flags2) |= TF2_ECN_USE_ECT1; } } /* * Perform any necessary tasks before we exit congestion recovery. */ static void dctcp_post_recovery(struct cc_var *ccv) { newreno_cc_post_recovery(ccv); if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) dctcp_update_alpha(ccv); } /* * Execute an additional ECN processing using ECN field in IP header * and the CWR bit in TCP header. */ static void dctcp_ecnpkt_handler(struct cc_var *ccv) { struct dctcp *dctcp_data; uint32_t ccflag; int acknow; dctcp_data = ccv->cc_data; ccflag = ccv->flags; acknow = 0; /* * DCTCP responds with an ACK immediately when the CE state * in between this segment and the last segment has changed. */ if (ccflag & CCF_IPHDR_CE) { if (!dctcp_data->ce_prev) { acknow = 1; dctcp_data->ce_prev = 1; CCV(ccv, t_flags2) |= TF2_ECN_SND_ECE; } } else { if (dctcp_data->ce_prev) { acknow = 1; dctcp_data->ce_prev = 0; CCV(ccv, t_flags2) &= ~TF2_ECN_SND_ECE; } } if ((acknow) || (ccflag & CCF_TCPHDR_CWR)) { ccv->flags |= CCF_ACKNOW; } else { ccv->flags &= ~CCF_ACKNOW; } } /* * Update the fraction of marked bytes represented as 'alpha'. * Also initialize several internal parameters at the end of this function. */ static void dctcp_update_alpha(struct cc_var *ccv) { struct dctcp *dctcp_data; int alpha_prev; dctcp_data = ccv->cc_data; alpha_prev = dctcp_data->alpha; dctcp_data->bytes_total = max(dctcp_data->bytes_total, 1); /* * Update alpha: alpha = (1 - g) * alpha + g * M. * Here: * g is weight factor * recommaded to be set to 1/16 * small g = slow convergence between competitive DCTCP flows * large g = impacts low utilization of bandwidth at switches * M is fraction of marked segments in last RTT * updated every RTT * Alpha must be round to 0 - MAX_ALPHA_VALUE. */ dctcp_data->alpha = ulmin(alpha_prev - (alpha_prev >> V_dctcp_shift_g) + ((uint64_t)dctcp_data->bytes_ecn << (DCTCP_SHIFT - V_dctcp_shift_g)) / dctcp_data->bytes_total, MAX_ALPHA_VALUE); /* Initialize internal parameters for next alpha calculation */ dctcp_data->bytes_ecn = 0; dctcp_data->bytes_total = 0; dctcp_data->save_sndnxt = CCV(ccv, snd_nxt); } static int dctcp_alpha_handler(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; new = V_dctcp_alpha; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new > MAX_ALPHA_VALUE) error = EINVAL; else V_dctcp_alpha = new; } return (error); } static int dctcp_shift_g_handler(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; new = V_dctcp_shift_g; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new > DCTCP_SHIFT) error = EINVAL; else V_dctcp_shift_g = new; } return (error); } static int dctcp_slowstart_handler(SYSCTL_HANDLER_ARGS) { uint32_t new; int error; new = V_dctcp_slowstart; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new > 1) error = EINVAL; else V_dctcp_slowstart = new; } return (error); } SYSCTL_DECL(_net_inet_tcp_cc_dctcp); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, dctcp, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "dctcp congestion control related settings"); SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, alpha, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(dctcp_alpha), 0, &dctcp_alpha_handler, "IU", "dctcp alpha parameter at start of session"); SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, shift_g, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(dctcp_shift_g), 4, &dctcp_shift_g_handler, "IU", "dctcp shift parameter"); SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, slowstart, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(dctcp_slowstart), 0, &dctcp_slowstart_handler, "IU", "half CWND reduction after the first slow start"); SYSCTL_UINT(_net_inet_tcp_cc_dctcp, OID_AUTO, ect1, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(dctcp_ect1), 0, "Send DCTCP segments with ÍP ECT(0) or ECT(1)"); DECLARE_CC_MODULE(dctcp, &dctcp_cc_algo); MODULE_VERSION(dctcp, 2); diff --git a/sys/netinet/cc/cc_htcp.c b/sys/netinet/cc/cc_htcp.c index 49bf4d6142f1..13441bb49190 100644 --- a/sys/netinet/cc/cc_htcp.c +++ b/sys/netinet/cc/cc_htcp.c @@ -1,554 +1,555 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the H-TCP congestion control algorithm for FreeBSD, * based on the Internet Draft "draft-leith-tcp-htcp-06.txt" by Leith and * Shorten. Originally released as part of the NewTCP research project at * Swinburne University of Technology's Centre for Advanced Internet * Architectures, Melbourne, Australia, which was made possible in part by a * grant from the Cisco University Research Program Fund at Community Foundation * Silicon Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Fixed point math shifts. */ #define HTCP_SHIFT 8 #define HTCP_ALPHA_INC_SHIFT 4 #define HTCP_INIT_ALPHA 1 #define HTCP_DELTA_L hz /* 1 sec in ticks. */ #define HTCP_MINBETA 128 /* 0.5 << HTCP_SHIFT. */ #define HTCP_MAXBETA 204 /* ~0.8 << HTCP_SHIFT. */ #define HTCP_MINROWE 26 /* ~0.1 << HTCP_SHIFT. */ #define HTCP_MAXROWE 512 /* 2 << HTCP_SHIFT. */ /* RTT_ref (ms) used in the calculation of alpha if RTT scaling is enabled. */ #define HTCP_RTT_REF 100 /* Don't trust SRTT until this many samples have been taken. */ #define HTCP_MIN_RTT_SAMPLES 8 /* * HTCP_CALC_ALPHA performs a fixed point math calculation to determine the * value of alpha, based on the function defined in the HTCP spec. * * i.e. 1 + 10(delta - delta_l) + ((delta - delta_l) / 2) ^ 2 * * "diff" is passed in to the macro as "delta - delta_l" and is expected to be * in units of ticks. * * The joyousnous of fixed point maths means our function implementation looks a * little funky... * * In order to maintain some precision in the calculations, a fixed point shift * HTCP_ALPHA_INC_SHIFT is used to ensure the integer divisions don't * truncate the results too badly. * * The "16" value is the "1" term in the alpha function shifted up by * HTCP_ALPHA_INC_SHIFT * * The "160" value is the "10" multiplier in the alpha function multiplied by * 2^HTCP_ALPHA_INC_SHIFT * * Specifying these as constants reduces the computations required. After * up-shifting all the terms in the function and performing the required * calculations, we down-shift the final result by HTCP_ALPHA_INC_SHIFT to * ensure it is back in the correct range. * * The "hz" terms are required as kernels can be configured to run with * different tick timers, which we have to adjust for in the alpha calculation * (which originally was defined in terms of seconds). * * We also have to be careful to constrain the value of diff such that it won't * overflow whilst performing the calculation. The middle term i.e. (160 * diff) * / hz is the limiting factor in the calculation. We must constrain diff to be * less than the max size of an int divided by the constant 160 figure * i.e. diff < INT_MAX / 160 * * NB: Changing HTCP_ALPHA_INC_SHIFT will require you to MANUALLY update the * constants used in this function! */ #define HTCP_CALC_ALPHA(diff) \ ((\ (16) + \ ((160 * (diff)) / hz) + \ (((diff) / hz) * (((diff) << HTCP_ALPHA_INC_SHIFT) / (4 * hz))) \ ) >> HTCP_ALPHA_INC_SHIFT) static void htcp_ack_received(struct cc_var *ccv, ccsignal_t type); static void htcp_cb_destroy(struct cc_var *ccv); static int htcp_cb_init(struct cc_var *ccv, void *ptr); static void htcp_cong_signal(struct cc_var *ccv, ccsignal_t type); static int htcp_mod_init(void); static void htcp_post_recovery(struct cc_var *ccv); static void htcp_recalc_alpha(struct cc_var *ccv); static void htcp_recalc_beta(struct cc_var *ccv); static void htcp_record_rtt(struct cc_var *ccv); static void htcp_ssthresh_update(struct cc_var *ccv); static size_t htcp_data_sz(void); struct htcp { /* cwnd before entering cong recovery. */ unsigned long prev_cwnd; /* cwnd additive increase parameter. */ int alpha; /* cwnd multiplicative decrease parameter. */ int beta; /* Largest rtt seen for the flow. */ int maxrtt; /* Shortest rtt seen for the flow. */ int minrtt; /* Time of last congestion event in ticks. */ int t_last_cong; }; static int htcp_rtt_ref; /* * The maximum number of ticks the value of diff can reach in * htcp_recalc_alpha() before alpha will stop increasing due to overflow. * See comment above HTCP_CALC_ALPHA for more info. */ static int htcp_max_diff = INT_MAX / ((1 << HTCP_ALPHA_INC_SHIFT) * 10); /* Per-netstack vars. */ VNET_DEFINE_STATIC(u_int, htcp_adaptive_backoff) = 0; VNET_DEFINE_STATIC(u_int, htcp_rtt_scaling) = 0; #define V_htcp_adaptive_backoff VNET(htcp_adaptive_backoff) #define V_htcp_rtt_scaling VNET(htcp_rtt_scaling) struct cc_algo htcp_cc_algo = { .name = "htcp", .ack_received = htcp_ack_received, .cb_destroy = htcp_cb_destroy, .cb_init = htcp_cb_init, .cong_signal = htcp_cong_signal, .mod_init = htcp_mod_init, .post_recovery = htcp_post_recovery, .cc_data_sz = htcp_data_sz, .after_idle = newreno_cc_after_idle, }; static void htcp_ack_received(struct cc_var *ccv, ccsignal_t type) { struct htcp *htcp_data; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); htcp_data = ccv->cc_data; htcp_record_rtt(ccv); /* * Regular ACK and we're not in cong/fast recovery and we're cwnd * limited and we're either not doing ABC or are slow starting or are * doing ABC and we've sent a cwnd's worth of bytes. */ if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 || CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) { htcp_recalc_beta(ccv); htcp_recalc_alpha(ccv); /* * Use the logic in NewReno ack_received() for slow start and * for the first HTCP_DELTA_L ticks after either the flow starts * or a congestion event (when alpha equals 1). */ if (htcp_data->alpha == 1 || CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) newreno_cc_ack_received(ccv, type); else { if (V_tcp_do_rfc3465) { /* Increment cwnd by alpha segments. */ CCV(ccv, snd_cwnd) += htcp_data->alpha * - CCV(ccv, t_maxseg); + mss; ccv->flags &= ~CCF_ABC_SENTAWND; } else /* * Increment cwnd by alpha/cwnd segments to * approximate an increase of alpha segments * per RTT. */ CCV(ccv, snd_cwnd) += (((htcp_data->alpha << HTCP_SHIFT) / (max(1, - CCV(ccv, snd_cwnd) / CCV(ccv, t_maxseg)))) * - CCV(ccv, t_maxseg)) >> HTCP_SHIFT; + CCV(ccv, snd_cwnd) / mss))) * + mss) >> HTCP_SHIFT; } } } static void htcp_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } static size_t htcp_data_sz(void) { return(sizeof(struct htcp)); } static int htcp_cb_init(struct cc_var *ccv, void *ptr) { struct htcp *htcp_data; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { htcp_data = malloc(sizeof(struct htcp), M_CC_MEM, M_NOWAIT); if (htcp_data == NULL) return (ENOMEM); } else htcp_data = ptr; /* Init some key variables with sensible defaults. */ htcp_data->alpha = HTCP_INIT_ALPHA; htcp_data->beta = HTCP_MINBETA; htcp_data->maxrtt = TCPTV_SRTTBASE; htcp_data->minrtt = TCPTV_SRTTBASE; htcp_data->prev_cwnd = 0; htcp_data->t_last_cong = ticks; ccv->cc_data = htcp_data; return (0); } /* * Perform any necessary tasks before we enter congestion recovery. */ static void htcp_cong_signal(struct cc_var *ccv, ccsignal_t type) { struct htcp *htcp_data; uint32_t mss, pipe; htcp_data = ccv->cc_data; mss = tcp_fixed_maxseg(ccv->tp); switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { /* * Apply hysteresis to maxrtt to ensure * reductions in the RTT are reflected in our * measurements. */ htcp_data->maxrtt = (htcp_data->minrtt + (htcp_data->maxrtt - htcp_data->minrtt) * 95) / 100; htcp_ssthresh_update(ccv); htcp_data->t_last_cong = ticks; htcp_data->prev_cwnd = CCV(ccv, snd_cwnd); } ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { /* * Apply hysteresis to maxrtt to ensure reductions in * the RTT are reflected in our measurements. */ htcp_data->maxrtt = (htcp_data->minrtt + (htcp_data->maxrtt - htcp_data->minrtt) * 95) / 100; htcp_ssthresh_update(ccv); CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); htcp_data->t_last_cong = ticks; htcp_data->prev_cwnd = CCV(ccv, snd_cwnd); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { if (V_tcp_do_newsack) { pipe = tcp_compute_pipe(ccv->tp); } else { pipe = CCV(ccv, snd_max) - CCV(ccv, snd_fack) + CCV(ccv, sackhint.sack_bytes_rexmit); } CCV(ccv, snd_ssthresh) = max(2, min(CCV(ccv, snd_wnd), pipe) / 2 / mss) * mss; } CCV(ccv, snd_cwnd) = mss; /* * Grab the current time and record it so we know when the * most recent congestion event was. Only record it when the * timeout has fired more than once, as there is a reasonable * chance the first one is a false alarm and may not indicate * congestion. */ if (CCV(ccv, t_rxtshift) >= 2) htcp_data->t_last_cong = ticks; break; default: break; } } static int htcp_mod_init(void) { /* * HTCP_RTT_REF is defined in ms, and t_srtt in the tcpcb is stored in * units of TCP_RTT_SCALE*hz. Scale HTCP_RTT_REF to be in the same units * as t_srtt. */ htcp_rtt_ref = (HTCP_RTT_REF * TCP_RTT_SCALE * hz) / 1000; return (0); } /* * Perform any necessary tasks before we exit congestion recovery. */ static void htcp_post_recovery(struct cc_var *ccv) { int pipe; struct htcp *htcp_data; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); pipe = 0; htcp_data = ccv->cc_data; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * If inflight data is less than ssthresh, set cwnd * conservatively to avoid a burst of data, as suggested in the * NewReno RFC. Otherwise, use the HTCP method. * * XXXLAS: Find a way to do this without needing curack */ if (V_tcp_do_newsack) pipe = tcp_compute_pipe(ccv->tp); else pipe = CCV(ccv, snd_max) - ccv->curack; if (pipe < CCV(ccv, snd_ssthresh)) /* * Ensure that cwnd down not collape to 1 MSS under * adverse conditions. Implements RFC6582 */ - CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + - CCV(ccv, t_maxseg); + CCV(ccv, snd_cwnd) = max(pipe, mss) + mss; else CCV(ccv, snd_cwnd) = max(1, ((htcp_data->beta * - htcp_data->prev_cwnd / CCV(ccv, t_maxseg)) - >> HTCP_SHIFT)) * CCV(ccv, t_maxseg); + htcp_data->prev_cwnd / mss) + >> HTCP_SHIFT)) * mss; } } static void htcp_recalc_alpha(struct cc_var *ccv) { struct htcp *htcp_data; int alpha, diff, now; htcp_data = ccv->cc_data; now = ticks; /* * If ticks has wrapped around (will happen approximately once every 49 * days on a machine with the default kern.hz=1000) and a flow straddles * the wrap point, our alpha calcs will be completely wrong. We cut our * losses and restart alpha from scratch by setting t_last_cong = now - * HTCP_DELTA_L. * * This does not deflate our cwnd at all. It simply slows the rate cwnd * is growing by until alpha regains the value it held prior to taking * this drastic measure. */ if (now < htcp_data->t_last_cong) htcp_data->t_last_cong = now - HTCP_DELTA_L; diff = now - htcp_data->t_last_cong - HTCP_DELTA_L; /* Cap alpha if the value of diff would overflow HTCP_CALC_ALPHA(). */ if (diff < htcp_max_diff) { /* * If it has been more than HTCP_DELTA_L ticks since congestion, * increase alpha according to the function defined in the spec. */ if (diff > 0) { alpha = HTCP_CALC_ALPHA(diff); /* * Adaptive backoff fairness adjustment: * 2 * (1 - beta) * alpha_raw */ if (V_htcp_adaptive_backoff) alpha = max(1, (2 * ((1 << HTCP_SHIFT) - htcp_data->beta) * alpha) >> HTCP_SHIFT); /* * RTT scaling: (RTT / RTT_ref) * alpha * alpha will be the raw value from HTCP_CALC_ALPHA() if * adaptive backoff is off, or the adjusted value if * adaptive backoff is on. */ if (V_htcp_rtt_scaling) alpha = max(1, (min(max(HTCP_MINROWE, (tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) << HTCP_SHIFT) / htcp_rtt_ref), HTCP_MAXROWE) * alpha) >> HTCP_SHIFT); } else alpha = 1; htcp_data->alpha = alpha; } } static void htcp_recalc_beta(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* * TCPTV_SRTTBASE is the initialised value of each connection's SRTT, so * we only calc beta if the connection's SRTT has been changed from its * initial value. beta is bounded to ensure it is always between * HTCP_MINBETA and HTCP_MAXBETA. */ if (V_htcp_adaptive_backoff && htcp_data->minrtt != TCPTV_SRTTBASE && htcp_data->maxrtt != TCPTV_SRTTBASE) htcp_data->beta = min(max(HTCP_MINBETA, (htcp_data->minrtt << HTCP_SHIFT) / htcp_data->maxrtt), HTCP_MAXBETA); else htcp_data->beta = HTCP_MINBETA; } /* * Record the minimum and maximum RTT seen for the connection. These are used in * the calculation of beta if adaptive backoff is enabled. */ static void htcp_record_rtt(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* XXXLAS: Should there be some hysteresis for minrtt? */ /* * Record the current SRTT as our minrtt if it's the smallest we've seen * or minrtt is currently equal to its initialised value. Ignore SRTT * until a min number of samples have been taken. */ if ((tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) < htcp_data->minrtt || htcp_data->minrtt == TCPTV_SRTTBASE) && (CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES)) htcp_data->minrtt = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS); /* * Record the current SRTT as our maxrtt if it's the largest we've * seen. Ignore SRTT until a min number of samples have been taken. */ if (tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS) > htcp_data->maxrtt && CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES) htcp_data->maxrtt = tcp_get_srtt(ccv->tp, TCP_TMR_GRANULARITY_TICKS); } /* * Update the ssthresh in the event of congestion. */ static void htcp_ssthresh_update(struct cc_var *ccv) { struct htcp *htcp_data; htcp_data = ccv->cc_data; /* * On the first congestion event, set ssthresh to cwnd * 0.5, on * subsequent congestion events, set it to cwnd * beta. */ if (CCV(ccv, snd_ssthresh) == TCP_MAXWIN << TCP_MAX_WINSHIFT) CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) * HTCP_MINBETA) >> HTCP_SHIFT; else { htcp_recalc_beta(ccv); CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) * htcp_data->beta) >> HTCP_SHIFT; } } SYSCTL_DECL(_net_inet_tcp_cc_htcp); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, htcp, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "H-TCP related settings"); SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, adaptive_backoff, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_adaptive_backoff), 0, "enable H-TCP adaptive backoff"); SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, rtt_scaling, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_rtt_scaling), 0, "enable H-TCP RTT scaling"); DECLARE_CC_MODULE(htcp, &htcp_cc_algo); MODULE_VERSION(htcp, 2); diff --git a/sys/netinet/cc/cc_newreno.c b/sys/netinet/cc/cc_newreno.c index 4515ac133c19..d7172fa78bc4 100644 --- a/sys/netinet/cc/cc_newreno.c +++ b/sys/netinet/cc/cc_newreno.c @@ -1,608 +1,608 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. * Copyright (c) 2007-2008,2010,2014 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, James * Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ * * Dec 2014 garmitage@swin.edu.au * Borrowed code fragments from cc_cdg.c to add modifiable beta * via sysctls. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void newreno_cb_destroy(struct cc_var *ccv); static void newreno_ack_received(struct cc_var *ccv, ccsignal_t type); static void newreno_after_idle(struct cc_var *ccv); static void newreno_cong_signal(struct cc_var *ccv, ccsignal_t type); static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf); static void newreno_newround(struct cc_var *ccv, uint32_t round_cnt); static void newreno_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas); static int newreno_cb_init(struct cc_var *ccv, void *); static size_t newreno_data_sz(void); VNET_DECLARE(uint32_t, newreno_beta); #define V_newreno_beta VNET(newreno_beta) VNET_DECLARE(uint32_t, newreno_beta_ecn); #define V_newreno_beta_ecn VNET(newreno_beta_ecn) struct cc_algo newreno_cc_algo = { .name = "newreno", .cb_destroy = newreno_cb_destroy, .ack_received = newreno_ack_received, .after_idle = newreno_after_idle, .cong_signal = newreno_cong_signal, .post_recovery = newreno_cc_post_recovery, .ctl_output = newreno_ctl_output, .newround = newreno_newround, .rttsample = newreno_rttsample, .cb_init = newreno_cb_init, .cc_data_sz = newreno_data_sz, }; static void newreno_log_hystart_event(struct cc_var *ccv, struct newreno *nreno, uint8_t mod, uint32_t flex1) { /* * Types of logs (mod value) * 1 - rtt_thresh in flex1, checking to see if RTT is to great. * 2 - rtt is too great, rtt_thresh in flex1. * 3 - CSS is active incr in flex1 * 4 - A new round is beginning flex1 is round count * 5 - A new RTT measurement flex1 is the new measurement. * 6 - We enter CA ssthresh is also in flex1. * 7 - Socket option to change hystart executed opt.val in flex1. * 8 - Back out of CSS into SS, flex1 is the css_baseline_minrtt * 9 - We enter CA, via an ECN mark. * 10 - We enter CA, via a loss. * 11 - We have slipped out of SS into CA via cwnd growth. * 12 - After idle has re-enabled hystart++ */ struct tcpcb *tp; if (hystart_bblogs == 0) return; tp = ccv->tp; if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = nreno->css_current_round_minrtt; log.u_bbr.flex3 = nreno->css_lastround_minrtt; log.u_bbr.flex4 = nreno->css_rttsample_count; log.u_bbr.flex5 = nreno->css_entered_at_round; log.u_bbr.flex6 = nreno->css_baseline_minrtt; /* We only need bottom 16 bits of flags */ log.u_bbr.flex7 = nreno->newreno_flags & 0x0000ffff; log.u_bbr.flex8 = mod; log.u_bbr.epoch = nreno->css_current_round; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.lt_epoch = nreno->css_fas_at_css_entry; log.u_bbr.pkts_out = nreno->css_last_fas; log.u_bbr.delivered = nreno->css_lowrtt_fas; log.u_bbr.pkt_epoch = ccv->flags; TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_HYSTART, 0, 0, &log, false, &tv); } } static size_t newreno_data_sz(void) { return (sizeof(struct newreno)); } static int newreno_cb_init(struct cc_var *ccv, void *ptr) { struct newreno *nreno; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { ccv->cc_data = malloc(sizeof(struct newreno), M_CC_MEM, M_NOWAIT); if (ccv->cc_data == NULL) return (ENOMEM); } else ccv->cc_data = ptr; nreno = (struct newreno *)ccv->cc_data; /* NB: nreno is not zeroed, so initialise all fields. */ nreno->beta = V_newreno_beta; nreno->beta_ecn = V_newreno_beta_ecn; /* * We set the enabled flag so that if * the socket option gets strobed and * we have not hit a loss */ nreno->newreno_flags = CC_NEWRENO_HYSTART_ENABLED; /* At init set both to infinity */ nreno->css_lastround_minrtt = 0xffffffff; nreno->css_current_round_minrtt = 0xffffffff; nreno->css_current_round = 0; nreno->css_baseline_minrtt = 0xffffffff; nreno->css_rttsample_count = 0; nreno->css_entered_at_round = 0; nreno->css_fas_at_css_entry = 0; nreno->css_lowrtt_fas = 0; nreno->css_last_fas = 0; return (0); } static void newreno_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } static void newreno_ack_received(struct cc_var *ccv, ccsignal_t type) { struct newreno *nreno; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); nreno = ccv->cc_data; if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { u_int cw = CCV(ccv, snd_cwnd); - u_int incr = CCV(ccv, t_maxseg); + u_int incr = mss; /* * Regular in-order ACK, open the congestion window. * Method depends on which congestion control state we're * in (slow start or cong avoid) and if ABC (RFC 3465) is * enabled. * * slow start: cwnd <= ssthresh * cong avoid: cwnd > ssthresh * * slow start and ABC (RFC 3465): * Grow cwnd exponentially by the amount of data * ACKed capping the max increment per ACK to * (abc_l_var * maxseg) bytes. * * slow start without ABC (RFC 5681): * Grow cwnd exponentially by maxseg per ACK. * * cong avoid and ABC (RFC 3465): * Grow cwnd linearly by maxseg per RTT for each * cwnd worth of ACKed data. * * cong avoid without ABC (RFC 5681): * Grow cwnd linearly by approximately maxseg per RTT using * maxseg^2 / cwnd per ACK as the increment. * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to * avoid capping cwnd. */ if (cw > CCV(ccv, snd_ssthresh)) { if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) { /* * We have slipped into CA with * CSS active. Deactivate all. */ /* Turn off the CSS flag */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; /* Disable use of CSS in the future except long idle */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; newreno_log_hystart_event(ccv, nreno, 11, CCV(ccv, snd_ssthresh)); } if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) ccv->flags &= ~CCF_ABC_SENTAWND; else incr = 0; } else incr = max((incr * incr / cw), 1); } else if (V_tcp_do_rfc3465) { /* * In slow-start with ABC enabled and no RTO in sight? * (Must not use abc_l_var > 1 if slow starting after * an RTO. On RTO, snd_nxt = snd_una, so the * snd_nxt == snd_max check is sufficient to * handle this). * * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ uint16_t abc_val; if (ccv->flags & CCF_USE_LOCAL_ABC) abc_val = ccv->labc; else abc_val = V_tcp_abc_l_var; if ((ccv->flags & CCF_HYSTART_ALLOWED) && (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) && ((nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) == 0)) { /* * Hystart is allowed and still enabled and we are not yet * in CSS. Lets check to see if we can make a decision on * if we need to go into CSS. */ if ((nreno->css_rttsample_count >= hystart_n_rttsamples) && (nreno->css_current_round_minrtt != 0xffffffff) && (nreno->css_lastround_minrtt != 0xffffffff)) { uint32_t rtt_thresh; /* Clamp (minrtt_thresh, lastround/8, maxrtt_thresh) */ rtt_thresh = (nreno->css_lastround_minrtt >> 3); if (rtt_thresh < hystart_minrtt_thresh) rtt_thresh = hystart_minrtt_thresh; if (rtt_thresh > hystart_maxrtt_thresh) rtt_thresh = hystart_maxrtt_thresh; newreno_log_hystart_event(ccv, nreno, 1, rtt_thresh); if (nreno->css_current_round_minrtt >= (nreno->css_lastround_minrtt + rtt_thresh)) { /* Enter CSS */ nreno->newreno_flags |= CC_NEWRENO_HYSTART_IN_CSS; nreno->css_fas_at_css_entry = nreno->css_lowrtt_fas; /* * The draft (v4) calls for us to set baseline to css_current_round_min * but that can cause an oscillation. We probably shoudl be using * css_lastround_minrtt, but the authors insist that will cause * issues on exiting early. We will leave the draft version for now * but I suspect this is incorrect. */ nreno->css_baseline_minrtt = nreno->css_current_round_minrtt; nreno->css_entered_at_round = nreno->css_current_round; newreno_log_hystart_event(ccv, nreno, 2, rtt_thresh); } } } if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, - ccv->nsegs * abc_val * - CCV(ccv, t_maxseg)); + ccv->nsegs * abc_val * mss); else - incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + incr = min(ccv->bytes_this_ack, mss); /* Only if Hystart is enabled will the flag get set */ if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) { incr /= hystart_css_growth_div; newreno_log_hystart_event(ccv, nreno, 3, incr); } } /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) CCV(ccv, snd_cwnd) = min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } } static void newreno_after_idle(struct cc_var *ccv) { struct newreno *nreno; nreno = ccv->cc_data; newreno_cc_after_idle(ccv); if ((nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) == 0) { /* * Re-enable hystart if we have been idle. */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; nreno->newreno_flags |= CC_NEWRENO_HYSTART_ENABLED; newreno_log_hystart_event(ccv, nreno, 12, CCV(ccv, snd_ssthresh)); } } /* * Perform any necessary tasks before we enter congestion recovery. */ static void newreno_cong_signal(struct cc_var *ccv, ccsignal_t type) { struct newreno *nreno; uint32_t beta, beta_ecn, cwin, factor, mss, pipe; cwin = CCV(ccv, snd_cwnd); mss = tcp_fixed_maxseg(ccv->tp); nreno = ccv->cc_data; beta = (nreno == NULL) ? V_newreno_beta : nreno->beta; beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn; /* * Note that we only change the backoff for ECN if the * global sysctl V_cc_do_abe is set the stack itself * has set a flag in our newreno_flags (due to pacing) telling * us to use the lower valued back-off. */ if ((type == CC_ECN) && (V_cc_do_abe || ((nreno != NULL) && (nreno->newreno_flags & CC_NEWRENO_BETA_ECN_ENABLED)))) factor = beta_ecn; else factor = beta; /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, ("%s: congestion signal type 0x%08x is private\n", __func__, type)); cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss; switch (type) { case CC_NDUPACK: if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; newreno_log_hystart_event(ccv, nreno, 10, CCV(ccv, snd_ssthresh)); } if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (IN_CONGRECOVERY(CCV(ccv, t_flags) && V_cc_do_abe && V_cc_abe_frlossreduce)) { CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_ssthresh) * (uint64_t)beta) / (uint64_t)beta_ecn; } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; newreno_log_hystart_event(ccv, nreno, 9, CCV(ccv, snd_ssthresh)); } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_ssthresh) = cwin; CCV(ccv, snd_cwnd) = cwin; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; case CC_RTO: if (CCV(ccv, t_rxtshift) == 1) { if (V_tcp_do_newsack) { pipe = tcp_compute_pipe(ccv->tp); } else { pipe = CCV(ccv, snd_max) - CCV(ccv, snd_fack) + CCV(ccv, sackhint.sack_bytes_rexmit); } CCV(ccv, snd_ssthresh) = max(2, ((uint64_t)min(CCV(ccv, snd_wnd), pipe) * (uint64_t)factor) / (100ULL * (uint64_t)mss)) * mss; } CCV(ccv, snd_cwnd) = mss; break; default: break; } } static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) { struct newreno *nreno; struct cc_newreno_opts *opt; if (sopt->sopt_valsize != sizeof(struct cc_newreno_opts)) return (EMSGSIZE); if (CC_ALGO(ccv->tp) != &newreno_cc_algo) return (ENOPROTOOPT); nreno = (struct newreno *)ccv->cc_data; opt = buf; switch (sopt->sopt_dir) { case SOPT_SET: switch (opt->name) { case CC_NEWRENO_BETA: nreno->beta = opt->val; break; case CC_NEWRENO_BETA_ECN: nreno->beta_ecn = opt->val; nreno->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; break; default: return (ENOPROTOOPT); } break; case SOPT_GET: switch (opt->name) { case CC_NEWRENO_BETA: opt->val = nreno->beta; break; case CC_NEWRENO_BETA_ECN: opt->val = nreno->beta_ecn; break; default: return (ENOPROTOOPT); } break; default: return (EINVAL); } return (0); } static int newreno_beta_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = *(uint32_t *)arg1; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL ) { if (arg1 == &VNET_NAME(newreno_beta_ecn) && !V_cc_do_abe) error = EACCES; else if (new == 0 || new > 100) error = EINVAL; else *(uint32_t *)arg1 = new; } return (error); } static void newreno_newround(struct cc_var *ccv, uint32_t round_cnt) { struct newreno *nreno; nreno = (struct newreno *)ccv->cc_data; /* We have entered a new round */ nreno->css_lastround_minrtt = nreno->css_current_round_minrtt; nreno->css_current_round_minrtt = 0xffffffff; nreno->css_rttsample_count = 0; nreno->css_current_round = round_cnt; if ((nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) && ((round_cnt - nreno->css_entered_at_round) >= hystart_css_rounds)) { /* Enter CA */ if (ccv->flags & CCF_HYSTART_CAN_SH_CWND) { /* * We engage more than snd_ssthresh, engage * the brakes!! Though we will stay in SS to * creep back up again, so lets leave CSS active * and give us hystart_css_rounds more rounds. */ if (ccv->flags & CCF_HYSTART_CONS_SSTH) { CCV(ccv, snd_ssthresh) = ((nreno->css_lowrtt_fas + nreno->css_fas_at_css_entry) / 2); } else { CCV(ccv, snd_ssthresh) = nreno->css_lowrtt_fas; } CCV(ccv, snd_cwnd) = nreno->css_fas_at_css_entry; nreno->css_entered_at_round = round_cnt; } else { CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); /* Turn off the CSS flag */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; /* Disable use of CSS in the future except long idle */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; } newreno_log_hystart_event(ccv, nreno, 6, CCV(ccv, snd_ssthresh)); } if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) newreno_log_hystart_event(ccv, nreno, 4, round_cnt); } static void newreno_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas) { struct newreno *nreno; nreno = (struct newreno *)ccv->cc_data; if (rxtcnt > 1) { /* * Only look at RTT's that are non-ambiguous. */ return; } nreno->css_rttsample_count++; nreno->css_last_fas = fas; if (nreno->css_current_round_minrtt > usec_rtt) { nreno->css_current_round_minrtt = usec_rtt; nreno->css_lowrtt_fas = nreno->css_last_fas; } if ((nreno->css_rttsample_count >= hystart_n_rttsamples) && (nreno->css_current_round_minrtt != 0xffffffff) && (nreno->css_current_round_minrtt < nreno->css_baseline_minrtt) && (nreno->css_lastround_minrtt != 0xffffffff)) { /* * We were in CSS and the RTT is now less, we * entered CSS erroneously. */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; newreno_log_hystart_event(ccv, nreno, 8, nreno->css_baseline_minrtt); nreno->css_baseline_minrtt = 0xffffffff; } if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) newreno_log_hystart_event(ccv, nreno, 5, usec_rtt); } SYSCTL_DECL(_net_inet_tcp_cc_newreno); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, newreno, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "New Reno related settings"); SYSCTL_PROC(_net_inet_tcp_cc_newreno, OID_AUTO, beta, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(newreno_beta), 3, &newreno_beta_handler, "IU", "New Reno beta, specified as number between 1 and 100"); SYSCTL_PROC(_net_inet_tcp_cc_newreno, OID_AUTO, beta_ecn, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(newreno_beta_ecn), 3, &newreno_beta_handler, "IU", "New Reno beta ecn, specified as number between 1 and 100"); DECLARE_CC_MODULE(newreno, &newreno_cc_algo); MODULE_VERSION(newreno, 2); diff --git a/sys/netinet/cc/cc_vegas.c b/sys/netinet/cc/cc_vegas.c index 45d6b646bcba..2e24a717f869 100644 --- a/sys/netinet/cc/cc_vegas.c +++ b/sys/netinet/cc/cc_vegas.c @@ -1,312 +1,313 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009-2010 * Swinburne University of Technology, Melbourne, Australia * Copyright (c) 2010 Lawrence Stewart * Copyright (c) 2010-2011 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by David Hayes and * Lawrence Stewart, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, Melbourne, Australia by * David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * An implementation of the Vegas congestion control algorithm for FreeBSD, * based on L. S. Brakmo and L. L. Peterson, "TCP Vegas: end to end congestion * avoidance on a global internet", IEEE J. Sel. Areas Commun., vol. 13, no. 8, * pp. 1465-1480, Oct. 1995. The original Vegas duplicate ack policy has not * been implemented, since clock ticks are not as coarse as they were (i.e. * 500ms) when Vegas was designed. Also, packets are timed once per RTT as in * the original paper. * * Originally released as part of the NewTCP research project at Swinburne * University of Technology's Centre for Advanced Internet Architectures, * Melbourne, Australia, which was made possible in part by a grant from the * Cisco University Research Program Fund at Community Foundation Silicon * Valley. More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Private signal type for rate based congestion signal. * See for appropriate bit-range to use for private signals. */ #define CC_VEGAS_RATE 0x04000000 static void vegas_ack_received(struct cc_var *ccv, ccsignal_t ack_type); static void vegas_cb_destroy(struct cc_var *ccv); static int vegas_cb_init(struct cc_var *ccv, void *ptr); static void vegas_cong_signal(struct cc_var *ccv, ccsignal_t signal_type); static void vegas_conn_init(struct cc_var *ccv); static int vegas_mod_init(void); static size_t vegas_data_sz(void); struct vegas { int slow_start_toggle; }; static int32_t ertt_id; VNET_DEFINE_STATIC(uint32_t, vegas_alpha) = 1; VNET_DEFINE_STATIC(uint32_t, vegas_beta) = 3; #define V_vegas_alpha VNET(vegas_alpha) #define V_vegas_beta VNET(vegas_beta) struct cc_algo vegas_cc_algo = { .name = "vegas", .ack_received = vegas_ack_received, .cb_destroy = vegas_cb_destroy, .cb_init = vegas_cb_init, .cong_signal = vegas_cong_signal, .conn_init = vegas_conn_init, .mod_init = vegas_mod_init, .cc_data_sz = vegas_data_sz, .after_idle = newreno_cc_after_idle, .post_recovery = newreno_cc_post_recovery, }; /* * The vegas window adjustment is done once every RTT, as indicated by the * ERTT_NEW_MEASUREMENT flag. This flag is reset once the new measurement data * has been used. */ static void vegas_ack_received(struct cc_var *ccv, ccsignal_t ack_type) { struct ertt *e_t; struct vegas *vegas_data; long actual_tx_rate, expected_tx_rate, ndiff; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); e_t = khelp_get_osd(&CCV(ccv, t_osd), ertt_id); vegas_data = ccv->cc_data; if (e_t->flags & ERTT_NEW_MEASUREMENT) { /* Once per RTT. */ if (e_t->minrtt && e_t->markedpkt_rtt) { expected_tx_rate = e_t->marked_snd_cwnd / e_t->minrtt; actual_tx_rate = e_t->bytes_tx_in_marked_rtt / e_t->markedpkt_rtt; ndiff = (expected_tx_rate - actual_tx_rate) * - e_t->minrtt / CCV(ccv, t_maxseg); + e_t->minrtt / mss; if (ndiff < V_vegas_alpha) { if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) { vegas_data->slow_start_toggle = vegas_data->slow_start_toggle ? 0 : 1; } else { vegas_data->slow_start_toggle = 0; CCV(ccv, snd_cwnd) = - min(CCV(ccv, snd_cwnd) + - CCV(ccv, t_maxseg), + min(CCV(ccv, snd_cwnd) + mss, TCP_MAXWIN << CCV(ccv, snd_scale)); } } else if (ndiff > V_vegas_beta) { /* Rate-based congestion. */ vegas_cong_signal(ccv, CC_VEGAS_RATE); vegas_data->slow_start_toggle = 0; } } e_t->flags &= ~ERTT_NEW_MEASUREMENT; } if (vegas_data->slow_start_toggle) newreno_cc_ack_received(ccv, ack_type); } static void vegas_cb_destroy(struct cc_var *ccv) { free(ccv->cc_data, M_CC_MEM); } static size_t vegas_data_sz(void) { return (sizeof(struct vegas)); } static int vegas_cb_init(struct cc_var *ccv, void *ptr) { struct vegas *vegas_data; INP_WLOCK_ASSERT(tptoinpcb(ccv->tp)); if (ptr == NULL) { vegas_data = malloc(sizeof(struct vegas), M_CC_MEM, M_NOWAIT); if (vegas_data == NULL) return (ENOMEM); } else vegas_data = ptr; vegas_data->slow_start_toggle = 1; ccv->cc_data = vegas_data; return (0); } /* * If congestion has been triggered triggered by the Vegas measured rates, it is * handled here, otherwise it falls back to newreno's congestion handling. */ static void vegas_cong_signal(struct cc_var *ccv, ccsignal_t signal_type) { struct vegas *vegas_data; int presignalrecov; + uint32_t mss = tcp_fixed_maxseg(ccv->tp); vegas_data = ccv->cc_data; if (IN_RECOVERY(CCV(ccv, t_flags))) presignalrecov = 1; else presignalrecov = 0; switch((int)signal_type) { case CC_VEGAS_RATE: if (!IN_RECOVERY(CCV(ccv, t_flags))) { - CCV(ccv, snd_cwnd) = max(2 * CCV(ccv, t_maxseg), - CCV(ccv, snd_cwnd) - CCV(ccv, t_maxseg)); + CCV(ccv, snd_cwnd) = max(2 * mss, + CCV(ccv, snd_cwnd) - mss); if (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh)) /* Exit slow start. */ CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd); } break; default: newreno_cc_cong_signal(ccv, signal_type); break; } if (IN_RECOVERY(CCV(ccv, t_flags)) && !presignalrecov) vegas_data->slow_start_toggle = (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh)) ? 1 : 0; } static void vegas_conn_init(struct cc_var *ccv) { struct vegas *vegas_data; vegas_data = ccv->cc_data; vegas_data->slow_start_toggle = 1; } static int vegas_mod_init(void) { ertt_id = khelp_get_id("ertt"); if (ertt_id <= 0) { printf("%s: h_ertt module not found\n", __func__); return (ENOENT); } return (0); } static int vegas_alpha_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_vegas_alpha; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new == 0 || new > V_vegas_beta) error = EINVAL; else V_vegas_alpha = new; } return (error); } static int vegas_beta_handler(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_vegas_beta; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr != NULL) { if (new == 0 || new < V_vegas_alpha) error = EINVAL; else V_vegas_beta = new; } return (error); } SYSCTL_DECL(_net_inet_tcp_cc_vegas); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, vegas, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Vegas related settings"); SYSCTL_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, alpha, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(vegas_alpha), 1, &vegas_alpha_handler, "IU", "vegas alpha, specified as number of \"buffers\" (0 < alpha < beta)"); SYSCTL_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, beta, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(vegas_beta), 3, &vegas_beta_handler, "IU", "vegas beta, specified as number of \"buffers\" (0 < alpha < beta)"); DECLARE_CC_MODULE(vegas, &vegas_cc_algo); MODULE_VERSION(vegas, 2); MODULE_DEPEND(vegas, ertt, 1, 1, 1); diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 854cce2a0cc1..be98d2e41f11 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1,2186 +1,2186 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include VNET_DEFINE(int, path_mtu_discovery) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(path_mtu_discovery), 1, "Enable Path MTU Discovery"); VNET_DEFINE(int, tcp_do_tso) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_tso), 0, "Enable TCP Segmentation Offload"); VNET_DEFINE(int, tcp_sendspace) = 1024*32; #define V_tcp_sendspace VNET(tcp_sendspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendspace), 0, "Initial send socket buffer size"); VNET_DEFINE(int, tcp_do_autosndbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autosndbuf), 0, "Enable automatic send buffer sizing"); VNET_DEFINE(int, tcp_autosndbuf_inc) = 8*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_inc), 0, "Incrementor step size of automatic send buffer"); VNET_DEFINE(int, tcp_autosndbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autosndbuf_max), 0, "Max size of automatic send buffer"); VNET_DEFINE(int, tcp_sendbuf_auto_lowat) = 0; #define V_tcp_sendbuf_auto_lowat VNET(tcp_sendbuf_auto_lowat) SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto_lowat, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sendbuf_auto_lowat), 0, "Modify threshold for auto send buffer growth to account for SO_SNDLOWAT"); /* * Make sure that either retransmit or persist timer is set for SYN, FIN and * non-ACK. */ #define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \ KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\ tcp_timer_active((tp), TT_REXMT) || \ tcp_timer_active((tp), TT_PERSIST), \ ("neither rexmt nor persist timer is set")) #ifdef TCP_HHOOK /* * Wrapper for the TCP established output helper hook. */ void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_OUT]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_data.len = len; hhook_data.tso = tso; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_OUT], &hhook_data, &tp->t_osd); } } #endif /* * CC wrapper hook functions */ void cc_after_idle(struct tcpcb *tp) { INP_WLOCK_ASSERT(tptoinpcb(tp)); if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(&tp->t_ccv); } /* * Tcp output routine: figure out what should be sent and send it. */ int tcp_default_output(struct tcpcb *tp) { struct socket *so = tptosocket(tp); struct inpcb *inp = tptoinpcb(tp); int32_t len; uint32_t recwin, sendwin; uint16_t flags; int off, error = 0; /* Keep compiler happy */ u_int if_hw_tsomaxsegcount = 0; u_int if_hw_tsomaxsegsize = 0; struct mbuf *m; struct ip *ip = NULL; struct tcphdr *th; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen, ulen; unsigned ipsec_optlen = 0; int idle, sendalot, curticks; int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; struct tcpopt to; struct udphdr *udp = NULL; struct tcp_log_buffer *lgb; unsigned int wanted_cookie = 0; unsigned int dont_sendalot = 0; #ifdef INET6 struct ip6_hdr *ip6 = NULL; const bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif #ifdef KERN_TLS const bool hw_tls = tp->t_nic_ktls_xmit != 0; #else const bool hw_tls = false; #endif NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif /* * For TFO connections in SYN_SENT or SYN_RECEIVED, * only allow the initial SYN or SYN|ACK and those sent * by the retransmit timer. */ if ((tp->t_flags & TF_FASTOPEN) && ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) && SEQ_GT(tp->snd_max, tp->snd_una) && /* SYN or SYN|ACK sent */ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ return (0); /* * Determine length of data that should be transmitted, * and flags that will be used. * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (idle && (((ticks - tp->t_rcvtime) >= tp->t_rxtcur) || (tp->t_sndtime && ((ticks - tp->t_sndtime) >= tp->t_rxtcur)))) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } again: sendwin = 0; /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && (tp->sackhint.nexthole != NULL) && !IN_FASTRECOVERY(tp->t_flags)) { sendwin = tcp_sack_adjust(tp); } sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd + sendwin); flags = tcp_outflags[tp->t_state]; /* * Send any SACK-generated retransmissions. If we're explicitly trying * to send out new data (when sendalot is 1), bypass this function. * If we retransmit in fast recovery mode, decrement snd_cwnd, since * we're replacing a (future) new transmission with a retransmission * now, and we previously incremented snd_cwnd in tcp_input(). */ /* * Still in sack recovery , reset rxmit flag to zero. */ sack_rxmit = 0; sack_bytes_rxmt = 0; len = 0; p = NULL; if ((tp->t_flags & TF_SACK_PERMIT) && (IN_FASTRECOVERY(tp->t_flags) || (SEQ_LT(tp->snd_nxt, tp->snd_max) && (tp->t_dupacks >= tcprexmtthresh))) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { int32_t cwin; if (IN_FASTRECOVERY(tp->t_flags)) { cwin = imax(sendwin - tcp_compute_pipe(tp), 0); } else { cwin = imax(sendwin - off, 0); } /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* * (At least) part of sack hole extends beyond * snd_recover. Check to see if we can rexmit data * for this hole. */ if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* * Can't rexmit any more data for this hole. * That data will be rexmitted in the next * sack recovery episode, when snd_recover * moves past p->rxmit. */ p = NULL; goto after_sack_rexmit; } else { /* Can rexmit part of the current hole */ len = SEQ_SUB(tp->snd_recover, p->rxmit); if (cwin <= len) { len = cwin; } else { sendalot = 1; } } } else { len = SEQ_SUB(p->end, p->rxmit); if (cwin <= len) { len = cwin; } else { sendalot = 1; } } /* we could have transmitted from the scoreboard, * but sendwin (expected flightsize) - pipe didn't * allow any transmission. * Bypass recalculating the possible transmission * length further down by setting sack_rxmit. * Wouldn't be here if there would have been * nothing in the scoreboard to transmit. */ sack_rxmit = 1; if (len > 0) { off = SEQ_SUB(p->rxmit, tp->snd_una); KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); } } after_sack_rexmit: /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; SOCK_SENDBUF_LOCK(so); /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero * and timer expired, we will send what we can * and go to transmit state. */ if (tp->t_flags & TF_FORCEDATA) { if (sendwin == 0) { /* * If we still have some data to send, then * clear the FIN bit. Usually this would * happen below when it realizes that we * aren't sending all the data. However, * if we have exactly 1 byte of unsent data, * then it won't clear the FIN bit below, * and if we are in persist state, we wind * up sending the packet without recording * that we sent the FIN bit. * * We can't just blindly clear the FIN bit, * because if we don't have any more data * to send then the probe will be the FIN * itself. */ if (off < sbused(&so->so_snd)) flags &= ~TH_FIN; sendwin = 1; } else { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } } /* * If snd_nxt == snd_max and we have transmitted a FIN, the * offset will be > 0 even if so_snd.sb_cc is 0, resulting in * a negative length. This can also occur when TCP opens up * its congestion window while receiving additional duplicate * acks after fast-retransmit because TCP will reset snd_nxt * to snd_max after the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { if ((sack_bytes_rxmt == 0) || SEQ_LT(tp->snd_nxt, tp->snd_max)) { len = imin(sbavail(&so->so_snd), sendwin) - off; } else { /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ len = imin(sbavail(&so->so_snd) - off, sendwin - tcp_compute_pipe(tp)); } } /* * Lop off SYN bit if it has already been sent. However, if this * is SYN-SENT state and if segment contains data and if we don't * know that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; off--, len++; } /* * Be careful not to send data and/or FIN on SYN segments. * This measure is needed to prevent interoperability problems * with not fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * * - When retransmitting SYN on an actively created socket * * - When sending a zero-length cookie (cookie request) on an * actively created socket * * - When the socket is in the CLOSED state (RST is being sent) */ if ((tp->t_flags & TF_FASTOPEN) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) len = 0; /* Without fast-open there should never be data sent on a SYN. */ if ((flags & TH_SYN) && !(tp->t_flags & TF_FASTOPEN)) { len = 0; } if (len <= 0) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, * len will be < 0. Otherwise, window shrank * after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't * close completely, just wait for an ACK. * * We also do a general check here to ensure that * we will set the persist timer when we have data * to send, but a 0-byte window. This makes sure * the persist timer is set even if the packet * hits one of the "goto send" lines below. */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (off < (int) sbavail(&so->so_snd)) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; if (!tcp_timer_active(tp, TT_PERSIST)) tcp_setpersist(tp); } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, IP options (IPsec), and possibly SACK * retransmits prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per generated * segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); #endif /* INET */ #endif /* IPSEC */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(inp); else #endif if (inp->inp_options) ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; ipoptlen += ipsec_optlen; if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && ((sack_rxmit == 0) || V_tcp_sack_tso) && (ipoptlen == 0 || (ipoptlen == ipsec_optlen && (tp->t_flags2 & TF2_IPSEC_TSO) != 0)) && !(flags & TH_SYN)) tso = 1; if (SEQ_LT((sack_rxmit ? p->rxmit : tp->snd_nxt) + len, tp->snd_una + sbused(&so->so_snd))) { flags &= ~TH_FIN; } recwin = lmin(lmax(sbspace(&so->so_rcv), 0), (long)TCP_MAXWIN << tp->rcv_scale); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) * - This is the last buffer in a write()/send() and we are * either idle or running NODELAY * - we've timed out (e.g. persist timer) * - we have more then 1/2 the maximum send window's worth of * data (receiver may be limited the window size) * - we need to retransmit */ if (len) { if (len >= tp->t_maxseg) goto send; /* * As the TCP header options are now * considered when setting up the initial * window, we would not send the last segment * if we skip considering the option length here. * Note: this may not work when tcp headers change * very dynamically in the future. */ if ((((tp->t_flags & TF_SIGNATURE) ? PADTCPOLEN(TCPOLEN_SIGNATURE) : 0) + ((tp->t_flags & TF_RCVD_TSTMP) ? PADTCPOLEN(TCPOLEN_TIMESTAMP) : 0) + len) >= tp->t_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause * us to flush a buffer queued with moretocome. XXX * * note: the len + off check is almost certainly unnecessary. */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && (uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; if (sack_rxmit) goto send; } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read * from the receive buffer, no matter how small, causes a window * update to be sent. We also should avoid sending a flurry of * window updates when the socket buffer had queued a lot of data * and the application is doing small reads. * * Prevent a flurry of pointless window updates by only sending * an update when we can increase the advertized window by more * than 1/4th of the socket buffer capacity. When the buffer is * getting full or is very small be more aggressive and send an * update whenever we can increase by two mss sized segments. * In all other situations the ACK's to new incoming data will * carry further window increases. * * Don't send an independent window update if a delayed * ACK is pending (it will get piggy-backed on it) or the * remote side already has done a half-close and won't send * more data. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, * taking into account that we are limited by * TCP_MAXWIN << tp->rcv_scale. */ int32_t adv; int oldwin; adv = recwin; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); if (adv > oldwin) adv -= oldwin; else adv = 0; } else oldwin = 0; /* * If the new window size ends up being the same as or less * than the old size when it is scaled, then don't force * a window update. */ if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (int32_t)(2 * tp->t_maxseg) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg || adv >= TCP_MAXWIN << tp->rcv_scale)) goto send; if (2 * adv >= (int32_t)so->so_rcv.sb_hiwat) goto send; } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) goto send; if ((flags & TH_RST) || ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) goto send; if (SEQ_GT(tp->snd_up, tp->snd_una)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; /* * In SACK, it is possible for tcp_output to fail to send a segment * after the retransmission timer has been turned off. Make sure * that the retransmission timer is set. */ if ((tp->t_flags & TF_SACK_PERMIT) && SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); goto just_return; } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: * idle not doing retransmits or persists * persisting to move a small or zero window * (re)transmitting and thereby not persisting * * tcp_timer_active(tp, TT_PERSIST) * is true when we are in persist state. * (tp->t_flags & TF_FORCEDATA) * is set when we are called to send a persist packet. * tcp_timer_active(tp, TT_REXMT) * is set when we are retransmitting * The output side is idle when both timers are zero. * * If send window is too small, there is data to transmit, and no * retransmit or persist is pending, then go to persist state. * If nothing happens soon, send when timer expires: * if window is nonzero, transmit what we can, * otherwise force out a byte. */ if (sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } /* * No reason to send a segment, just return. */ just_return: SOCK_SENDBUF_UNLOCK(so); return (0); send: SOCK_SENDBUF_LOCK_ASSERT(so); if (len > 0) { if (len >= tp->t_maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options * unless TCP set not to do any options. * NOTE: we assume that the IP/TCP header plus TCP options * always fit in a single mbuf, leaving room for a maximum * link header, i.e. * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); else #endif hdrlen = sizeof (struct tcpiphdr); if (flags & TH_SYN) { tp->snd_nxt = tp->iss; } /* * Compute options for segment. * We only have to care about SYN and established connection * segments. Options for SYN-ACK segments are handled in TCP * syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { to.to_mss = tcp_mssopt(&inp->inp_inc); if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; /* * If we wind up having more data to * send with the SYN than can fit in * one segment, don't send any more * until the SYN|ACK comes back from * the other end. */ dont_sendalot = 1; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { curticks = tcp_ts_getticks(); to.to_tsval = curticks + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; if (tp->t_rxtshift == 1) tp->t_badrxtwin = curticks; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ /* * Check that TCP_MD5SIG is enabled in tcpcb to * account the size needed to set this TCP option. */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if ((tp->t_flags & TF_FASTOPEN) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCK_SENDBUF_UNLOCK(so); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); } /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. * Clear the FIN bit because we cut off the tail of * the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { flags &= ~TH_FIN; if (tso) { u_int if_hw_tsomax; u_int moff; int max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; /* * Limit a TSO burst to prevent it from * overflowing or exceeding the maximum length * allowed by the network interface: */ KASSERT(ipoptlen == ipsec_optlen, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = if_hw_tsomax - hdrlen - ipsec_optlen - max_linkhdr; if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; } } /* * Prevent the last segment from being * fractional unless the send sockbuf can be * emptied: */ max_len = tp->t_maxseg - optlen - ipsec_optlen; if (((uint32_t)off + (uint32_t)len) < sbavail(&so->so_snd)) { moff = len % max_len; if (moff != 0) { len -= moff; sendalot = 1; } } /* * In case there are too many small fragments * don't use TSO: */ if (len <= max_len) { len = max_len; sendalot = 1; tso = 0; } /* * Send the FIN in a separate segment * after the bulk sending is done. * We don't trust the TSO implementations * to clear the FIN flag on all but the * last segment. */ if (tp->t_flags & TF_NEEDFIN) sendalot = 1; } else { if (optlen + ipoptlen >= tp->t_maxseg) { /* * Since we don't have enough space to put * the IP header chain and the TCP header in * one packet as required by RFC 7112, don't * send it. Also ensure that at least one * byte of the payload can be put into the * TCP segment. */ SOCK_SENDBUF_UNLOCK(so); error = EMSGSIZE; sack_rxmit = 0; goto out; } len = tp->t_maxseg - optlen - ipoptlen; sendalot = 1; if (dont_sendalot) sendalot = 0; } } else tso = 0; KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); /*#ifdef DIAGNOSTIC*/ #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); /*#endif*/ /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); /* * Grab a header mbuf, attaching a copy of data to * be transmitted, and initialize the header from * the template for sends on this connection. */ if (len) { struct mbuf *mb; struct sockbuf *msb; u_int moff; if ((tp->t_flags & TF_FORCEDATA) && len == 1) { TCPSTAT_INC(tcps_sndprobe); #ifdef STATS if (SEQ_LT(tp->snd_nxt, tp->snd_max)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); else stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif /* STATS */ } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); if (sack_rxmit) { TCPSTAT_INC(tcps_sack_rexmits); if (tso) { TCPSTAT_INC(tcps_sack_rexmits_tso); } TCPSTAT_ADD(tcps_sack_rexmit_bytes, len); } #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif /* STATS */ } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); #ifdef STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif /* STATS */ } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCK_SENDBUF_UNLOCK(so); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf * to the offset in the socket buffer chain. */ mb = sbsndptr_noadv(&so->so_snd, off, &moff); if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, len, mtod(m, caddr_t) + hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) sbsndptr_adv(&so->so_snd, mb, len); m->m_len += len; } else { int32_t old_len; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) msb = NULL; else msb = &so->so_snd; old_len = len; m->m_next = tcp_m_copym(mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, hw_tls); if (old_len != len) flags &= ~TH_FIN; if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCK_SENDBUF_UNLOCK(so); (void) m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } /* * If we're sending everything we've got, set PUSH. * (This will keep happy those implementations which only * give data to the user when a buffer fills or * a PUSH comes in.) */ if (((uint32_t)off + (uint32_t)len == sbused(&so->so_snd)) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCK_SENDBUF_UNLOCK(so); } else { SOCK_SENDBUF_UNLOCK(so); if (tp->t_flags & TF_ACKNOW) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); else if (SEQ_GT(tp->snd_up, tp->snd_una)) TCPSTAT_INC(tcps_sndurg); else TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCK_SENDBUF_UNLOCK_ASSERT(so); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else { th = (struct tcphdr *)(ip6 + 1); } tcpip_fillheaders(inp, tp->t_port, ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, tp->t_port, ip, th); } /* * Fill in fields, remembering maximum advertised * window for use in delaying messages about window sizes. * If resending a FIN, be sure not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup * SYN packet. If we are on a retransmit, we may * resend those bits a number of times as per * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { flags |= tcp_ecn_output_syn_sent(tp); } /* Also handle parallel SYN for ECN */ if ((TCPS_HAVERCVDSYN(tp->t_state)) && (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) tp->t_flags2 &= ~TF2_ECN_SND_ECE; #ifdef INET6 if (isipv6) { ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << IPV6_FLOWLABEL_LEN); ip6->ip6_flow |= htonl(ect << IPV6_FLOWLABEL_LEN); } else #endif { ip->ip_tos &= ~IPTOS_ECN_MASK; ip->ip_tos |= ect; } } /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the * retransmitted packet, we want the sequence number * of the next unsent octet. So, if there is no data * (and no SYN or FIN), use snd_max instead of snd_nxt * when filling in ti_seq. But if we are in persist * state, snd_max might reflect one byte beyond the * right edge of the window, so use snd_nxt in that * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); p->rxmit += len; /* * Lost Retransmission Detection * trigger resending of a (then * still existing) hole, when * fack acks recoverypoint. */ if ((tp->t_flags & TF_LRD) && SEQ_GEQ(p->rxmit, p->end)) p->rxmit = tp->snd_recover; tp->sackhint.sack_bytes_rexmit += len; } if (IN_RECOVERY(tp->t_flags)) { /* * Account all bytes transmitted while * IN_RECOVERY, simplifying PRR and * Lost Retransmit Detection */ tp->sackhint.prr_out += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } tcp_set_flags(th, flags); /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. * If a RST segment is sent, advertise a window of zero. */ if (flags & TH_RST) { recwin = 0; } else { if (recwin < (so->so_rcv.sb_hiwat / 4) && recwin < tp->t_maxseg) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (tp->rcv_adv - tp->rcv_nxt)) recwin = (tp->rcv_adv - tp->rcv_nxt); } /* * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. The * case is handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else { /* Avoid shrinking window with window scaling. */ recwin = roundup2(recwin, 1 << tp->rcv_scale); th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); } /* * Adjust the RXWIN0SENT flag - indicate that we have advertised * a 0 window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is attempting * to read more data than can be buffered prior to transmitting on * the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); th->th_flags |= TH_URG; } else /* * If no urgent pointer to send, then we pull * the urgent pointer to the left edge of the send window * so that it doesn't drift into the send window on sequence * number wraparound. */ tp->snd_up = tp->snd_una; /* drag it along */ /* * Put TCP length in extended header, and then * checksum extended header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. * NOTE: since TCP options buffer doesn't point into * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || (error = TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt))) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ m_freem(m); goto out; } } #endif #ifdef INET6 if (isipv6) { /* * There is no need to fill in ip6_plen right now. * It will be filled later by ip6_output. */ if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); } /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen - ipsec_optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen - ipsec_optlen; } KASSERT(len + hdrlen == m_length(m, NULL), ("%s: mbuf chain shorter than expected: %d + %u != %u", __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif TCP_PROBE3(debug__output, tp, th, m); /* We're getting ready to send; log now. */ /* XXXMT: We are not honoring verbose logging. */ if (tcp_bblogging_on(tp)) lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, NULL, false, NULL, NULL, 0, NULL); else lgb = NULL; /* * Fill in IP length and desired time to live and * send to IP level. There should be a better way * to handle ttl and tos; we could keep them in * the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before checksum calculation, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. * Also, desired default hop limit might be changed via * Neighbor Discovery. */ ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace probes. * ip6_output() will set it properly; it's supposed to include * the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, inp->in6p_outputopts, &inp->inp_route6, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) mtu = inp->inp_route6.ro_nh->nh_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every packet. * This might not be the best thing to do according to RFC3390 * Section 2. However the tcp hostcache migitates the problem * so it affects only the first tcp connection with a host. * * NB: Don't set DF on small MTU/MSS to have a safe fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; if (tp->t_port == 0 || len < V_tcp_minmss) { ip->ip_off |= htons(IP_DF); } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); #ifdef TCPPCAP /* Save packet, if requested. */ tcp_pcap_add(th, m, &(tp->t_outpkts)); #endif error = ip_output(m, inp->inp_options, &inp->inp_route, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) mtu = inp->inp_route.ro_nh->nh_mtu; } #endif /* INET */ if (lgb != NULL) { lgb->tlb_errno = error; lgb = NULL; } out: if (error == 0) tcp_account_for_send(tp, len, (tp->snd_nxt != tp->snd_max), 0, hw_tls); /* * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. In a closed * state just return. */ if (flags & TH_RST) { TCPSTAT_INC(tcps_sndtotal); return (0); } else if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) { tcp_seq startseq = tp->snd_nxt; /* * Advance snd_nxt over sequence space of this segment. */ if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit) goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { /* * Update "made progress" indication if we just * added new data to an empty socket buffer. */ if (tp->snd_una == tp->snd_max) tp->t_acktime = ticks; tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. */ tp->t_sndtime = ticks; if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } #ifdef STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; tp->gput_seq = startseq; tp->gput_ack = startseq + ulmin(sbavail(&so->so_snd) - off, sendwin); tp->gput_ts = tcp_ts_getticks(); } #endif /* STATS */ } /* * Set retransmit timer if not currently set, * and not doing a pure ack or a keep-alive probe. * Initial value for retransmit timer is smoothed * round-trip time + 2 * round-trip time variance. * Initialize shift counter which is used for backoff * of retransmit time. */ timer: if (!tcp_timer_active(tp, TT_REXMT) && ((sack_rxmit && tp->snd_nxt != tp->snd_max) || (tp->snd_nxt != tp->snd_una))) { if (tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { /* * Avoid a situation where we do not set persist timer * after a zero window condition. For example: * 1) A -> B: packet with enough data to fill the window * 2) B -> A: ACK for #1 + new data (0 window * advertisement) * 3) A -> B: ACK for #2, 0 len packet * * In this case, A will not activate the persist timer, * because it chose to send a packet. Unless tcp_output * is called for some other reason (delayed ack timer, * another input packet from B, socket syscall), A will * not send zero window probes. * * So, if you send a 0-length packet, but there is data * in the socket buffer, and neither the rexmt or * persist timer is already set, then activate the * persist timer. */ tp->t_rxtshift = 0; tcp_setpersist(tp); } } else { /* * Persist case, update snd_max but since we are in * persist mode (no window) we do not update snd_nxt. */ int xlen = len; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) tp->snd_max = tp->snd_nxt + xlen; } if ((error == 0) && (tp->rcv_numsacks > 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT)) { /* Clean up any DSACK's sent */ tcp_clean_dsack_blocks(tp); } if ((error == 0) && sack_rxmit && SEQ_LT(tp->snd_nxt, SEQ_MIN(p->rxmit, p->end))) { /* * When transmitting from SACK scoreboard * after an RTO, pull snd_nxt along. */ tp->snd_nxt = SEQ_MIN(p->rxmit, p->end); } if (error) { /* * We know that the packet was lost, so back out the * sequence number advance, if any. * * If the error is EPERM the packet got blocked by the * local firewall. Normally we should terminate the * connection but the blocking may have been spurious * due to a firewall reconfiguration cycle. So we treat * it like a packet loss and let the retransmit timer and * timeouts do their work over time. * XXX: It is a POLA question whether calling tcp_drop right * away would be the really correct behavior instead. */ if (((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) && ((flags & TH_SYN) == 0) && (error != EPERM)) { if (sack_rxmit) { p->rxmit = SEQ_MIN(p->end, p->rxmit) - len; tp->sackhint.sack_bytes_rexmit -= len; KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); KASSERT((flags & TH_FIN) == 0, ("error while FIN with SACK rxmit")); } else { tp->snd_nxt -= len; if (flags & TH_FIN) tp->snd_nxt--; } if (IN_RECOVERY(tp->t_flags)) tp->sackhint.prr_out -= len; } SOCK_SENDBUF_UNLOCK_ASSERT(so); /* Check gotos. */ switch (error) { case EACCES: case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: TCP_XMIT_TIMER_ASSERT(tp, len, flags); - tp->snd_cwnd = tp->t_maxseg; + tp->snd_cwnd = tcp_maxseg(tp); return (0); case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. * If TSO was active we either got an interface * without TSO capabilits or TSO was turned off. * If we obtained mtu from ip_output() then update * it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } return (error); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; return (0); } /* FALLTHROUGH */ default: return (error); } } TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, * then remember the size of the advertised window. * Any pending ACK has now been sent. */ if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); if (sendalot) goto again; return (0); } void tcp_setpersist(struct tcpcb *tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; int maxunacktime; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* * If the state is already closed, don't bother. */ if (tp->t_state == TCPS_CLOSED) return; /* * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); if (TP_MAXUNACKTIME(tp) && tp->t_acktime) { maxunacktime = tp->t_acktime + TP_MAXUNACKTIME(tp) - ticks; if (maxunacktime < 1) maxunacktime = 1; if (maxunacktime < tt) tt = maxunacktime; } tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < V_tcp_retries) tp->t_rxtshift++; } /* * Insert TCP options according to the supplied parameters to the place * optp in a consistent way. Can handle unaligned destinations. * * The order of the option processing is crucial for optimal packing and * alignment for the scarce option space. * * The optimal order for a SYN/SYN-ACK segment is: * MSS (4) + NOP (1) + Window scale (3) + SACK permitted (2) + * Timestamp (10) + Signature (18) = 38 bytes out of a maximum of 40. * * The SACK options should be last. SACK blocks consume 8*n+2 bytes. * So a full size SACK blocks option is 34 bytes (with 4 SACK blocks). * At minimum we need 10 bytes (to generate 1 SACK block). If both * TCP Timestamps (12 bytes) and TCP Signatures (18 bytes) are present, * we only have 10 bytes for SACK options (40 - (12 + 18)). */ int tcp_addoptions(struct tcpopt *to, u_char *optp) { u_int32_t mask, optlen = 0; for (mask = 1; mask < TOF_MAXOPT; mask <<= 1) { if ((to->to_flags & mask) != mask) continue; if (optlen == TCP_MAXOLEN) break; switch (to->to_flags & mask) { case TOF_MSS: while (optlen % 4) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_MAXSEG) continue; optlen += TCPOLEN_MAXSEG; *optp++ = TCPOPT_MAXSEG; *optp++ = TCPOLEN_MAXSEG; to->to_mss = htons(to->to_mss); bcopy((u_char *)&to->to_mss, optp, sizeof(to->to_mss)); optp += sizeof(to->to_mss); break; case TOF_SCALE: while (!optlen || optlen % 2 != 1) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_WINDOW) continue; optlen += TCPOLEN_WINDOW; *optp++ = TCPOPT_WINDOW; *optp++ = TCPOLEN_WINDOW; *optp++ = to->to_wscale; break; case TOF_SACKPERM: while (optlen % 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACK_PERMITTED) continue; optlen += TCPOLEN_SACK_PERMITTED; *optp++ = TCPOPT_SACK_PERMITTED; *optp++ = TCPOLEN_SACK_PERMITTED; break; case TOF_TS: while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_TIMESTAMP) continue; optlen += TCPOLEN_TIMESTAMP; *optp++ = TCPOPT_TIMESTAMP; *optp++ = TCPOLEN_TIMESTAMP; to->to_tsval = htonl(to->to_tsval); to->to_tsecr = htonl(to->to_tsecr); bcopy((u_char *)&to->to_tsval, optp, sizeof(to->to_tsval)); optp += sizeof(to->to_tsval); bcopy((u_char *)&to->to_tsecr, optp, sizeof(to->to_tsecr)); optp += sizeof(to->to_tsecr); break; case TOF_SIGNATURE: { int siglen = TCPOLEN_SIGNATURE - 2; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SIGNATURE) { to->to_flags &= ~TOF_SIGNATURE; continue; } optlen += TCPOLEN_SIGNATURE; *optp++ = TCPOPT_SIGNATURE; *optp++ = TCPOLEN_SIGNATURE; to->to_signature = optp; while (siglen--) *optp++ = 0; break; } case TOF_SACK: { int sackblks = 0; struct sackblk *sack = (struct sackblk *)to->to_sacks; tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; *optp++ = TCPOPT_NOP; } if (TCP_MAXOLEN - optlen < TCPOLEN_SACKHDR + TCPOLEN_SACK) continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; sackblks = min(to->to_nsacks, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; while (sackblks--) { sack_seq = htonl(sack->start); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); sack_seq = htonl(sack->end); bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); optp += sizeof(sack_seq); optlen += TCPOLEN_SACK; sack++; } TCPSTAT_INC(tcps_sack_send_blocks); break; } case TOF_FASTOPEN: { int total_len; /* XXX is there any point to aligning this option? */ total_len = TCPOLEN_FAST_OPEN_EMPTY + to->to_tfo_len; if (TCP_MAXOLEN - optlen < total_len) { to->to_flags &= ~TOF_FASTOPEN; continue; } *optp++ = TCPOPT_FAST_OPEN; *optp++ = total_len; if (to->to_tfo_len > 0) { bcopy(to->to_tfo_cookie, optp, to->to_tfo_len); optp += to->to_tfo_len; } optlen += total_len; break; } default: panic("%s: unknown TCP option type", __func__); break; } } /* Terminate and pad TCP options to a 4 byte boundary. */ if (optlen % 4) { optlen += TCPOLEN_EOL; *optp++ = TCPOPT_EOL; } /* * According to RFC 793 (STD0007): * "The content of the header beyond the End-of-Option option * must be header padding (i.e., zero)." * and later: "The padding is composed of zeros." */ while (optlen % 4) { optlen += TCPOLEN_PAD; *optp++ = TCPOPT_PAD; } KASSERT(optlen <= TCP_MAXOLEN, ("%s: TCP options too long", __func__)); return (optlen); } /* * This is a copy of m_copym(), taking the TSO segment size/limit * constraints into account, and advancing the sndptr as it goes. */ struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls) { #ifdef KERN_TLS struct ktls_session *tls, *ntls; struct mbuf *start __diagused; #endif struct mbuf *n, **np; struct mbuf *top; int32_t off = off0; int32_t len = *plen; int32_t fragsize; int32_t len_cp = 0; int32_t *pkthdrlen; uint32_t mlen, frags; bool copyhdr; KASSERT(off >= 0, ("tcp_m_copym, negative off %d", off)); KASSERT(len >= 0, ("tcp_m_copym, negative len %d", len)); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = true; else copyhdr = false; while (off > 0) { KASSERT(m != NULL, ("tcp_m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; if ((sb) && (m == sb->sb_sndptr)) { sb->sb_sndptroff += m->m_len; sb->sb_sndptr = m->m_next; } m = m->m_next; } np = ⊤ top = NULL; pkthdrlen = NULL; #ifdef KERN_TLS if (hw_tls && (m->m_flags & M_EXTPG)) tls = m->m_epg_tls; else tls = NULL; start = m; #endif while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("tcp_m_copym, length > size of mbuf chain")); *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } #ifdef KERN_TLS if (hw_tls) { if (m->m_flags & M_EXTPG) ntls = m->m_epg_tls; else ntls = NULL; /* * Avoid mixing TLS records with handshake * data or TLS records from different * sessions. */ if (tls != ntls) { MPASS(m != start); *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } } #endif mlen = min(len, m->m_len - off); if (seglimit) { /* * For M_EXTPG mbufs, add 3 segments * + 1 in case we are crossing page boundaries * + 2 in case the TLS hdr/trailer are used * It is cheaper to just add the segments * than it is to take the cache miss to look * at the mbuf ext_pgs state in detail. */ if (m->m_flags & M_EXTPG) { fragsize = min(segsize, PAGE_SIZE); frags = 3; } else { fragsize = segsize; frags = 0; } /* Break if we really can't fit anymore. */ if ((frags + 1) >= seglimit) { *plen = len_cp; if (pkthdrlen != NULL) *pkthdrlen = len_cp; break; } /* * Reduce size if you can't copy the whole * mbuf. If we can't copy the whole mbuf, also * adjust len so the loop will end after this * mbuf. */ if ((frags + howmany(mlen, fragsize)) >= seglimit) { mlen = (seglimit - frags - 1) * fragsize; len = mlen; *plen = len_cp + len; if (pkthdrlen != NULL) *pkthdrlen = *plen; } frags += howmany(mlen, fragsize); if (frags == 0) frags++; seglimit -= frags; KASSERT(seglimit > 0, ("%s: seglimit went too low", __func__)); } if (copyhdr) n = m_gethdr(M_NOWAIT, m->m_type); else n = m_get(M_NOWAIT, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, M_NOWAIT)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; pkthdrlen = &n->m_pkthdr.len; copyhdr = false; } n->m_len = mlen; len_cp += n->m_len; if (m->m_flags & (M_EXT | M_EXTPG)) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (sb && (sb->sb_sndptr == m) && ((n->m_len + off) >= m->m_len) && m->m_next) { sb->sb_sndptroff += m->m_len; sb->sb_sndptr = m->m_next; } off = 0; if (len != M_COPYALL) { len -= n->m_len; } m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } void tcp_sndbuf_autoscale(struct tcpcb *tp, struct socket *so, uint32_t sendwin) { /* * Automatic sizing of send socket buffer. Often the send buffer * size is not optimally adjusted to the actual network conditions * at hand (delay bandwidth product). Setting the buffer size too * small limits throughput on links with high bandwidth and high * delay (eg. trans-continental/oceanic links). Setting the * buffer size too big consumes too much real kernel memory, * especially with many connections on busy servers. * * The criteria to step up the send buffer one notch are: * 1. receive window of remote host is larger than send buffer * (with a fudge factor of 5/4th); * 2. send buffer is filled to 7/8th with data (so we actually * have data to make use of it); * 3. send buffer fill has not hit maximal automatic size; * 4. our send window (slow start and cogestion controlled) is * larger than sent but unacknowledged data in send buffer. * * The remote host receive window scaling factor may limit the * growing of the send buffer before it reaches its allowed * maximum. * * It scales directly with slow start or congestion window * and does at most one step per received ACK. This fast * scaling has the drawback of growing the send buffer beyond * what is strictly necessary to make full use of a given * delay*bandwidth product. However testing has shown this not * to be much of an problem. At worst we are trading wasting * of available bandwidth (the non-use of it) for wasting some * socket buffer memory. * * TODO: Shrink send buffer during idle periods together * with congestion window. Requires another timer. Has to * wait for upcoming tcp timer rewrite. * * XXXGL: should there be used sbused() or sbavail()? */ if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { int lowat; lowat = V_tcp_sendbuf_auto_lowat ? so->so_snd.sb_lowat : 0; if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat - lowat && sbused(&so->so_snd) >= (so->so_snd.sb_hiwat / 8 * 7) - lowat && sbused(&so->so_snd) < V_tcp_autosndbuf_max && sendwin >= (sbused(&so->so_snd) - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(so, SO_SND, min(so->so_snd.sb_hiwat + V_tcp_autosndbuf_inc, V_tcp_autosndbuf_max), curthread)) so->so_snd.sb_flags &= ~SB_AUTOSIZE; } } }