Index: stable/12/sys/netinet/cc/cc_cdg.c
===================================================================
--- stable/12/sys/netinet/cc/cc_cdg.c	(revision 364376)
+++ stable/12/sys/netinet/cc/cc_cdg.c	(revision 364377)
@@ -1,718 +1,718 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2013
  * 	Swinburne University of Technology, Melbourne, Australia
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by David Hayes, made
  * possible in part by a gift from The Cisco University Research Program Fund,
  * a corporate advised fund of Silicon Valley Community Foundation. Development
  * and testing were further assisted by a grant from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * CAIA Delay-Gradient (CDG) congestion control algorithm
  *
  * An implemention of the delay-gradient congestion control algorithm proposed
  * in the following paper:
  *
  * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay
  * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011.
  *
  * Developed as part of the NewTCP research project at Swinburne University of
  * Technology's Centre for Advanced Internet Architectures, Melbourne,
  * Australia. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_module.h>
 
 #include <netinet/khelp/h_ertt.h>
 
 #include <vm/uma.h>
 
 #define	CDG_VERSION "0.1"
 
 /* Private delay-gradient induced congestion control signal. */
 #define	CC_CDG_DELAY 0x01000000
 
 /* NewReno window deflation factor on loss (as a percentage). */
 #define	RENO_BETA 50
 
 /* Queue states. */
 #define	CDG_Q_EMPTY	1
 #define	CDG_Q_RISING	2
 #define	CDG_Q_FALLING	3
 #define	CDG_Q_FULL	4
 #define	CDG_Q_UNKNOWN	9999
 
 /* Number of bit shifts used in probexp lookup table. */
 #define	EXP_PREC 15
 
 /* Largest gradient represented in probexp lookup table. */
 #define	MAXGRAD 5
 
 /*
  * Delay Precision Enhance - number of bit shifts used for qtrend related
  * integer arithmetic precision.
  */
 #define	D_P_E 7
 
 struct qdiff_sample {
 	long qdiff;
 	STAILQ_ENTRY(qdiff_sample) qdiff_lnk;
 };
 
 struct cdg {
 	long max_qtrend;
 	long min_qtrend;
 	STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q;
 	STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q;
 	long window_incr;
 	/* rttcount for window increase when in congestion avoidance */
 	long rtt_count;
 	/* maximum measured rtt within an rtt period */
 	int maxrtt_in_rtt;
 	/* maximum measured rtt within prev rtt period */
 	int maxrtt_in_prevrtt;
 	/* minimum measured rtt within an rtt period */
 	int minrtt_in_rtt;
 	/* minimum measured rtt within prev rtt period */
 	int minrtt_in_prevrtt;
 	/* consecutive congestion episode counter */
 	uint32_t consec_cong_cnt;
 	/* when tracking a new reno type loss window */
 	uint32_t shadow_w;
 	/* maximum number of samples in the moving average queue */
 	int sample_q_size;
 	/* number of samples in the moving average queue */
 	int num_samples;
 	/* estimate of the queue state of the path */
 	int queue_state;
 };
 
 /*
  * Lookup table for:
  *   (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments
  *
  * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase
  * gradients.
  */
 static const int probexp[641] = {
    10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624,
    3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846,
    7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713,
    9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939,
    12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951,
    14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768,
    15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410,
    17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893,
    19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233,
    20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444,
    21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537,
    22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525,
    23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418,
    24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224,
    25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953,
    26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611,
    26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206,
    27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743,
    27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228,
    28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667,
    28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063,
    29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421,
    29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744,
    29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036,
    30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300,
    30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538,
    30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753,
    30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948,
    30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124,
    31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283,
    31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426,
    31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556,
    31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673,
    31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778,
    31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874,
    31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960,
    31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038,
    32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109,
    32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173,
    32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230,
    32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282,
    32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329,
    32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371,
    32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410,
    32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444,
    32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476,
    32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504,
    32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529,
    32531,32533,32535,32537,32538,32540,32542,32544,32545,32547};
 
 static uma_zone_t qdiffsample_zone;
 
 static MALLOC_DEFINE(M_CDG, "cdg data",
   "Per connection data required for the CDG congestion control algorithm");
 
 static int ertt_id;
 
 VNET_DEFINE_STATIC(uint32_t, cdg_alpha_inc);
 VNET_DEFINE_STATIC(uint32_t, cdg_beta_delay);
 VNET_DEFINE_STATIC(uint32_t, cdg_beta_loss);
 VNET_DEFINE_STATIC(uint32_t, cdg_smoothing_factor);
 VNET_DEFINE_STATIC(uint32_t, cdg_exp_backoff_scale);
 VNET_DEFINE_STATIC(uint32_t, cdg_consec_cong);
 VNET_DEFINE_STATIC(uint32_t, cdg_hold_backoff);
 #define	V_cdg_alpha_inc		VNET(cdg_alpha_inc)
 #define	V_cdg_beta_delay	VNET(cdg_beta_delay)
 #define	V_cdg_beta_loss		VNET(cdg_beta_loss)
 #define	V_cdg_smoothing_factor	VNET(cdg_smoothing_factor)
 #define	V_cdg_exp_backoff_scale	VNET(cdg_exp_backoff_scale)
 #define	V_cdg_consec_cong	VNET(cdg_consec_cong)
 #define	V_cdg_hold_backoff	VNET(cdg_hold_backoff)
 
 /* Function prototypes. */
 static int cdg_mod_init(void);
 static int cdg_mod_destroy(void);
 static void cdg_conn_init(struct cc_var *ccv);
 static int cdg_cb_init(struct cc_var *ccv);
 static void cdg_cb_destroy(struct cc_var *ccv);
 static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type);
 static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type);
 
 struct cc_algo cdg_cc_algo = {
 	.name = "cdg",
 	.mod_init = cdg_mod_init,
 	.ack_received = cdg_ack_received,
 	.cb_destroy = cdg_cb_destroy,
 	.cb_init = cdg_cb_init,
 	.conn_init = cdg_conn_init,
 	.cong_signal = cdg_cong_signal,
 	.mod_destroy = cdg_mod_destroy
 };
 
 /* Vnet created and being initialised. */
 static void
 cdg_init_vnet(const void *unused __unused)
 {
 
 	V_cdg_alpha_inc = 0;
 	V_cdg_beta_delay = 70;
 	V_cdg_beta_loss = 50;
 	V_cdg_smoothing_factor = 8;
 	V_cdg_exp_backoff_scale = 3;
 	V_cdg_consec_cong = 5;
 	V_cdg_hold_backoff = 5;
 }
 
 static int
 cdg_mod_init(void)
 {
 	VNET_ITERATOR_DECL(v);
 
 	ertt_id = khelp_get_id("ertt");
 	if (ertt_id <= 0)
 		return (EINVAL);
 
 	qdiffsample_zone = uma_zcreate("cdg_qdiffsample",
 	    sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0);
 
 	VNET_LIST_RLOCK();
 	VNET_FOREACH(v) {
 		CURVNET_SET(v);
 		cdg_init_vnet(NULL);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK();
 
 	cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
 	cdg_cc_algo.after_idle = newreno_cc_algo.after_idle;
 
 	return (0);
 }
 
 static int
 cdg_mod_destroy(void)
 {
 
 	uma_zdestroy(qdiffsample_zone);
 	return (0);
 }
 
 static int
 cdg_cb_init(struct cc_var *ccv)
 {
 	struct cdg *cdg_data;
 
 	cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT);
 	if (cdg_data == NULL)
 		return (ENOMEM);
 
 	cdg_data->shadow_w = 0;
 	cdg_data->max_qtrend = 0;
 	cdg_data->min_qtrend = 0;
 	cdg_data->queue_state = CDG_Q_UNKNOWN;
 	cdg_data->maxrtt_in_rtt = 0;
 	cdg_data->maxrtt_in_prevrtt = 0;
 	cdg_data->minrtt_in_rtt = INT_MAX;
 	cdg_data->minrtt_in_prevrtt = 0;
 	cdg_data->window_incr = 0;
 	cdg_data->rtt_count = 0;
 	cdg_data->consec_cong_cnt = 0;
 	cdg_data->sample_q_size = V_cdg_smoothing_factor;
 	cdg_data->num_samples = 0;
 	STAILQ_INIT(&cdg_data->qdiffmin_q);
 	STAILQ_INIT(&cdg_data->qdiffmax_q);
 
 	ccv->cc_data = cdg_data;
 
 	return (0);
 }
 
 static void
 cdg_conn_init(struct cc_var *ccv)
 {
 	struct cdg *cdg_data = ccv->cc_data;
 
 	/*
 	 * Initialise the shadow_cwnd in case we are competing with loss based
 	 * flows from the start
 	 */
 	cdg_data->shadow_w = CCV(ccv, snd_cwnd);
 }
 
 static void
 cdg_cb_destroy(struct cc_var *ccv)
 {
 	struct cdg *cdg_data;
 	struct qdiff_sample *qds, *qds_n;
 
 	cdg_data = ccv->cc_data;
 
 	qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
 	while (qds != NULL) {
 		qds_n = STAILQ_NEXT(qds, qdiff_lnk);
 		uma_zfree(qdiffsample_zone,qds);
 		qds = qds_n;
 	}
 
 	qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
 	while (qds != NULL) {
 		qds_n = STAILQ_NEXT(qds, qdiff_lnk);
 		uma_zfree(qdiffsample_zone,qds);
 		qds = qds_n;
 	}
 
 	free(ccv->cc_data, M_CDG);
 }
 
 static int
 cdg_beta_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = *(uint32_t *)arg1;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new == 0 || new > 100)
 			error = EINVAL;
 		else
 			*(uint32_t *)arg1 = new;
 	}
 
 	return (error);
 }
 
 static int
 cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = *(uint32_t *)arg1;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new < 1)
 			error = EINVAL;
 		else
 			*(uint32_t *)arg1 = new;
 	}
 
 	return (error);
 }
 
 static inline uint32_t
 cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta)
 {
 
 	return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100);
 }
 
 /*
  * Window increase function
  * This window increase function is independent of the initial window size
  * to ensure small window flows are not discriminated against (i.e. fairness).
  * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for
  * the next alpha_inc rtts, etc.
  */
 static void
 cdg_window_increase(struct cc_var *ccv, int new_measurement)
 {
 	struct cdg *cdg_data;
 	int incr, s_w_incr;
 
 	cdg_data = ccv->cc_data;
 	incr = s_w_incr = 0;
 
 	if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
 		/* Slow start. */
 		incr = CCV(ccv, t_maxseg);
 		s_w_incr = incr;
 		cdg_data->window_incr = cdg_data->rtt_count = 0;
 	} else {
 		/* Congestion avoidance. */
 		if (new_measurement) {
 			s_w_incr = CCV(ccv, t_maxseg);
 			if (V_cdg_alpha_inc == 0) {
 				incr = CCV(ccv, t_maxseg);
 			} else {
 				if (++cdg_data->rtt_count >= V_cdg_alpha_inc) {
 					cdg_data->window_incr++;
 					cdg_data->rtt_count = 0;
 				}
 				incr = CCV(ccv, t_maxseg) *
 				    cdg_data->window_incr;
 			}
 		}
 	}
 
 	if (cdg_data->shadow_w > 0)
 		cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr,
 		    TCP_MAXWIN << CCV(ccv, snd_scale));
 
 	CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr,
 	    TCP_MAXWIN << CCV(ccv, snd_scale));
 }
 
 static void
 cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type)
 {
 	struct cdg *cdg_data = ccv->cc_data;
 
 	switch(signal_type) {
 	case CC_CDG_DELAY:
 		CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv,
 		    CCV(ccv, snd_cwnd), V_cdg_beta_delay);
 		CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 		CCV(ccv, snd_recover) = CCV(ccv, snd_max);
 		cdg_data->window_incr = cdg_data->rtt_count = 0;
 		ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		break;
 	case CC_NDUPACK:
 		/*
 		 * If already responding to congestion OR we have guessed no
 		 * queue in the path is full.
 		 */
 		if (IN_CONGRECOVERY(CCV(ccv, t_flags)) ||
 		    cdg_data->queue_state < CDG_Q_FULL) {
 			CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
 			CCV(ccv, snd_recover) = CCV(ccv, snd_max);
 		} else {
 			/*
 			 * Loss is likely to be congestion related. We have
 			 * inferred a queue full state, so have shadow window
 			 * react to loss as NewReno would.
 			 */
 			if (cdg_data->shadow_w > 0)
 				cdg_data->shadow_w = cdg_window_decrease(ccv,
 				    cdg_data->shadow_w, RENO_BETA);
 
 			CCV(ccv, snd_ssthresh) = max(cdg_data->shadow_w,
 			    cdg_window_decrease(ccv, CCV(ccv, snd_cwnd),
 			    V_cdg_beta_loss));
 
 			cdg_data->window_incr = cdg_data->rtt_count = 0;
 		}
 		ENTER_RECOVERY(CCV(ccv, t_flags));
 		break;
 	default:
 		newreno_cc_algo.cong_signal(ccv, signal_type);
 		break;
 	}
 }
 
 /*
  * Using a negative exponential probabilistic backoff so that sources with
  * varying RTTs which share the same link will, on average, have the same
  * probability of backoff over time.
  *
  * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where
  * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff.
  */
 static inline int
 prob_backoff(long qtrend)
 {
 	int backoff, idx, p;
 
 	backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E));
 
 	if (!backoff) {
 		if (V_cdg_exp_backoff_scale > 1)
 			idx = (qtrend + V_cdg_exp_backoff_scale / 2) /
 			    V_cdg_exp_backoff_scale;
 		else
 			idx = qtrend;
 
 		/* Backoff probability proportional to rate of queue growth. */
 		p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx];
 		backoff = (random() < p);
 	}
 
 	return (backoff);
 }
 
 static inline void
 calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min)
 {
 	struct qdiff_sample *qds;
 
 	++cdg_data->num_samples;
 	if (cdg_data->num_samples > cdg_data->sample_q_size) {
 		/* Minimum RTT. */
 		qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
 		cdg_data->min_qtrend =  cdg_data->min_qtrend +
 		    (qdiff_min - qds->qdiff) / cdg_data->sample_q_size;
 		STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk);
 		qds->qdiff = qdiff_min;
 		STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk);
 
 		/* Maximum RTT. */
 		qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
 		cdg_data->max_qtrend =  cdg_data->max_qtrend +
 		    (qdiff_max - qds->qdiff) / cdg_data->sample_q_size;
 		STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk);
 		qds->qdiff = qdiff_max;
 		STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk);
 		--cdg_data->num_samples;
 	} else {
 		qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
 		if (qds != NULL) {
 			cdg_data->min_qtrend = cdg_data->min_qtrend +
 			    qdiff_min / cdg_data->sample_q_size;
 			qds->qdiff = qdiff_min;
 			STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds,
 			    qdiff_lnk);
 		}
 
 		qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
 		if (qds) {
 			cdg_data->max_qtrend = cdg_data->max_qtrend +
 			    qdiff_max / cdg_data->sample_q_size;
 			qds->qdiff = qdiff_max;
 			STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds,
 			    qdiff_lnk);
 		}
 	}
 }
 
 static void
 cdg_ack_received(struct cc_var *ccv, uint16_t ack_type)
 {
 	struct cdg *cdg_data;
 	struct ertt *e_t;
 	long qdiff_max, qdiff_min;
 	int congestion, new_measurement, slowstart;
 
 	cdg_data = ccv->cc_data;
 	e_t = (struct ertt *)khelp_get_osd(CCV(ccv, osd), ertt_id);
 	new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
 	congestion = 0;
 	cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt);
 	cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt);
 
 	if (new_measurement) {
 		slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh));
 		/*
 		 * Update smoothed gradient measurements. Since we are only
 		 * using one measurement per RTT, use max or min rtt_in_rtt.
 		 * This is also less noisy than a sample RTT measurement. Max
 		 * RTT measurements can have trouble due to OS issues.
 		 */
 		if (cdg_data->maxrtt_in_prevrtt) {
 			qdiff_max = ((long)(cdg_data->maxrtt_in_rtt -
 			    cdg_data->maxrtt_in_prevrtt) << D_P_E );
 			qdiff_min = ((long)(cdg_data->minrtt_in_rtt -
 			    cdg_data->minrtt_in_prevrtt) << D_P_E );
 
 			if (cdg_data->sample_q_size == 0) {
 				cdg_data->max_qtrend = qdiff_max;
 				cdg_data->min_qtrend = qdiff_min;
 			} else
 				calc_moving_average(cdg_data, qdiff_max, qdiff_min);
 
 			/* Probabilistic backoff with respect to gradient. */
 			if (slowstart && qdiff_min > 0)
 				congestion = prob_backoff(qdiff_min);
 			else if (cdg_data->min_qtrend > 0)
 				congestion = prob_backoff(cdg_data->min_qtrend);
 			else if (slowstart && qdiff_max > 0)
 				congestion = prob_backoff(qdiff_max);
 			else if (cdg_data->max_qtrend > 0)
 				congestion = prob_backoff(cdg_data->max_qtrend);
 			
 			/* Update estimate of queue state. */
 			if (cdg_data->min_qtrend > 0 &&
 			    cdg_data->max_qtrend <= 0) {
 				cdg_data->queue_state = CDG_Q_FULL;
 			} else if (cdg_data->min_qtrend >= 0 &&
 			    cdg_data->max_qtrend < 0) {
 				cdg_data->queue_state = CDG_Q_EMPTY;
 				cdg_data->shadow_w = 0;
 			} else if (cdg_data->min_qtrend > 0 &&
 			    cdg_data->max_qtrend > 0) {
 				cdg_data->queue_state = CDG_Q_RISING;
 			} else if (cdg_data->min_qtrend < 0 &&
 			    cdg_data->max_qtrend < 0) {
 				cdg_data->queue_state = CDG_Q_FALLING;
 			}
 
 			if (cdg_data->min_qtrend < 0 ||
 			    cdg_data->max_qtrend < 0)
 				cdg_data->consec_cong_cnt = 0;
 		}
 
 		cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt;
 		cdg_data->minrtt_in_rtt = INT_MAX;
 		cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt;
 		cdg_data->maxrtt_in_rtt = 0;
 		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
 	}
 
 	if (congestion) {
 		cdg_data->consec_cong_cnt++;
 		if (!IN_RECOVERY(CCV(ccv, t_flags))) {
 			if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong)
 				cdg_cong_signal(ccv, CC_CDG_DELAY);
 			else
 				/*
 				 * We have been backing off but the queue is not
 				 * falling. Assume we are competing with
 				 * loss-based flows and don't back off for the
 				 * next V_cdg_hold_backoff RTT periods.
 				 */
 				if (cdg_data->consec_cong_cnt >=
 				    V_cdg_consec_cong + V_cdg_hold_backoff)
 					cdg_data->consec_cong_cnt = 0;
 
 			/* Won't see effect until 2nd RTT. */
 			cdg_data->maxrtt_in_prevrtt = 0;
 			/*
 			 * Resync shadow window in case we are competing with a
 			 * loss based flow
 			 */
 			cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd),
 			    cdg_data->shadow_w);
 		}
 	} else if (ack_type == CC_ACK)
 		cdg_window_increase(ccv, new_measurement);
 }
 
 /* When a vnet is created and being initialised, init the per-stack CDG vars. */
 VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
     cdg_init_vnet, NULL);
 
 SYSCTL_DECL(_net_inet_tcp_cc_cdg);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW, NULL,
     "CAIA delay-gradient congestion control related settings");
 
 SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version,
     CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1,
     "Current algorithm/implementation version number");
 
 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0,
     "Increment the window increase factor alpha by 1 MSS segment every "
     "alpha_inc RTTs during congestion avoidance mode.");
 
 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70,
     &cdg_beta_handler, "IU",
     "Delay-based window decrease factor as a percentage "
     "(on delay-based backoff, w = w * beta_delay / 100)");
 
 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50,
     &cdg_beta_handler, "IU",
     "Loss-based window decrease factor as a percentage "
     "(on loss-based backoff, w = w * beta_loss / 100)");
 
 SYSCTL_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     &VNET_NAME(cdg_exp_backoff_scale), 2, &cdg_exp_backoff_scale_handler, "IU",
     "Scaling parameter for the probabilistic exponential backoff");
 
 SYSCTL_UINT(_net_inet_tcp_cc_cdg,  OID_AUTO, smoothing_factor,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8,
     "Number of samples used for moving average smoothing (0 = no smoothing)");
 
 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5,
     "Number of consecutive delay-gradient based congestion episodes which will "
     "trigger loss based CC compatibility");
 
 SYSCTL_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5,
     "Number of consecutive delay-gradient based congestion episodes to hold "
     "the window backoff for loss based CC compatibility");
 
 DECLARE_CC_MODULE(cdg, &cdg_cc_algo);
-
+MODULE_VERSION(cdg, 1);
 MODULE_DEPEND(cdg, ertt, 1, 1, 1);
Index: stable/12/sys/netinet/cc/cc_chd.c
===================================================================
--- stable/12/sys/netinet/cc/cc_chd.c	(revision 364376)
+++ stable/12/sys/netinet/cc/cc_chd.c	(revision 364377)
@@ -1,496 +1,497 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2010
  *	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2010-2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by David Hayes and
  * Lawrence Stewart, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, Melbourne, Australia by
  * David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the CAIA-Hamilton delay based congestion control
  * algorithm, based on "Improved coexistence and loss tolerance for delay based
  * TCP congestion control" by D. A. Hayes and G. Armitage., in 35th Annual IEEE
  * Conference on Local Computer Networks (LCN 2010), Denver, Colorado, USA,
  * 11-14 October 2010.
  *
  * Originally released as part of the NewTCP research project at Swinburne
  * University of Technology's Centre for Advanced Internet Architectures,
  * Melbourne, Australia, which was made possible in part by a grant from the
  * Cisco University Research Program Fund at Community Foundation Silicon
  * Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_module.h>
 
 #include <netinet/khelp/h_ertt.h>
 
 /*
  * Private signal type for rate based congestion signal.
  * See <netinet/cc.h> for appropriate bit-range to use for private signals.
  */
 #define	CC_CHD_DELAY	0x02000000
 
 /* Largest possible number returned by random(). */
 #define	RANDOM_MAX	INT_MAX
 
 static void	chd_ack_received(struct cc_var *ccv, uint16_t ack_type);
 static void	chd_cb_destroy(struct cc_var *ccv);
 static int	chd_cb_init(struct cc_var *ccv);
 static void	chd_cong_signal(struct cc_var *ccv, uint32_t signal_type);
 static void	chd_conn_init(struct cc_var *ccv);
 static int	chd_mod_init(void);
 
 struct chd {
 	/*
 	 * Shadow window - keeps track of what the NewReno congestion window
 	 * would have been if delay-based cwnd backoffs had not been made. This
 	 * functionality aids coexistence with loss-based TCP flows which may be
 	 * sharing links along the path.
 	 */
 	unsigned long shadow_w;
 	/*
 	 * Loss-based TCP compatibility flag - When set, it turns on the shadow
 	 * window functionality.
 	 */
 	int loss_compete;
 	 /* The maximum round trip time seen within a measured rtt period. */
 	int maxrtt_in_rtt;
 	/* The previous qdly that caused cwnd to backoff. */
 	int prev_backoff_qdly;
 };
 
 static int ertt_id;
 
 VNET_DEFINE_STATIC(uint32_t, chd_qmin) = 5;
 VNET_DEFINE_STATIC(uint32_t, chd_pmax) = 50;
 VNET_DEFINE_STATIC(uint32_t, chd_loss_fair) = 1;
 VNET_DEFINE_STATIC(uint32_t, chd_use_max) = 1;
 VNET_DEFINE_STATIC(uint32_t, chd_qthresh) = 20;
 #define	V_chd_qthresh	VNET(chd_qthresh)
 #define	V_chd_qmin	VNET(chd_qmin)
 #define	V_chd_pmax	VNET(chd_pmax)
 #define	V_chd_loss_fair	VNET(chd_loss_fair)
 #define	V_chd_use_max	VNET(chd_use_max)
 
 static MALLOC_DEFINE(M_CHD, "chd data",
     "Per connection data required for the CHD congestion control algorithm");
 
 struct cc_algo chd_cc_algo = {
 	.name = "chd",
 	.ack_received = chd_ack_received,
 	.cb_destroy = chd_cb_destroy,
 	.cb_init = chd_cb_init,
 	.cong_signal = chd_cong_signal,
 	.conn_init = chd_conn_init,
 	.mod_init = chd_mod_init
 };
 
 static __inline void
 chd_window_decrease(struct cc_var *ccv)
 {
 	unsigned long win;
 
 	win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / CCV(ccv, t_maxseg);
 	win -= max((win / 2), 1);
 	CCV(ccv, snd_ssthresh) = max(win, 2) * CCV(ccv, t_maxseg);
 }
 
 /*
  * Probabilistic backoff function. Returns 1 if we should backoff or 0
  * otherwise. The calculation of p is similar to the calculation of p in cc_hd.
  */
 static __inline int
 should_backoff(int qdly, int maxqdly, struct chd *chd_data)
 {
 	unsigned long p, rand;
 
 	rand = random();
 
 	if (qdly < V_chd_qthresh) {
 		chd_data->loss_compete = 0;
 		p = (((RANDOM_MAX / 100) * V_chd_pmax) /
 		    (V_chd_qthresh - V_chd_qmin)) *
 		    (qdly - V_chd_qmin);
 	} else {
 		if (qdly > V_chd_qthresh) {
 			p = (((RANDOM_MAX / 100) * V_chd_pmax) /
 			    (maxqdly - V_chd_qthresh)) *
 			    (maxqdly - qdly);
 			if (V_chd_loss_fair && rand < p)
 				chd_data->loss_compete = 1;
 		} else {
 			p = (RANDOM_MAX / 100) * V_chd_pmax;
 			chd_data->loss_compete = 0;
 		}
 	}
 
 	return (rand < p);
 }
 
 static __inline void
 chd_window_increase(struct cc_var *ccv, int new_measurement)
 {
 	struct chd *chd_data;
 	int incr;
 
 	chd_data = ccv->cc_data;
 	incr = 0;
 
 	if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
 		/* Adapted from NewReno slow start. */
 		if (V_tcp_do_rfc3465) {
 			/* In slow-start with ABC enabled. */
 			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) {
 				/* Not due to RTO. */
 				incr = min(ccv->bytes_this_ack,
 				    V_tcp_abc_l_var * CCV(ccv, t_maxseg));
 			} else {
 				/* Due to RTO. */
 				incr = min(ccv->bytes_this_ack,
 				    CCV(ccv, t_maxseg));
 			}
 		} else
 			incr = CCV(ccv, t_maxseg);
 
 	} else { /* Congestion avoidance. */
 		if (V_tcp_do_rfc3465) {
 			if (ccv->flags & CCF_ABC_SENTAWND) {
 				ccv->flags &= ~CCF_ABC_SENTAWND;
 				incr = CCV(ccv, t_maxseg);
 			}
 		} else if (new_measurement)
 			incr = CCV(ccv, t_maxseg);
 	}
 
 	if (chd_data->shadow_w > 0) {
 		/* Track NewReno window. */
 		chd_data->shadow_w = min(chd_data->shadow_w + incr,
 		    TCP_MAXWIN << CCV(ccv, snd_scale));
 	}
 
 	CCV(ccv,snd_cwnd) = min(CCV(ccv, snd_cwnd) + incr,
 	    TCP_MAXWIN << CCV(ccv, snd_scale));
 }
 
 /*
  * All ACK signals are used for timing measurements to determine delay-based
  * congestion. However, window increases are only performed when
  * ack_type == CC_ACK.
  */
 static void
 chd_ack_received(struct cc_var *ccv, uint16_t ack_type)
 {
 	struct chd *chd_data;
 	struct ertt *e_t;
 	int backoff, new_measurement, qdly, rtt;
 
 	e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
 	chd_data = ccv->cc_data;
 	new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
 	backoff = qdly = 0;
 
 	chd_data->maxrtt_in_rtt = imax(e_t->rtt, chd_data->maxrtt_in_rtt);
 
 	if (new_measurement) {
 		/*
 		 * There is a new per RTT measurement, so check to see if there
 		 * is delay based congestion.
 		 */
 		rtt = V_chd_use_max ? chd_data->maxrtt_in_rtt : e_t->rtt;
 		chd_data->maxrtt_in_rtt = 0;
 
 		if (rtt && e_t->minrtt && !IN_RECOVERY(CCV(ccv, t_flags))) {
 			qdly = rtt - e_t->minrtt;
 			if (qdly > V_chd_qmin) {
 				/*
 				 * Probabilistic delay based congestion
 				 * indication.
 				 */
 				backoff = should_backoff(qdly,
 				    e_t->maxrtt - e_t->minrtt, chd_data);
 			} else
 				chd_data->loss_compete = 0;
 		}
 		/* Reset per RTT measurement flag to start a new measurement. */
 		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
 	}
 
 	if (backoff) {
 		/*
 		 * Update shadow_w before delay based backoff.
 		 */
 		if (chd_data->loss_compete ||
 		    qdly > chd_data->prev_backoff_qdly) {
 			/*
 			 * Delay is higher than when we backed off previously,
 			 * so it is possible that this flow is competing with
 			 * loss based flows.
 			 */
 			chd_data->shadow_w = max(CCV(ccv, snd_cwnd),
 			    chd_data->shadow_w);
 		} else {
 			/*
 			 * Reset shadow_w, as it is probable that this flow is
 			 * not competing with loss based flows at the moment.
 			 */
 			chd_data->shadow_w = 0;
 		}
 
 		chd_data->prev_backoff_qdly = qdly;
 		/*
 		 * Send delay-based congestion signal to the congestion signal
 		 * handler.
 		 */
 		chd_cong_signal(ccv, CC_CHD_DELAY);
 
 	} else if (ack_type == CC_ACK)
 		chd_window_increase(ccv, new_measurement);
 }
 
 static void
 chd_cb_destroy(struct cc_var *ccv)
 {
 
 	free(ccv->cc_data, M_CHD);
 }
 
 static int
 chd_cb_init(struct cc_var *ccv)
 {
 	struct chd *chd_data;
 
 	chd_data = malloc(sizeof(struct chd), M_CHD, M_NOWAIT);
 	if (chd_data == NULL)
 		return (ENOMEM);
 
 	chd_data->shadow_w = 0;
 	ccv->cc_data = chd_data;
 
 	return (0);
 }
 
 static void
 chd_cong_signal(struct cc_var *ccv, uint32_t signal_type)
 {
 	struct ertt *e_t;
 	struct chd *chd_data;
 	int qdly;
 
 	e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
 	chd_data = ccv->cc_data;
 	qdly = imax(e_t->rtt, chd_data->maxrtt_in_rtt) - e_t->minrtt;
 
 	switch(signal_type) {
 	case CC_CHD_DELAY:
 		chd_window_decrease(ccv); /* Set new ssthresh. */
 		CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 		CCV(ccv, snd_recover) = CCV(ccv, snd_max);
 		ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		break;
 
 	case CC_NDUPACK: /* Packet loss. */
 		/*
 		 * Only react to loss as a congestion signal if qdly >
 		 * V_chd_qthresh.  If qdly is less than qthresh, presume that
 		 * this is a non congestion related loss. If qdly is greater
 		 * than qthresh, assume that we are competing with loss based
 		 * tcp flows and restore window from any unnecessary backoffs,
 		 * before the decrease.
 		 */
 		if (!IN_RECOVERY(CCV(ccv, t_flags)) && qdly > V_chd_qthresh) {
 			if (chd_data->loss_compete) {
 				CCV(ccv, snd_cwnd) = max(CCV(ccv, snd_cwnd),
 				    chd_data->shadow_w);
 			}
 			chd_window_decrease(ccv);
 		} else {
 			 /*
 			  * This loss isn't congestion related, or already
 			  * recovering from congestion.
 			  */
 			CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
 			CCV(ccv, snd_recover) = CCV(ccv, snd_max);
 		}
 
 		if (chd_data->shadow_w > 0) {
 			chd_data->shadow_w = max(chd_data->shadow_w /
 			    CCV(ccv, t_maxseg) / 2, 2) * CCV(ccv, t_maxseg);
 		}
 		ENTER_FASTRECOVERY(CCV(ccv, t_flags));
 		break;
 
 	default:
 		newreno_cc_algo.cong_signal(ccv, signal_type);
 	}
 }
 
 static void
 chd_conn_init(struct cc_var *ccv)
 {
 	struct chd *chd_data;
 
 	chd_data = ccv->cc_data;
 	chd_data->prev_backoff_qdly = 0;
 	chd_data->maxrtt_in_rtt = 0;
 	chd_data->loss_compete = 0;
 	/*
 	 * Initialise the shadow_cwnd to be equal to snd_cwnd in case we are
 	 * competing with loss based flows from the start.
 	 */
 	chd_data->shadow_w = CCV(ccv, snd_cwnd);
 }
 
 static int
 chd_mod_init(void)
 {
 
 	ertt_id = khelp_get_id("ertt");
 	if (ertt_id <= 0) {
 		printf("%s: h_ertt module not found\n", __func__);
 		return (ENOENT);
 	}
 
 	chd_cc_algo.after_idle = newreno_cc_algo.after_idle;
 	chd_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
 
 	return (0);
 }
 
 static int
 chd_loss_fair_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_chd_loss_fair;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new > 1)
 			error = EINVAL;
 		else
 			V_chd_loss_fair = new;
 	}
 
 	return (error);
 }
 
 static int
 chd_pmax_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_chd_pmax;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new == 0 || new > 100)
 			error = EINVAL;
 		else
 			V_chd_pmax = new;
 	}
 
 	return (error);
 }
 
 static int
 chd_qthresh_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_chd_qthresh;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new <= V_chd_qmin)
 			error = EINVAL;
 		else
 			V_chd_qthresh = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_chd);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, chd, CTLFLAG_RW, NULL,
     "CAIA Hamilton delay-based congestion control related settings");
 
 SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler,
     "IU", "Flag to enable shadow window functionality.");
 
 SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     &VNET_NAME(chd_pmax), 5, &chd_pmax_handler,
     "IU", "Per RTT maximum backoff probability as a percentage");
 
 SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler,
     "IU", "Queueing congestion threshold in ticks");
 
 SYSCTL_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_qmin), 5,
     "Minimum queueing delay threshold in ticks");
 
 SYSCTL_UINT(_net_inet_tcp_cc_chd,  OID_AUTO, use_max,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_use_max), 1,
     "Use the maximum RTT seen within the measurement period (RTT) "
     "as the basic delay measurement for the algorithm.");
 
 DECLARE_CC_MODULE(chd, &chd_cc_algo);
+MODULE_VERSION(chd, 1);
 MODULE_DEPEND(chd, ertt, 1, 1, 1);
Index: stable/12/sys/netinet/cc/cc_cubic.c
===================================================================
--- stable/12/sys/netinet/cc/cc_cubic.c	(revision 364376)
+++ stable/12/sys/netinet/cc/cc_cubic.c	(revision 364377)
@@ -1,475 +1,476 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Lawrence Stewart while studying at the Centre
  * for Advanced Internet Architectures, Swinburne University of Technology, made
  * possible in part by a grant from the Cisco University Research Program Fund
  * at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the CUBIC congestion control algorithm for FreeBSD,
  * based on the Internet Draft "draft-rhee-tcpm-cubic-02" by Rhee, Xu and Ha.
  * Originally released as part of the NewTCP research project at Swinburne
  * University of Technology's Centre for Advanced Internet Architectures,
  * Melbourne, Australia, which was made possible in part by a grant from the
  * Cisco University Research Program Fund at Community Foundation Silicon
  * Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_cubic.h>
 #include <netinet/cc/cc_module.h>
 
 static void	cubic_ack_received(struct cc_var *ccv, uint16_t type);
 static void	cubic_cb_destroy(struct cc_var *ccv);
 static int	cubic_cb_init(struct cc_var *ccv);
 static void	cubic_cong_signal(struct cc_var *ccv, uint32_t type);
 static void	cubic_conn_init(struct cc_var *ccv);
 static int	cubic_mod_init(void);
 static void	cubic_post_recovery(struct cc_var *ccv);
 static void	cubic_record_rtt(struct cc_var *ccv);
 static void	cubic_ssthresh_update(struct cc_var *ccv);
 static void	cubic_after_idle(struct cc_var *ccv);
 
 struct cubic {
 	/* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
 	int64_t		K;
 	/* Sum of RTT samples across an epoch in ticks. */
 	int64_t		sum_rtt_ticks;
 	/* cwnd at the most recent congestion event. */
 	unsigned long	max_cwnd;
 	/* cwnd at the previous congestion event. */
 	unsigned long	prev_max_cwnd;
 	/* various flags */
 	uint32_t	flags;
 #define CUBICFLAG_CONG_EVENT	0x00000001	/* congestion experienced */
 #define CUBICFLAG_IN_SLOWSTART	0x00000002	/* in slow start */
 #define CUBICFLAG_IN_APPLIMIT	0x00000004	/* application limited */
 	/* Minimum observed rtt in ticks. */
 	int		min_rtt_ticks;
 	/* Mean observed rtt between congestion epochs. */
 	int		mean_rtt_ticks;
 	/* ACKs since last congestion event. */
 	int		epoch_ack_count;
 	/* Time of last congestion event in ticks. */
 	int		t_last_cong;
 };
 
 static MALLOC_DEFINE(M_CUBIC, "cubic data",
     "Per connection data required for the CUBIC congestion control algorithm");
 
 struct cc_algo cubic_cc_algo = {
 	.name = "cubic",
 	.ack_received = cubic_ack_received,
 	.cb_destroy = cubic_cb_destroy,
 	.cb_init = cubic_cb_init,
 	.cong_signal = cubic_cong_signal,
 	.conn_init = cubic_conn_init,
 	.mod_init = cubic_mod_init,
 	.post_recovery = cubic_post_recovery,
 	.after_idle = cubic_after_idle,
 };
 
 static void
 cubic_ack_received(struct cc_var *ccv, uint16_t type)
 {
 	struct cubic *cubic_data;
 	unsigned long w_tf, w_cubic_next;
 	int ticks_since_cong;
 
 	cubic_data = ccv->cc_data;
 	cubic_record_rtt(ccv);
 
 	/*
 	 * Regular ACK and we're not in cong/fast recovery and we're cwnd
 	 * limited and we're either not doing ABC or are slow starting or are
 	 * doing ABC and we've sent a cwnd's worth of bytes.
 	 */
 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
 	    (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 ||
 	    CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) ||
 	    (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) {
 		 /* Use the logic in NewReno ack_received() for slow start. */
 		if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) ||
 		    cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) {
 			cubic_data->flags |= CUBICFLAG_IN_SLOWSTART;
 			newreno_cc_algo.ack_received(ccv, type);
 		} else {
 			if ((ticks_since_cong =
 			    ticks - cubic_data->t_last_cong) < 0) {
 				/*
 				 * dragging t_last_cong along
 				 */
 				ticks_since_cong = INT_MAX;
 				cubic_data->t_last_cong = ticks - INT_MAX;
 			}
 
 			if (cubic_data->flags & (CUBICFLAG_IN_SLOWSTART |
 						 CUBICFLAG_IN_APPLIMIT)) {
 				cubic_data->flags &= ~(CUBICFLAG_IN_SLOWSTART |
 						       CUBICFLAG_IN_APPLIMIT);
 				cubic_data->t_last_cong = ticks;
 				cubic_data->K = 0;
 			}
 			/*
 			 * The mean RTT is used to best reflect the equations in
 			 * the I-D. Using min_rtt in the tf_cwnd calculation
 			 * causes w_tf to grow much faster than it should if the
 			 * RTT is dominated by network buffering rather than
 			 * propagation delay.
 			 */
 			w_tf = tf_cwnd(ticks_since_cong,
 			    cubic_data->mean_rtt_ticks, cubic_data->max_cwnd,
 			    CCV(ccv, t_maxseg));
 
 			w_cubic_next = cubic_cwnd(ticks_since_cong +
 			    cubic_data->mean_rtt_ticks, cubic_data->max_cwnd,
 			    CCV(ccv, t_maxseg), cubic_data->K);
 
 			ccv->flags &= ~CCF_ABC_SENTAWND;
 
 			if (w_cubic_next < w_tf) {
 				/*
 				 * TCP-friendly region, follow tf
 				 * cwnd growth.
 				 */
 				if (CCV(ccv, snd_cwnd) < w_tf)
 					CCV(ccv, snd_cwnd) = ulmin(w_tf, INT_MAX);
 			}
 
 			else if (CCV(ccv, snd_cwnd) < w_cubic_next) {
 				/*
 				 * Concave or convex region, follow CUBIC
 				 * cwnd growth.
 				 */
 				if (V_tcp_do_rfc3465)
 					CCV(ccv, snd_cwnd) = ulmin(w_cubic_next,
 					    INT_MAX);
 				else
 					CCV(ccv, snd_cwnd) += ulmax(1,
 					    ((ulmin(w_cubic_next, INT_MAX) -
 					    CCV(ccv, snd_cwnd)) *
 					    CCV(ccv, t_maxseg)) /
 					    CCV(ccv, snd_cwnd));
 			}
 
 			/*
 			 * If we're not in slow start and we're probing for a
 			 * new cwnd limit at the start of a connection
 			 * (happens when hostcache has a relevant entry),
 			 * keep updating our current estimate of the
 			 * max_cwnd.
 			 */
 			if (((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0) &&
 			    cubic_data->max_cwnd < CCV(ccv, snd_cwnd)) {
 				cubic_data->max_cwnd = CCV(ccv, snd_cwnd);
 				cubic_data->K = cubic_k(cubic_data->max_cwnd /
 				    CCV(ccv, t_maxseg));
 			}
 		}
 	} else if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
 	    !(ccv->flags & CCF_CWND_LIMITED)) {
 		cubic_data->flags |= CUBICFLAG_IN_APPLIMIT;
 	}
 }
 
 /*
  * This is a Cubic specific implementation of after_idle.
  *   - Reset cwnd by calling New Reno implementation of after_idle.
  *   - Reset t_last_cong.
  */
 static void
 cubic_after_idle(struct cc_var *ccv)
 {
 	struct cubic *cubic_data;
 
 	cubic_data = ccv->cc_data;
 
 	cubic_data->max_cwnd = ulmax(cubic_data->max_cwnd, CCV(ccv, snd_cwnd));
 	cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg));
 
 	newreno_cc_algo.after_idle(ccv);
 	cubic_data->t_last_cong = ticks;
 }
 
 
 static void
 cubic_cb_destroy(struct cc_var *ccv)
 {
 	free(ccv->cc_data, M_CUBIC);
 }
 
 static int
 cubic_cb_init(struct cc_var *ccv)
 {
 	struct cubic *cubic_data;
 
 	cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO);
 
 	if (cubic_data == NULL)
 		return (ENOMEM);
 
 	/* Init some key variables with sensible defaults. */
 	cubic_data->t_last_cong = ticks;
 	cubic_data->min_rtt_ticks = TCPTV_SRTTBASE;
 	cubic_data->mean_rtt_ticks = 1;
 
 	ccv->cc_data = cubic_data;
 
 	return (0);
 }
 
 /*
  * Perform any necessary tasks before we enter congestion recovery.
  */
 static void
 cubic_cong_signal(struct cc_var *ccv, uint32_t type)
 {
 	struct cubic *cubic_data;
 
 	cubic_data = ccv->cc_data;
 
 	switch (type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 				cubic_ssthresh_update(ccv);
 				cubic_data->flags |= CUBICFLAG_CONG_EVENT;
 				cubic_data->prev_max_cwnd = cubic_data->max_cwnd;
 				cubic_data->max_cwnd = CCV(ccv, snd_cwnd);
 				cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg));
 			}
 			ENTER_RECOVERY(CCV(ccv, t_flags));
 		}
 		break;
 
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 			cubic_ssthresh_update(ccv);
 			cubic_data->flags |= CUBICFLAG_CONG_EVENT;
 			cubic_data->prev_max_cwnd = cubic_data->max_cwnd;
 			cubic_data->max_cwnd = CCV(ccv, snd_cwnd);
 			cubic_data->t_last_cong = ticks;
 			cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg));
 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		}
 		break;
 
 	case CC_RTO:
 		/*
 		 * Grab the current time and record it so we know when the
 		 * most recent congestion event was. Only record it when the
 		 * timeout has fired more than once, as there is a reasonable
 		 * chance the first one is a false alarm and may not indicate
 		 * congestion.
 		 * This will put Cubic firmly into the concave / TCP friendly
 		 * region, for a slower ramp-up after two consecutive RTOs.
 		 */
 		if (CCV(ccv, t_rxtshift) >= 2) {
 			cubic_data->flags |= CUBICFLAG_CONG_EVENT;
 			cubic_data->t_last_cong = ticks;
 			cubic_data->max_cwnd = CCV(ccv, snd_cwnd_prev);
 			cubic_data->K = cubic_k(cubic_data->max_cwnd /
 						CCV(ccv, t_maxseg));
 		}
 		break;
 	}
 }
 
 static void
 cubic_conn_init(struct cc_var *ccv)
 {
 	struct cubic *cubic_data;
 
 	cubic_data = ccv->cc_data;
 
 	/*
 	 * Ensure we have a sane initial value for max_cwnd recorded. Without
 	 * this here bad things happen when entries from the TCP hostcache
 	 * get used.
 	 */
 	cubic_data->max_cwnd = CCV(ccv, snd_cwnd);
 }
 
 static int
 cubic_mod_init(void)
 {
 	return (0);
 }
 
 /*
  * Perform any necessary tasks before we exit congestion recovery.
  */
 static void
 cubic_post_recovery(struct cc_var *ccv)
 {
 	struct cubic *cubic_data;
 	int pipe;
 
 	cubic_data = ccv->cc_data;
 	pipe = 0;
 
 	/* Fast convergence heuristic. */
 	if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd)
 		cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR)
 		    >> CUBIC_SHIFT;
 
 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 		/*
 		 * If inflight data is less than ssthresh, set cwnd
 		 * conservatively to avoid a burst of data, as suggested in
 		 * the NewReno RFC. Otherwise, use the CUBIC method.
 		 *
 		 * XXXLAS: Find a way to do this without needing curack
 		 */
 		if (V_tcp_do_rfc6675_pipe)
 			pipe = tcp_compute_pipe(ccv->ccvc.tcp);
 		else
 			pipe = CCV(ccv, snd_max) - ccv->curack;
 
 		if (pipe < CCV(ccv, snd_ssthresh))
 			/*
 			 * Ensure that cwnd does not collapse to 1 MSS under
 			 * adverse conditions. Implements RFC6582
 			 */
 			CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
 			    CCV(ccv, t_maxseg);
 		else
 			/* Update cwnd based on beta and adjusted max_cwnd. */
 			CCV(ccv, snd_cwnd) = max(((uint64_t)cubic_data->max_cwnd *
 			    CUBIC_BETA) >> CUBIC_SHIFT,
 			    2 * CCV(ccv, t_maxseg));
 	}
 	cubic_data->t_last_cong = ticks;
 
 	/* Calculate the average RTT between congestion epochs. */
 	if (cubic_data->epoch_ack_count > 0 &&
 	    cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) {
 		cubic_data->mean_rtt_ticks = (int)(cubic_data->sum_rtt_ticks /
 		    cubic_data->epoch_ack_count);
 	}
 
 	cubic_data->epoch_ack_count = 0;
 	cubic_data->sum_rtt_ticks = 0;
 	cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg));
 }
 
 /*
  * Record the min RTT and sum samples for the epoch average RTT calculation.
  */
 static void
 cubic_record_rtt(struct cc_var *ccv)
 {
 	struct cubic *cubic_data;
 	int t_srtt_ticks;
 
 	/* Ignore srtt until a min number of samples have been taken. */
 	if (CCV(ccv, t_rttupdated) >= CUBIC_MIN_RTT_SAMPLES) {
 		cubic_data = ccv->cc_data;
 		t_srtt_ticks = CCV(ccv, t_srtt) / TCP_RTT_SCALE;
 
 		/*
 		 * Record the current SRTT as our minrtt if it's the smallest
 		 * we've seen or minrtt is currently equal to its initialised
 		 * value.
 		 *
 		 * XXXLAS: Should there be some hysteresis for minrtt?
 		 */
 		if ((t_srtt_ticks < cubic_data->min_rtt_ticks ||
 		    cubic_data->min_rtt_ticks == TCPTV_SRTTBASE)) {
 			cubic_data->min_rtt_ticks = max(1, t_srtt_ticks);
 
 			/*
 			 * If the connection is within its first congestion
 			 * epoch, ensure we prime mean_rtt_ticks with a
 			 * reasonable value until the epoch average RTT is
 			 * calculated in cubic_post_recovery().
 			 */
 			if (cubic_data->min_rtt_ticks >
 			    cubic_data->mean_rtt_ticks)
 				cubic_data->mean_rtt_ticks =
 				    cubic_data->min_rtt_ticks;
 		}
 
 		/* Sum samples for epoch average RTT calculation. */
 		cubic_data->sum_rtt_ticks += t_srtt_ticks;
 		cubic_data->epoch_ack_count++;
 	}
 }
 
 /*
  * Update the ssthresh in the event of congestion.
  */
 static void
 cubic_ssthresh_update(struct cc_var *ccv)
 {
 	struct cubic *cubic_data;
 	uint32_t ssthresh;
 
 	cubic_data = ccv->cc_data;
 
 	/*
 	 * On the first congestion event, set ssthresh to cwnd * 0.5, on
 	 * subsequent congestion events, set it to cwnd * beta.
 	 */
 	if ((cubic_data->flags & CUBICFLAG_CONG_EVENT) == 0)
 		ssthresh = CCV(ccv, snd_cwnd) >> 1;
 	else
 		ssthresh = ((uint64_t)CCV(ccv, snd_cwnd) *
 		    CUBIC_BETA) >> CUBIC_SHIFT;
 	CCV(ccv, snd_ssthresh) = max(ssthresh, 2 * CCV(ccv, t_maxseg));
 }
 
 
 DECLARE_CC_MODULE(cubic, &cubic_cc_algo);
+MODULE_VERSION(cubic, 1);
Index: stable/12/sys/netinet/cc/cc_dctcp.c
===================================================================
--- stable/12/sys/netinet/cc/cc_dctcp.c	(revision 364376)
+++ stable/12/sys/netinet/cc/cc_dctcp.c	(revision 364377)
@@ -1,469 +1,470 @@
 /*-
  * Copyright (c) 2007-2008
  *	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2014 Midori Kato <katoon@sfc.wide.ad.jp>
  * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the DCTCP algorithm for FreeBSD, based on
  * "Data Center TCP (DCTCP)" by M. Alizadeh, A. Greenberg, D. A. Maltz,
  * J. Padhye, P. Patel, B. Prabhakar, S. Sengupta, and M. Sridharan.,
  * in ACM Conference on SIGCOMM 2010, New York, USA,
  * Originally released as the contribution of Microsoft Research project.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_module.h>
 
 #define DCTCP_SHIFT 10
 #define MAX_ALPHA_VALUE (1<<DCTCP_SHIFT)
 VNET_DEFINE_STATIC(uint32_t, dctcp_alpha) = MAX_ALPHA_VALUE;
 #define V_dctcp_alpha	    VNET(dctcp_alpha)
 VNET_DEFINE_STATIC(uint32_t, dctcp_shift_g) = 4;
 #define	V_dctcp_shift_g	    VNET(dctcp_shift_g)
 VNET_DEFINE_STATIC(uint32_t, dctcp_slowstart) = 0;
 #define	V_dctcp_slowstart   VNET(dctcp_slowstart)
 
 struct dctcp {
 	uint32_t bytes_ecn;	  /* # of marked bytes during a RTT */
 	uint32_t bytes_total;	  /* # of acked bytes during a RTT */
 	int      alpha;		  /* the fraction of marked bytes */
 	int      ce_prev;	  /* CE state of the last segment */
 	tcp_seq  save_sndnxt;	  /* end sequence number of the current window */
 	int      ece_curr;	  /* ECE flag in this segment */
 	int      ece_prev;	  /* ECE flag in the last segment */
 	uint32_t num_cong_events; /* # of congestion events */
 };
 
 static MALLOC_DEFINE(M_dctcp, "dctcp data",
     "Per connection data required for the dctcp algorithm");
 
 static void	dctcp_ack_received(struct cc_var *ccv, uint16_t type);
 static void	dctcp_after_idle(struct cc_var *ccv);
 static void	dctcp_cb_destroy(struct cc_var *ccv);
 static int	dctcp_cb_init(struct cc_var *ccv);
 static void	dctcp_cong_signal(struct cc_var *ccv, uint32_t type);
 static void	dctcp_conn_init(struct cc_var *ccv);
 static void	dctcp_post_recovery(struct cc_var *ccv);
 static void	dctcp_ecnpkt_handler(struct cc_var *ccv);
 static void	dctcp_update_alpha(struct cc_var *ccv);
 
 struct cc_algo dctcp_cc_algo = {
 	.name = "dctcp",
 	.ack_received = dctcp_ack_received,
 	.cb_destroy = dctcp_cb_destroy,
 	.cb_init = dctcp_cb_init,
 	.cong_signal = dctcp_cong_signal,
 	.conn_init = dctcp_conn_init,
 	.post_recovery = dctcp_post_recovery,
 	.ecnpkt_handler = dctcp_ecnpkt_handler,
 	.after_idle = dctcp_after_idle,
 };
 
 static void
 dctcp_ack_received(struct cc_var *ccv, uint16_t type)
 {
 	struct dctcp *dctcp_data;
 	int bytes_acked = 0;
 
 	dctcp_data = ccv->cc_data;
 
 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
 		/*
 		 * DCTCP doesn't treat receipt of ECN marked packet as a
 		 * congestion event. Thus, DCTCP always executes the ACK
 		 * processing out of congestion recovery.
 		 */
 		if (IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 			EXIT_CONGRECOVERY(CCV(ccv, t_flags));
 			newreno_cc_algo.ack_received(ccv, type);
 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		} else
 			newreno_cc_algo.ack_received(ccv, type);
 
 		if (type == CC_DUPACK)
 			bytes_acked = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
 
 		if (type == CC_ACK)
 			bytes_acked = ccv->bytes_this_ack;
 
 		/* Update total bytes. */
 		dctcp_data->bytes_total += bytes_acked;
 
 		/* Update total marked bytes. */
 		if (dctcp_data->ece_curr) {
 			//XXRMS: For fluid-model DCTCP, update
 			//cwnd here during for RTT fairness
 			if (!dctcp_data->ece_prev
 			    && bytes_acked > CCV(ccv, t_maxseg)) {
 				dctcp_data->bytes_ecn +=
 				    (bytes_acked - CCV(ccv, t_maxseg));
 			} else
 				dctcp_data->bytes_ecn += bytes_acked;
 			dctcp_data->ece_prev = 1;
 		} else {
 			if (dctcp_data->ece_prev
 			    && bytes_acked > CCV(ccv, t_maxseg))
 				dctcp_data->bytes_ecn += CCV(ccv, t_maxseg);
 			dctcp_data->ece_prev = 0;
 		}
 		dctcp_data->ece_curr = 0;
 
 		/*
 		 * Update the fraction of marked bytes at the end of
 		 * current window size.
 		 */
 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
 		    SEQ_GT(ccv->curack, dctcp_data->save_sndnxt))
 			dctcp_update_alpha(ccv);
 	} else
 		newreno_cc_algo.ack_received(ccv, type);
 }
 
 static void
 dctcp_after_idle(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 
 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
 		dctcp_data = ccv->cc_data;
 
 		/* Initialize internal parameters after idle time */
 		dctcp_data->bytes_ecn = 0;
 		dctcp_data->bytes_total = 0;
 		dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
 		dctcp_data->alpha = V_dctcp_alpha;
 		dctcp_data->ece_curr = 0;
 		dctcp_data->ece_prev = 0;
 		dctcp_data->num_cong_events = 0;
 	}
 
 	newreno_cc_algo.after_idle(ccv);
 }
 
 static void
 dctcp_cb_destroy(struct cc_var *ccv)
 {
 	free(ccv->cc_data, M_dctcp);
 }
 
 static int
 dctcp_cb_init(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 
 	dctcp_data = malloc(sizeof(struct dctcp), M_dctcp, M_NOWAIT|M_ZERO);
 
 	if (dctcp_data == NULL)
 		return (ENOMEM);
 
 	/* Initialize some key variables with sensible defaults. */
 	dctcp_data->bytes_ecn = 0;
 	dctcp_data->bytes_total = 0;
 	/*
 	 * When alpha is set to 0 in the beginning, DCTCP sender transfers as
 	 * much data as possible until the value converges which may expand the
 	 * queueing delay at the switch. When alpha is set to 1, queueing delay
 	 * is kept small.
 	 * Throughput-sensitive applications should have alpha = 0
 	 * Latency-sensitive applications should have alpha = 1
 	 *
 	 * Note: DCTCP draft suggests initial alpha to be 1 but we've decided to
 	 * keep it 0 as default.
 	 */
 	dctcp_data->alpha = V_dctcp_alpha;
 	dctcp_data->save_sndnxt = 0;
 	dctcp_data->ce_prev = 0;
 	dctcp_data->ece_curr = 0;
 	dctcp_data->ece_prev = 0;
 	dctcp_data->num_cong_events = 0;
 
 	ccv->cc_data = dctcp_data;
 	return (0);
 }
 
 /*
  * Perform any necessary tasks before we enter congestion recovery.
  */
 static void
 dctcp_cong_signal(struct cc_var *ccv, uint32_t type)
 {
 	struct dctcp *dctcp_data;
 	u_int cwin, mss;
 
 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
 		dctcp_data = ccv->cc_data;
 		cwin = CCV(ccv, snd_cwnd);
 		mss = CCV(ccv, t_maxseg);
 
 		switch (type) {
 		case CC_NDUPACK:
 			if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 				if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 					CCV(ccv, snd_ssthresh) =
 					    max(cwin / 2, 2 * mss);
 					dctcp_data->num_cong_events++;
 				} else {
 					/* cwnd has already updated as congestion
 					 * recovery. Reverse cwnd value using
 					 * snd_cwnd_prev and recalculate snd_ssthresh
 					 */
 					cwin = CCV(ccv, snd_cwnd_prev);
 					CCV(ccv, snd_ssthresh) =
 					    max(cwin / 2, 2 * mss);
 				}
 				ENTER_RECOVERY(CCV(ccv, t_flags));
 			}
 			break;
 		case CC_ECN:
 			/*
 			 * Save current snd_cwnd when the host encounters both
 			 * congestion recovery and fast recovery.
 			 */
 			CCV(ccv, snd_cwnd_prev) = cwin;
 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 				if (V_dctcp_slowstart &&
 				    dctcp_data->num_cong_events++ == 0) {
 					CCV(ccv, snd_ssthresh) =
 					    max(cwin / 2, 2 * mss);
 					dctcp_data->alpha = MAX_ALPHA_VALUE;
 					dctcp_data->bytes_ecn = 0;
 					dctcp_data->bytes_total = 0;
 					dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
 				} else
 					CCV(ccv, snd_ssthresh) = 
 					    max((cwin - (((uint64_t)cwin *
 					    dctcp_data->alpha) >> (DCTCP_SHIFT+1))), 
 					    2 * mss);
 				CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 				ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 			}
 			dctcp_data->ece_curr = 1;
 			break;
 		case CC_RTO:
 			CCV(ccv, t_flags) |= TF_ECN_SND_CWR;
 			dctcp_update_alpha(ccv);
 			dctcp_data->save_sndnxt += CCV(ccv, t_maxseg);
 			dctcp_data->num_cong_events++;
 			break;
 		}
 	} else
 		newreno_cc_algo.cong_signal(ccv, type);
 }
 
 static void
 dctcp_conn_init(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 
 	dctcp_data = ccv->cc_data;
 
 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
 		dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
 }
 
 /*
  * Perform any necessary tasks before we exit congestion recovery.
  */
 static void
 dctcp_post_recovery(struct cc_var *ccv)
 {
 	newreno_cc_algo.post_recovery(ccv);
 
 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
 		dctcp_update_alpha(ccv);
 }
 
 /*
  * Execute an additional ECN processing using ECN field in IP header and the CWR
  * bit in TCP header.
  *
  * delay_ack == 0 - Delayed ACK disabled
  * delay_ack == 1 - Delayed ACK enabled
  */
 
 static void
 dctcp_ecnpkt_handler(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 	uint32_t ccflag;
 	int delay_ack;
 
 	dctcp_data = ccv->cc_data;
 	ccflag = ccv->flags;
 	delay_ack = 1;
 
 	/*
 	 * DCTCP responds with an ACK immediately when the CE state
 	 * in between this segment and the last segment has changed.
 	 */
 	if (ccflag & CCF_IPHDR_CE) {
 		if (!dctcp_data->ce_prev && (ccflag & CCF_DELACK))
 			delay_ack = 0;
 		dctcp_data->ce_prev = 1;
 		CCV(ccv, t_flags) |= TF_ECN_SND_ECE;
 	} else {
 		if (dctcp_data->ce_prev && (ccflag & CCF_DELACK))
 			delay_ack = 0;
 		dctcp_data->ce_prev = 0;
 		CCV(ccv, t_flags) &= ~TF_ECN_SND_ECE;
 	}
 
 	/* DCTCP sets delayed ack when this segment sets the CWR flag. */
 	if ((ccflag & CCF_DELACK) && (ccflag & CCF_TCPHDR_CWR))
 		delay_ack = 1;
 
 	if (delay_ack == 0)
 		ccv->flags |= CCF_ACKNOW;
 }
 
 /*
  * Update the fraction of marked bytes represented as 'alpha'.
  * Also initialize several internal parameters at the end of this function.
  */
 static void
 dctcp_update_alpha(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 	int alpha_prev;
 
 	dctcp_data = ccv->cc_data;
 	alpha_prev = dctcp_data->alpha;
 	dctcp_data->bytes_total = max(dctcp_data->bytes_total, 1);
 
 	/*
 	 * Update alpha: alpha = (1 - g) * alpha + g * M.
 	 * Here:
 	 * g is weight factor
 	 *	recommaded to be set to 1/16
 	 *	small g = slow convergence between competitive DCTCP flows
 	 *	large g = impacts low utilization of bandwidth at switches
 	 * M is fraction of marked segments in last RTT
 	 *	updated every RTT
 	 * Alpha must be round to 0 - MAX_ALPHA_VALUE.
 	 */
 	dctcp_data->alpha = ulmin(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
 	    ((uint64_t)dctcp_data->bytes_ecn << (DCTCP_SHIFT - V_dctcp_shift_g)) /
 	    dctcp_data->bytes_total, MAX_ALPHA_VALUE);
 
 	/* Initialize internal parameters for next alpha calculation */
 	dctcp_data->bytes_ecn = 0;
 	dctcp_data->bytes_total = 0;
 	dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
 }
 
 static int
 dctcp_alpha_handler(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t new;
 	int error;
 
 	new = V_dctcp_alpha;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new > MAX_ALPHA_VALUE)
 			error = EINVAL;
 		else
 			V_dctcp_alpha = new;
 	}
 
 	return (error);
 }
 
 static int
 dctcp_shift_g_handler(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t new;
 	int error;
 
 	new = V_dctcp_shift_g;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new > DCTCP_SHIFT)
 			error = EINVAL;
 		else
 			V_dctcp_shift_g = new;
 	}
 
 	return (error);
 }
 
 static int
 dctcp_slowstart_handler(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t new;
 	int error;
 
 	new = V_dctcp_slowstart;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new > 1)
 			error = EINVAL;
 		else
 			V_dctcp_slowstart = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_dctcp);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, dctcp, CTLFLAG_RW, NULL,
     "dctcp congestion control related settings");
 
 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, alpha,
     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_alpha), 0,
     &dctcp_alpha_handler,
     "IU", "dctcp alpha parameter at start of session");
 
 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, shift_g,
     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_shift_g), 4,
     &dctcp_shift_g_handler,
     "IU", "dctcp shift parameter");
 
 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, slowstart,
     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_slowstart), 0,
     &dctcp_slowstart_handler,
     "IU", "half CWND reduction after the first slow start");
 
 DECLARE_CC_MODULE(dctcp, &dctcp_cc_algo);
+MODULE_VERSION(dctcp, 1);
Index: stable/12/sys/netinet/cc/cc_hd.c
===================================================================
--- stable/12/sys/netinet/cc/cc_hd.c	(revision 364376)
+++ stable/12/sys/netinet/cc/cc_hd.c	(revision 364377)
@@ -1,252 +1,253 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2010
  *	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010-2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by David Hayes and
  * Lawrence Stewart, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, Melbourne, Australia by
  * David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the Hamilton Institute's delay-based congestion control
  * algorithm for FreeBSD, based on "A strategy for fair coexistence of loss and
  * delay-based congestion control algorithms," by L. Budzisz, R. Stanojevic, R.
  * Shorten, and F. Baker, IEEE Commun. Lett., vol. 13, no. 7, pp. 555--557, Jul.
  * 2009.
  *
  * Originally released as part of the NewTCP research project at Swinburne
  * University of Technology's Centre for Advanced Internet Architectures,
  * Melbourne, Australia, which was made possible in part by a grant from the
  * Cisco University Research Program Fund at Community Foundation Silicon
  * Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_module.h>
 
 #include <netinet/khelp/h_ertt.h>
 
 /* Largest possible number returned by random(). */
 #define	RANDOM_MAX	INT_MAX
 
 static void	hd_ack_received(struct cc_var *ccv, uint16_t ack_type);
 static int	hd_mod_init(void);
 
 static int ertt_id;
 
 VNET_DEFINE_STATIC(uint32_t, hd_qthresh) = 20;
 VNET_DEFINE_STATIC(uint32_t, hd_qmin) = 5;
 VNET_DEFINE_STATIC(uint32_t, hd_pmax) = 5;
 #define	V_hd_qthresh	VNET(hd_qthresh)
 #define	V_hd_qmin	VNET(hd_qmin)
 #define	V_hd_pmax	VNET(hd_pmax)
 
 struct cc_algo hd_cc_algo = {
 	.name = "hd",
 	.ack_received = hd_ack_received,
 	.mod_init = hd_mod_init
 };
 
 /*
  * Hamilton backoff function. Returns 1 if we should backoff or 0 otherwise.
  */
 static __inline int
 should_backoff(int qdly, int maxqdly)
 {
 	unsigned long p;
 
 	if (qdly < V_hd_qthresh) {
 		p = (((RANDOM_MAX / 100) * V_hd_pmax) /
 		    (V_hd_qthresh - V_hd_qmin)) * (qdly - V_hd_qmin);
 	} else {
 		if (qdly > V_hd_qthresh)
 			p = (((RANDOM_MAX / 100) * V_hd_pmax) /
 			    (maxqdly - V_hd_qthresh)) * (maxqdly - qdly);
 		else
 			p = (RANDOM_MAX / 100) * V_hd_pmax;
 	}
 
 	return (random() < p);
 }
 
 /*
  * If the ack type is CC_ACK, and the inferred queueing delay is greater than
  * the Qmin threshold, cwnd is reduced probabilistically. When backing off due
  * to delay, HD behaves like NewReno when an ECN signal is received. HD behaves
  * as NewReno in all other circumstances.
  */
 static void
 hd_ack_received(struct cc_var *ccv, uint16_t ack_type)
 {
 	struct ertt *e_t;
 	int qdly;
 
 	if (ack_type == CC_ACK) {
 		e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
 
 		if (e_t->rtt && e_t->minrtt && V_hd_qthresh > 0) {
 			qdly = e_t->rtt - e_t->minrtt;
 
 			if (qdly > V_hd_qmin &&
 			    !IN_RECOVERY(CCV(ccv, t_flags))) {
 				/* Probabilistic backoff of cwnd. */
 				if (should_backoff(qdly,
 				    e_t->maxrtt - e_t->minrtt)) {
 					/*
 					 * Update cwnd and ssthresh update to
 					 * half cwnd and behave like an ECN (ie
 					 * not a packet loss).
 					 */
 					newreno_cc_algo.cong_signal(ccv,
 					    CC_ECN);
 					return;
 				}
 			}
 		}
 	}
 	newreno_cc_algo.ack_received(ccv, ack_type); /* As for NewReno. */
 }
 
 static int
 hd_mod_init(void)
 {
 
 	ertt_id = khelp_get_id("ertt");
 	if (ertt_id <= 0) {
 		printf("%s: h_ertt module not found\n", __func__);
 		return (ENOENT);
 	}
 
 	hd_cc_algo.after_idle = newreno_cc_algo.after_idle;
 	hd_cc_algo.cong_signal = newreno_cc_algo.cong_signal;
 	hd_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
 
 	return (0);
 }
 
 static int
 hd_pmax_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_hd_pmax;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new == 0 || new > 100)
 			error = EINVAL;
 		else
 			V_hd_pmax = new;
 	}
 
 	return (error);
 }
 
 static int
 hd_qmin_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_hd_qmin;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new > V_hd_qthresh)
 			error = EINVAL;
 		else
 			V_hd_qmin = new;
 	}
 
 	return (error);
 }
 
 static int
 hd_qthresh_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_hd_qthresh;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new == 0 || new < V_hd_qmin)
 			error = EINVAL;
 		else
 			V_hd_qthresh = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_hd);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, hd, CTLFLAG_RW, NULL,
     "Hamilton delay-based congestion control related settings");
 
 SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_threshold,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_qthresh), 20,
     &hd_qthresh_handler, "IU", "queueing congestion threshold (qth) in ticks");
 
 SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, pmax,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_pmax), 5,
     &hd_pmax_handler, "IU",
     "per packet maximum backoff probability as a percentage");
 
 SYSCTL_PROC(_net_inet_tcp_cc_hd, OID_AUTO, queue_min,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(hd_qmin), 5,
     &hd_qmin_handler, "IU", "minimum queueing delay threshold (qmin) in ticks");
 
 DECLARE_CC_MODULE(hd, &hd_cc_algo);
+MODULE_VERSION(hd, 1);
 MODULE_DEPEND(hd, ertt, 1, 1, 1);
Index: stable/12/sys/netinet/cc/cc_htcp.c
===================================================================
--- stable/12/sys/netinet/cc/cc_htcp.c	(revision 364376)
+++ stable/12/sys/netinet/cc/cc_htcp.c	(revision 364377)
@@ -1,532 +1,533 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2008
  * 	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart and
  * James Healy, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the H-TCP congestion control algorithm for FreeBSD,
  * based on the Internet Draft "draft-leith-tcp-htcp-06.txt" by Leith and
  * Shorten. Originally released as part of the NewTCP research project at
  * Swinburne University of Technology's Centre for Advanced Internet
  * Architectures, Melbourne, Australia, which was made possible in part by a
  * grant from the Cisco University Research Program Fund at Community Foundation
  * Silicon Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_module.h>
 
 /* Fixed point math shifts. */
 #define HTCP_SHIFT 8
 #define HTCP_ALPHA_INC_SHIFT 4
 
 #define HTCP_INIT_ALPHA 1
 #define HTCP_DELTA_L hz		/* 1 sec in ticks. */
 #define HTCP_MINBETA 128	/* 0.5 << HTCP_SHIFT. */
 #define HTCP_MAXBETA 204	/* ~0.8 << HTCP_SHIFT. */
 #define HTCP_MINROWE 26		/* ~0.1 << HTCP_SHIFT. */
 #define HTCP_MAXROWE 512	/* 2 << HTCP_SHIFT. */
 
 /* RTT_ref (ms) used in the calculation of alpha if RTT scaling is enabled. */
 #define HTCP_RTT_REF 100
 
 /* Don't trust SRTT until this many samples have been taken. */
 #define HTCP_MIN_RTT_SAMPLES 8
 
 /*
  * HTCP_CALC_ALPHA performs a fixed point math calculation to determine the
  * value of alpha, based on the function defined in the HTCP spec.
  *
  * i.e. 1 + 10(delta - delta_l) + ((delta - delta_l) / 2) ^ 2
  *
  * "diff" is passed in to the macro as "delta - delta_l" and is expected to be
  * in units of ticks.
  *
  * The joyousnous of fixed point maths means our function implementation looks a
  * little funky...
  *
  * In order to maintain some precision in the calculations, a fixed point shift
  * HTCP_ALPHA_INC_SHIFT is used to ensure the integer divisions don't
  * truncate the results too badly.
  *
  * The "16" value is the "1" term in the alpha function shifted up by
  * HTCP_ALPHA_INC_SHIFT
  *
  * The "160" value is the "10" multiplier in the alpha function multiplied by
  * 2^HTCP_ALPHA_INC_SHIFT
  *
  * Specifying these as constants reduces the computations required. After
  * up-shifting all the terms in the function and performing the required
  * calculations, we down-shift the final result by HTCP_ALPHA_INC_SHIFT to
  * ensure it is back in the correct range.
  *
  * The "hz" terms are required as kernels can be configured to run with
  * different tick timers, which we have to adjust for in the alpha calculation
  * (which originally was defined in terms of seconds).
  *
  * We also have to be careful to constrain the value of diff such that it won't
  * overflow whilst performing the calculation. The middle term i.e. (160 * diff)
  * / hz is the limiting factor in the calculation. We must constrain diff to be
  * less than the max size of an int divided by the constant 160 figure
  * i.e. diff < INT_MAX / 160
  *
  * NB: Changing HTCP_ALPHA_INC_SHIFT will require you to MANUALLY update the
  * constants used in this function!
  */
 #define HTCP_CALC_ALPHA(diff) \
 ((\
 	(16) + \
 	((160 * (diff)) / hz) + \
 	(((diff) / hz) * (((diff) << HTCP_ALPHA_INC_SHIFT) / (4 * hz))) \
 ) >> HTCP_ALPHA_INC_SHIFT)
 
 static void	htcp_ack_received(struct cc_var *ccv, uint16_t type);
 static void	htcp_cb_destroy(struct cc_var *ccv);
 static int	htcp_cb_init(struct cc_var *ccv);
 static void	htcp_cong_signal(struct cc_var *ccv, uint32_t type);
 static int	htcp_mod_init(void);
 static void	htcp_post_recovery(struct cc_var *ccv);
 static void	htcp_recalc_alpha(struct cc_var *ccv);
 static void	htcp_recalc_beta(struct cc_var *ccv);
 static void	htcp_record_rtt(struct cc_var *ccv);
 static void	htcp_ssthresh_update(struct cc_var *ccv);
 
 struct htcp {
 	/* cwnd before entering cong recovery. */
 	unsigned long	prev_cwnd;
 	/* cwnd additive increase parameter. */
 	int		alpha;
 	/* cwnd multiplicative decrease parameter. */
 	int		beta;
 	/* Largest rtt seen for the flow. */
 	int		maxrtt;
 	/* Shortest rtt seen for the flow. */
 	int		minrtt;
 	/* Time of last congestion event in ticks. */
 	int		t_last_cong;
 };
 
 static int htcp_rtt_ref;
 /*
  * The maximum number of ticks the value of diff can reach in
  * htcp_recalc_alpha() before alpha will stop increasing due to overflow.
  * See comment above HTCP_CALC_ALPHA for more info.
  */
 static int htcp_max_diff = INT_MAX / ((1 << HTCP_ALPHA_INC_SHIFT) * 10);
 
 /* Per-netstack vars. */
 VNET_DEFINE_STATIC(u_int, htcp_adaptive_backoff) = 0;
 VNET_DEFINE_STATIC(u_int, htcp_rtt_scaling) = 0;
 #define	V_htcp_adaptive_backoff    VNET(htcp_adaptive_backoff)
 #define	V_htcp_rtt_scaling    VNET(htcp_rtt_scaling)
 
 static MALLOC_DEFINE(M_HTCP, "htcp data",
     "Per connection data required for the HTCP congestion control algorithm");
 
 struct cc_algo htcp_cc_algo = {
 	.name = "htcp",
 	.ack_received = htcp_ack_received,
 	.cb_destroy = htcp_cb_destroy,
 	.cb_init = htcp_cb_init,
 	.cong_signal = htcp_cong_signal,
 	.mod_init = htcp_mod_init,
 	.post_recovery = htcp_post_recovery,
 };
 
 static void
 htcp_ack_received(struct cc_var *ccv, uint16_t type)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 	htcp_record_rtt(ccv);
 
 	/*
 	 * Regular ACK and we're not in cong/fast recovery and we're cwnd
 	 * limited and we're either not doing ABC or are slow starting or are
 	 * doing ABC and we've sent a cwnd's worth of bytes.
 	 */
 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
 	    (ccv->flags & CCF_CWND_LIMITED) && (!V_tcp_do_rfc3465 ||
 	    CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) ||
 	    (V_tcp_do_rfc3465 && ccv->flags & CCF_ABC_SENTAWND))) {
 		htcp_recalc_beta(ccv);
 		htcp_recalc_alpha(ccv);
 		/*
 		 * Use the logic in NewReno ack_received() for slow start and
 		 * for the first HTCP_DELTA_L ticks after either the flow starts
 		 * or a congestion event (when alpha equals 1).
 		 */
 		if (htcp_data->alpha == 1 ||
 		    CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh))
 			newreno_cc_algo.ack_received(ccv, type);
 		else {
 			if (V_tcp_do_rfc3465) {
 				/* Increment cwnd by alpha segments. */
 				CCV(ccv, snd_cwnd) += htcp_data->alpha *
 				    CCV(ccv, t_maxseg);
 				ccv->flags &= ~CCF_ABC_SENTAWND;
 			} else
 				/*
 				 * Increment cwnd by alpha/cwnd segments to
 				 * approximate an increase of alpha segments
 				 * per RTT.
 				 */
 				CCV(ccv, snd_cwnd) += (((htcp_data->alpha <<
 				    HTCP_SHIFT) / (CCV(ccv, snd_cwnd) /
 				    CCV(ccv, t_maxseg))) * CCV(ccv, t_maxseg))
 				    >> HTCP_SHIFT;
 		}
 	}
 }
 
 static void
 htcp_cb_destroy(struct cc_var *ccv)
 {
 	free(ccv->cc_data, M_HTCP);
 }
 
 static int
 htcp_cb_init(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = malloc(sizeof(struct htcp), M_HTCP, M_NOWAIT);
 
 	if (htcp_data == NULL)
 		return (ENOMEM);
 
 	/* Init some key variables with sensible defaults. */
 	htcp_data->alpha = HTCP_INIT_ALPHA;
 	htcp_data->beta = HTCP_MINBETA;
 	htcp_data->maxrtt = TCPTV_SRTTBASE;
 	htcp_data->minrtt = TCPTV_SRTTBASE;
 	htcp_data->prev_cwnd = 0;
 	htcp_data->t_last_cong = ticks;
 
 	ccv->cc_data = htcp_data;
 
 	return (0);
 }
 
 /*
  * Perform any necessary tasks before we enter congestion recovery.
  */
 static void
 htcp_cong_signal(struct cc_var *ccv, uint32_t type)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	switch (type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 				/*
 				 * Apply hysteresis to maxrtt to ensure
 				 * reductions in the RTT are reflected in our
 				 * measurements.
 				 */
 				htcp_data->maxrtt = (htcp_data->minrtt +
 				    (htcp_data->maxrtt - htcp_data->minrtt) *
 				    95) / 100;
 				htcp_ssthresh_update(ccv);
 				htcp_data->t_last_cong = ticks;
 				htcp_data->prev_cwnd = CCV(ccv, snd_cwnd);
 			}
 			ENTER_RECOVERY(CCV(ccv, t_flags));
 		}
 		break;
 
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 			/*
 			 * Apply hysteresis to maxrtt to ensure reductions in
 			 * the RTT are reflected in our measurements.
 			 */
 			htcp_data->maxrtt = (htcp_data->minrtt + (htcp_data->maxrtt -
 			    htcp_data->minrtt) * 95) / 100;
 			htcp_ssthresh_update(ccv);
 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 			htcp_data->t_last_cong = ticks;
 			htcp_data->prev_cwnd = CCV(ccv, snd_cwnd);
 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		}
 		break;
 
 	case CC_RTO:
 		/*
 		 * Grab the current time and record it so we know when the
 		 * most recent congestion event was. Only record it when the
 		 * timeout has fired more than once, as there is a reasonable
 		 * chance the first one is a false alarm and may not indicate
 		 * congestion.
 		 */
 		if (CCV(ccv, t_rxtshift) >= 2)
 			htcp_data->t_last_cong = ticks;
 		break;
 	}
 }
 
 static int
 htcp_mod_init(void)
 {
 
 	htcp_cc_algo.after_idle = newreno_cc_algo.after_idle;
 
 	/*
 	 * HTCP_RTT_REF is defined in ms, and t_srtt in the tcpcb is stored in
 	 * units of TCP_RTT_SCALE*hz. Scale HTCP_RTT_REF to be in the same units
 	 * as t_srtt.
 	 */
 	htcp_rtt_ref = (HTCP_RTT_REF * TCP_RTT_SCALE * hz) / 1000;
 
 	return (0);
 }
 
 /*
  * Perform any necessary tasks before we exit congestion recovery.
  */
 static void
 htcp_post_recovery(struct cc_var *ccv)
 {
 	int pipe;
 	struct htcp *htcp_data;
 
 	pipe = 0;
 	htcp_data = ccv->cc_data;
 
 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 		/*
 		 * If inflight data is less than ssthresh, set cwnd
 		 * conservatively to avoid a burst of data, as suggested in the
 		 * NewReno RFC. Otherwise, use the HTCP method.
 		 *
 		 * XXXLAS: Find a way to do this without needing curack
 		 */
 		if (V_tcp_do_rfc6675_pipe)
 			pipe = tcp_compute_pipe(ccv->ccvc.tcp);
 		else
 			pipe = CCV(ccv, snd_max) - ccv->curack;
 		
 		if (pipe < CCV(ccv, snd_ssthresh))
 			/*
 			 * Ensure that cwnd down not collape to 1 MSS under
 			 * adverse conditions. Implements RFC6582
 			 */
 			CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
 			    CCV(ccv, t_maxseg);
 		else
 			CCV(ccv, snd_cwnd) = max(1, ((htcp_data->beta *
 			    htcp_data->prev_cwnd / CCV(ccv, t_maxseg))
 			    >> HTCP_SHIFT)) * CCV(ccv, t_maxseg);
 	}
 }
 
 static void
 htcp_recalc_alpha(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 	int alpha, diff, now;
 
 	htcp_data = ccv->cc_data;
 	now = ticks;
 
 	/*
 	 * If ticks has wrapped around (will happen approximately once every 49
 	 * days on a machine with the default kern.hz=1000) and a flow straddles
 	 * the wrap point, our alpha calcs will be completely wrong. We cut our
 	 * losses and restart alpha from scratch by setting t_last_cong = now -
 	 * HTCP_DELTA_L.
 	 *
 	 * This does not deflate our cwnd at all. It simply slows the rate cwnd
 	 * is growing by until alpha regains the value it held prior to taking
 	 * this drastic measure.
 	 */
 	if (now < htcp_data->t_last_cong)
 		htcp_data->t_last_cong = now - HTCP_DELTA_L;
 
 	diff = now - htcp_data->t_last_cong - HTCP_DELTA_L;
 
 	/* Cap alpha if the value of diff would overflow HTCP_CALC_ALPHA(). */
 	if (diff < htcp_max_diff) {
 		/*
 		 * If it has been more than HTCP_DELTA_L ticks since congestion,
 		 * increase alpha according to the function defined in the spec.
 		 */
 		if (diff > 0) {
 			alpha = HTCP_CALC_ALPHA(diff);
 
 			/*
 			 * Adaptive backoff fairness adjustment:
 			 * 2 * (1 - beta) * alpha_raw
 			 */
 			if (V_htcp_adaptive_backoff)
 				alpha = max(1, (2 * ((1 << HTCP_SHIFT) -
 				    htcp_data->beta) * alpha) >> HTCP_SHIFT);
 
 			/*
 			 * RTT scaling: (RTT / RTT_ref) * alpha
 			 * alpha will be the raw value from HTCP_CALC_ALPHA() if
 			 * adaptive backoff is off, or the adjusted value if
 			 * adaptive backoff is on.
 			 */
 			if (V_htcp_rtt_scaling)
 				alpha = max(1, (min(max(HTCP_MINROWE,
 				    (CCV(ccv, t_srtt) << HTCP_SHIFT) /
 				    htcp_rtt_ref), HTCP_MAXROWE) * alpha)
 				    >> HTCP_SHIFT);
 
 		} else
 			alpha = 1;
 
 		htcp_data->alpha = alpha;
 	}
 }
 
 static void
 htcp_recalc_beta(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	/*
 	 * TCPTV_SRTTBASE is the initialised value of each connection's SRTT, so
 	 * we only calc beta if the connection's SRTT has been changed from its
 	 * initial value. beta is bounded to ensure it is always between
 	 * HTCP_MINBETA and HTCP_MAXBETA.
 	 */
 	if (V_htcp_adaptive_backoff && htcp_data->minrtt != TCPTV_SRTTBASE &&
 	    htcp_data->maxrtt != TCPTV_SRTTBASE)
 		htcp_data->beta = min(max(HTCP_MINBETA,
 		    (htcp_data->minrtt << HTCP_SHIFT) / htcp_data->maxrtt),
 		    HTCP_MAXBETA);
 	else
 		htcp_data->beta = HTCP_MINBETA;
 }
 
 /*
  * Record the minimum and maximum RTT seen for the connection. These are used in
  * the calculation of beta if adaptive backoff is enabled.
  */
 static void
 htcp_record_rtt(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	/* XXXLAS: Should there be some hysteresis for minrtt? */
 
 	/*
 	 * Record the current SRTT as our minrtt if it's the smallest we've seen
 	 * or minrtt is currently equal to its initialised value. Ignore SRTT
 	 * until a min number of samples have been taken.
 	 */
 	if ((CCV(ccv, t_srtt) < htcp_data->minrtt ||
 	    htcp_data->minrtt == TCPTV_SRTTBASE) &&
 	    (CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES))
 		htcp_data->minrtt = CCV(ccv, t_srtt);
 
 	/*
 	 * Record the current SRTT as our maxrtt if it's the largest we've
 	 * seen. Ignore SRTT until a min number of samples have been taken.
 	 */
 	if (CCV(ccv, t_srtt) > htcp_data->maxrtt
 	    && CCV(ccv, t_rttupdated) >= HTCP_MIN_RTT_SAMPLES)
 		htcp_data->maxrtt = CCV(ccv, t_srtt);
 }
 
 /*
  * Update the ssthresh in the event of congestion.
  */
 static void
 htcp_ssthresh_update(struct cc_var *ccv)
 {
 	struct htcp *htcp_data;
 
 	htcp_data = ccv->cc_data;
 
 	/*
 	 * On the first congestion event, set ssthresh to cwnd * 0.5, on
 	 * subsequent congestion events, set it to cwnd * beta.
 	 */
 	if (CCV(ccv, snd_ssthresh) == TCP_MAXWIN << TCP_MAX_WINSHIFT)
 		CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) *
 		    HTCP_MINBETA) >> HTCP_SHIFT;
 	else {
 		htcp_recalc_beta(ccv);
 		CCV(ccv, snd_ssthresh) = ((u_long)CCV(ccv, snd_cwnd) *
 		    htcp_data->beta) >> HTCP_SHIFT;
 	}
 }
 
 
 SYSCTL_DECL(_net_inet_tcp_cc_htcp);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, htcp, CTLFLAG_RW,
     NULL, "H-TCP related settings");
 SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, adaptive_backoff,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_adaptive_backoff), 0,
     "enable H-TCP adaptive backoff");
 SYSCTL_UINT(_net_inet_tcp_cc_htcp, OID_AUTO, rtt_scaling,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(htcp_rtt_scaling), 0,
     "enable H-TCP RTT scaling");
 
 DECLARE_CC_MODULE(htcp, &htcp_cc_algo);
+MODULE_VERSION(htcp, 1);
Index: stable/12/sys/netinet/cc/cc_newreno.c
===================================================================
--- stable/12/sys/netinet/cc/cc_newreno.c	(revision 364376)
+++ stable/12/sys/netinet/cc/cc_newreno.c	(revision 364377)
@@ -1,401 +1,402 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2008,2010,2014
  *	Swinburne University of Technology, Melbourne, Australia.
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart, James
  * Healy and David Hayes, made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced
  * Internet Architectures, Swinburne University of Technology, Melbourne,
  * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * This software was first released in 2007 by James Healy and Lawrence Stewart
  * whilst working on the NewTCP research project at Swinburne University of
  * Technology's Centre for Advanced Internet Architectures, Melbourne,
  * Australia, which was made possible in part by a grant from the Cisco
  * University Research Program Fund at Community Foundation Silicon Valley.
  * More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  *
  * Dec 2014 garmitage@swin.edu.au
  * Borrowed code fragments from cc_cdg.c to add modifiable beta
  * via sysctls.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_module.h>
 #include <netinet/cc/cc_newreno.h>
 
 static MALLOC_DEFINE(M_NEWRENO, "newreno data",
 	"newreno beta values");
 
 static void	newreno_cb_destroy(struct cc_var *ccv);
 static void	newreno_ack_received(struct cc_var *ccv, uint16_t type);
 static void	newreno_after_idle(struct cc_var *ccv);
 static void	newreno_cong_signal(struct cc_var *ccv, uint32_t type);
 static void	newreno_post_recovery(struct cc_var *ccv);
 static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf);
 
 VNET_DEFINE_STATIC(uint32_t, newreno_beta) = 50;
 VNET_DEFINE_STATIC(uint32_t, newreno_beta_ecn) = 80;
 #define V_newreno_beta VNET(newreno_beta)
 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)
 
 struct cc_algo newreno_cc_algo = {
 	.name = "newreno",
 	.cb_destroy = newreno_cb_destroy,
 	.ack_received = newreno_ack_received,
 	.after_idle = newreno_after_idle,
 	.cong_signal = newreno_cong_signal,
 	.post_recovery = newreno_post_recovery,
 	.ctl_output = newreno_ctl_output,
 };
 
 struct newreno {
 	uint32_t beta;
 	uint32_t beta_ecn;
 };
 
 static inline struct newreno *
 newreno_malloc(struct cc_var *ccv)
 {
 	struct newreno *nreno;
 
 	nreno = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT);
 	if (nreno != NULL) {
 		/* NB: nreno is not zeroed, so initialise all fields. */
 		nreno->beta = V_newreno_beta;
 		nreno->beta_ecn = V_newreno_beta_ecn;
 		ccv->cc_data = nreno;
 	}
 
 	return (nreno);
 }
 
 static void
 newreno_cb_destroy(struct cc_var *ccv)
 {
 	free(ccv->cc_data, M_NEWRENO);
 }
 
 static void
 newreno_ack_received(struct cc_var *ccv, uint16_t type)
 {
 	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
 	    (ccv->flags & CCF_CWND_LIMITED)) {
 		u_int cw = CCV(ccv, snd_cwnd);
 		u_int incr = CCV(ccv, t_maxseg);
 
 		/*
 		 * Regular in-order ACK, open the congestion window.
 		 * Method depends on which congestion control state we're
 		 * in (slow start or cong avoid) and if ABC (RFC 3465) is
 		 * enabled.
 		 *
 		 * slow start: cwnd <= ssthresh
 		 * cong avoid: cwnd > ssthresh
 		 *
 		 * slow start and ABC (RFC 3465):
 		 *   Grow cwnd exponentially by the amount of data
 		 *   ACKed capping the max increment per ACK to
 		 *   (abc_l_var * maxseg) bytes.
 		 *
 		 * slow start without ABC (RFC 5681):
 		 *   Grow cwnd exponentially by maxseg per ACK.
 		 *
 		 * cong avoid and ABC (RFC 3465):
 		 *   Grow cwnd linearly by maxseg per RTT for each
 		 *   cwnd worth of ACKed data.
 		 *
 		 * cong avoid without ABC (RFC 5681):
 		 *   Grow cwnd linearly by approximately maxseg per RTT using
 		 *   maxseg^2 / cwnd per ACK as the increment.
 		 *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
 		 *   avoid capping cwnd.
 		 */
 		if (cw > CCV(ccv, snd_ssthresh)) {
 			if (V_tcp_do_rfc3465) {
 				if (ccv->flags & CCF_ABC_SENTAWND)
 					ccv->flags &= ~CCF_ABC_SENTAWND;
 				else
 					incr = 0;
 			} else
 				incr = max((incr * incr / cw), 1);
 		} else if (V_tcp_do_rfc3465) {
 			/*
 			 * In slow-start with ABC enabled and no RTO in sight?
 			 * (Must not use abc_l_var > 1 if slow starting after
 			 * an RTO. On RTO, snd_nxt = snd_una, so the
 			 * snd_nxt == snd_max check is sufficient to
 			 * handle this).
 			 *
 			 * XXXLAS: Find a way to signal SS after RTO that
 			 * doesn't rely on tcpcb vars.
 			 */
 			if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
 				incr = min(ccv->bytes_this_ack,
 				    ccv->nsegs * V_tcp_abc_l_var *
 				    CCV(ccv, t_maxseg));
 			else
 				incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
 		}
 		/* ABC is on by default, so incr equals 0 frequently. */
 		if (incr > 0)
 			CCV(ccv, snd_cwnd) = min(cw + incr,
 			    TCP_MAXWIN << CCV(ccv, snd_scale));
 	}
 }
 
 static void
 newreno_after_idle(struct cc_var *ccv)
 {
 	int rw;
 
 	/*
 	 * If we've been idle for more than one retransmit timeout the old
 	 * congestion window is no longer current and we have to reduce it to
 	 * the restart window before we can transmit again.
 	 *
 	 * The restart window is the initial window or the last CWND, whichever
 	 * is smaller.
 	 *
 	 * This is done to prevent us from flooding the path with a full CWND at
 	 * wirespeed, overloading router and switch buffers along the way.
 	 *
 	 * See RFC5681 Section 4.1. "Restarting Idle Connections".
 	 */
 	if (V_tcp_do_rfc3390)
 		rw = min(4 * CCV(ccv, t_maxseg),
 		    max(2 * CCV(ccv, t_maxseg), 4380));
 	else
 		rw = CCV(ccv, t_maxseg) * 2;
 
 	CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
 }
 
 /*
  * Perform any necessary tasks before we enter congestion recovery.
  */
 static void
 newreno_cong_signal(struct cc_var *ccv, uint32_t type)
 {
 	struct newreno *nreno;
 	uint32_t beta, beta_ecn, cwin, factor;
 	u_int mss;
 
 	cwin = CCV(ccv, snd_cwnd);
 	mss = CCV(ccv, t_maxseg);
 	nreno = ccv->cc_data;
 	beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;
 	beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn;
 	if (V_cc_do_abe && type == CC_ECN)
 		factor = beta_ecn;
 	else
 		factor = beta;
 
 	/* Catch algos which mistakenly leak private signal types. */
 	KASSERT((type & CC_SIGPRIVMASK) == 0,
 	    ("%s: congestion signal type 0x%08x is private\n", __func__, type));
 
 	cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss),
 	    2) * mss;
 
 	switch (type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 			if (IN_CONGRECOVERY(CCV(ccv, t_flags) &&
 			    V_cc_do_abe && V_cc_abe_frlossreduce)) {
 				CCV(ccv, snd_ssthresh) =
 				    ((uint64_t)CCV(ccv, snd_ssthresh) *
 				    (uint64_t)beta) /
 				    (100ULL * (uint64_t)beta_ecn);
 			}
 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
 				CCV(ccv, snd_ssthresh) = cwin;
 			ENTER_RECOVERY(CCV(ccv, t_flags));
 		}
 		break;
 	case CC_ECN:
 		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 			CCV(ccv, snd_ssthresh) = cwin;
 			CCV(ccv, snd_cwnd) = cwin;
 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		}
 		break;
 	}
 }
 
 /*
  * Perform any necessary tasks before we exit congestion recovery.
  */
 static void
 newreno_post_recovery(struct cc_var *ccv)
 {
 	int pipe;
 
 	if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
 		/*
 		 * Fast recovery will conclude after returning from this
 		 * function. Window inflation should have left us with
 		 * approximately snd_ssthresh outstanding data. But in case we
 		 * would be inclined to send a burst, better to do it via the
 		 * slow start mechanism.
 		 *
 		 * XXXLAS: Find a way to do this without needing curack
 		 */
 		if (V_tcp_do_rfc6675_pipe)
 			pipe = tcp_compute_pipe(ccv->ccvc.tcp);
 		else
 			pipe = CCV(ccv, snd_max) - ccv->curack;
 
 		if (pipe < CCV(ccv, snd_ssthresh))
 			/*
 			 * Ensure that cwnd does not collapse to 1 MSS under
 			 * adverse conditons. Implements RFC6582
 			 */
 			CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
 			    CCV(ccv, t_maxseg);
 		else
 			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
 	}
 }
 
 static int
 newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf)
 {
 	struct newreno *nreno;
 	struct cc_newreno_opts *opt;
 
 	if (sopt->sopt_valsize != sizeof(struct cc_newreno_opts))
 		return (EMSGSIZE);
 
 	nreno = ccv->cc_data;
 	opt = buf;
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		/* We cannot set without cc_data memory. */
 		if (nreno == NULL) {
 			nreno = newreno_malloc(ccv);
 			if (nreno == NULL)
 				return (ENOMEM);
 		}
 		switch (opt->name) {
 		case CC_NEWRENO_BETA:
 			nreno->beta = opt->val;
 			break;
 		case CC_NEWRENO_BETA_ECN:
 			if (!V_cc_do_abe)
 				return (EACCES);
 			nreno->beta_ecn = opt->val;
 			break;
 		default:
 			return (ENOPROTOOPT);
 		}
 		break;
 	case SOPT_GET:
 		switch (opt->name) {
 		case CC_NEWRENO_BETA:
 			opt->val = (nreno == NULL) ?
 			    V_newreno_beta : nreno->beta;
 			break;
 		case CC_NEWRENO_BETA_ECN:
 			opt->val = (nreno == NULL) ?
 			    V_newreno_beta_ecn : nreno->beta_ecn;
 			break;
 		default:
 			return (ENOPROTOOPT);
 		}
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 newreno_beta_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = *(uint32_t *)arg1;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL ) {
 		if (arg1 == &VNET_NAME(newreno_beta_ecn) && !V_cc_do_abe)
 			error = EACCES;
 		else if (new == 0 || new > 100)
 			error = EINVAL;
 		else
 			*(uint32_t *)arg1 = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_newreno);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, newreno, CTLFLAG_RW, NULL,
     "New Reno related settings");
 
 SYSCTL_PROC(_net_inet_tcp_cc_newreno, OID_AUTO, beta,
 	CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
 	&VNET_NAME(newreno_beta), 3, &newreno_beta_handler, "IU",
 	"New Reno beta, specified as number between 1 and 100");
 
 SYSCTL_PROC(_net_inet_tcp_cc_newreno, OID_AUTO, beta_ecn,
 	CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
 	&VNET_NAME(newreno_beta_ecn), 3, &newreno_beta_handler, "IU",
 	"New Reno beta ecn, specified as number between 1 and 100");
 
 DECLARE_CC_MODULE(newreno, &newreno_cc_algo);
+MODULE_VERSION(newreno, 1);
Index: stable/12/sys/netinet/cc/cc_vegas.c
===================================================================
--- stable/12/sys/netinet/cc/cc_vegas.c	(revision 364376)
+++ stable/12/sys/netinet/cc/cc_vegas.c	(revision 364377)
@@ -1,303 +1,304 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2010
  *	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2010-2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by David Hayes and
  * Lawrence Stewart, made possible in part by a grant from the Cisco University
  * Research Program Fund at Community Foundation Silicon Valley.
  *
  * Portions of this software were developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, Melbourne, Australia by
  * David Hayes under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the Vegas congestion control algorithm for FreeBSD,
  * based on L. S. Brakmo and L. L. Peterson, "TCP Vegas: end to end congestion
  * avoidance on a global internet", IEEE J. Sel. Areas Commun., vol. 13, no. 8,
  * pp. 1465-1480, Oct. 1995. The original Vegas duplicate ack policy has not
  * been implemented, since clock ticks are not as coarse as they were (i.e.
  * 500ms) when Vegas was designed. Also, packets are timed once per RTT as in
  * the original paper.
  *
  * Originally released as part of the NewTCP research project at Swinburne
  * University of Technology's Centre for Advanced Internet Architectures,
  * Melbourne, Australia, which was made possible in part by a grant from the
  * Cisco University Research Program Fund at Community Foundation Silicon
  * Valley. More details are available at:
  *   http://caia.swin.edu.au/urp/newtcp/
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_module.h>
 
 #include <netinet/khelp/h_ertt.h>
 
 /*
  * Private signal type for rate based congestion signal.
  * See <netinet/cc.h> for appropriate bit-range to use for private signals.
  */
 #define	CC_VEGAS_RATE	0x01000000
 
 static void	vegas_ack_received(struct cc_var *ccv, uint16_t ack_type);
 static void	vegas_cb_destroy(struct cc_var *ccv);
 static int	vegas_cb_init(struct cc_var *ccv);
 static void	vegas_cong_signal(struct cc_var *ccv, uint32_t signal_type);
 static void	vegas_conn_init(struct cc_var *ccv);
 static int	vegas_mod_init(void);
 
 struct vegas {
 	int slow_start_toggle;
 };
 
 static int32_t ertt_id;
 
 VNET_DEFINE_STATIC(uint32_t, vegas_alpha) = 1;
 VNET_DEFINE_STATIC(uint32_t, vegas_beta) = 3;
 #define	V_vegas_alpha	VNET(vegas_alpha)
 #define	V_vegas_beta	VNET(vegas_beta)
 
 static MALLOC_DEFINE(M_VEGAS, "vegas data",
     "Per connection data required for the Vegas congestion control algorithm");
 
 struct cc_algo vegas_cc_algo = {
 	.name = "vegas",
 	.ack_received = vegas_ack_received,
 	.cb_destroy = vegas_cb_destroy,
 	.cb_init = vegas_cb_init,
 	.cong_signal = vegas_cong_signal,
 	.conn_init = vegas_conn_init,
 	.mod_init = vegas_mod_init
 };
 
 /*
  * The vegas window adjustment is done once every RTT, as indicated by the
  * ERTT_NEW_MEASUREMENT flag. This flag is reset once the new measurment data
  * has been used.
  */
 static void
 vegas_ack_received(struct cc_var *ccv, uint16_t ack_type)
 {
 	struct ertt *e_t;
 	struct vegas *vegas_data;
 	long actual_tx_rate, expected_tx_rate, ndiff;
 
 	e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
 	vegas_data = ccv->cc_data;
 
 	if (e_t->flags & ERTT_NEW_MEASUREMENT) { /* Once per RTT. */
 		if (e_t->minrtt && e_t->markedpkt_rtt) {
 			expected_tx_rate = e_t->marked_snd_cwnd / e_t->minrtt;
 			actual_tx_rate = e_t->bytes_tx_in_marked_rtt /
 			    e_t->markedpkt_rtt;
 			ndiff = (expected_tx_rate - actual_tx_rate) *
 			    e_t->minrtt / CCV(ccv, t_maxseg);
 
 			if (ndiff < V_vegas_alpha) {
 				if (CCV(ccv, snd_cwnd) <=
 				    CCV(ccv, snd_ssthresh)) {
 					vegas_data->slow_start_toggle =
 					    vegas_data->slow_start_toggle ?
 					    0 : 1;
 				} else {
 					vegas_data->slow_start_toggle = 0;
 					CCV(ccv, snd_cwnd) =
 					    min(CCV(ccv, snd_cwnd) +
 					    CCV(ccv, t_maxseg),
 					    TCP_MAXWIN << CCV(ccv, snd_scale));
 				}
 			} else if (ndiff > V_vegas_beta) {
 				/* Rate-based congestion. */
 				vegas_cong_signal(ccv, CC_VEGAS_RATE);
 				vegas_data->slow_start_toggle = 0;
 			}
 		}
 		e_t->flags &= ~ERTT_NEW_MEASUREMENT;
 	}
 
 	if (vegas_data->slow_start_toggle)
 		newreno_cc_algo.ack_received(ccv, ack_type);
 }
 
 static void
 vegas_cb_destroy(struct cc_var *ccv)
 {
 	free(ccv->cc_data, M_VEGAS);
 }
 
 static int
 vegas_cb_init(struct cc_var *ccv)
 {
 	struct vegas *vegas_data;
 
 	vegas_data = malloc(sizeof(struct vegas), M_VEGAS, M_NOWAIT);
 
 	if (vegas_data == NULL)
 		return (ENOMEM);
 
 	vegas_data->slow_start_toggle = 1;
 	ccv->cc_data = vegas_data;
 
 	return (0);
 }
 
 /*
  * If congestion has been triggered triggered by the Vegas measured rates, it is
  * handled here, otherwise it falls back to newreno's congestion handling.
  */
 static void
 vegas_cong_signal(struct cc_var *ccv, uint32_t signal_type)
 {
 	struct vegas *vegas_data;
 	int presignalrecov;
 
 	vegas_data = ccv->cc_data;
 
 	if (IN_RECOVERY(CCV(ccv, t_flags)))
 		presignalrecov = 1;
 	else
 		presignalrecov = 0;
 
 	switch(signal_type) {
 	case CC_VEGAS_RATE:
 		if (!IN_RECOVERY(CCV(ccv, t_flags))) {
 			CCV(ccv, snd_cwnd) = max(2 * CCV(ccv, t_maxseg),
 			    CCV(ccv, snd_cwnd) - CCV(ccv, t_maxseg));
 			if (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh))
 				/* Exit slow start. */
 				CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
 		}
 		break;
 
 	default:
 		newreno_cc_algo.cong_signal(ccv, signal_type);
 	}
 
 	if (IN_RECOVERY(CCV(ccv, t_flags)) && !presignalrecov)
 		vegas_data->slow_start_toggle =
 		    (CCV(ccv, snd_cwnd) < CCV(ccv, snd_ssthresh)) ? 1 : 0;
 }
 
 static void
 vegas_conn_init(struct cc_var *ccv)
 {
 	struct vegas *vegas_data;
 
 	vegas_data = ccv->cc_data;
 	vegas_data->slow_start_toggle = 1;
 }
 
 static int
 vegas_mod_init(void)
 {
 
 	ertt_id = khelp_get_id("ertt");
 	if (ertt_id <= 0) {
 		printf("%s: h_ertt module not found\n", __func__);
 		return (ENOENT);
 	}
 
 	vegas_cc_algo.after_idle = newreno_cc_algo.after_idle;
 	vegas_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
 
 	return (0);
 }
 
 static int
 vegas_alpha_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_vegas_alpha;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new == 0 || new > V_vegas_beta)
 			error = EINVAL;
 		else
 			V_vegas_alpha = new;
 	}
 
 	return (error);
 }
 
 static int
 vegas_beta_handler(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_vegas_beta;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new == 0 || new < V_vegas_alpha)
 			 error = EINVAL;
 		else
 			V_vegas_beta = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_vegas);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, vegas, CTLFLAG_RW, NULL,
     "Vegas related settings");
 
 SYSCTL_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, alpha,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     &VNET_NAME(vegas_alpha), 1, &vegas_alpha_handler, "IU",
     "vegas alpha, specified as number of \"buffers\" (0 < alpha < beta)");
 
 SYSCTL_PROC(_net_inet_tcp_cc_vegas, OID_AUTO, beta,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     &VNET_NAME(vegas_beta), 3, &vegas_beta_handler, "IU",
     "vegas beta, specified as number of \"buffers\" (0 < alpha < beta)");
 
 DECLARE_CC_MODULE(vegas, &vegas_cc_algo);
+MODULE_VERSION(vegas, 1);
 MODULE_DEPEND(vegas, ertt, 1, 1, 1);
Index: stable/12
===================================================================
--- stable/12	(revision 364376)
+++ stable/12	(revision 364377)

Property changes on: stable/12
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r363380