D32693.id97602.diff
No OneTemporary
Actions

Size

42 KB

Referenced Files

None

Subscribers

None

D32693.id97602.diff
View Options

	Index: sys/netinet/cc/cc.h
	===================================================================
	--- sys/netinet/cc/cc.h
	+++ sys/netinet/cc/cc.h
	@@ -53,10 +53,11 @@

	#ifdef _KERNEL

	+MALLOC_DECLARE(M_CC_MEM);
	+
	/* Global CC vars. */
	extern STAILQ_HEAD(cc_head, cc_algo) cc_list;
	extern const int tcprexmtthresh;
	-extern struct cc_algo newreno_cc_algo;

	/* Per-netstack bits. */
	VNET_DECLARE(struct cc_algo *, default_cc_ptr);
	@@ -139,8 +140,19 @@
	/* Cleanup global module state on kldunload. */
	int (*mod_destroy)(void);

	- /* Init CC state for a new control block. */
	- int (cb_init)(struct cc_var ccv);
	+ /* Return the size of the void pointer the CC needs for state */
	+ size_t (*cc_data_sz)(void);
	+
	+ /*
	+ * Init CC state for a new control block. The CC
	+ * module may be passed a NULL ptr indicating that
	+ * it must allocate the memory. If it is passed a
	+ * non-null pointer it is pre-allocated memory by
	+ * the caller and the cb_init is expected to use that memory.
	+ * It is not expected to fail if memory is passed in and
	+ * all currently defined modules do not.
	+ */
	+ int (cb_init)(struct cc_var ccv, void *ptr);

	/* Cleanup CC state for a terminating control block. */
	void (cb_destroy)(struct cc_var ccv);
	@@ -198,5 +210,16 @@

	#define CC_ALGOOPT_LIMIT 2048

	+/*
	+ * These routines give NewReno behavior to the caller
	+ * they require no state and can be used by any other CC
	+ * module that wishes to use NewReno type behaviour (along
	+ * with anything else they may add on, pre or post call).
	+ */
	+void common_cc_post_recovery(struct cc_var *);
	+void common_cc_after_idle(struct cc_var *);
	+void common_cc_cong_signal(struct cc_var *, uint32_t );
	+void common_cc_ack_received(struct cc_var *, uint16_t);
	+
	#endif /* _KERNEL */
	#endif /* _NETINET_CC_CC_H_ */
	Index: sys/netinet/cc/cc.c
	===================================================================
	--- sys/netinet/cc/cc.c
	+++ sys/netinet/cc/cc.c
	@@ -70,11 +70,16 @@
	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#include <netinet/tcp.h>
	+#include <netinet/tcp_seq.h>
	#include <netinet/tcp_var.h>
	+#include <netinet/tcp_log_buf.h>
	+#include <netinet/tcp_hpts.h>
	#include <netinet/cc/cc.h>
	-
	#include <netinet/cc/cc_module.h>

	+MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");
	+
	+extern struct cc_algo newreno_cc_algo;
	/*
	* List of available cc algorithms on the current system. First element
	* is used as the system default CC algorithm.
	@@ -86,6 +91,8 @@

	VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;

	+VNET_DECLARE(uint32_t, newreno_beta);
	+#define V_newreno_beta VNET(newreno_beta)
	/*
	* Sysctl handler to show and change the default CC algorithm.
	*/
	@@ -176,12 +183,13 @@
	}

	/*
	- * Reset the default CC algo to NewReno for any netstack which is using the algo
	- * that is about to go away as its default.
	+ * Return the number of times a proposed removal_cc is
	+ * being used as the default.
	*/
	-static void
	-cc_checkreset_default(struct cc_algo *remove_cc)
	+static int
	+cc_check_default(struct cc_algo *remove_cc)
	{
	+ int cnt = 0;
	VNET_ITERATOR_DECL(vnet_iter);

	CC_LIST_LOCK_ASSERT();
	@@ -189,12 +197,15 @@
	VNET_LIST_RLOCK_NOSLEEP();
	VNET_FOREACH(vnet_iter) {
	CURVNET_SET(vnet_iter);
	- if (strncmp(CC_DEFAULT()->name, remove_cc->name,
	- TCP_CA_NAME_MAX) == 0)
	- V_default_cc_ptr = &newreno_cc_algo;
	+ if (strncmp(CC_DEFAULT()->name,
	+ remove_cc->name,
	+ TCP_CA_NAME_MAX) == 0) {
	+ cnt++;
	+ }
	CURVNET_RESTORE();
	}
	VNET_LIST_RUNLOCK_NOSLEEP();
	+ return (cnt);
	}

	/*
	@@ -226,23 +237,17 @@
	CC_LIST_WLOCK();
	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
	if (funcs == remove_cc) {
	- cc_checkreset_default(remove_cc);
	- STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
	- err = 0;
	+ if (cc_check_default(remove_cc)) {
	+ err = EBUSY;
	+ break;
	+ }
	+ err = tcp_ccalgounload(remove_cc);
	+ if (err == 0)
	+ STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
	break;
	}
	}
	CC_LIST_WUNLOCK();
	-
	- if (!err)
	- /*
	- * XXXLAS:
	- * - We may need to handle non-zero return values in future.
	- * - If we add CC framework support for protocols other than
	- * TCP, we may want a more generic way to handle this step.
	- */
	- tcp_ccalgounload(remove_cc);
	-
	return (err);
	}

	@@ -276,6 +281,193 @@
	return (err);
	}

	+/*
	+ * Perform any necessary tasks before we exit congestion recovery.
	+ */
	+void
	+common_cc_post_recovery(struct cc_var *ccv)
	+{
	+ int pipe;
	+
	+ if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
	+ /*
	+ * Fast recovery will conclude after returning from this
	+ * function. Window inflation should have left us with
	+ * approximately snd_ssthresh outstanding data. But in case we
	+ * would be inclined to send a burst, better to do it via the
	+ * slow start mechanism.
	+ *
	+ * XXXLAS: Find a way to do this without needing curack
	+ */
	+ if (V_tcp_do_newsack)
	+ pipe = tcp_compute_pipe(ccv->ccvc.tcp);
	+ else
	+ pipe = CCV(ccv, snd_max) - ccv->curack;
	+ if (pipe < CCV(ccv, snd_ssthresh))
	+ /*
	+ * Ensure that cwnd does not collapse to 1 MSS under
	+ * adverse conditons. Implements RFC6582
	+ */
	+ CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
	+ CCV(ccv, t_maxseg);
	+ else
	+ CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
	+ }
	+}
	+
	+void
	+common_cc_after_idle(struct cc_var *ccv)
	+{
	+ uint32_t rw;
	+ /*
	+ * If we've been idle for more than one retransmit timeout the old
	+ * congestion window is no longer current and we have to reduce it to
	+ * the restart window before we can transmit again.
	+ *
	+ * The restart window is the initial window or the last CWND, whichever
	+ * is smaller.
	+ *
	+ * This is done to prevent us from flooding the path with a full CWND at
	+ * wirespeed, overloading router and switch buffers along the way.
	+ *
	+ * See RFC5681 Section 4.1. "Restarting Idle Connections".
	+ *
	+ * In addition, per RFC2861 Section 2, the ssthresh is set to the
	+ * maximum of the former ssthresh or 3/4 of the old cwnd, to
	+ * not exit slow-start prematurely.
	+ */
	+ rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp));
	+
	+ CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
	+ CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
	+
	+ CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
	+}
	+
	+/*
	+ * Perform any necessary tasks before we enter congestion recovery.
	+ */
	+void
	+common_cc_cong_signal(struct cc_var *ccv, uint32_t type)
	+{
	+ uint32_t cwin, factor;
	+ u_int mss;
	+
	+ cwin = CCV(ccv, snd_cwnd);
	+ mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
	+ /*
	+ * Other TCP congestion controls use newreno_cong_signal(), but
	+ * with their own private cc_data. Make sure the cc_data is used
	+ * correctly.
	+ */
	+ factor = V_newreno_beta;
	+
	+ /* Catch algos which mistakenly leak private signal types. */
	+ KASSERT((type & CC_SIGPRIVMASK) == 0,
	+ ("%s: congestion signal type 0x%08x is private\n", __func__, type));
	+
	+ cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss),
	+ 2) * mss;
	+
	+ switch (type) {
	+ case CC_NDUPACK:
	+ if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
	+ if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
	+ CCV(ccv, snd_ssthresh) = cwin;
	+ ENTER_RECOVERY(CCV(ccv, t_flags));
	+ }
	+ break;
	+ case CC_ECN:
	+ if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
	+ CCV(ccv, snd_ssthresh) = cwin;
	+ CCV(ccv, snd_cwnd) = cwin;
	+ ENTER_CONGRECOVERY(CCV(ccv, t_flags));
	+ }
	+ break;
	+ case CC_RTO:
	+ CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd),
	+ CCV(ccv, snd_cwnd)) / 2 / mss,
	+ 2) * mss;
	+ CCV(ccv, snd_cwnd) = mss;
	+ break;
	+ }
	+}
	+
	+void
	+common_cc_ack_received(struct cc_var *ccv, uint16_t type)
	+{
	+ if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
	+ (ccv->flags & CCF_CWND_LIMITED)) {
	+ u_int cw = CCV(ccv, snd_cwnd);
	+ u_int incr = CCV(ccv, t_maxseg);
	+
	+ /*
	+ * Regular in-order ACK, open the congestion window.
	+ * Method depends on which congestion control state we're
	+ * in (slow start or cong avoid) and if ABC (RFC 3465) is
	+ * enabled.
	+ *
	+ * slow start: cwnd <= ssthresh
	+ * cong avoid: cwnd > ssthresh
	+ *
	+ * slow start and ABC (RFC 3465):
	+ * Grow cwnd exponentially by the amount of data
	+ * ACKed capping the max increment per ACK to
	+ * (abc_l_var * maxseg) bytes.
	+ *
	+ * slow start without ABC (RFC 5681):
	+ * Grow cwnd exponentially by maxseg per ACK.
	+ *
	+ * cong avoid and ABC (RFC 3465):
	+ * Grow cwnd linearly by maxseg per RTT for each
	+ * cwnd worth of ACKed data.
	+ *
	+ * cong avoid without ABC (RFC 5681):
	+ * Grow cwnd linearly by approximately maxseg per RTT using
	+ * maxseg^2 / cwnd per ACK as the increment.
	+ * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
	+ * avoid capping cwnd.
	+ */
	+ if (cw > CCV(ccv, snd_ssthresh)) {
	+ if (V_tcp_do_rfc3465) {
	+ if (ccv->flags & CCF_ABC_SENTAWND)
	+ ccv->flags &= ~CCF_ABC_SENTAWND;
	+ else
	+ incr = 0;
	+ } else
	+ incr = max((incr * incr / cw), 1);
	+ } else if (V_tcp_do_rfc3465) {
	+ /*
	+ * In slow-start with ABC enabled and no RTO in sight?
	+ * (Must not use abc_l_var > 1 if slow starting after
	+ * an RTO. On RTO, snd_nxt = snd_una, so the
	+ * snd_nxt == snd_max check is sufficient to
	+ * handle this).
	+ *
	+ * XXXLAS: Find a way to signal SS after RTO that
	+ * doesn't rely on tcpcb vars.
	+ */
	+ uint16_t abc_val;
	+
	+ if (ccv->flags & CCF_USE_LOCAL_ABC)
	+ abc_val = ccv->labc;
	+ else
	+ abc_val = V_tcp_abc_l_var;
	+ if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
	+ incr = min(ccv->bytes_this_ack,
	+ ccv->nsegs * abc_val *
	+ CCV(ccv, t_maxseg));
	+ else
	+ incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
	+
	+ }
	+ /* ABC is on by default, so incr equals 0 frequently. */
	+ if (incr > 0)
	+ CCV(ccv, snd_cwnd) = min(cw + incr,
	+ TCP_MAXWIN << CCV(ccv, snd_scale));
	+ }
	+}
	+
	/*
	* Handles kld related events. Returns 0 on success, non-zero on failure.
	*/
	@@ -290,6 +482,15 @@

	switch(event_type) {
	case MOD_LOAD:
	+ if ((algo->cc_data_sz == NULL) && (algo->cb_init != NULL)) {
	+ /*
	+ * A module must have a cc_data_sz function
	+ * even if it has no data it should return 0.
	+ */
	+ printf("Module Load Fails, it lacks a cc_data_sz() function but has a cb_init()!\n");
	+ err = EINVAL;
	+ break;
	+ }
	if (algo->mod_init != NULL)
	err = algo->mod_init();
	if (!err)
	Index: sys/netinet/cc/cc_cdg.c
	===================================================================
	--- sys/netinet/cc/cc_cdg.c
	+++ sys/netinet/cc/cc_cdg.c
	@@ -197,10 +197,6 @@
	32531,32533,32535,32537,32538,32540,32542,32544,32545,32547};

	static uma_zone_t qdiffsample_zone;
	-
	-static MALLOC_DEFINE(M_CDG, "cdg data",
	- "Per connection data required for the CDG congestion control algorithm");
	-
	static int ertt_id;

	VNET_DEFINE_STATIC(uint32_t, cdg_alpha_inc);
	@@ -222,10 +218,11 @@
	static int cdg_mod_init(void);
	static int cdg_mod_destroy(void);
	static void cdg_conn_init(struct cc_var *ccv);
	-static int cdg_cb_init(struct cc_var *ccv);
	+static int cdg_cb_init(struct cc_var ccv, void ptr);
	static void cdg_cb_destroy(struct cc_var *ccv);
	static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type);
	static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type);
	+static size_t cdg_data_sz(void);

	struct cc_algo cdg_cc_algo = {
	.name = "cdg",
	@@ -235,7 +232,10 @@
	.cb_init = cdg_cb_init,
	.conn_init = cdg_conn_init,
	.cong_signal = cdg_cong_signal,
	- .mod_destroy = cdg_mod_destroy
	+ .mod_destroy = cdg_mod_destroy,
	+ .cc_data_sz = cdg_data_sz,
	+ .post_recovery = common_cc_post_recovery,
	+ .after_idle = common_cc_after_idle,
	};

	/* Vnet created and being initialised. */
	@@ -271,10 +271,6 @@
	CURVNET_RESTORE();
	}
	VNET_LIST_RUNLOCK();
	-
	- cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
	- cdg_cc_algo.after_idle = newreno_cc_algo.after_idle;
	-
	return (0);
	}

	@@ -286,15 +282,24 @@
	return (0);
	}

	+static size_t
	+cdg_data_sz(void)
	+{
	+ return (sizeof(struct cdg));
	+}
	+
	static int
	-cdg_cb_init(struct cc_var *ccv)
	+cdg_cb_init(struct cc_var ccv, void ptr)
	{
	struct cdg *cdg_data;

	- cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT);
	- if (cdg_data == NULL)
	- return (ENOMEM);
	-
	+ if (ptr == NULL) {
	+ cdg_data = malloc(sizeof(struct cdg), M_CC_MEM, M_NOWAIT);
	+ if (cdg_data == NULL)
	+ return (ENOMEM);
	+ } else {
	+ cdg_data = ptr;
	+ }
	cdg_data->shadow_w = 0;
	cdg_data->max_qtrend = 0;
	cdg_data->min_qtrend = 0;
	@@ -350,7 +355,7 @@
	qds = qds_n;
	}

	- free(ccv->cc_data, M_CDG);
	+ free(ccv->cc_data, M_CC_MEM);
	}

	static int
	@@ -484,7 +489,7 @@
	ENTER_RECOVERY(CCV(ccv, t_flags));
	break;
	default:
	- newreno_cc_algo.cong_signal(ccv, signal_type);
	+ common_cc_cong_signal(ccv, signal_type);
	break;
	}
	}
	Index: sys/netinet/cc/cc_chd.c
	===================================================================
	--- sys/netinet/cc/cc_chd.c
	+++ sys/netinet/cc/cc_chd.c
	@@ -89,10 +89,11 @@

	static void chd_ack_received(struct cc_var *ccv, uint16_t ack_type);
	static void chd_cb_destroy(struct cc_var *ccv);
	-static int chd_cb_init(struct cc_var *ccv);
	+static int chd_cb_init(struct cc_var ccv, void ptr);
	static void chd_cong_signal(struct cc_var *ccv, uint32_t signal_type);
	static void chd_conn_init(struct cc_var *ccv);
	static int chd_mod_init(void);
	+static size_t chd_data_sz(void);

	struct chd {
	/*
	@@ -126,8 +127,6 @@
	#define V_chd_loss_fair VNET(chd_loss_fair)
	#define V_chd_use_max VNET(chd_use_max)

	-static MALLOC_DEFINE(M_CHD, "chd data",
	- "Per connection data required for the CHD congestion control algorithm");

	struct cc_algo chd_cc_algo = {
	.name = "chd",
	@@ -136,7 +135,10 @@
	.cb_init = chd_cb_init,
	.cong_signal = chd_cong_signal,
	.conn_init = chd_conn_init,
	- .mod_init = chd_mod_init
	+ .mod_init = chd_mod_init,
	+ .cc_data_sz = chd_data_sz,
	+ .after_idle = common_cc_after_idle,
	+ .post_recovery = common_cc_post_recovery,
	};

	static __inline void
	@@ -304,18 +306,26 @@
	static void
	chd_cb_destroy(struct cc_var *ccv)
	{
	+ free(ccv->cc_data, M_CC_MEM);
	+}

	- free(ccv->cc_data, M_CHD);
	+size_t
	+chd_data_sz(void)
	+{
	+ return (sizeof(struct chd));
	}

	static int
	-chd_cb_init(struct cc_var *ccv)
	+chd_cb_init(struct cc_var ccv, void ptr)
	{
	struct chd *chd_data;

	- chd_data = malloc(sizeof(struct chd), M_CHD, M_NOWAIT);
	- if (chd_data == NULL)
	- return (ENOMEM);
	+ if (ptr == NULL) {
	+ chd_data = malloc(sizeof(struct chd), M_CC_MEM, M_NOWAIT);
	+ if (chd_data == NULL)
	+ return (ENOMEM);
	+ } else
	+ chd_data = ptr;

	chd_data->shadow_w = 0;
	ccv->cc_data = chd_data;
	@@ -374,7 +384,7 @@
	break;

	default:
	- newreno_cc_algo.cong_signal(ccv, signal_type);
	+ common_cc_cong_signal(ccv, signal_type);
	}
	}

	@@ -403,10 +413,6 @@
	printf("%s: h_ertt module not found\n", __func__);
	return (ENOENT);
	}
	-
	- chd_cc_algo.after_idle = newreno_cc_algo.after_idle;
	- chd_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
	-
	return (0);
	}

	Index: sys/netinet/cc/cc_cubic.c
	===================================================================
	--- sys/netinet/cc/cc_cubic.c
	+++ sys/netinet/cc/cc_cubic.c
	@@ -72,7 +72,7 @@

	static void cubic_ack_received(struct cc_var *ccv, uint16_t type);
	static void cubic_cb_destroy(struct cc_var *ccv);
	-static int cubic_cb_init(struct cc_var *ccv);
	+static int cubic_cb_init(struct cc_var ccv, void ptr);
	static void cubic_cong_signal(struct cc_var *ccv, uint32_t type);
	static void cubic_conn_init(struct cc_var *ccv);
	static int cubic_mod_init(void);
	@@ -80,6 +80,7 @@
	static void cubic_record_rtt(struct cc_var *ccv);
	static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg);
	static void cubic_after_idle(struct cc_var *ccv);
	+static size_t cubic_data_sz(void);

	struct cubic {
	/* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
	@@ -114,9 +115,6 @@
	int t_last_cong_prev;
	};

	-static MALLOC_DEFINE(M_CUBIC, "cubic data",
	- "Per connection data required for the CUBIC congestion control algorithm");
	-
	struct cc_algo cubic_cc_algo = {
	.name = "cubic",
	.ack_received = cubic_ack_received,
	@@ -127,6 +125,7 @@
	.mod_init = cubic_mod_init,
	.post_recovery = cubic_post_recovery,
	.after_idle = cubic_after_idle,
	+ .cc_data_sz = cubic_data_sz
	};

	static void
	@@ -149,7 +148,7 @@
	if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) \|\|
	cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) {
	cubic_data->flags \|= CUBICFLAG_IN_SLOWSTART;
	- newreno_cc_algo.ack_received(ccv, type);
	+ common_cc_ack_received(ccv, type);
	} else {
	if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) &&
	(cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) {
	@@ -243,25 +242,33 @@
	cubic_data->max_cwnd = ulmax(cubic_data->max_cwnd, CCV(ccv, snd_cwnd));
	cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg));

	- newreno_cc_algo.after_idle(ccv);
	+ common_cc_after_idle(ccv);
	cubic_data->t_last_cong = ticks;
	}

	static void
	cubic_cb_destroy(struct cc_var *ccv)
	{
	- free(ccv->cc_data, M_CUBIC);
	+ free(ccv->cc_data, M_CC_MEM);
	+}
	+
	+static size_t
	+cubic_data_sz(void)
	+{
	+ return (sizeof(struct cubic));
	}

	static int
	-cubic_cb_init(struct cc_var *ccv)
	+cubic_cb_init(struct cc_var ccv, void ptr)
	{
	struct cubic *cubic_data;

	- cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT\|M_ZERO);
	-
	- if (cubic_data == NULL)
	- return (ENOMEM);
	+ if (ptr == NULL) {
	+ cubic_data = malloc(sizeof(struct cubic), M_CC_MEM, M_NOWAIT\|M_ZERO);
	+ if (cubic_data == NULL)
	+ return (ENOMEM);
	+ } else
	+ cubic_data = ptr;

	/* Init some key variables with sensible defaults. */
	cubic_data->t_last_cong = ticks;
	Index: sys/netinet/cc/cc_dctcp.c
	===================================================================
	--- sys/netinet/cc/cc_dctcp.c
	+++ sys/netinet/cc/cc_dctcp.c
	@@ -76,18 +76,16 @@
	uint32_t num_cong_events; /* # of congestion events */
	};

	-static MALLOC_DEFINE(M_dctcp, "dctcp data",
	- "Per connection data required for the dctcp algorithm");
	-
	static void dctcp_ack_received(struct cc_var *ccv, uint16_t type);
	static void dctcp_after_idle(struct cc_var *ccv);
	static void dctcp_cb_destroy(struct cc_var *ccv);
	-static int dctcp_cb_init(struct cc_var *ccv);
	+static int dctcp_cb_init(struct cc_var ccv, void ptr);
	static void dctcp_cong_signal(struct cc_var *ccv, uint32_t type);
	static void dctcp_conn_init(struct cc_var *ccv);
	static void dctcp_post_recovery(struct cc_var *ccv);
	static void dctcp_ecnpkt_handler(struct cc_var *ccv);
	static void dctcp_update_alpha(struct cc_var *ccv);
	+static size_t dctcp_data_sz(void);

	struct cc_algo dctcp_cc_algo = {
	.name = "dctcp",
	@@ -99,6 +97,7 @@
	.post_recovery = dctcp_post_recovery,
	.ecnpkt_handler = dctcp_ecnpkt_handler,
	.after_idle = dctcp_after_idle,
	+ .cc_data_sz = dctcp_data_sz,
	};

	static void
	@@ -117,10 +116,10 @@
	*/
	if (IN_CONGRECOVERY(CCV(ccv, t_flags))) {
	EXIT_CONGRECOVERY(CCV(ccv, t_flags));
	- newreno_cc_algo.ack_received(ccv, type);
	+ common_cc_ack_received(ccv, type);
	ENTER_CONGRECOVERY(CCV(ccv, t_flags));
	} else
	- newreno_cc_algo.ack_received(ccv, type);
	+ common_cc_ack_received(ccv, type);

	if (type == CC_DUPACK)
	bytes_acked = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
	@@ -158,7 +157,13 @@
	SEQ_GT(ccv->curack, dctcp_data->save_sndnxt))
	dctcp_update_alpha(ccv);
	} else
	- newreno_cc_algo.ack_received(ccv, type);
	+ common_cc_ack_received(ccv, type);
	+}
	+
	+static size_t
	+dctcp_data_sz(void)
	+{
	+ return (sizeof(struct dctcp));
	}

	static void
	@@ -179,25 +184,26 @@
	dctcp_data->num_cong_events = 0;
	}

	- newreno_cc_algo.after_idle(ccv);
	+ common_cc_after_idle(ccv);
	}

	static void
	dctcp_cb_destroy(struct cc_var *ccv)
	{
	- free(ccv->cc_data, M_dctcp);
	+ free(ccv->cc_data, M_CC_MEM);
	}

	static int
	-dctcp_cb_init(struct cc_var *ccv)
	+dctcp_cb_init(struct cc_var ccv, void ptr)
	{
	struct dctcp *dctcp_data;

	- dctcp_data = malloc(sizeof(struct dctcp), M_dctcp, M_NOWAIT\|M_ZERO);
	-
	- if (dctcp_data == NULL)
	- return (ENOMEM);
	-
	+ if (ptr == NULL) {
	+ dctcp_data = malloc(sizeof(struct dctcp), M_CC_MEM, M_NOWAIT\|M_ZERO);
	+ if (dctcp_data == NULL)
	+ return (ENOMEM);
	+ } else
	+ dctcp_data = ptr;
	/* Initialize some key variables with sensible defaults. */
	dctcp_data->bytes_ecn = 0;
	dctcp_data->bytes_total = 0;
	@@ -292,7 +298,7 @@
	break;
	}
	} else
	- newreno_cc_algo.cong_signal(ccv, type);
	+ common_cc_cong_signal(ccv, type);
	}

	static void
	@@ -312,7 +318,7 @@
	static void
	dctcp_post_recovery(struct cc_var *ccv)
	{
	- newreno_cc_algo.post_recovery(ccv);
	+ common_cc_post_recovery(ccv);

	if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT)
	dctcp_update_alpha(ccv);
	Index: sys/netinet/cc/cc_hd.c
	===================================================================
	--- sys/netinet/cc/cc_hd.c
	+++ sys/netinet/cc/cc_hd.c
	@@ -84,6 +84,7 @@

	static void hd_ack_received(struct cc_var *ccv, uint16_t ack_type);
	static int hd_mod_init(void);
	+static size_t hd_data_sz(void);

	static int ertt_id;

	@@ -97,9 +98,19 @@
	struct cc_algo hd_cc_algo = {
	.name = "hd",
	.ack_received = hd_ack_received,
	- .mod_init = hd_mod_init
	+ .mod_init = hd_mod_init,
	+ .cc_data_sz = hd_data_sz,
	+ .after_idle = common_cc_after_idle,
	+ .cong_signal = common_cc_cong_signal,
	+ .post_recovery = common_cc_post_recovery,
	};

	+static size_t
	+hd_data_sz(void)
	+{
	+ return (0);
	+}
	+
	/*
	* Hamilton backoff function. Returns 1 if we should backoff or 0 otherwise.
	*/
	@@ -150,14 +161,14 @@
	* half cwnd and behave like an ECN (ie
	* not a packet loss).
	*/
	- newreno_cc_algo.cong_signal(ccv,
	+ common_cc_cong_signal(ccv,
	CC_ECN);
	return;
	}
	}
	}
	}
	- newreno_cc_algo.ack_received(ccv, ack_type); /* As for NewReno. */
	+ common_cc_ack_received(ccv, ack_type);
	}

	static int
	@@ -169,11 +180,6 @@
	printf("%s: h_ertt module not found\n", __func__);
	return (ENOENT);
	}
	-
	- hd_cc_algo.after_idle = newreno_cc_algo.after_idle;
	- hd_cc_algo.cong_signal = newreno_cc_algo.cong_signal;
	- hd_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
	-
	return (0);
	}

	Index: sys/netinet/cc/cc_htcp.c
	===================================================================
	--- sys/netinet/cc/cc_htcp.c
	+++ sys/netinet/cc/cc_htcp.c
	@@ -137,7 +137,7 @@

	static void htcp_ack_received(struct cc_var *ccv, uint16_t type);
	static void htcp_cb_destroy(struct cc_var *ccv);
	-static int htcp_cb_init(struct cc_var *ccv);
	+static int htcp_cb_init(struct cc_var ccv, void ptr);
	static void htcp_cong_signal(struct cc_var *ccv, uint32_t type);
	static int htcp_mod_init(void);
	static void htcp_post_recovery(struct cc_var *ccv);
	@@ -145,6 +145,7 @@
	static void htcp_recalc_beta(struct cc_var *ccv);
	static void htcp_record_rtt(struct cc_var *ccv);
	static void htcp_ssthresh_update(struct cc_var *ccv);
	+static size_t htcp_data_sz(void);

	struct htcp {
	/* cwnd before entering cong recovery. */
	@@ -175,9 +176,6 @@
	#define V_htcp_adaptive_backoff VNET(htcp_adaptive_backoff)
	#define V_htcp_rtt_scaling VNET(htcp_rtt_scaling)

	-static MALLOC_DEFINE(M_HTCP, "htcp data",
	- "Per connection data required for the HTCP congestion control algorithm");
	-
	struct cc_algo htcp_cc_algo = {
	.name = "htcp",
	.ack_received = htcp_ack_received,
	@@ -186,6 +184,8 @@
	.cong_signal = htcp_cong_signal,
	.mod_init = htcp_mod_init,
	.post_recovery = htcp_post_recovery,
	+ .cc_data_sz = htcp_data_sz,
	+ .after_idle = common_cc_after_idle,
	};

	static void
	@@ -214,7 +214,7 @@
	*/
	if (htcp_data->alpha == 1 \|\|
	CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh))
	- newreno_cc_algo.ack_received(ccv, type);
	+ common_cc_ack_received(ccv, type);
	else {
	if (V_tcp_do_rfc3465) {
	/* Increment cwnd by alpha segments. */
	@@ -238,18 +238,26 @@
	static void
	htcp_cb_destroy(struct cc_var *ccv)
	{
	- free(ccv->cc_data, M_HTCP);
	+ free(ccv->cc_data, M_CC_MEM);
	+}
	+
	+static size_t
	+htcp_data_sz(void)
	+{
	+ return(sizeof(struct htcp));
	}

	static int
	-htcp_cb_init(struct cc_var *ccv)
	+htcp_cb_init(struct cc_var ccv, void ptr)
	{
	struct htcp *htcp_data;

	- htcp_data = malloc(sizeof(struct htcp), M_HTCP, M_NOWAIT);
	-
	- if (htcp_data == NULL)
	- return (ENOMEM);
	+ if (ptr == NULL) {
	+ htcp_data = malloc(sizeof(struct htcp), M_CC_MEM, M_NOWAIT);
	+ if (htcp_data == NULL)
	+ return (ENOMEM);
	+ } else
	+ htcp_data = ptr;

	/* Init some key variables with sensible defaults. */
	htcp_data->alpha = HTCP_INIT_ALPHA;
	@@ -333,16 +341,12 @@
	static int
	htcp_mod_init(void)
	{
	-
	- htcp_cc_algo.after_idle = newreno_cc_algo.after_idle;
	-
	/*
	* HTCP_RTT_REF is defined in ms, and t_srtt in the tcpcb is stored in
	* units of TCP_RTT_SCALE*hz. Scale HTCP_RTT_REF to be in the same units
	* as t_srtt.
	*/
	htcp_rtt_ref = (HTCP_RTT_REF * TCP_RTT_SCALE * hz) / 1000;
	-
	return (0);
	}

	Index: sys/netinet/cc/cc_newreno.c
	===================================================================
	--- sys/netinet/cc/cc_newreno.c
	+++ sys/netinet/cc/cc_newreno.c
	@@ -82,18 +82,15 @@
	#include <netinet/cc/cc_module.h>
	#include <netinet/cc/cc_newreno.h>

	-static MALLOC_DEFINE(M_NEWRENO, "newreno data",
	- "newreno beta values");
	-
	static void newreno_cb_destroy(struct cc_var *ccv);
	static void newreno_ack_received(struct cc_var *ccv, uint16_t type);
	static void newreno_after_idle(struct cc_var *ccv);
	static void newreno_cong_signal(struct cc_var *ccv, uint32_t type);
	-static void newreno_post_recovery(struct cc_var *ccv);
	static int newreno_ctl_output(struct cc_var ccv, struct sockopt sopt, void *buf);
	static void newreno_newround(struct cc_var *ccv, uint32_t round_cnt);
	static void newreno_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas);
	-static int newreno_cb_init(struct cc_var *ccv);
	+static int newreno_cb_init(struct cc_var ccv, void );
	+static size_t newreno_data_sz(void);

	VNET_DEFINE(uint32_t, newreno_beta) = 50;
	VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80;
	@@ -106,11 +103,12 @@
	.ack_received = newreno_ack_received,
	.after_idle = newreno_after_idle,
	.cong_signal = newreno_cong_signal,
	- .post_recovery = newreno_post_recovery,
	+ .post_recovery = common_cc_post_recovery,
	.ctl_output = newreno_ctl_output,
	.newround = newreno_newround,
	.rttsample = newreno_rttsample,
	.cb_init = newreno_cb_init,
	+ .cc_data_sz = newreno_data_sz,
	};

	static uint32_t hystart_lowcwnd = 16;
	@@ -167,14 +165,23 @@
	}
	}

	+static size_t
	+newreno_data_sz(void)
	+{
	+ return (sizeof(struct newreno));
	+}
	+
	static int
	-newreno_cb_init(struct cc_var *ccv)
	+newreno_cb_init(struct cc_var ccv, void ptr)
	{
	struct newreno *nreno;

	- ccv->cc_data = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT);
	- if (ccv->cc_data == NULL)
	- return (ENOMEM);
	+ if (ptr == NULL) {
	+ ccv->cc_data = malloc(sizeof(struct newreno), M_CC_MEM, M_NOWAIT);
	+ if (ccv->cc_data == NULL)
	+ return (ENOMEM);
	+ } else
	+ ccv->cc_data = ptr;
	nreno = (struct newreno *)ccv->cc_data;
	/* NB: nreno is not zeroed, so initialise all fields. */
	nreno->beta = V_newreno_beta;
	@@ -201,7 +208,7 @@
	static void
	newreno_cb_destroy(struct cc_var *ccv)
	{
	- free(ccv->cc_data, M_NEWRENO);
	+ free(ccv->cc_data, M_CC_MEM);
	}

	static void
	@@ -209,13 +216,7 @@
	{
	struct newreno *nreno;

	- /*
	- * Other TCP congestion controls use newreno_ack_received(), but
	- * with their own private cc_data. Make sure the cc_data is used
	- * correctly.
	- */
	- nreno = (CC_ALGO(ccv->ccvc.tcp) == &newreno_cc_algo) ? ccv->cc_data : NULL;
	-
	+ nreno = ccv->cc_data;
	if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
	(ccv->flags & CCF_CWND_LIMITED)) {
	u_int cw = CCV(ccv, snd_cwnd);
	@@ -249,8 +250,7 @@
	* avoid capping cwnd.
	*/
	if (cw > CCV(ccv, snd_ssthresh)) {
	- if ((nreno != NULL) &&
	- (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS)) {
	+ if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) {
	/*
	* We have slipped into CA with
	* CSS active. Deactivate all.
	@@ -284,8 +284,7 @@
	abc_val = ccv->labc;
	else
	abc_val = V_tcp_abc_l_var;
	- if ((nreno != NULL) &&
	- (nreno->newreno_flags & CC_NEWRENO_HYSTART_ALLOWED) &&
	+ if ((nreno->newreno_flags & CC_NEWRENO_HYSTART_ALLOWED) &&
	(nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) &&
	((nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) == 0)) {
	/*
	@@ -323,8 +322,7 @@
	incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));

	/* Only if Hystart is enabled will the flag get set */
	- if ((nreno != NULL) &&
	- (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS)) {
	+ if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) {
	incr /= hystart_css_growth_div;
	newreno_log_hystart_event(ccv, nreno, 3, incr);
	}
	@@ -340,39 +338,10 @@
	newreno_after_idle(struct cc_var *ccv)
	{
	struct newreno *nreno;
	- uint32_t rw;
	-
	- /*
	- * Other TCP congestion controls use newreno_after_idle(), but
	- * with their own private cc_data. Make sure the cc_data is used
	- * correctly.
	- */
	- nreno = (CC_ALGO(ccv->ccvc.tcp) == &newreno_cc_algo) ? ccv->cc_data : NULL;
	- /*
	- * If we've been idle for more than one retransmit timeout the old
	- * congestion window is no longer current and we have to reduce it to
	- * the restart window before we can transmit again.
	- *
	- * The restart window is the initial window or the last CWND, whichever
	- * is smaller.
	- *
	- * This is done to prevent us from flooding the path with a full CWND at
	- * wirespeed, overloading router and switch buffers along the way.
	- *
	- * See RFC5681 Section 4.1. "Restarting Idle Connections".
	- *
	- * In addition, per RFC2861 Section 2, the ssthresh is set to the
	- * maximum of the former ssthresh or 3/4 of the old cwnd, to
	- * not exit slow-start prematurely.
	- */
	- rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp));

	- CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
	- CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));
	-
	- CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
	- if ((nreno != NULL) &&
	- (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) == 0) {
	+ nreno = ccv->cc_data;
	+ common_cc_after_idle(ccv);
	+ if ((nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) == 0) {
	if (CCV(ccv, snd_cwnd) <= (hystart_lowcwnd * tcp_fixed_maxseg(ccv->ccvc.tcp))) {
	/*
	* Re-enable hystart if our cwnd has fallen below
	@@ -396,12 +365,7 @@

	cwin = CCV(ccv, snd_cwnd);
	mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
	- /*
	- * Other TCP congestion controls use newreno_cong_signal(), but
	- * with their own private cc_data. Make sure the cc_data is used
	- * correctly.
	- */
	- nreno = (CC_ALGO(ccv->ccvc.tcp) == &newreno_cc_algo) ? ccv->cc_data : NULL;
	+ nreno = ccv->cc_data;
	beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;;
	beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn;
	/*
	@@ -426,8 +390,7 @@

	switch (type) {
	case CC_NDUPACK:
	- if ((nreno != NULL) &&
	- (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED)) {
	+ if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) {
	/* Make sure the flags are all off we had a loss */
	nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED;
	nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS;
	@@ -445,8 +408,7 @@
	}
	break;
	case CC_ECN:
	- if ((nreno != NULL) &&
	- (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED)) {
	+ if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) {
	/* Make sure the flags are all off we had a loss */
	nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED;
	nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS;
	@@ -466,41 +428,6 @@
	}
	}

	-/*
	- * Perform any necessary tasks before we exit congestion recovery.
	- */
	-static void
	-newreno_post_recovery(struct cc_var *ccv)
	-{
	- int pipe;
	-
	- if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
	- /*
	- * Fast recovery will conclude after returning from this
	- * function. Window inflation should have left us with
	- * approximately snd_ssthresh outstanding data. But in case we
	- * would be inclined to send a burst, better to do it via the
	- * slow start mechanism.
	- *
	- * XXXLAS: Find a way to do this without needing curack
	- */
	- if (V_tcp_do_newsack)
	- pipe = tcp_compute_pipe(ccv->ccvc.tcp);
	- else
	- pipe = CCV(ccv, snd_max) - ccv->curack;
	-
	- if (pipe < CCV(ccv, snd_ssthresh))
	- /*
	- * Ensure that cwnd does not collapse to 1 MSS under
	- * adverse conditons. Implements RFC6582
	- */
	- CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
	- CCV(ccv, t_maxseg);
	- else
	- CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
	- }
	-}
	-
	static int
	newreno_ctl_output(struct cc_var ccv, struct sockopt sopt, void *buf)
	{
	Index: sys/netinet/cc/cc_vegas.c
	===================================================================
	--- sys/netinet/cc/cc_vegas.c
	+++ sys/netinet/cc/cc_vegas.c
	@@ -87,10 +87,11 @@

	static void vegas_ack_received(struct cc_var *ccv, uint16_t ack_type);
	static void vegas_cb_destroy(struct cc_var *ccv);
	-static int vegas_cb_init(struct cc_var *ccv);
	+static int vegas_cb_init(struct cc_var ccv, void ptr);
	static void vegas_cong_signal(struct cc_var *ccv, uint32_t signal_type);
	static void vegas_conn_init(struct cc_var *ccv);
	static int vegas_mod_init(void);
	+static size_t vegas_data_sz(void);

	struct vegas {
	int slow_start_toggle;
	@@ -103,9 +104,6 @@
	#define V_vegas_alpha VNET(vegas_alpha)
	#define V_vegas_beta VNET(vegas_beta)

	-static MALLOC_DEFINE(M_VEGAS, "vegas data",
	- "Per connection data required for the Vegas congestion control algorithm");
	-
	struct cc_algo vegas_cc_algo = {
	.name = "vegas",
	.ack_received = vegas_ack_received,
	@@ -113,7 +111,10 @@
	.cb_init = vegas_cb_init,
	.cong_signal = vegas_cong_signal,
	.conn_init = vegas_conn_init,
	- .mod_init = vegas_mod_init
	+ .mod_init = vegas_mod_init,
	+ .cc_data_sz = vegas_data_sz,
	+ .after_idle = common_cc_after_idle,
	+ .post_recovery = common_cc_post_recovery,
	};

	/*
	@@ -162,24 +163,32 @@
	}

	if (vegas_data->slow_start_toggle)
	- newreno_cc_algo.ack_received(ccv, ack_type);
	+ common_cc_ack_received(ccv, ack_type);
	}

	static void
	vegas_cb_destroy(struct cc_var *ccv)
	{
	- free(ccv->cc_data, M_VEGAS);
	+ free(ccv->cc_data, M_CC_MEM);
	+}
	+
	+static size_t
	+vegas_data_sz(void)
	+{
	+ return (sizeof(struct vegas));
	}

	static int
	-vegas_cb_init(struct cc_var *ccv)
	+vegas_cb_init(struct cc_var ccv, void ptr)
	{
	struct vegas *vegas_data;

	- vegas_data = malloc(sizeof(struct vegas), M_VEGAS, M_NOWAIT);
	-
	- if (vegas_data == NULL)
	- return (ENOMEM);
	+ if (ptr == NULL) {
	+ vegas_data = malloc(sizeof(struct vegas), M_CC_MEM, M_NOWAIT);
	+ if (vegas_data == NULL)
	+ return (ENOMEM);
	+ } else
	+ vegas_data = ptr;

	vegas_data->slow_start_toggle = 1;
	ccv->cc_data = vegas_data;
	@@ -216,7 +225,7 @@
	break;

	default:
	- newreno_cc_algo.cong_signal(ccv, signal_type);
	+ common_cc_cong_signal(ccv, signal_type);
	}

	if (IN_RECOVERY(CCV(ccv, t_flags)) && !presignalrecov)
	@@ -236,16 +245,11 @@
	static int
	vegas_mod_init(void)
	{
	-
	ertt_id = khelp_get_id("ertt");
	if (ertt_id <= 0) {
	printf("%s: h_ertt module not found\n", __func__);
	return (ENOENT);
	}
	-
	- vegas_cc_algo.after_idle = newreno_cc_algo.after_idle;
	- vegas_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
	-
	return (0);
	}

	Index: sys/netinet/tcp_subr.c
	===================================================================
	--- sys/netinet/tcp_subr.c
	+++ sys/netinet/tcp_subr.c
	@@ -2147,7 +2147,7 @@
	tp->t_inpcb = inp;

	if (CC_ALGO(tp)->cb_init != NULL)
	- if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
	+ if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) {
	if (tp->t_fb->tfb_tcp_fb_fini)
	(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
	in_pcbrele_wlocked(inp);
	@@ -2240,25 +2240,23 @@
	}

	/*
	- * Switch the congestion control algorithm back to NewReno for any active
	- * control blocks using an algorithm which is about to go away.
	- * This ensures the CC framework can allow the unload to proceed without leaving
	- * any dangling pointers which would trigger a panic.
	- * Returning non-zero would inform the CC framework that something went wrong
	- * and it would be unsafe to allow the unload to proceed. However, there is no
	- * way for this to occur with this implementation so we always return zero.
	+ * Switch the congestion control algorithm back to Vnet default for any active
	+ * control blocks using an algorithm which is about to go away. If the algorithm
	+ * has a cb_init function and it fails (no memory) then the operation fails and
	+ * the unload will not succeed.
	+ *
	*/
	int
	tcp_ccalgounload(struct cc_algo *unload_algo)
	{
	- struct cc_algo *tmpalgo;
	+ struct cc_algo oldalgo, newalgo;
	struct inpcb *inp;
	struct tcpcb *tp;
	VNET_ITERATOR_DECL(vnet_iter);

	/*
	* Check all active control blocks across all network stacks and change
	- * any that are using "unload_algo" back to NewReno. If "unload_algo"
	+ * any that are using "unload_algo" back to its default. If "unload_algo"
	* requires cleanup code to be run, call it.
	*/
	VNET_LIST_RLOCK();
	@@ -2272,6 +2270,7 @@
	* therefore don't enter the loop below until the connection
	* list has stabilised.
	*/
	+ newalgo = CC_DEFAULT();
	CK_LIST_FOREACH(inp, &V_tcb, inp_list) {
	INP_WLOCK(inp);
	/* Important to skip tcptw structs. */
	@@ -2280,24 +2279,48 @@
	/*
	* By holding INP_WLOCK here, we are assured
	* that the connection is not currently
	- * executing inside the CC module's functions
	- * i.e. it is safe to make the switch back to
	- * NewReno.
	+ * executing inside the CC module's functions.
	+ * We attempt to switch to the Vnets default,
	+ * if the init fails then we fail the whole
	+ * operation and the module unload will fail.
	*/
	if (CC_ALGO(tp) == unload_algo) {
	- tmpalgo = CC_ALGO(tp);
	- if (tmpalgo->cb_destroy != NULL)
	- tmpalgo->cb_destroy(tp->ccv);
	- CC_DATA(tp) = NULL;
	- /*
	- * NewReno may allocate memory on
	- * demand for certain stateful
	- * configuration as needed, but is
	- * coded to never fail on memory
	- * allocation failure so it is a safe
	- * fallback.
	- */
	- CC_ALGO(tp) = &newreno_cc_algo;
	+ struct cc_var cc_mem;
	+ int err;
	+
	+ oldalgo = CC_ALGO(tp);
	+ memset(&cc_mem, 0, sizeof(cc_mem));
	+ cc_mem.ccvc.tcp = tp;
	+ if (newalgo->cb_init == NULL) {
	+ /*
	+ * No init we can skip the
	+ * dance around a possible failure.
	+ */
	+ CC_DATA(tp) = NULL;
	+ goto proceed;
	+ }
	+ err = (newalgo->cb_init)(&cc_mem, NULL);
	+ if (err) {
	+ /*
	+ * Presumably no memory the caller will
	+ * need to try again.
	+ */
	+ INP_WUNLOCK(inp);
	+ INP_INFO_WUNLOCK(&V_tcbinfo);
	+ CURVNET_RESTORE();
	+ VNET_LIST_RUNLOCK();
	+ return (err);
	+ }
	+proceed:
	+ if (oldalgo->cb_destroy != NULL)
	+ oldalgo->cb_destroy(tp->ccv);
	+ CC_ALGO(tp) = newalgo;
	+ memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
	+ if (TCPS_HAVEESTABLISHED(tp->t_state) &&
	+ (CC_ALGO(tp)->conn_init != NULL)) {
	+ /* Yep run the connection init for the new CC */
	+ CC_ALGO(tp)->conn_init(tp->ccv);
	+ }
	}
	}
	INP_WUNLOCK(inp);
	@@ -2306,7 +2329,6 @@
	CURVNET_RESTORE();
	}
	VNET_LIST_RUNLOCK();
	-
	return (0);
	}

	Index: sys/netinet/tcp_usrreq.c
	===================================================================
	--- sys/netinet/tcp_usrreq.c
	+++ sys/netinet/tcp_usrreq.c
	@@ -2007,6 +2007,8 @@
	}
	#endif

	+extern struct cc_algo newreno_cc_algo;
	+
	int
	tcp_default_ctloutput(struct socket so, struct sockopt sopt, struct inpcb inp, struct tcpcb tp)
	{
	@@ -2223,48 +2225,103 @@
	break;

	case TCP_CONGESTION:
	+ {
	+ struct cc_var cc_mem;
	+ size_t mem_sz;
	+ void *ptr = NULL;
	+
	INP_WUNLOCK(inp);
	error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
	if (error)
	break;
	buf[sopt->sopt_valsize] = '\0';
	- INP_WLOCK_RECHECK(inp);
	CC_LIST_RLOCK();
	STAILQ_FOREACH(algo, &cc_list, entries)
	if (strncmp(buf, algo->name,
	TCP_CA_NAME_MAX) == 0)
	break;
	- CC_LIST_RUNLOCK();
	if (algo == NULL) {
	- INP_WUNLOCK(inp);
	+ CC_LIST_RUNLOCK();
	error = EINVAL;
	break;
	}
	+do_over:
	+ if (algo->cb_init != NULL) {
	+ /* We can now pre-get the memory for the CC */
	+ CC_LIST_RUNLOCK();
	+ mem_sz = (*algo->cc_data_sz)();
	+ if (mem_sz == 0)
	+ goto no_mem_needed;
	+ ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK);
	+ CC_LIST_RLOCK();
	+ STAILQ_FOREACH(algo, &cc_list, entries)
	+ if (strncmp(buf, algo->name,
	+ TCP_CA_NAME_MAX) == 0)
	+ break;
	+ if (algo == NULL) {
	+ if (ptr)
	+ free(ptr, M_CC_MEM);
	+ CC_LIST_RUNLOCK();
	+ error = EINVAL;
	+ break;
	+ }
	+ } else {
	+no_mem_needed:
	+ mem_sz = 0;
	+ ptr = NULL;
	+ }
	/*
	- * We hold a write lock over the tcb so it's safe to
	- * do these things without ordering concerns.
	+ * Make sure its all clean and zero and also get
	+ * back the inplock.
	*/
	- if (CC_ALGO(tp)->cb_destroy != NULL)
	- CC_ALGO(tp)->cb_destroy(tp->ccv);
	- CC_DATA(tp) = NULL;
	- CC_ALGO(tp) = algo;
	+ memset(&cc_mem, 0, sizeof(cc_mem));
	+ if (mem_sz != (*algo->cc_data_sz)()) {
	+ if (ptr)
	+ free(ptr, M_CC_MEM);
	+ goto do_over;
	+ }
	+ if (ptr) {
	+ memset(ptr, 0, mem_sz);
	+ INP_WLOCK_RECHECK_CLEANUP(inp, free(ptr, M_CC_MEM));
	+ } else
	+ INP_WLOCK_RECHECK(inp);
	+ CC_LIST_RUNLOCK();
	+ cc_mem.ccvc.tcp = tp;
	+ /*
	+ * We once again hold a write lock over the tcb so it's
	+ * safe to do these things without ordering concerns.
	+ * Note here we init into stack memory.
	+ */
	+ if (algo->cb_init != NULL)
	+ error = algo->cb_init(&cc_mem, ptr);
	+ else
	+ error = 0;
	/*
	- * If something goes pear shaped initialising the new
	- * algo, fall back to newreno (which does not
	- * require initialisation).
	+ * The CC algorithms, when given their memory
	+ * should not fail we could in theory have a
	+ * KASSERT here.
	*/
	- if (algo->cb_init != NULL &&
	- algo->cb_init(tp->ccv) != 0) {
	- CC_ALGO(tp) = &newreno_cc_algo;
	+ if (error == 0) {
	/*
	- * The only reason init should fail is
	- * because of malloc.
	+ * Touchdown, lets go ahead and move the
	+ * connection to the new CC module by
	+ * copying in the cc_mem after we call
	+ * the old ones cleanup (if any).
	*/
	- error = ENOMEM;
	- }
	+ if (CC_ALGO(tp)->cb_destroy != NULL)
	+ CC_ALGO(tp)->cb_destroy(tp->ccv);
	+ memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
	+ tp->cc_algo = algo;
	+ /* Ok now are we where we have gotten past any conn_init? */
	+ if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
	+ /* Yep run the connection init for the new CC */
	+ CC_ALGO(tp)->conn_init(tp->ccv);
	+ }
	+ } else if (ptr)
	+ free(ptr, M_CC_MEM);
	INP_WUNLOCK(inp);
	break;
	-
	+ }
	case TCP_REUSPORT_LB_NUMA:
	INP_WUNLOCK(inp);
	error = sooptcopyin(sopt, &optval, sizeof(optval),

File Metadata

Mime Type: text/plain
Expires: Sun, Nov 16, 1:31 PM (5 h, 25 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 25372015
Default Alt Text: D32693.id97602.diff (42 KB)

D32693.id97602.diffNo OneTemporaryActions

D32693.id97602.diffView Options

File Metadata

Event Timeline

D32693.id97602.diff
No OneTemporary
Actions

D32693.id97602.diff
View Options