Differential D32693 Diff 97583 sys/netinet/cc/cc.c

Changeset View

Standalone View

sys/netinet/cc/cc.c

Show First 20 Lines • Show All 64 Lines • ▼ Show 20 Lines
#include <sys/socketvar.h>		#include <sys/socketvar.h>
#include <sys/sysctl.h>		#include <sys/sysctl.h>

#include <net/vnet.h>		#include <net/vnet.h>

#include <netinet/in.h>		#include <netinet/in.h>
#include <netinet/in_pcb.h>		#include <netinet/in_pcb.h>
#include <netinet/tcp.h>		#include <netinet/tcp.h>
		#include <netinet/tcp_seq.h>
#include <netinet/tcp_var.h>		#include <netinet/tcp_var.h>
		#include <netinet/tcp_log_buf.h>
		#include <netinet/tcp_hpts.h>
#include <netinet/cc/cc.h>		#include <netinet/cc/cc.h>

#include <netinet/cc/cc_module.h>		#include <netinet/cc/cc_module.h>

		MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory");

		extern struct cc_algo newreno_cc_algo;
/*		/*
* List of available cc algorithms on the current system. First element		* List of available cc algorithms on the current system. First element
* is used as the system default CC algorithm.		* is used as the system default CC algorithm.
*/		*/
struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);		struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);

/* Protects the cc_list TAILQ. */		/* Protects the cc_list TAILQ. */
struct rwlock cc_list_lock;		struct rwlock cc_list_lock;

VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;		VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo;

		VNET_DECLARE(uint32_t, newreno_beta);
		#define V_newreno_beta VNET(newreno_beta)
/*		/*
* Sysctl handler to show and change the default CC algorithm.		* Sysctl handler to show and change the default CC algorithm.
*/		*/
static int		static int
cc_default_algo(SYSCTL_HANDLER_ARGS)		cc_default_algo(SYSCTL_HANDLER_ARGS)
{		{
char default_cc[TCP_CA_NAME_MAX];		char default_cc[TCP_CA_NAME_MAX];
struct cc_algo *funcs;		struct cc_algo *funcs;
▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
if (funcs == remove_cc) {		if (funcs == remove_cc) {
cc_checkreset_default(remove_cc);		cc_checkreset_default(remove_cc);
STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);		STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
err = 0;		err = 0;
break;		break;
}		}
}		}
CC_LIST_WUNLOCK();		CC_LIST_WUNLOCK();

		lstewartUnsubmitted Not Done Inline Actions I don't want to lose the ability to (forcibly) unload a module which is in use as a stack-configured default or by active connections. I'm ok with refusing to unload when in use if a normal kldunload is called, but `kldunload -f` should be a workable option and leave the system in a sane/functional state. lstewart: I don't want to lose the ability to (forcibly) unload a module which is in use as a stack…
		rrsAuthorUnsubmitted Not Done Inline Actions I believe a -f will still work but you will likely crash the system. I have seen this with the loadable stacks.. Since you leave things stranded with invalid methods. I do not think it is unreasonable to insist that you have enough brains to move the default to some other CC before you unload a module. This is just common sense. As to tcp_ccalgounload() failing, which it can (if the new default can't get memory), being able to retry is I think acceptable. Michael has also suggested a user space utility like tcpdrop that does this i.e. moves the CC algorithm of anyone using "cc" to some other. That might be another way to do this. This is mainly a developer thing and I really think it is not unreasonable to let you do it twice or three times so the system has M_NOWAIT memory available. You will get to unload your module. And actually in general its going to work.. only if the system is under heavy load and memory is low is it not going to work.. rrs: I believe a -f will still work but you will likely crash the system. I have seen this with the…
if (!err)		if (!err)
/*		/*
* XXXLAS:		* XXXLAS:
* - We may need to handle non-zero return values in future.		* - We may need to handle non-zero return values in future.
* - If we add CC framework support for protocols other than		* - If we add CC framework support for protocols other than
* TCP, we may want a more generic way to handle this step.		* TCP, we may want a more generic way to handle this step.
*/		*/
tcp_ccalgounload(remove_cc);		tcp_ccalgounload(remove_cc);
Show All 27 Lines	if (!err)
STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);		STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);

CC_LIST_WUNLOCK();		CC_LIST_WUNLOCK();

return (err);		return (err);
}		}

/*		/*
		* Perform any necessary tasks before we exit congestion recovery.
		*/
		void
		common_cc_post_recovery(struct cc_var *ccv)
		{
		int pipe;

		if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
		/*
		* Fast recovery will conclude after returning from this
		* function. Window inflation should have left us with
		* approximately snd_ssthresh outstanding data. But in case we
		* would be inclined to send a burst, better to do it via the
		* slow start mechanism.
		*
		* XXXLAS: Find a way to do this without needing curack
		*/
		if (V_tcp_do_newsack)
		pipe = tcp_compute_pipe(ccv->ccvc.tcp);
		else
		pipe = CCV(ccv, snd_max) - ccv->curack;
		if (pipe < CCV(ccv, snd_ssthresh))
		/*
		* Ensure that cwnd does not collapse to 1 MSS under
		* adverse conditons. Implements RFC6582
		*/
		CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) +
		CCV(ccv, t_maxseg);
		else
		CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
		}
		}

		void
		common_cc_after_idle(struct cc_var *ccv)
		{
		uint32_t rw;
		/*
		* If we've been idle for more than one retransmit timeout the old
		* congestion window is no longer current and we have to reduce it to
		* the restart window before we can transmit again.
		*
		* The restart window is the initial window or the last CWND, whichever
		* is smaller.
		*
		* This is done to prevent us from flooding the path with a full CWND at
		* wirespeed, overloading router and switch buffers along the way.
		*
		* See RFC5681 Section 4.1. "Restarting Idle Connections".
		*
		* In addition, per RFC2861 Section 2, the ssthresh is set to the
		* maximum of the former ssthresh or 3/4 of the old cwnd, to
		* not exit slow-start prematurely.
		*/
		rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp));

		CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh),
		CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2));

		CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd));
		}

		/*
		* Perform any necessary tasks before we enter congestion recovery.
		*/
		void
		common_cc_cong_signal(struct cc_var *ccv, uint32_t type)
		{
		uint32_t cwin, factor;
		u_int mss;

		cwin = CCV(ccv, snd_cwnd);
		mss = tcp_fixed_maxseg(ccv->ccvc.tcp);
		/*
		* Other TCP congestion controls use newreno_cong_signal(), but
		* with their own private cc_data. Make sure the cc_data is used
		* correctly.
		*/
		factor = V_newreno_beta;

		/* Catch algos which mistakenly leak private signal types. */
		KASSERT((type & CC_SIGPRIVMASK) == 0,
		("%s: congestion signal type 0x%08x is private\n", __func__, type));

		cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss),
		2) * mss;

		switch (type) {
		case CC_NDUPACK:
		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
		if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
		CCV(ccv, snd_ssthresh) = cwin;
		ENTER_RECOVERY(CCV(ccv, t_flags));
		}
		break;
		case CC_ECN:
		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
		CCV(ccv, snd_ssthresh) = cwin;
		CCV(ccv, snd_cwnd) = cwin;
		ENTER_CONGRECOVERY(CCV(ccv, t_flags));
		}
		break;
		case CC_RTO:
		CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd),
		CCV(ccv, snd_cwnd)) / 2 / mss,
		2) * mss;
		CCV(ccv, snd_cwnd) = mss;
		break;
		}
		}

		void
		common_cc_ack_received(struct cc_var *ccv, uint16_t type)
		{
		if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
		(ccv->flags & CCF_CWND_LIMITED)) {
		u_int cw = CCV(ccv, snd_cwnd);
		u_int incr = CCV(ccv, t_maxseg);

		/*
		* Regular in-order ACK, open the congestion window.
		* Method depends on which congestion control state we're
		* in (slow start or cong avoid) and if ABC (RFC 3465) is
		* enabled.
		*
		* slow start: cwnd <= ssthresh
		* cong avoid: cwnd > ssthresh
		*
		* slow start and ABC (RFC 3465):
		* Grow cwnd exponentially by the amount of data
		* ACKed capping the max increment per ACK to
		* (abc_l_var * maxseg) bytes.
		*
		* slow start without ABC (RFC 5681):
		* Grow cwnd exponentially by maxseg per ACK.
		*
		* cong avoid and ABC (RFC 3465):
		* Grow cwnd linearly by maxseg per RTT for each
		* cwnd worth of ACKed data.
		*
		* cong avoid without ABC (RFC 5681):
		* Grow cwnd linearly by approximately maxseg per RTT using
		* maxseg^2 / cwnd per ACK as the increment.
		* If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
		* avoid capping cwnd.
		*/
		if (cw > CCV(ccv, snd_ssthresh)) {
		if (V_tcp_do_rfc3465) {
		if (ccv->flags & CCF_ABC_SENTAWND)
		ccv->flags &= ~CCF_ABC_SENTAWND;
		else
		incr = 0;
		} else
		incr = max((incr * incr / cw), 1);
		} else if (V_tcp_do_rfc3465) {
		/*
		* In slow-start with ABC enabled and no RTO in sight?
		* (Must not use abc_l_var > 1 if slow starting after
		* an RTO. On RTO, snd_nxt = snd_una, so the
		* snd_nxt == snd_max check is sufficient to
		* handle this).
		*
		* XXXLAS: Find a way to signal SS after RTO that
		* doesn't rely on tcpcb vars.
		*/
		uint16_t abc_val;

		if (ccv->flags & CCF_USE_LOCAL_ABC)
		abc_val = ccv->labc;
		else
		abc_val = V_tcp_abc_l_var;
		if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
		incr = min(ccv->bytes_this_ack,
		ccv->nsegs * abc_val *
		CCV(ccv, t_maxseg));
		else
		incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));

		}
		/* ABC is on by default, so incr equals 0 frequently. */
		if (incr > 0)
		CCV(ccv, snd_cwnd) = min(cw + incr,
		TCP_MAXWIN << CCV(ccv, snd_scale));
		}
		}

		/*
* Handles kld related events. Returns 0 on success, non-zero on failure.		* Handles kld related events. Returns 0 on success, non-zero on failure.
*/		*/
int		int
cc_modevent(module_t mod, int event_type, void *data)		cc_modevent(module_t mod, int event_type, void *data)
{		{
struct cc_algo *algo;		struct cc_algo *algo;
int err;		int err;

err = 0;		err = 0;
algo = (struct cc_algo *)data;		algo = (struct cc_algo *)data;

switch(event_type) {		switch(event_type) {
case MOD_LOAD:		case MOD_LOAD:
		if (algo->cc_data_sz == NULL) {
		/*
		* A module must have a cc_data_sz function
		* even if it has no data it should return 0.
		*/
		err = EINVAL;
		break;
		}
if (algo->mod_init != NULL)		if (algo->mod_init != NULL)
err = algo->mod_init();		err = algo->mod_init();
if (!err)		if (!err)
err = cc_register_algo(algo);		err = cc_register_algo(algo);
break;		break;

case MOD_QUIESCE:		case MOD_QUIESCE:
case MOD_SHUTDOWN:		case MOD_SHUTDOWN:
▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines