Changeset View
Standalone View
sys/netinet/cc/cc.c
Show First 20 Lines • Show All 64 Lines • ▼ Show 20 Lines | |||||
#include <sys/socketvar.h> | #include <sys/socketvar.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <net/vnet.h> | #include <net/vnet.h> | ||||
#include <netinet/in.h> | #include <netinet/in.h> | ||||
#include <netinet/in_pcb.h> | #include <netinet/in_pcb.h> | ||||
#include <netinet/tcp.h> | #include <netinet/tcp.h> | ||||
#include <netinet/tcp_seq.h> | |||||
#include <netinet/tcp_var.h> | #include <netinet/tcp_var.h> | ||||
#include <netinet/tcp_log_buf.h> | |||||
#include <netinet/tcp_hpts.h> | |||||
#include <netinet/cc/cc.h> | #include <netinet/cc/cc.h> | ||||
#include <netinet/cc/cc_module.h> | #include <netinet/cc/cc_module.h> | ||||
MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); | |||||
extern struct cc_algo newreno_cc_algo; | |||||
/* | /* | ||||
* List of available cc algorithms on the current system. First element | * List of available cc algorithms on the current system. First element | ||||
* is used as the system default CC algorithm. | * is used as the system default CC algorithm. | ||||
*/ | */ | ||||
struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); | struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list); | ||||
/* Protects the cc_list TAILQ. */ | /* Protects the cc_list TAILQ. */ | ||||
struct rwlock cc_list_lock; | struct rwlock cc_list_lock; | ||||
VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo; | VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo; | ||||
VNET_DECLARE(uint32_t, newreno_beta); | |||||
#define V_newreno_beta VNET(newreno_beta) | |||||
/* | /* | ||||
* Sysctl handler to show and change the default CC algorithm. | * Sysctl handler to show and change the default CC algorithm. | ||||
*/ | */ | ||||
static int | static int | ||||
cc_default_algo(SYSCTL_HANDLER_ARGS) | cc_default_algo(SYSCTL_HANDLER_ARGS) | ||||
{ | { | ||||
char default_cc[TCP_CA_NAME_MAX]; | char default_cc[TCP_CA_NAME_MAX]; | ||||
struct cc_algo *funcs; | struct cc_algo *funcs; | ||||
▲ Show 20 Lines • Show All 131 Lines • ▼ Show 20 Lines | STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) { | ||||
if (funcs == remove_cc) { | if (funcs == remove_cc) { | ||||
cc_checkreset_default(remove_cc); | cc_checkreset_default(remove_cc); | ||||
STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); | STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries); | ||||
err = 0; | err = 0; | ||||
break; | break; | ||||
} | } | ||||
} | } | ||||
CC_LIST_WUNLOCK(); | CC_LIST_WUNLOCK(); | ||||
lstewart: I don't want to lose the ability to (forcibly) unload a module which is in use as a stack… | |||||
Not Done Inline ActionsI believe a -f will still work but you will likely crash the system. I have seen I do *not* think it is unreasonable to insist that you have enough brains to move the default to As to tcp_ccalgounload() failing, which it can (if the new default can't get memory), being This is mainly a developer thing and I really think it is not unreasonable to let you do it twice or three times rrs: I believe a -f will still work but you will likely crash the system. I have seen
this with the… | |||||
if (!err) | if (!err) | ||||
/* | /* | ||||
* XXXLAS: | * XXXLAS: | ||||
* - We may need to handle non-zero return values in future. | * - We may need to handle non-zero return values in future. | ||||
* - If we add CC framework support for protocols other than | * - If we add CC framework support for protocols other than | ||||
* TCP, we may want a more generic way to handle this step. | * TCP, we may want a more generic way to handle this step. | ||||
*/ | */ | ||||
tcp_ccalgounload(remove_cc); | tcp_ccalgounload(remove_cc); | ||||
Show All 27 Lines | if (!err) | ||||
STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); | STAILQ_INSERT_TAIL(&cc_list, add_cc, entries); | ||||
CC_LIST_WUNLOCK(); | CC_LIST_WUNLOCK(); | ||||
return (err); | return (err); | ||||
} | } | ||||
/* | /* | ||||
* Perform any necessary tasks before we exit congestion recovery. | |||||
*/ | |||||
void | |||||
common_cc_post_recovery(struct cc_var *ccv) | |||||
{ | |||||
int pipe; | |||||
if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { | |||||
/* | |||||
* Fast recovery will conclude after returning from this | |||||
* function. Window inflation should have left us with | |||||
* approximately snd_ssthresh outstanding data. But in case we | |||||
* would be inclined to send a burst, better to do it via the | |||||
* slow start mechanism. | |||||
* | |||||
* XXXLAS: Find a way to do this without needing curack | |||||
*/ | |||||
if (V_tcp_do_newsack) | |||||
pipe = tcp_compute_pipe(ccv->ccvc.tcp); | |||||
else | |||||
pipe = CCV(ccv, snd_max) - ccv->curack; | |||||
if (pipe < CCV(ccv, snd_ssthresh)) | |||||
/* | |||||
* Ensure that cwnd does not collapse to 1 MSS under | |||||
* adverse conditons. Implements RFC6582 | |||||
*/ | |||||
CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + | |||||
CCV(ccv, t_maxseg); | |||||
else | |||||
CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); | |||||
} | |||||
} | |||||
void | |||||
common_cc_after_idle(struct cc_var *ccv) | |||||
{ | |||||
uint32_t rw; | |||||
/* | |||||
* If we've been idle for more than one retransmit timeout the old | |||||
* congestion window is no longer current and we have to reduce it to | |||||
* the restart window before we can transmit again. | |||||
* | |||||
* The restart window is the initial window or the last CWND, whichever | |||||
* is smaller. | |||||
* | |||||
* This is done to prevent us from flooding the path with a full CWND at | |||||
* wirespeed, overloading router and switch buffers along the way. | |||||
* | |||||
* See RFC5681 Section 4.1. "Restarting Idle Connections". | |||||
* | |||||
* In addition, per RFC2861 Section 2, the ssthresh is set to the | |||||
* maximum of the former ssthresh or 3/4 of the old cwnd, to | |||||
* not exit slow-start prematurely. | |||||
*/ | |||||
rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); | |||||
CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), | |||||
CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); | |||||
CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); | |||||
} | |||||
/* | |||||
* Perform any necessary tasks before we enter congestion recovery. | |||||
*/ | |||||
void | |||||
common_cc_cong_signal(struct cc_var *ccv, uint32_t type) | |||||
{ | |||||
uint32_t cwin, factor; | |||||
u_int mss; | |||||
cwin = CCV(ccv, snd_cwnd); | |||||
mss = tcp_fixed_maxseg(ccv->ccvc.tcp); | |||||
/* | |||||
* Other TCP congestion controls use newreno_cong_signal(), but | |||||
* with their own private cc_data. Make sure the cc_data is used | |||||
* correctly. | |||||
*/ | |||||
factor = V_newreno_beta; | |||||
/* Catch algos which mistakenly leak private signal types. */ | |||||
KASSERT((type & CC_SIGPRIVMASK) == 0, | |||||
("%s: congestion signal type 0x%08x is private\n", __func__, type)); | |||||
cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), | |||||
2) * mss; | |||||
switch (type) { | |||||
case CC_NDUPACK: | |||||
if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { | |||||
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) | |||||
CCV(ccv, snd_ssthresh) = cwin; | |||||
ENTER_RECOVERY(CCV(ccv, t_flags)); | |||||
} | |||||
break; | |||||
case CC_ECN: | |||||
if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { | |||||
CCV(ccv, snd_ssthresh) = cwin; | |||||
CCV(ccv, snd_cwnd) = cwin; | |||||
ENTER_CONGRECOVERY(CCV(ccv, t_flags)); | |||||
} | |||||
break; | |||||
case CC_RTO: | |||||
CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), | |||||
CCV(ccv, snd_cwnd)) / 2 / mss, | |||||
2) * mss; | |||||
CCV(ccv, snd_cwnd) = mss; | |||||
break; | |||||
} | |||||
} | |||||
void | |||||
common_cc_ack_received(struct cc_var *ccv, uint16_t type) | |||||
{ | |||||
if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && | |||||
(ccv->flags & CCF_CWND_LIMITED)) { | |||||
u_int cw = CCV(ccv, snd_cwnd); | |||||
u_int incr = CCV(ccv, t_maxseg); | |||||
/* | |||||
* Regular in-order ACK, open the congestion window. | |||||
* Method depends on which congestion control state we're | |||||
* in (slow start or cong avoid) and if ABC (RFC 3465) is | |||||
* enabled. | |||||
* | |||||
* slow start: cwnd <= ssthresh | |||||
* cong avoid: cwnd > ssthresh | |||||
* | |||||
* slow start and ABC (RFC 3465): | |||||
* Grow cwnd exponentially by the amount of data | |||||
* ACKed capping the max increment per ACK to | |||||
* (abc_l_var * maxseg) bytes. | |||||
* | |||||
* slow start without ABC (RFC 5681): | |||||
* Grow cwnd exponentially by maxseg per ACK. | |||||
* | |||||
* cong avoid and ABC (RFC 3465): | |||||
* Grow cwnd linearly by maxseg per RTT for each | |||||
* cwnd worth of ACKed data. | |||||
* | |||||
* cong avoid without ABC (RFC 5681): | |||||
* Grow cwnd linearly by approximately maxseg per RTT using | |||||
* maxseg^2 / cwnd per ACK as the increment. | |||||
* If cwnd > maxseg^2, fix the cwnd increment at 1 byte to | |||||
* avoid capping cwnd. | |||||
*/ | |||||
if (cw > CCV(ccv, snd_ssthresh)) { | |||||
if (V_tcp_do_rfc3465) { | |||||
if (ccv->flags & CCF_ABC_SENTAWND) | |||||
ccv->flags &= ~CCF_ABC_SENTAWND; | |||||
else | |||||
incr = 0; | |||||
} else | |||||
incr = max((incr * incr / cw), 1); | |||||
} else if (V_tcp_do_rfc3465) { | |||||
/* | |||||
* In slow-start with ABC enabled and no RTO in sight? | |||||
* (Must not use abc_l_var > 1 if slow starting after | |||||
* an RTO. On RTO, snd_nxt = snd_una, so the | |||||
* snd_nxt == snd_max check is sufficient to | |||||
* handle this). | |||||
* | |||||
* XXXLAS: Find a way to signal SS after RTO that | |||||
* doesn't rely on tcpcb vars. | |||||
*/ | |||||
uint16_t abc_val; | |||||
if (ccv->flags & CCF_USE_LOCAL_ABC) | |||||
abc_val = ccv->labc; | |||||
else | |||||
abc_val = V_tcp_abc_l_var; | |||||
if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) | |||||
incr = min(ccv->bytes_this_ack, | |||||
ccv->nsegs * abc_val * | |||||
CCV(ccv, t_maxseg)); | |||||
else | |||||
incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); | |||||
} | |||||
/* ABC is on by default, so incr equals 0 frequently. */ | |||||
if (incr > 0) | |||||
CCV(ccv, snd_cwnd) = min(cw + incr, | |||||
TCP_MAXWIN << CCV(ccv, snd_scale)); | |||||
} | |||||
} | |||||
/* | |||||
* Handles kld related events. Returns 0 on success, non-zero on failure. | * Handles kld related events. Returns 0 on success, non-zero on failure. | ||||
*/ | */ | ||||
int | int | ||||
cc_modevent(module_t mod, int event_type, void *data) | cc_modevent(module_t mod, int event_type, void *data) | ||||
{ | { | ||||
struct cc_algo *algo; | struct cc_algo *algo; | ||||
int err; | int err; | ||||
err = 0; | err = 0; | ||||
algo = (struct cc_algo *)data; | algo = (struct cc_algo *)data; | ||||
switch(event_type) { | switch(event_type) { | ||||
case MOD_LOAD: | case MOD_LOAD: | ||||
if (algo->cc_data_sz == NULL) { | |||||
/* | |||||
* A module must have a cc_data_sz function | |||||
* even if it has no data it should return 0. | |||||
*/ | |||||
err = EINVAL; | |||||
break; | |||||
} | |||||
if (algo->mod_init != NULL) | if (algo->mod_init != NULL) | ||||
err = algo->mod_init(); | err = algo->mod_init(); | ||||
if (!err) | if (!err) | ||||
err = cc_register_algo(algo); | err = cc_register_algo(algo); | ||||
break; | break; | ||||
case MOD_QUIESCE: | case MOD_QUIESCE: | ||||
case MOD_SHUTDOWN: | case MOD_SHUTDOWN: | ||||
▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines |
I don't want to lose the ability to (forcibly) unload a module which is in use as a stack-configured default or by active connections. I'm ok with refusing to unload when in use if a normal kldunload is called, but kldunload -f should be a workable option and leave the system in a sane/functional state.