Index: sys/netinet/cc/cc.h =================================================================== --- sys/netinet/cc/cc.h +++ sys/netinet/cc/cc.h @@ -53,10 +53,11 @@ #ifdef _KERNEL +MALLOC_DECLARE(M_CC_MEM); + /* Global CC vars. */ extern STAILQ_HEAD(cc_head, cc_algo) cc_list; extern const int tcprexmtthresh; -extern struct cc_algo newreno_cc_algo; /* Per-netstack bits. */ VNET_DECLARE(struct cc_algo *, default_cc_ptr); @@ -139,8 +140,19 @@ /* Cleanup global module state on kldunload. */ int (*mod_destroy)(void); - /* Init CC state for a new control block. */ - int (*cb_init)(struct cc_var *ccv); + /* Return the size of the void pointer the CC needs for state */ + size_t (*cc_data_sz)(void); + + /* + * Init CC state for a new control block. The CC + * module may be passed a NULL ptr indicating that + * it must allocate the memory. If it is passed a + * non-null pointer it is pre-allocated memory by + * the caller and the cb_init is expected to use that memory. + * It is not expected to fail if memory is passed in and + * all currently defined modules do not. + */ + int (*cb_init)(struct cc_var *ccv, void *ptr); /* Cleanup CC state for a terminating control block. */ void (*cb_destroy)(struct cc_var *ccv); @@ -198,5 +210,16 @@ #define CC_ALGOOPT_LIMIT 2048 +/* + * These routines give NewReno behavior to the caller + * they require no state and can be used by any other CC + * module that wishes to use NewReno type behaviour (along + * with anything else they may add on, pre or post call). + */ +void common_cc_post_recovery(struct cc_var *); +void common_cc_after_idle(struct cc_var *); +void common_cc_cong_signal(struct cc_var *, uint32_t ); +void common_cc_ack_received(struct cc_var *, uint16_t); + #endif /* _KERNEL */ #endif /* _NETINET_CC_CC_H_ */ Index: sys/netinet/cc/cc.c =================================================================== --- sys/netinet/cc/cc.c +++ sys/netinet/cc/cc.c @@ -70,11 +70,16 @@ #include #include #include +#include #include +#include +#include #include - #include +MALLOC_DEFINE(M_CC_MEM, "CC Mem", "Congestion Control State memory"); + +extern struct cc_algo newreno_cc_algo; /* * List of available cc algorithms on the current system. First element * is used as the system default CC algorithm. @@ -86,6 +91,8 @@ VNET_DEFINE(struct cc_algo *, default_cc_ptr) = &newreno_cc_algo; +VNET_DECLARE(uint32_t, newreno_beta); +#define V_newreno_beta VNET(newreno_beta) /* * Sysctl handler to show and change the default CC algorithm. */ @@ -276,6 +283,193 @@ return (err); } +/* + * Perform any necessary tasks before we exit congestion recovery. + */ +void +common_cc_post_recovery(struct cc_var *ccv) +{ + int pipe; + + if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { + /* + * Fast recovery will conclude after returning from this + * function. Window inflation should have left us with + * approximately snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do it via the + * slow start mechanism. + * + * XXXLAS: Find a way to do this without needing curack + */ + if (V_tcp_do_newsack) + pipe = tcp_compute_pipe(ccv->ccvc.tcp); + else + pipe = CCV(ccv, snd_max) - ccv->curack; + if (pipe < CCV(ccv, snd_ssthresh)) + /* + * Ensure that cwnd does not collapse to 1 MSS under + * adverse conditons. Implements RFC6582 + */ + CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + + CCV(ccv, t_maxseg); + else + CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); + } +} + +void +common_cc_after_idle(struct cc_var *ccv) +{ + uint32_t rw; + /* + * If we've been idle for more than one retransmit timeout the old + * congestion window is no longer current and we have to reduce it to + * the restart window before we can transmit again. + * + * The restart window is the initial window or the last CWND, whichever + * is smaller. + * + * This is done to prevent us from flooding the path with a full CWND at + * wirespeed, overloading router and switch buffers along the way. + * + * See RFC5681 Section 4.1. "Restarting Idle Connections". + * + * In addition, per RFC2861 Section 2, the ssthresh is set to the + * maximum of the former ssthresh or 3/4 of the old cwnd, to + * not exit slow-start prematurely. + */ + rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); + + CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), + CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); + + CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); +} + +/* + * Perform any necessary tasks before we enter congestion recovery. + */ +void +common_cc_cong_signal(struct cc_var *ccv, uint32_t type) +{ + uint32_t cwin, factor; + u_int mss; + + cwin = CCV(ccv, snd_cwnd); + mss = tcp_fixed_maxseg(ccv->ccvc.tcp); + /* + * Other TCP congestion controls use newreno_cong_signal(), but + * with their own private cc_data. Make sure the cc_data is used + * correctly. + */ + factor = V_newreno_beta; + + /* Catch algos which mistakenly leak private signal types. */ + KASSERT((type & CC_SIGPRIVMASK) == 0, + ("%s: congestion signal type 0x%08x is private\n", __func__, type)); + + cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), + 2) * mss; + + switch (type) { + case CC_NDUPACK: + if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { + if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) + CCV(ccv, snd_ssthresh) = cwin; + ENTER_RECOVERY(CCV(ccv, t_flags)); + } + break; + case CC_ECN: + if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { + CCV(ccv, snd_ssthresh) = cwin; + CCV(ccv, snd_cwnd) = cwin; + ENTER_CONGRECOVERY(CCV(ccv, t_flags)); + } + break; + case CC_RTO: + CCV(ccv, snd_ssthresh) = max(min(CCV(ccv, snd_wnd), + CCV(ccv, snd_cwnd)) / 2 / mss, + 2) * mss; + CCV(ccv, snd_cwnd) = mss; + break; + } +} + +void +common_cc_ack_received(struct cc_var *ccv, uint16_t type) +{ + if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && + (ccv->flags & CCF_CWND_LIMITED)) { + u_int cw = CCV(ccv, snd_cwnd); + u_int incr = CCV(ccv, t_maxseg); + + /* + * Regular in-order ACK, open the congestion window. + * Method depends on which congestion control state we're + * in (slow start or cong avoid) and if ABC (RFC 3465) is + * enabled. + * + * slow start: cwnd <= ssthresh + * cong avoid: cwnd > ssthresh + * + * slow start and ABC (RFC 3465): + * Grow cwnd exponentially by the amount of data + * ACKed capping the max increment per ACK to + * (abc_l_var * maxseg) bytes. + * + * slow start without ABC (RFC 5681): + * Grow cwnd exponentially by maxseg per ACK. + * + * cong avoid and ABC (RFC 3465): + * Grow cwnd linearly by maxseg per RTT for each + * cwnd worth of ACKed data. + * + * cong avoid without ABC (RFC 5681): + * Grow cwnd linearly by approximately maxseg per RTT using + * maxseg^2 / cwnd per ACK as the increment. + * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to + * avoid capping cwnd. + */ + if (cw > CCV(ccv, snd_ssthresh)) { + if (V_tcp_do_rfc3465) { + if (ccv->flags & CCF_ABC_SENTAWND) + ccv->flags &= ~CCF_ABC_SENTAWND; + else + incr = 0; + } else + incr = max((incr * incr / cw), 1); + } else if (V_tcp_do_rfc3465) { + /* + * In slow-start with ABC enabled and no RTO in sight? + * (Must not use abc_l_var > 1 if slow starting after + * an RTO. On RTO, snd_nxt = snd_una, so the + * snd_nxt == snd_max check is sufficient to + * handle this). + * + * XXXLAS: Find a way to signal SS after RTO that + * doesn't rely on tcpcb vars. + */ + uint16_t abc_val; + + if (ccv->flags & CCF_USE_LOCAL_ABC) + abc_val = ccv->labc; + else + abc_val = V_tcp_abc_l_var; + if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) + incr = min(ccv->bytes_this_ack, + ccv->nsegs * abc_val * + CCV(ccv, t_maxseg)); + else + incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); + + } + /* ABC is on by default, so incr equals 0 frequently. */ + if (incr > 0) + CCV(ccv, snd_cwnd) = min(cw + incr, + TCP_MAXWIN << CCV(ccv, snd_scale)); + } +} + /* * Handles kld related events. Returns 0 on success, non-zero on failure. */ @@ -290,6 +484,14 @@ switch(event_type) { case MOD_LOAD: + if (algo->cc_data_sz == NULL) { + /* + * A module must have a cc_data_sz function + * even if it has no data it should return 0. + */ + err = EINVAL; + break; + } if (algo->mod_init != NULL) err = algo->mod_init(); if (!err) Index: sys/netinet/cc/cc_cdg.c =================================================================== --- sys/netinet/cc/cc_cdg.c +++ sys/netinet/cc/cc_cdg.c @@ -197,10 +197,6 @@ 32531,32533,32535,32537,32538,32540,32542,32544,32545,32547}; static uma_zone_t qdiffsample_zone; - -static MALLOC_DEFINE(M_CDG, "cdg data", - "Per connection data required for the CDG congestion control algorithm"); - static int ertt_id; VNET_DEFINE_STATIC(uint32_t, cdg_alpha_inc); @@ -222,10 +218,11 @@ static int cdg_mod_init(void); static int cdg_mod_destroy(void); static void cdg_conn_init(struct cc_var *ccv); -static int cdg_cb_init(struct cc_var *ccv); +static int cdg_cb_init(struct cc_var *ccv, void *ptr); static void cdg_cb_destroy(struct cc_var *ccv); static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type); static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type); +static size_t cdg_data_sz(void); struct cc_algo cdg_cc_algo = { .name = "cdg", @@ -235,7 +232,10 @@ .cb_init = cdg_cb_init, .conn_init = cdg_conn_init, .cong_signal = cdg_cong_signal, - .mod_destroy = cdg_mod_destroy + .mod_destroy = cdg_mod_destroy, + .cc_data_sz = cdg_data_sz, + .post_recovery = common_cc_post_recovery, + .after_idle = common_cc_after_idle, }; /* Vnet created and being initialised. */ @@ -271,10 +271,6 @@ CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); - - cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery; - cdg_cc_algo.after_idle = newreno_cc_algo.after_idle; - return (0); } @@ -286,15 +282,24 @@ return (0); } +static size_t +cdg_data_sz(void) +{ + return (sizeof(struct cdg)); +} + static int -cdg_cb_init(struct cc_var *ccv) +cdg_cb_init(struct cc_var *ccv, void *ptr) { struct cdg *cdg_data; - cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT); - if (cdg_data == NULL) - return (ENOMEM); - + if (ptr == NULL) { + cdg_data = malloc(sizeof(struct cdg), M_CC_MEM, M_NOWAIT); + if (cdg_data == NULL) + return (ENOMEM); + } else { + cdg_data = ptr; + } cdg_data->shadow_w = 0; cdg_data->max_qtrend = 0; cdg_data->min_qtrend = 0; @@ -350,7 +355,7 @@ qds = qds_n; } - free(ccv->cc_data, M_CDG); + free(ccv->cc_data, M_CC_MEM); } static int @@ -484,7 +489,7 @@ ENTER_RECOVERY(CCV(ccv, t_flags)); break; default: - newreno_cc_algo.cong_signal(ccv, signal_type); + common_cc_cong_signal(ccv, signal_type); break; } } Index: sys/netinet/cc/cc_chd.c =================================================================== --- sys/netinet/cc/cc_chd.c +++ sys/netinet/cc/cc_chd.c @@ -89,10 +89,11 @@ static void chd_ack_received(struct cc_var *ccv, uint16_t ack_type); static void chd_cb_destroy(struct cc_var *ccv); -static int chd_cb_init(struct cc_var *ccv); +static int chd_cb_init(struct cc_var *ccv, void *ptr); static void chd_cong_signal(struct cc_var *ccv, uint32_t signal_type); static void chd_conn_init(struct cc_var *ccv); static int chd_mod_init(void); +static size_t chd_data_sz(void); struct chd { /* @@ -126,8 +127,6 @@ #define V_chd_loss_fair VNET(chd_loss_fair) #define V_chd_use_max VNET(chd_use_max) -static MALLOC_DEFINE(M_CHD, "chd data", - "Per connection data required for the CHD congestion control algorithm"); struct cc_algo chd_cc_algo = { .name = "chd", @@ -136,7 +135,10 @@ .cb_init = chd_cb_init, .cong_signal = chd_cong_signal, .conn_init = chd_conn_init, - .mod_init = chd_mod_init + .mod_init = chd_mod_init, + .cc_data_sz = chd_data_sz, + .after_idle = common_cc_after_idle, + .post_recovery = common_cc_post_recovery, }; static __inline void @@ -304,18 +306,26 @@ static void chd_cb_destroy(struct cc_var *ccv) { + free(ccv->cc_data, M_CC_MEM); +} - free(ccv->cc_data, M_CHD); +size_t +chd_data_sz(void) +{ + return (sizeof(struct chd)); } static int -chd_cb_init(struct cc_var *ccv) +chd_cb_init(struct cc_var *ccv, void *ptr) { struct chd *chd_data; - chd_data = malloc(sizeof(struct chd), M_CHD, M_NOWAIT); - if (chd_data == NULL) - return (ENOMEM); + if (ptr == NULL) { + chd_data = malloc(sizeof(struct chd), M_CC_MEM, M_NOWAIT); + if (chd_data == NULL) + return (ENOMEM); + } else + chd_data = ptr; chd_data->shadow_w = 0; ccv->cc_data = chd_data; @@ -374,7 +384,7 @@ break; default: - newreno_cc_algo.cong_signal(ccv, signal_type); + common_cc_cong_signal(ccv, signal_type); } } @@ -403,10 +413,6 @@ printf("%s: h_ertt module not found\n", __func__); return (ENOENT); } - - chd_cc_algo.after_idle = newreno_cc_algo.after_idle; - chd_cc_algo.post_recovery = newreno_cc_algo.post_recovery; - return (0); } Index: sys/netinet/cc/cc_cubic.c =================================================================== --- sys/netinet/cc/cc_cubic.c +++ sys/netinet/cc/cc_cubic.c @@ -72,7 +72,7 @@ static void cubic_ack_received(struct cc_var *ccv, uint16_t type); static void cubic_cb_destroy(struct cc_var *ccv); -static int cubic_cb_init(struct cc_var *ccv); +static int cubic_cb_init(struct cc_var *ccv, void *ptr); static void cubic_cong_signal(struct cc_var *ccv, uint32_t type); static void cubic_conn_init(struct cc_var *ccv); static int cubic_mod_init(void); @@ -80,6 +80,7 @@ static void cubic_record_rtt(struct cc_var *ccv); static void cubic_ssthresh_update(struct cc_var *ccv, uint32_t maxseg); static void cubic_after_idle(struct cc_var *ccv); +static size_t cubic_data_sz(void); struct cubic { /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ @@ -114,9 +115,6 @@ int t_last_cong_prev; }; -static MALLOC_DEFINE(M_CUBIC, "cubic data", - "Per connection data required for the CUBIC congestion control algorithm"); - struct cc_algo cubic_cc_algo = { .name = "cubic", .ack_received = cubic_ack_received, @@ -127,6 +125,7 @@ .mod_init = cubic_mod_init, .post_recovery = cubic_post_recovery, .after_idle = cubic_after_idle, + .cc_data_sz = cubic_data_sz }; static void @@ -149,7 +148,7 @@ if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh) || cubic_data->min_rtt_ticks == TCPTV_SRTTBASE) { cubic_data->flags |= CUBICFLAG_IN_SLOWSTART; - newreno_cc_algo.ack_received(ccv, type); + common_cc_ack_received(ccv, type); } else { if ((cubic_data->flags & CUBICFLAG_RTO_EVENT) && (cubic_data->flags & CUBICFLAG_IN_SLOWSTART)) { @@ -243,25 +242,33 @@ cubic_data->max_cwnd = ulmax(cubic_data->max_cwnd, CCV(ccv, snd_cwnd)); cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, t_maxseg)); - newreno_cc_algo.after_idle(ccv); + common_cc_after_idle(ccv); cubic_data->t_last_cong = ticks; } static void cubic_cb_destroy(struct cc_var *ccv) { - free(ccv->cc_data, M_CUBIC); + free(ccv->cc_data, M_CC_MEM); +} + +static size_t +cubic_data_sz(void) +{ + return (sizeof(struct cubic)); } static int -cubic_cb_init(struct cc_var *ccv) +cubic_cb_init(struct cc_var *ccv, void *ptr) { struct cubic *cubic_data; - cubic_data = malloc(sizeof(struct cubic), M_CUBIC, M_NOWAIT|M_ZERO); - - if (cubic_data == NULL) - return (ENOMEM); + if (ptr == NULL) { + cubic_data = malloc(sizeof(struct cubic), M_CC_MEM, M_NOWAIT|M_ZERO); + if (cubic_data == NULL) + return (ENOMEM); + } else + cubic_data = ptr; /* Init some key variables with sensible defaults. */ cubic_data->t_last_cong = ticks; Index: sys/netinet/cc/cc_dctcp.c =================================================================== --- sys/netinet/cc/cc_dctcp.c +++ sys/netinet/cc/cc_dctcp.c @@ -76,18 +76,16 @@ uint32_t num_cong_events; /* # of congestion events */ }; -static MALLOC_DEFINE(M_dctcp, "dctcp data", - "Per connection data required for the dctcp algorithm"); - static void dctcp_ack_received(struct cc_var *ccv, uint16_t type); static void dctcp_after_idle(struct cc_var *ccv); static void dctcp_cb_destroy(struct cc_var *ccv); -static int dctcp_cb_init(struct cc_var *ccv); +static int dctcp_cb_init(struct cc_var *ccv, void *ptr); static void dctcp_cong_signal(struct cc_var *ccv, uint32_t type); static void dctcp_conn_init(struct cc_var *ccv); static void dctcp_post_recovery(struct cc_var *ccv); static void dctcp_ecnpkt_handler(struct cc_var *ccv); static void dctcp_update_alpha(struct cc_var *ccv); +static size_t dctcp_data_sz(void); struct cc_algo dctcp_cc_algo = { .name = "dctcp", @@ -99,6 +97,7 @@ .post_recovery = dctcp_post_recovery, .ecnpkt_handler = dctcp_ecnpkt_handler, .after_idle = dctcp_after_idle, + .cc_data_sz = dctcp_data_sz, }; static void @@ -117,10 +116,10 @@ */ if (IN_CONGRECOVERY(CCV(ccv, t_flags))) { EXIT_CONGRECOVERY(CCV(ccv, t_flags)); - newreno_cc_algo.ack_received(ccv, type); + common_cc_ack_received(ccv, type); ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } else - newreno_cc_algo.ack_received(ccv, type); + common_cc_ack_received(ccv, type); if (type == CC_DUPACK) bytes_acked = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); @@ -158,7 +157,13 @@ SEQ_GT(ccv->curack, dctcp_data->save_sndnxt)) dctcp_update_alpha(ccv); } else - newreno_cc_algo.ack_received(ccv, type); + common_cc_ack_received(ccv, type); +} + +static size_t +dctcp_data_sz(void) +{ + return (sizeof(struct dctcp)); } static void @@ -179,25 +184,26 @@ dctcp_data->num_cong_events = 0; } - newreno_cc_algo.after_idle(ccv); + common_cc_after_idle(ccv); } static void dctcp_cb_destroy(struct cc_var *ccv) { - free(ccv->cc_data, M_dctcp); + free(ccv->cc_data, M_CC_MEM); } static int -dctcp_cb_init(struct cc_var *ccv) +dctcp_cb_init(struct cc_var *ccv, void *ptr) { struct dctcp *dctcp_data; - dctcp_data = malloc(sizeof(struct dctcp), M_dctcp, M_NOWAIT|M_ZERO); - - if (dctcp_data == NULL) - return (ENOMEM); - + if (ptr == NULL) { + dctcp_data = malloc(sizeof(struct dctcp), M_CC_MEM, M_NOWAIT|M_ZERO); + if (dctcp_data == NULL) + return (ENOMEM); + } else + dctcp_data = ptr; /* Initialize some key variables with sensible defaults. */ dctcp_data->bytes_ecn = 0; dctcp_data->bytes_total = 0; @@ -292,7 +298,7 @@ break; } } else - newreno_cc_algo.cong_signal(ccv, type); + common_cc_cong_signal(ccv, type); } static void @@ -312,7 +318,7 @@ static void dctcp_post_recovery(struct cc_var *ccv) { - newreno_cc_algo.post_recovery(ccv); + common_cc_post_recovery(ccv); if (CCV(ccv, t_flags2) & TF2_ECN_PERMIT) dctcp_update_alpha(ccv); Index: sys/netinet/cc/cc_hd.c =================================================================== --- sys/netinet/cc/cc_hd.c +++ sys/netinet/cc/cc_hd.c @@ -84,6 +84,7 @@ static void hd_ack_received(struct cc_var *ccv, uint16_t ack_type); static int hd_mod_init(void); +static size_t hd_data_sz(void); static int ertt_id; @@ -97,9 +98,19 @@ struct cc_algo hd_cc_algo = { .name = "hd", .ack_received = hd_ack_received, - .mod_init = hd_mod_init + .mod_init = hd_mod_init, + .cc_data_sz = hd_data_sz, + .after_idle = common_cc_after_idle, + .cong_signal = common_cc_cong_signal, + .post_recovery = common_cc_post_recovery, }; +static size_t +hd_data_sz(void) +{ + return (0); +} + /* * Hamilton backoff function. Returns 1 if we should backoff or 0 otherwise. */ @@ -150,14 +161,14 @@ * half cwnd and behave like an ECN (ie * not a packet loss). */ - newreno_cc_algo.cong_signal(ccv, + common_cc_cong_signal(ccv, CC_ECN); return; } } } } - newreno_cc_algo.ack_received(ccv, ack_type); /* As for NewReno. */ + common_cc_ack_received(ccv, ack_type); } static int @@ -169,11 +180,6 @@ printf("%s: h_ertt module not found\n", __func__); return (ENOENT); } - - hd_cc_algo.after_idle = newreno_cc_algo.after_idle; - hd_cc_algo.cong_signal = newreno_cc_algo.cong_signal; - hd_cc_algo.post_recovery = newreno_cc_algo.post_recovery; - return (0); } Index: sys/netinet/cc/cc_htcp.c =================================================================== --- sys/netinet/cc/cc_htcp.c +++ sys/netinet/cc/cc_htcp.c @@ -137,7 +137,7 @@ static void htcp_ack_received(struct cc_var *ccv, uint16_t type); static void htcp_cb_destroy(struct cc_var *ccv); -static int htcp_cb_init(struct cc_var *ccv); +static int htcp_cb_init(struct cc_var *ccv, void *ptr); static void htcp_cong_signal(struct cc_var *ccv, uint32_t type); static int htcp_mod_init(void); static void htcp_post_recovery(struct cc_var *ccv); @@ -145,6 +145,7 @@ static void htcp_recalc_beta(struct cc_var *ccv); static void htcp_record_rtt(struct cc_var *ccv); static void htcp_ssthresh_update(struct cc_var *ccv); +static size_t htcp_data_sz(void); struct htcp { /* cwnd before entering cong recovery. */ @@ -175,9 +176,6 @@ #define V_htcp_adaptive_backoff VNET(htcp_adaptive_backoff) #define V_htcp_rtt_scaling VNET(htcp_rtt_scaling) -static MALLOC_DEFINE(M_HTCP, "htcp data", - "Per connection data required for the HTCP congestion control algorithm"); - struct cc_algo htcp_cc_algo = { .name = "htcp", .ack_received = htcp_ack_received, @@ -186,6 +184,8 @@ .cong_signal = htcp_cong_signal, .mod_init = htcp_mod_init, .post_recovery = htcp_post_recovery, + .cc_data_sz = htcp_data_sz, + .after_idle = common_cc_after_idle, }; static void @@ -214,7 +214,7 @@ */ if (htcp_data->alpha == 1 || CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) - newreno_cc_algo.ack_received(ccv, type); + common_cc_ack_received(ccv, type); else { if (V_tcp_do_rfc3465) { /* Increment cwnd by alpha segments. */ @@ -238,18 +238,26 @@ static void htcp_cb_destroy(struct cc_var *ccv) { - free(ccv->cc_data, M_HTCP); + free(ccv->cc_data, M_CC_MEM); +} + +static size_t +htcp_data_sz(void) +{ + return(sizeof(struct htcp)); } static int -htcp_cb_init(struct cc_var *ccv) +htcp_cb_init(struct cc_var *ccv, void *ptr) { struct htcp *htcp_data; - htcp_data = malloc(sizeof(struct htcp), M_HTCP, M_NOWAIT); - - if (htcp_data == NULL) - return (ENOMEM); + if (ptr == NULL) { + htcp_data = malloc(sizeof(struct htcp), M_CC_MEM, M_NOWAIT); + if (htcp_data == NULL) + return (ENOMEM); + } else + htcp_data = ptr; /* Init some key variables with sensible defaults. */ htcp_data->alpha = HTCP_INIT_ALPHA; @@ -333,16 +341,12 @@ static int htcp_mod_init(void) { - - htcp_cc_algo.after_idle = newreno_cc_algo.after_idle; - /* * HTCP_RTT_REF is defined in ms, and t_srtt in the tcpcb is stored in * units of TCP_RTT_SCALE*hz. Scale HTCP_RTT_REF to be in the same units * as t_srtt. */ htcp_rtt_ref = (HTCP_RTT_REF * TCP_RTT_SCALE * hz) / 1000; - return (0); } Index: sys/netinet/cc/cc_newreno.c =================================================================== --- sys/netinet/cc/cc_newreno.c +++ sys/netinet/cc/cc_newreno.c @@ -82,18 +82,15 @@ #include #include -static MALLOC_DEFINE(M_NEWRENO, "newreno data", - "newreno beta values"); - static void newreno_cb_destroy(struct cc_var *ccv); static void newreno_ack_received(struct cc_var *ccv, uint16_t type); static void newreno_after_idle(struct cc_var *ccv); static void newreno_cong_signal(struct cc_var *ccv, uint32_t type); -static void newreno_post_recovery(struct cc_var *ccv); static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf); static void newreno_newround(struct cc_var *ccv, uint32_t round_cnt); static void newreno_rttsample(struct cc_var *ccv, uint32_t usec_rtt, uint32_t rxtcnt, uint32_t fas); -static int newreno_cb_init(struct cc_var *ccv); +static int newreno_cb_init(struct cc_var *ccv, void *); +static size_t newreno_data_sz(void); VNET_DEFINE(uint32_t, newreno_beta) = 50; VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; @@ -106,11 +103,12 @@ .ack_received = newreno_ack_received, .after_idle = newreno_after_idle, .cong_signal = newreno_cong_signal, - .post_recovery = newreno_post_recovery, + .post_recovery = common_cc_post_recovery, .ctl_output = newreno_ctl_output, .newround = newreno_newround, .rttsample = newreno_rttsample, .cb_init = newreno_cb_init, + .cc_data_sz = newreno_data_sz, }; static uint32_t hystart_lowcwnd = 16; @@ -167,14 +165,23 @@ } } +static size_t +newreno_data_sz(void) +{ + return (sizeof(struct newreno)); +} + static int -newreno_cb_init(struct cc_var *ccv) +newreno_cb_init(struct cc_var *ccv, void *ptr) { struct newreno *nreno; - ccv->cc_data = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT); - if (ccv->cc_data == NULL) - return (ENOMEM); + if (ptr == NULL) { + ccv->cc_data = malloc(sizeof(struct newreno), M_CC_MEM, M_NOWAIT); + if (ccv->cc_data == NULL) + return (ENOMEM); + } else + ccv->cc_data = ptr; nreno = (struct newreno *)ccv->cc_data; /* NB: nreno is not zeroed, so initialise all fields. */ nreno->beta = V_newreno_beta; @@ -201,7 +208,7 @@ static void newreno_cb_destroy(struct cc_var *ccv) { - free(ccv->cc_data, M_NEWRENO); + free(ccv->cc_data, M_CC_MEM); } static void @@ -209,13 +216,7 @@ { struct newreno *nreno; - /* - * Other TCP congestion controls use newreno_ack_received(), but - * with their own private cc_data. Make sure the cc_data is used - * correctly. - */ - nreno = (CC_ALGO(ccv->ccvc.tcp) == &newreno_cc_algo) ? ccv->cc_data : NULL; - + nreno = ccv->cc_data; if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { u_int cw = CCV(ccv, snd_cwnd); @@ -249,8 +250,7 @@ * avoid capping cwnd. */ if (cw > CCV(ccv, snd_ssthresh)) { - if ((nreno != NULL) && - (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS)) { + if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) { /* * We have slipped into CA with * CSS active. Deactivate all. @@ -284,8 +284,7 @@ abc_val = ccv->labc; else abc_val = V_tcp_abc_l_var; - if ((nreno != NULL) && - (nreno->newreno_flags & CC_NEWRENO_HYSTART_ALLOWED) && + if ((nreno->newreno_flags & CC_NEWRENO_HYSTART_ALLOWED) && (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) && ((nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) == 0)) { /* @@ -323,8 +322,7 @@ incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); /* Only if Hystart is enabled will the flag get set */ - if ((nreno != NULL) && - (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS)) { + if (nreno->newreno_flags & CC_NEWRENO_HYSTART_IN_CSS) { incr /= hystart_css_growth_div; newreno_log_hystart_event(ccv, nreno, 3, incr); } @@ -340,39 +338,10 @@ newreno_after_idle(struct cc_var *ccv) { struct newreno *nreno; - uint32_t rw; - - /* - * Other TCP congestion controls use newreno_after_idle(), but - * with their own private cc_data. Make sure the cc_data is used - * correctly. - */ - nreno = (CC_ALGO(ccv->ccvc.tcp) == &newreno_cc_algo) ? ccv->cc_data : NULL; - /* - * If we've been idle for more than one retransmit timeout the old - * congestion window is no longer current and we have to reduce it to - * the restart window before we can transmit again. - * - * The restart window is the initial window or the last CWND, whichever - * is smaller. - * - * This is done to prevent us from flooding the path with a full CWND at - * wirespeed, overloading router and switch buffers along the way. - * - * See RFC5681 Section 4.1. "Restarting Idle Connections". - * - * In addition, per RFC2861 Section 2, the ssthresh is set to the - * maximum of the former ssthresh or 3/4 of the old cwnd, to - * not exit slow-start prematurely. - */ - rw = tcp_compute_initwnd(tcp_maxseg(ccv->ccvc.tcp)); - CCV(ccv, snd_ssthresh) = max(CCV(ccv, snd_ssthresh), - CCV(ccv, snd_cwnd)-(CCV(ccv, snd_cwnd)>>2)); - - CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); - if ((nreno != NULL) && - (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) == 0) { + nreno = ccv->cc_data; + common_cc_after_idle(ccv); + if ((nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) == 0) { if (CCV(ccv, snd_cwnd) <= (hystart_lowcwnd * tcp_fixed_maxseg(ccv->ccvc.tcp))) { /* * Re-enable hystart if our cwnd has fallen below @@ -396,12 +365,7 @@ cwin = CCV(ccv, snd_cwnd); mss = tcp_fixed_maxseg(ccv->ccvc.tcp); - /* - * Other TCP congestion controls use newreno_cong_signal(), but - * with their own private cc_data. Make sure the cc_data is used - * correctly. - */ - nreno = (CC_ALGO(ccv->ccvc.tcp) == &newreno_cc_algo) ? ccv->cc_data : NULL; + nreno = ccv->cc_data; beta = (nreno == NULL) ? V_newreno_beta : nreno->beta;; beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn; /* @@ -426,8 +390,7 @@ switch (type) { case CC_NDUPACK: - if ((nreno != NULL) && - (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED)) { + if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; @@ -445,8 +408,7 @@ } break; case CC_ECN: - if ((nreno != NULL) && - (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED)) { + if (nreno->newreno_flags & CC_NEWRENO_HYSTART_ENABLED) { /* Make sure the flags are all off we had a loss */ nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_ENABLED; nreno->newreno_flags &= ~CC_NEWRENO_HYSTART_IN_CSS; @@ -466,41 +428,6 @@ } } -/* - * Perform any necessary tasks before we exit congestion recovery. - */ -static void -newreno_post_recovery(struct cc_var *ccv) -{ - int pipe; - - if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { - /* - * Fast recovery will conclude after returning from this - * function. Window inflation should have left us with - * approximately snd_ssthresh outstanding data. But in case we - * would be inclined to send a burst, better to do it via the - * slow start mechanism. - * - * XXXLAS: Find a way to do this without needing curack - */ - if (V_tcp_do_newsack) - pipe = tcp_compute_pipe(ccv->ccvc.tcp); - else - pipe = CCV(ccv, snd_max) - ccv->curack; - - if (pipe < CCV(ccv, snd_ssthresh)) - /* - * Ensure that cwnd does not collapse to 1 MSS under - * adverse conditons. Implements RFC6582 - */ - CCV(ccv, snd_cwnd) = max(pipe, CCV(ccv, t_maxseg)) + - CCV(ccv, t_maxseg); - else - CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); - } -} - static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) { Index: sys/netinet/cc/cc_vegas.c =================================================================== --- sys/netinet/cc/cc_vegas.c +++ sys/netinet/cc/cc_vegas.c @@ -87,10 +87,11 @@ static void vegas_ack_received(struct cc_var *ccv, uint16_t ack_type); static void vegas_cb_destroy(struct cc_var *ccv); -static int vegas_cb_init(struct cc_var *ccv); +static int vegas_cb_init(struct cc_var *ccv, void *ptr); static void vegas_cong_signal(struct cc_var *ccv, uint32_t signal_type); static void vegas_conn_init(struct cc_var *ccv); static int vegas_mod_init(void); +static size_t vegas_data_sz(void); struct vegas { int slow_start_toggle; @@ -103,9 +104,6 @@ #define V_vegas_alpha VNET(vegas_alpha) #define V_vegas_beta VNET(vegas_beta) -static MALLOC_DEFINE(M_VEGAS, "vegas data", - "Per connection data required for the Vegas congestion control algorithm"); - struct cc_algo vegas_cc_algo = { .name = "vegas", .ack_received = vegas_ack_received, @@ -113,7 +111,10 @@ .cb_init = vegas_cb_init, .cong_signal = vegas_cong_signal, .conn_init = vegas_conn_init, - .mod_init = vegas_mod_init + .mod_init = vegas_mod_init, + .cc_data_sz = vegas_data_sz, + .after_idle = common_cc_after_idle, + .post_recovery = common_cc_post_recovery, }; /* @@ -162,24 +163,32 @@ } if (vegas_data->slow_start_toggle) - newreno_cc_algo.ack_received(ccv, ack_type); + common_cc_ack_received(ccv, ack_type); } static void vegas_cb_destroy(struct cc_var *ccv) { - free(ccv->cc_data, M_VEGAS); + free(ccv->cc_data, M_CC_MEM); +} + +static size_t +vegas_data_sz(void) +{ + return (sizeof(struct vegas)); } static int -vegas_cb_init(struct cc_var *ccv) +vegas_cb_init(struct cc_var *ccv, void *ptr) { struct vegas *vegas_data; - vegas_data = malloc(sizeof(struct vegas), M_VEGAS, M_NOWAIT); - - if (vegas_data == NULL) - return (ENOMEM); + if (ptr == NULL) { + vegas_data = malloc(sizeof(struct vegas), M_CC_MEM, M_NOWAIT); + if (vegas_data == NULL) + return (ENOMEM); + } else + vegas_data = ptr; vegas_data->slow_start_toggle = 1; ccv->cc_data = vegas_data; @@ -216,7 +225,7 @@ break; default: - newreno_cc_algo.cong_signal(ccv, signal_type); + common_cc_cong_signal(ccv, signal_type); } if (IN_RECOVERY(CCV(ccv, t_flags)) && !presignalrecov) @@ -236,16 +245,11 @@ static int vegas_mod_init(void) { - ertt_id = khelp_get_id("ertt"); if (ertt_id <= 0) { printf("%s: h_ertt module not found\n", __func__); return (ENOENT); } - - vegas_cc_algo.after_idle = newreno_cc_algo.after_idle; - vegas_cc_algo.post_recovery = newreno_cc_algo.post_recovery; - return (0); } Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -2147,7 +2147,7 @@ tp->t_inpcb = inp; if (CC_ALGO(tp)->cb_init != NULL) - if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { + if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); in_pcbrele_wlocked(inp); @@ -2239,6 +2239,8 @@ return (tp); /* XXX */ } +extern struct cc_algo newreno_cc_algo; + /* * Switch the congestion control algorithm back to NewReno for any active * control blocks using an algorithm which is about to go away. Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -2007,6 +2007,8 @@ } #endif +extern struct cc_algo newreno_cc_algo; + int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { @@ -2223,48 +2225,91 @@ break; case TCP_CONGESTION: + { + struct cc_var cc_mem; + size_t mem_sz; + void *ptr = NULL; + INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); if (error) break; buf[sopt->sopt_valsize] = '\0'; - INP_WLOCK_RECHECK(inp); CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) break; - CC_LIST_RUNLOCK(); if (algo == NULL) { - INP_WUNLOCK(inp); + CC_LIST_RUNLOCK(); error = EINVAL; break; } + mem_sz = (*algo->cc_data_sz)(); + CC_LIST_RUNLOCK(); + /* We can now pre-get the memory for the CC */ + if (mem_sz) + ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK); + else + ptr = NULL; /* - * We hold a write lock over the tcb so it's safe to - * do these things without ordering concerns. + * Make sure its all clean and zero and also get + * back the inplock. */ - if (CC_ALGO(tp)->cb_destroy != NULL) - CC_ALGO(tp)->cb_destroy(tp->ccv); - CC_DATA(tp) = NULL; - CC_ALGO(tp) = algo; + memset(&cc_mem, 0, sizeof(cc_mem)); + if (ptr) { + memset(ptr, 0, mem_sz); + INP_WLOCK_RECHECK_CLEANUP(inp, free(ptr, M_CC_MEM)); + } else + INP_WLOCK_RECHECK(inp); + cc_mem.ccvc.tcp = tp; + /* + * We once again hold a write lock over the tcb so it's + * safe to do these things without ordering concerns. + * + * XXXrrs: Note there is a danger here, where the + * CC module is unloaded before we init. I don't + * address this problem here since the whole CC + * module is weak in this area, i.e. if you unload + * a module it can be being used and boom the kernel + * will crash. The CC modules will need refcounting + * and require a mod-unload function that can be checked + * against if there is use which is beyond the scope + * of this work but should be addressed later. + * + * Note here we init into stack memory. + */ + if (algo->cb_init != NULL) + error = algo->cb_init(&cc_mem, ptr); + else + error = 0; /* - * If something goes pear shaped initialising the new - * algo, fall back to newreno (which does not - * require initialisation). + * Without INVARIANT we must recover and we leave the + * previous CC module in place, by freeing the memory + * we allocated and letting the return error propagate + * to the caller. */ - if (algo->cb_init != NULL && - algo->cb_init(tp->ccv) != 0) { - CC_ALGO(tp) = &newreno_cc_algo; + if (error == 0) { /* - * The only reason init should fail is - * because of malloc. + * Touchdown, lets go ahead and move the + * connection to the new CC module by + * copying in the cc_mem after we call + * the old ones cleanup (if any). */ - error = ENOMEM; - } + if (CC_ALGO(tp)->cb_destroy != NULL) + CC_ALGO(tp)->cb_destroy(tp->ccv); + memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var)); + tp->cc_algo = algo; + /* Ok now are we where we have gotten past any conn_init? */ + if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) { + /* Yep run the connection init for the new CC */ + CC_ALGO(tp)->conn_init(tp->ccv); + } + } else if (ptr) + free(ptr, M_CC_MEM); INP_WUNLOCK(inp); break; - + } case TCP_REUSPORT_LB_NUMA: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval),