Index: etc/mtree/BSD.include.dist =================================================================== --- etc/mtree/BSD.include.dist +++ etc/mtree/BSD.include.dist @@ -270,6 +270,8 @@ .. .. netinet + cc + .. .. netinet6 .. Index: include/Makefile =================================================================== --- include/Makefile +++ include/Makefile @@ -51,7 +51,7 @@ geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \ geom/mirror geom/mountver geom/multipath geom/nop \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ - netgraph/atm netgraph/netflow \ + netgraph/atm netgraph/netflow netinet/cc\ security/audit \ security/mac_biba security/mac_bsdextended security/mac_lomac \ security/mac_mls security/mac_partition \ Index: share/man/man4/mod_cc.4 =================================================================== --- share/man/man4/mod_cc.4 +++ share/man/man4/mod_cc.4 @@ -30,12 +30,16 @@ .\" .\" $FreeBSD$ .\" -.Dd September 15, 2011 +.Dd September 2, 2014 .Dt MOD_CC 4 .Os .Sh NAME .Nm mod_cc .Nd Modular congestion control +.Sh SYNOPSIS +.In netinet/cc.h +.Fn CC_SOCKOPT_DEFINE "ccsopt" "valsize" +.Fn CC_SOCKOPT_RESET "ccsopt" .Sh DESCRIPTION The modular congestion control framework allows the TCP implementation to dynamically change the congestion control algorithm used by new and existing @@ -57,6 +61,56 @@ MIB variable detailed in the .Sx MIB Variables section below. +.Pp +Algorithm specific parameters can be set or queried using the TCP_CCALGOOPT +socket option (see +.Xr tcp 4 +for details). +Callers must pass a pointer to a +.Vt struct cc_sockopt +as +.Va optval , +which has the following members: +.Bd -literal -offset indent +struct cc_sockopt { + char cc_name[TCP_CA_NAME_MAX]; + socklen_t size; + int sopt_dir; + int sopt_name; + size_t sopt_valsize; + uint8_t sopt_val[]; +}; +.Ed +.Pp +The +.Va CC_SOCKOPT_DEFINE +and +.Va CC_SOCKOPT_RESET +macros provide convenience wrappers for working with +.Vt struct cc_sockopt . +Only the +.Va cc_name , +.Va sopt_name , +.Va sopt_valsize +and +.Va sopt_val +fields of +.Vt struct cc_sockopt +should be set explicitly by callers as required. +If reusing a +.Vt struct cc_sockopt +within a scoped area of code, the +.Va CC_SOCKOPT_RESET +macro should be used to reset certain struct members to appropriate values. +.Pp +The largest amount of data which can be passed through from/to the algorithm +module's +.Va ctl_output +hook is dictated by the +.Va valsize +used in +.Pa +for the temporary ccsopt variable (currently sizeof(uint32_t)). .Sh MIB Variables The framework exposes the following variables in the .Va net.inet.tcp.cc Index: share/man/man4/tcp.4 =================================================================== --- share/man/man4/tcp.4 +++ share/man/man4/tcp.4 @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd November 8, 2013 +.Dd September 2, 2014 .Dt TCP 4 .Os .Sh NAME @@ -137,6 +137,11 @@ receive window size, and bandwidth-controlled window space. +.It Dv TCP_CCALGOOPT +Set or query congestion control algorithm specific parameters. +See +.Xr mod_cc 4 +for details. .It Dv TCP_CONGESTION Select or query the congestion control algorithm that TCP will use for the connection. Index: share/man/man9/mod_cc.9 =================================================================== --- share/man/man9/mod_cc.9 +++ share/man/man9/mod_cc.9 @@ -31,7 +31,7 @@ .\" .\" $FreeBSD$ .\" -.Dd September 15, 2011 +.Dd September 2, 2014 .Dt MOD_CC 9 .Os .Sh NAME @@ -74,6 +74,7 @@ void (*cong_signal) (struct cc_var *ccv, uint32_t type); void (*post_recovery) (struct cc_var *ccv); void (*after_idle) (struct cc_var *ccv); + int (*ctl_output)(struct cc_var *ccv, struct cc_sockopt *ccsopt); }; .Ed .Pp @@ -166,6 +167,23 @@ It should be implemented to adjust state as required. .Pp The +.Va ctl_output +function is called when +.Xr getsockopt 2 +or +.Xr setsockopt 2 +is called on a +.Xr tcp 4 +socket with the +.Va optname +argument set to TCP_CCALGOOPT and allows the caller to query or set algorithm +specific parameters. +See +.Xr mod_cc 4 +for information on +.Vt struct cc_sockopt . +.Pp +The .Fn DECLARE_CC_MODULE macro provides a convenient wrapper around the .Xr DECLARE_MODULE 9 Index: sys/netinet/cc.h =================================================================== --- sys/netinet/cc.h +++ sys/netinet/cc.h @@ -54,6 +54,31 @@ /* XXX: TCP_CA_NAME_MAX define lives in tcp.h for compat reasons. */ #include +/* + * Shared structure with userspace for CC algorithm sockopt manipulation. + */ +struct cc_sockopt { + char cc_name[TCP_CA_NAME_MAX]; /* CC algo the sockopt applies to. */ + socklen_t size; /* Struct size inc variable sopt_val len. */ + int sopt_dir; /* SOPT_GET or SOPT_SET. */ + int sopt_name; /* CC algo specific sockopt identifier. */ + size_t sopt_valsize; /* Available space or data in sopt_val. */ + uint8_t sopt_val[]; /* Variable length data. */ +}; + +#define CC_SOCKOPT_RESET(x) \ +do { \ + x->size = sizeof(_ccsopt_ ## x); \ + x->sopt_valsize = sizeof(_ccsopt_ ## x) - sizeof(struct cc_sockopt); \ +} while(0) + +#define CC_SOCKOPT_DEFINE(x, valsize) \ + uint8_t _ccsopt_ ## x [valsize + sizeof(struct cc_sockopt)]; \ + struct cc_sockopt *x = (struct cc_sockopt *)_ccsopt_ ## x; \ + CC_SOCKOPT_RESET(x) + +#ifdef _KERNEL + /* Global CC vars. */ extern STAILQ_HEAD(cc_head, cc_algo) cc_list; extern const int tcprexmtthresh; @@ -143,6 +168,9 @@ /* Called when data transfer resumes after an idle period. */ void (*after_idle)(struct cc_var *ccv); + /* Called for {get|set}sockopt() on a TCP socket with TCP_CCALGOOPT. */ + int (*ctl_output)(struct cc_var *ccv, struct cc_sockopt *ccsopt); + STAILQ_ENTRY (cc_algo) entries; }; @@ -164,4 +192,6 @@ #define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock) #define CC_LIST_LOCK_ASSERT() rw_assert(&cc_list_lock, RA_LOCKED) +#endif /* _KERNEL */ + #endif /* _NETINET_CC_H_ */ Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -161,6 +161,7 @@ #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ +#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -1303,6 +1303,7 @@ struct tcp_info ti; char buf[TCP_CA_NAME_MAX]; struct cc_algo *algo; + CC_SOCKOPT_DEFINE(ccsopt, sizeof(uint32_t)); error = 0; inp = sotoinpcb(so); @@ -1469,6 +1470,30 @@ CC_LIST_RUNLOCK(); goto unlock_and_done; + case TCP_CCALGOOPT: + INP_WUNLOCK(inp); + socklen_t tmp_size = ccsopt->size; + error = sooptcopyin(sopt, ccsopt, ccsopt->size, + sizeof(struct cc_sockopt)); + if (error) + break; + if (ccsopt->size > tmp_size) { + /* Kernel ccsopt->sopt_val is too small. */ + error = EMSGSIZE; + break; + } + INP_WLOCK_RECHECK(inp); + + if (strlen(ccsopt->cc_name) == strlen(CC_ALGO(tp)->name) && + strcmp(ccsopt->cc_name, CC_ALGO(tp)->name) == 0 && + CC_ALGO(tp)->ctl_output) { + ccsopt->sopt_dir = SOPT_SET; + error = CC_ALGO(tp)->ctl_output(tp->ccv, + ccsopt); + } else + error = ENOENT; + goto unlock_and_done; + case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: @@ -1576,6 +1601,34 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX); break; + case TCP_CCALGOOPT: + INP_WUNLOCK(inp); + socklen_t tmp_size = ccsopt->size; + error = sooptcopyin(sopt, ccsopt, ccsopt->size, + sizeof(struct cc_sockopt)); + if (error) + break; + /* + * If userspace ccsopt->sopt_val is larger than the + * in-kernel ccsop->sopt_val, use kernel's size. + */ + if (ccsopt->size > tmp_size) + CC_SOCKOPT_RESET(ccsopt); + INP_WLOCK_RECHECK(inp); + + if (strlen(ccsopt->cc_name) == strlen(CC_ALGO(tp)->name) && + strcmp(ccsopt->cc_name, CC_ALGO(tp)->name) == 0 && + CC_ALGO(tp)->ctl_output) { + ccsopt->sopt_dir = SOPT_GET; + error = CC_ALGO(tp)->ctl_output(tp->ccv, + ccsopt); + } else + error = ENOENT; + INP_WUNLOCK(inp); + if (!error) + error = sooptcopyout(sopt, ccsopt, + ccsopt->size); + break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: