Index: head/etc/mtree/BSD.include.dist =================================================================== --- head/etc/mtree/BSD.include.dist (revision 294539) +++ head/etc/mtree/BSD.include.dist (revision 294540) @@ -1,342 +1,344 @@ # $FreeBSD$ # # Please see the file src/etc/mtree/README before making changes to this file. # /set type=dir uname=root gname=wheel mode=0755 . arpa .. atf-c .. atf-c++ .. bsm .. bsnmp .. c++ 4.2 backward .. bits .. debug .. ext pb_ds detail basic_tree_policy .. bin_search_tree_ .. binary_heap_ .. binomial_heap_ .. binomial_heap_base_ .. cc_hash_table_map_ .. eq_fn .. gp_hash_table_map_ .. hash_fn .. left_child_next_sibling_heap_ .. list_update_map_ .. list_update_policy .. ov_tree_map_ .. pairing_heap_ .. pat_trie_ .. rb_tree_map_ .. rc_binomial_heap_ .. resize_policy .. splay_tree_ .. thin_heap_ .. tree_policy .. trie_policy .. unordered_iterator .. .. .. .. tr1 .. .. v1 experimental .. ext .. tr1 .. .. .. cam ata .. scsi .. .. crypto .. dev acpica .. agp .. an .. bktr .. ciss .. filemon .. firewire .. hwpmc .. ic .. iicbus .. io .. lmc .. mfi .. mpt mpilib .. .. nand .. nvme .. ofw .. pbio .. pci .. powermac_nvram .. ppbus .. smbus .. speaker .. usb .. utopia .. vkbd .. wi .. .. edit readline .. .. fs cuse .. devfs .. fdescfs .. msdosfs .. nandfs .. nfs .. nullfs .. procfs .. smbfs .. udf .. unionfs .. .. gcc 4.2 .. .. geom cache .. concat .. eli .. gate .. journal .. label .. mirror .. mountver .. multipath .. nop .. raid .. raid3 .. shsec .. stripe .. virstor .. .. gnu posix .. .. gssapi .. infiniband complib .. iba .. opensm .. vendor .. .. isofs cd9660 .. .. kadm5 .. krb5 .. lib80211 .. libmilter .. libxo .. lzma .. machine pc .. .. net altq .. .. net80211 .. netgraph atm .. bluetooth include .. .. netflow .. .. netinet + cc + .. .. netinet6 .. netipsec .. netnatm api .. msg .. saal .. sig .. .. netpfil pf .. .. netsmb .. nfs .. nfsclient .. nfsserver .. openssl .. pcap .. protocols .. rdma .. rpc .. rpcsvc .. security audit .. mac_biba .. mac_bsdextended .. mac_lomac .. mac_mls .. mac_partition .. .. ssp .. sys .. teken .. ufs ffs .. ufs .. .. vm .. xlocale .. .. Index: head/include/Makefile =================================================================== --- head/include/Makefile (revision 294539) +++ head/include/Makefile (revision 294540) @@ -1,384 +1,385 @@ # @(#)Makefile 8.2 (Berkeley) 1/4/94 # $FreeBSD$ # # Doing a "make install" builds /usr/include. .include CLEANFILES= osreldate.h version SUBDIR= arpa protocols rpcsvc rpc xlocale SUBDIR_PARALLEL= INCS= a.out.h ar.h assert.h bitstring.h complex.h cpio.h _ctype.h ctype.h \ db.h \ dirent.h dlfcn.h elf.h elf-hints.h err.h fmtmsg.h fnmatch.h fstab.h \ fts.h ftw.h getopt.h glob.h grp.h \ ieeefp.h ifaddrs.h \ inttypes.h iso646.h kenv.h langinfo.h libgen.h limits.h link.h \ locale.h malloc.h malloc_np.h memory.h monetary.h mpool.h mqueue.h \ ndbm.h netconfig.h \ netdb.h nl_types.h nlist.h nss.h nsswitch.h paths.h \ printf.h proc_service.h pthread.h \ pthread_np.h pwd.h ranlib.h readpassphrase.h regex.h \ res_update.h resolv.h runetype.h search.h semaphore.h setjmp.h \ signal.h spawn.h stab.h stdalign.h stdbool.h stddef.h \ stdnoreturn.h stdio.h stdlib.h string.h stringlist.h \ strings.h sysexits.h tar.h termios.h tgmath.h \ time.h timeconv.h timers.h ttyent.h \ uchar.h ulimit.h unistd.h utime.h utmpx.h uuid.h varargs.h \ wchar.h wctype.h wordexp.h xlocale.h .PATH: ${.CURDIR}/../contrib/libc-vis INCS+= vis.h MHDRS= float.h floatingpoint.h stdarg.h PHDRS= sched.h _semaphore.h LHDRS= aio.h errno.h fcntl.h linker_set.h poll.h stdatomic.h stdint.h \ syslog.h ucontext.h LDIRS= bsm cam geom net net80211 netgraph netinet netinet6 \ netipsec netnatm netsmb nfs nfsclient nfsserver sys vm LSUBDIRS= cam/ata cam/scsi \ dev/acpica dev/agp dev/an dev/bktr dev/ciss dev/filemon dev/firewire \ dev/hwpmc \ dev/ic dev/iicbus dev/io dev/lmc dev/mfi dev/nvme \ dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/smbus \ dev/speaker dev/utopia dev/vkbd dev/wi \ fs/devfs fs/fdescfs fs/msdosfs fs/nandfs fs/nfs fs/nullfs \ fs/procfs fs/smbfs fs/udf fs/unionfs \ geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \ geom/mirror geom/mountver geom/multipath geom/nop \ geom/raid geom/raid3 geom/shsec geom/stripe geom/virstor \ net/altq \ netgraph/atm netgraph/netflow \ + netinet/cc \ security/audit \ security/mac_biba security/mac_bsdextended security/mac_lomac \ security/mac_mls security/mac_partition \ ufs/ffs ufs/ufs LSUBSUBDIRS= dev/mpt/mpilib .if ${MK_BLUETOOTH} != "no" LSUBSUBDIRS+= netgraph/bluetooth/include .endif .if ${MK_CUSE} != "no" LSUBDIRS+= fs/cuse .endif .if ${MK_GSSAPI} != "no" SUBDIR+= gssapi INCS+= gssapi.h .endif .if ${MK_HESIOD} != "no" INCS+= hesiod.h .endif # Handle the #define aliases for libiconv .if ${MK_ICONV} == "yes" INCS+= iconv.h .endif .if ${MK_USB} != "no" LSUBDIRS+= dev/usb .endif .if ${MACHINE_ARCH} == "powerpc" || ${MACHINE_ARCH} == "powerpc64" _dev_powermac_nvram= dev/powermac_nvram .endif # Define SHARED to indicate whether you want symbolic links to the system # source (``symlinks''), or a separate copy (``copies''). ``symlinks'' is # probably only useful for developers and should be avoided if you do not # wish to tie your /usr/include and /usr/src together. #SHARED= symlinks SHARED?= copies INCS+= osreldate.h SYSDIR= ${.CURDIR}/../sys NEWVERS_SH= ${SYSDIR}/conf/newvers.sh PARAM_H= ${SYSDIR}/sys/param.h MK_OSRELDATE_SH= ${.CURDIR}/mk-osreldate.sh SYMLINKS+= ${INCLUDEDIR} ${LIBDIR}/include osreldate.h: ${NEWVERS_SH} ${PARAM_H} ${MK_OSRELDATE_SH} env NEWVERS_SH=${NEWVERS_SH} PARAMFILE=${PARAM_H} SYSDIR=${SYSDIR} \ sh ${MK_OSRELDATE_SH} .for i in ${LHDRS} INCSLINKS+= sys/$i ${INCLUDEDIR}/$i .endfor .for i in ${MHDRS} INCSLINKS+= machine/$i ${INCLUDEDIR}/$i .endfor .for i in ${PHDRS} INCSLINKS+= sys/$i ${INCLUDEDIR}/$i .endfor .if ${MACHINE} != ${MACHINE_CPUARCH} _MARCHS= ${MACHINE_CPUARCH} .endif .if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" _MARCHS+= x86 .endif .if ${MK_STAGING} == "yes" # tell bsd.incs.mk that we have it covered NO_STAGE_INCLUDES= .endif .include .if ${MK_STAGING} != "no" && !defined(_SKIP_BUILD) .if make(all) DESTDIR= ${STAGE_OBJTOP} all: stage_include installincludes: buildincludes buildincludes: stage_prep stage_prep: @mkdir -p ${DESTDIR}${INCLUDEDIR} @touch $@ stage_include: .dirdep installincludes @touch $@ .endif .endif installincludes: ${SHARED} ${SHARED}: compat # Take care of stale directory-level symlinks. compat: .for i in ${LDIRS} ${LSUBDIRS} machine ${_MARCHS} crypto if [ -L ${DESTDIR}${INCLUDEDIR}/$i ]; then \ rm -f ${DESTDIR}${INCLUDEDIR}/$i; \ fi .endfor mtree -deU ${MTREE_FOLLOWS_SYMLINKS} \ -f ${.CURDIR}/../etc/mtree/BSD.include.dist \ -p ${DESTDIR}${INCLUDEDIR} > /dev/null .if ${MK_DIRDEPS_BUILD} == "yes" @touch ${.TARGET} .endif copies: .for i in ${LDIRS} ${LSUBDIRS} ${LSUBSUBDIRS} crypto machine machine/pc \ ${_MARCHS} if [ -d ${DESTDIR}${INCLUDEDIR}/$i ]; then \ cd ${DESTDIR}${INCLUDEDIR}/$i; \ for h in *.h; do \ if [ -L $$h ]; then rm -f $$h; fi; \ done; \ fi .endfor .for i in ${LDIRS} ${LSUBDIRS:Ndev/agp:Ndev/acpica:Ndev/bktr:Ndev/nand:Ndev/pci} ${LSUBSUBDIRS} cd ${.CURDIR}/../sys; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 $i/*.h \ ${DESTDIR}${INCLUDEDIR}/$i .endfor cd ${.CURDIR}/../sys/dev/acpica; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 acpiio.h \ ${DESTDIR}${INCLUDEDIR}/dev/acpica; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 acpi_hpet.h \ ${DESTDIR}${INCLUDEDIR}/dev/acpica cd ${.CURDIR}/../sys/dev/agp; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 agpreg.h \ ${DESTDIR}${INCLUDEDIR}/dev/agp cd ${.CURDIR}/../sys/dev/bktr; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 ioctl_*.h \ ${DESTDIR}${INCLUDEDIR}/dev/bktr .if ${MK_NAND} != "no" cd ${.CURDIR}/../sys/dev/nand; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 nandsim.h \ ${DESTDIR}${INCLUDEDIR}/dev/nand; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 nand_dev.h \ ${DESTDIR}${INCLUDEDIR}/dev/nand .endif cd ${.CURDIR}/../sys/dev/pci; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 pcireg.h \ ${DESTDIR}${INCLUDEDIR}/dev/pci cd ${.CURDIR}/../sys/fs/cd9660/; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/isofs/cd9660 .if ${MK_IPFILTER} != "no" cd ${.CURDIR}/../sys/contrib/ipfilter/netinet; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/netinet .endif .if ${MK_PF} != "no" cd ${.CURDIR}/../sys/netpfil/pf; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/netpfil/pf .endif cd ${.CURDIR}/../sys/crypto; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 rijndael/rijndael.h \ ${DESTDIR}${INCLUDEDIR}/crypto cd ${.CURDIR}/../sys/opencrypto; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/crypto cd ${.CURDIR}/../sys/${MACHINE}/include; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/machine .if exists(${.CURDIR}/../sys/${MACHINE}/include/pc) cd ${.CURDIR}/../sys/${MACHINE}/include/pc; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/machine/pc .endif .for _MARCH in ${_MARCHS} .if exists(${.CURDIR}/../sys/${_MARCH}/include) ${INSTALL} -d -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ cd ${.CURDIR}/../sys/${_MARCH}/include; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH} .if exists(${.CURDIR}/../sys/${_MARCH}/include/pc) ${INSTALL} -d -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ cd ${.CURDIR}/../sys/${_MARCH}/include/pc; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 *.h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc .endif .endif .endfor cd ${.CURDIR}/../sys/rpc; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 types.h \ ${DESTDIR}${INCLUDEDIR}/rpc cd ${.CURDIR}/../sys/teken; \ ${INSTALL} -C -o ${BINOWN} -g ${BINGRP} -m 444 teken.h \ ${DESTDIR}${INCLUDEDIR}/teken .if ${MK_DIRDEPS_BUILD} == "yes" @touch ${.OBJDIR}/${.TARGET} .endif symlinks: @${ECHO} "Setting up symlinks to kernel source tree..." .for i in ${LDIRS} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor .for i in ${LSUBDIRS:Ndev/agp:Ndev/acpica:Ndev/bktr:Ndev/nand:Ndev/pci} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor cd ${.CURDIR}/../sys/dev/acpica; \ for h in acpiio.h acpi_hpet.h; do \ ${INSTALL_SYMLINK} ../../../../sys/dev/acpica/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/acpica; \ done cd ${.CURDIR}/../sys/dev/agp; \ for h in agpreg.h; do \ ${INSTALL_SYMLINK} ../../../../sys/dev/agp/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/agp; \ done cd ${.CURDIR}/../sys/dev/bktr; \ for h in ioctl_*.h; do \ ${INSTALL_SYMLINK} ../../../../sys/dev/bktr/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/bktr; \ done .if ${MK_NAND} != "no" cd ${.CURDIR}/../sys/dev/nand; \ for h in nandsim.h nand_dev.h; do \ ${INSTALL_SYMLINK} ../../../../sys/dev/nand/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/nand; \ done .endif cd ${.CURDIR}/../sys/dev/pci; \ for h in pcireg.h; do \ ${INSTALL_SYMLINK} ../../../../sys/dev/pci/$$h \ ${DESTDIR}${INCLUDEDIR}/dev/pci; \ done .for i in ${LSUBSUBDIRS} cd ${.CURDIR}/../sys/$i; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../../../sys/$i/$$h ${DESTDIR}${INCLUDEDIR}/$i; \ done .endfor .if ${MK_IPFILTER} != "no" cd ${.CURDIR}/../sys/contrib/ipfilter/netinet; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../sys/contrib/ipfilter/netinet/$$h \ ${DESTDIR}${INCLUDEDIR}/netinet; \ done .endif .if ${MK_PF} != "no" cd ${.CURDIR}/../sys/netpfil/pf; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../../sys/netpfil/pf/$$h \ ${DESTDIR}${INCLUDEDIR}/netpfil/pf; \ done .endif cd ${.CURDIR}/../sys/crypto; \ for h in rijndael/rijndael.h; do \ ${INSTALL_SYMLINK} ../../../sys/crypto/$$h \ ${DESTDIR}${INCLUDEDIR}/crypto; \ done cd ${.CURDIR}/../sys/opencrypto; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../sys/opencrypto/$$h \ ${DESTDIR}${INCLUDEDIR}/crypto; \ done cd ${.CURDIR}/../sys/${MACHINE}/include; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../sys/${MACHINE}/include/$$h \ ${DESTDIR}${INCLUDEDIR}/machine; \ done .if exists(${.CURDIR}/../sys/${MACHINE}/include/pc) cd ${.CURDIR}/../sys/${MACHINE}/include/pc; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../../sys/${MACHINE}/include/pc/$$h \ ${DESTDIR}${INCLUDEDIR}/machine/pc; \ done .endif .for _MARCH in ${_MARCHS} .if exists(${.CURDIR}/../sys/${_MARCH}/include) ${INSTALL} -d -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ cd ${.CURDIR}/../sys/${_MARCH}/include; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../sys/${_MARCH}/include/$$h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}; \ done .if exists(${.CURDIR}/../sys/${_MARCH}/include/pc) ${INSTALL} -d -o ${BINOWN} -g ${BINGRP} -m 755 \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ cd ${.CURDIR}/../sys/${_MARCH}/include/pc; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../../sys/${_MARCH}/include/pc/$$h \ ${DESTDIR}${INCLUDEDIR}/${_MARCH}/pc; \ done .endif .endif .endfor cd ${.CURDIR}/../sys/fs/cd9660; \ for h in *.h; do \ ${INSTALL_SYMLINK} ../../../../sys/fs/cd9660/$$h \ ${DESTDIR}${INCLUDEDIR}/isofs/cd9660; \ done cd ${.CURDIR}/../sys/rpc; \ for h in types.h; do \ ${INSTALL_SYMLINK} ../../../sys/rpc/$$h \ ${DESTDIR}${INCLUDEDIR}/rpc; \ done .if ${MK_DIRDEPS_BUILD} == "yes" @touch ${.OBJDIR}/${.TARGET} .endif .if ${MACHINE} == "host" && !defined(_SKIP_BUILD) # we're here because we are building a sysroot... # we need MACHINE et al set correctly HOST_MACHINE!= uname -m HOST_MACHINE_ARCH!= uname -p MACHINE:= ${HOST_MACHINE} MACHINE_ARCH:= ${HOST_MACHINE_ARCH} .endif Index: head/share/man/man4/mod_cc.4 =================================================================== --- head/share/man/man4/mod_cc.4 (revision 294539) +++ head/share/man/man4/mod_cc.4 (revision 294540) @@ -1,119 +1,129 @@ .\" .\" Copyright (c) 2010-2011 The FreeBSD Foundation .\" All rights reserved. .\" .\" This documentation was written at the Centre for Advanced Internet .\" Architectures, Swinburne University of Technology, Melbourne, Australia by .\" David Hayes and Lawrence Stewart under sponsorship from the FreeBSD .\" Foundation. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR .\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd January 12, 2015 +.Dd January 21, 2016 .Dt MOD_CC 4 .Os .Sh NAME .Nm mod_cc .Nd Modular congestion control .Sh DESCRIPTION The modular congestion control framework allows the TCP implementation to dynamically change the congestion control algorithm used by new and existing connections. Algorithms are identified by a unique .Xr ascii 7 name. Algorithm modules can be compiled into the kernel or loaded as kernel modules using the .Xr kld 4 facility. .Pp The default algorithm is NewReno, and all connections use the default unless -explicitly overridden using the TCP_CONGESTION socket option (see +explicitly overridden using the +.Dv TCP_CONGESTION +socket option (see .Xr tcp 4 for details). The default can be changed using a .Xr sysctl 3 MIB variable detailed in the .Sx MIB Variables section below. +.Pp +Algorithm specific parameters can be set or queried using the +.Dv TCP_CCALGOOPT +socket option (see +.Xr tcp 4 +for details). +Callers must pass a pointer to an algorithm specific data, and specify +its size. .Sh MIB Variables The framework exposes the following variables in the .Va net.inet.tcp.cc branch of the .Xr sysctl 3 MIB: .Bl -tag -width ".Va available" .It Va available Read-only list of currently available congestion control algorithms by name. .It Va algorithm Returns the current default congestion control algorithm when read, and changes the default when set. When attempting to change the default algorithm, this variable should be set to one of the names listed by the .Va net.inet.tcp.cc.available MIB variable. .El .Sh SEE ALSO .Xr cc_cdg 4 , .Xr cc_chd 4 , .Xr cc_cubic 4 , .Xr cc_dctcp 4 , .Xr cc_hd 4 , .Xr cc_htcp 4 , .Xr cc_newreno 4 , .Xr cc_vegas 4 , .Xr tcp 4 , .Xr mod_cc 9 .Sh ACKNOWLEDGEMENTS Development and testing of this software were made possible in part by grants from the FreeBSD Foundation and Cisco University Research Program Fund at Community Foundation Silicon Valley. .Sh HISTORY The .Nm modular congestion control framework first appeared in .Fx 9.0 . .Pp The framework was first released in 2007 by James Healy and Lawrence Stewart whilst working on the NewTCP research project at Swinburne University of Technology's Centre for Advanced Internet Architectures, Melbourne, Australia, which was made possible in part by a grant from the Cisco University Research Program Fund at Community Foundation Silicon Valley. More details are available at: .Pp http://caia.swin.edu.au/urp/newtcp/ .Sh AUTHORS .An -nosplit The .Nm facility was written by .An Lawrence Stewart Aq Mt lstewart@FreeBSD.org , .An James Healy Aq Mt jimmy@deefa.com and .An David Hayes Aq Mt david.hayes@ieee.org . .Pp This manual page was written by .An David Hayes Aq Mt david.hayes@ieee.org and .An Lawrence Stewart Aq Mt lstewart@FreeBSD.org . Index: head/share/man/man4/tcp.4 =================================================================== --- head/share/man/man4/tcp.4 (revision 294539) +++ head/share/man/man4/tcp.4 (revision 294540) @@ -1,631 +1,636 @@ .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. .\" Copyright (c) 2010-2011 The FreeBSD Foundation .\" All rights reserved. .\" .\" Portions of this documentation were written at the Centre for Advanced .\" Internet Architectures, Swinburne University of Technology, Melbourne, .\" Australia by David Hayes under sponsorship from the FreeBSD Foundation. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd October 27, 2015 +.Dd January 21, 2016 .Dt TCP 4 .Os .Sh NAME .Nm tcp .Nd Internet Transmission Control Protocol .Sh SYNOPSIS .In sys/types.h .In sys/socket.h .In netinet/in.h .In netinet/tcp.h .Ft int .Fn socket AF_INET SOCK_STREAM 0 .Sh DESCRIPTION The .Tn TCP protocol provides reliable, flow-controlled, two-way transmission of data. It is a byte-stream protocol used to support the .Dv SOCK_STREAM abstraction. .Tn TCP uses the standard Internet address format and, in addition, provides a per-host collection of .Dq "port addresses" . Thus, each address is composed of an Internet address specifying the host and network, with a specific .Tn TCP port on the host identifying the peer entity. .Pp Sockets utilizing the .Tn TCP protocol are either .Dq active or .Dq passive . Active sockets initiate connections to passive sockets. By default, .Tn TCP sockets are created active; to create a passive socket, the .Xr listen 2 system call must be used after binding the socket with the .Xr bind 2 system call. Only passive sockets may use the .Xr accept 2 call to accept incoming connections. Only active sockets may use the .Xr connect 2 call to initiate connections. .Pp Passive sockets may .Dq underspecify their location to match incoming connection requests from multiple networks. This technique, termed .Dq "wildcard addressing" , allows a single server to provide service to clients on multiple networks. To create a socket which listens on all networks, the Internet address .Dv INADDR_ANY must be bound. The .Tn TCP port may still be specified at this time; if the port is not specified, the system will assign one. Once a connection has been established, the socket's address is fixed by the peer entity's location. The address assigned to the socket is the address associated with the network interface through which packets are being transmitted and received. Normally, this address corresponds to the peer entity's network. .Pp .Tn TCP supports a number of socket options which can be set with .Xr setsockopt 2 and tested with .Xr getsockopt 2 : .Bl -tag -width ".Dv TCP_CONGESTION" .It Dv TCP_INFO Information about a socket's underlying TCP session may be retrieved by passing the read-only option .Dv TCP_INFO to .Xr getsockopt 2 . It accepts a single argument: a pointer to an instance of .Vt "struct tcp_info" . .Pp This API is subject to change; consult the source to determine which fields are currently filled out by this option. .Fx specific additions include send window size, receive window size, and bandwidth-controlled window space. +.It Dv TCP_CCALGOOPT +Set or query congestion control algorithm specific parameters. +See +.Xr mod_cc 4 +for details. .It Dv TCP_CONGESTION Select or query the congestion control algorithm that TCP will use for the connection. See .Xr mod_cc 4 for details. .It Dv TCP_KEEPINIT This .Xr setsockopt 2 option accepts a per-socket timeout argument of .Vt "u_int" in seconds, for new, non-established .Tn TCP connections. For the global default in milliseconds see .Va keepinit in the .Sx MIB Variables section further down. .It Dv TCP_KEEPIDLE This .Xr setsockopt 2 option accepts an argument of .Vt "u_int" for the amount of time, in seconds, that the connection must be idle before keepalive probes (if enabled) are sent for the connection of this socket. If set on a listening socket, the value is inherited by the newly created socket upon .Xr accept 2 . For the global default in milliseconds see .Va keepidle in the .Sx MIB Variables section further down. .It Dv TCP_KEEPINTVL This .Xr setsockopt 2 option accepts an argument of .Vt "u_int" to set the per-socket interval, in seconds, between keepalive probes sent to a peer. If set on a listening socket, the value is inherited by the newly created socket upon .Xr accept 2 . For the global default in milliseconds see .Va keepintvl in the .Sx MIB Variables section further down. .It Dv TCP_KEEPCNT This .Xr setsockopt 2 option accepts an argument of .Vt "u_int" and allows a per-socket tuning of the number of probes sent, with no response, before the connection will be dropped. If set on a listening socket, the value is inherited by the newly created socket upon .Xr accept 2 . For the global default see the .Va keepcnt in the .Sx MIB Variables section further down. .It Dv TCP_NODELAY Under most circumstances, .Tn TCP sends data when it is presented; when outstanding data has not yet been acknowledged, it gathers small amounts of output to be sent in a single packet once an acknowledgement is received. For a small number of clients, such as window systems that send a stream of mouse events which receive no replies, this packetization may cause significant delays. The boolean option .Dv TCP_NODELAY defeats this algorithm. .It Dv TCP_MAXSEG By default, a sender- and .No receiver- Ns Tn TCP will negotiate among themselves to determine the maximum segment size to be used for each connection. The .Dv TCP_MAXSEG option allows the user to determine the result of this negotiation, and to reduce it if desired. .It Dv TCP_NOOPT .Tn TCP usually sends a number of options in each packet, corresponding to various .Tn TCP extensions which are provided in this implementation. The boolean option .Dv TCP_NOOPT is provided to disable .Tn TCP option use on a per-connection basis. .It Dv TCP_NOPUSH By convention, the .No sender- Ns Tn TCP will set the .Dq push bit, and begin transmission immediately (if permitted) at the end of every user call to .Xr write 2 or .Xr writev 2 . When this option is set to a non-zero value, .Tn TCP will delay sending any data at all until either the socket is closed, or the internal send buffer is filled. .It Dv TCP_MD5SIG This option enables the use of MD5 digests (also known as TCP-MD5) on writes to the specified socket. Outgoing traffic is digested; digests on incoming traffic are verified if the .Va net.inet.tcp.signature_verify_input sysctl is nonzero. The current default behavior for the system is to respond to a system advertising this option with TCP-MD5; this may change. .Pp One common use for this in a .Fx router deployment is to enable based routers to interwork with Cisco equipment at peering points. Support for this feature conforms to RFC 2385. Only IPv4 .Pq Dv AF_INET sessions are supported. .Pp In order for this option to function correctly, it is necessary for the administrator to add a tcp-md5 key entry to the system's security associations database (SADB) using the .Xr setkey 8 utility. This entry must have an SPI of 0x1000 and can therefore only be specified on a per-host basis at this time. .Pp If an SADB entry cannot be found for the destination, the outgoing traffic will have an invalid digest option prepended, and the following error message will be visible on the system console: .Em "tcp_signature_compute: SADB lookup failed for %d.%d.%d.%d" . .El .Pp The option level for the .Xr setsockopt 2 call is the protocol number for .Tn TCP , available from .Xr getprotobyname 3 , or .Dv IPPROTO_TCP . All options are declared in .In netinet/tcp.h . .Pp Options at the .Tn IP transport level may be used with .Tn TCP ; see .Xr ip 4 . Incoming connection requests that are source-routed are noted, and the reverse source route is used in responding. .Pp The default congestion control algorithm for .Tn TCP is .Xr cc_newreno 4 . Other congestion control algorithms can be made available using the .Xr mod_cc 4 framework. .Ss MIB Variables The .Tn TCP protocol implements a number of variables in the .Va net.inet.tcp branch of the .Xr sysctl 3 MIB. .Bl -tag -width ".Va TCPCTL_DO_RFC1323" .It Dv TCPCTL_DO_RFC1323 .Pq Va rfc1323 Implement the window scaling and timestamp options of RFC 1323 (default is true). .It Dv TCPCTL_MSSDFLT .Pq Va mssdflt The default value used for the maximum segment size .Pq Dq MSS when no advice to the contrary is received from MSS negotiation. .It Dv TCPCTL_SENDSPACE .Pq Va sendspace Maximum .Tn TCP send window. .It Dv TCPCTL_RECVSPACE .Pq Va recvspace Maximum .Tn TCP receive window. .It Va log_in_vain Log any connection attempts to ports where there is not a socket accepting connections. The value of 1 limits the logging to .Tn SYN (connection establishment) packets only. That of 2 results in any .Tn TCP packets to closed ports being logged. Any value unlisted above disables the logging (default is 0, i.e., the logging is disabled). .It Va msl The Maximum Segment Lifetime, in milliseconds, for a packet. .It Va keepinit Timeout, in milliseconds, for new, non-established .Tn TCP connections. The default is 75000 msec. .It Va keepidle Amount of time, in milliseconds, that the connection must be idle before keepalive probes (if enabled) are sent. The default is 7200000 msec (2 hours). .It Va keepintvl The interval, in milliseconds, between keepalive probes sent to remote machines, when no response is received on a .Va keepidle probe. The default is 75000 msec. .It Va keepcnt Number of probes sent, with no response, before a connection is dropped. The default is 8 packets. .It Va always_keepalive Assume that .Dv SO_KEEPALIVE is set on all .Tn TCP connections, the kernel will periodically send a packet to the remote host to verify the connection is still up. .It Va icmp_may_rst Certain .Tn ICMP unreachable messages may abort connections in .Tn SYN-SENT state. .It Va do_tcpdrain Flush packets in the .Tn TCP reassembly queue if the system is low on mbufs. .It Va blackhole If enabled, disable sending of RST when a connection is attempted to a port where there is not a socket accepting connections. See .Xr blackhole 4 . .It Va delayed_ack Delay ACK to try and piggyback it onto a data packet. .It Va delacktime Maximum amount of time, in milliseconds, before a delayed ACK is sent. .It Va path_mtu_discovery Enable Path MTU Discovery. .It Va tcbhashsize Size of the .Tn TCP control-block hash table (read-only). This may be tuned using the kernel option .Dv TCBHASHSIZE or by setting .Va net.inet.tcp.tcbhashsize in the .Xr loader 8 . .It Va pcbcount Number of active process control blocks (read-only). .It Va syncookies Determines whether or not .Tn SYN cookies should be generated for outbound .Tn SYN-ACK packets. .Tn SYN cookies are a great help during .Tn SYN flood attacks, and are enabled by default. (See .Xr syncookies 4 . ) .It Va isn_reseed_interval The interval (in seconds) specifying how often the secret data used in RFC 1948 initial sequence number calculations should be reseeded. By default, this variable is set to zero, indicating that no reseeding will occur. Reseeding should not be necessary, and will break .Dv TIME_WAIT recycling for a few minutes. .It Va rexmit_min , rexmit_slop Adjust the retransmit timer calculation for .Tn TCP . The slop is typically added to the raw calculation to take into account occasional variances that the .Tn SRTT (smoothed round-trip time) is unable to accommodate, while the minimum specifies an absolute minimum. While a number of .Tn TCP RFCs suggest a 1 second minimum, these RFCs tend to focus on streaming behavior, and fail to deal with the fact that a 1 second minimum has severe detrimental effects over lossy interactive connections, such as a 802.11b wireless link, and over very fast but lossy connections for those cases not covered by the fast retransmit code. For this reason, we use 200ms of slop and a near-0 minimum, which gives us an effective minimum of 200ms (similar to .Tn Linux ) . .It Va initcwnd_segments Enable the ability to specify initial congestion window in number of segments. The default value is 10 as suggested by RFC 6928. Changing the value on fly would not affect connections using congestion window from the hostcache. Caution: This regulates the burst of packets allowed to be sent in the first RTT. The value should be relative to the link capacity. Start with small values for lower-capacity links. Large bursts can cause buffer overruns and packet drops if routers have small buffers or the link is experiencing congestion. .It Va rfc3042 Enable the Limited Transmit algorithm as described in RFC 3042. It helps avoid timeouts on lossy links and also when the congestion window is small, as happens on short transfers. .It Va rfc3390 Enable support for RFC 3390, which allows for a variable-sized starting congestion window on new connections, depending on the maximum segment size. This helps throughput in general, but particularly affects short transfers and high-bandwidth large propagation-delay connections. .It Va sack.enable Enable support for RFC 2018, TCP Selective Acknowledgment option, which allows the receiver to inform the sender about all successfully arrived segments, allowing the sender to retransmit the missing segments only. .It Va sack.maxholes Maximum number of SACK holes per connection. Defaults to 128. .It Va sack.globalmaxholes Maximum number of SACK holes per system, across all connections. Defaults to 65536. .It Va maxtcptw When a TCP connection enters the .Dv TIME_WAIT state, its associated socket structure is freed, since it is of negligible size and use, and a new structure is allocated to contain a minimal amount of information necessary for sustaining a connection in this state, called the compressed TCP TIME_WAIT state. Since this structure is smaller than a socket structure, it can save a significant amount of system memory. The .Va net.inet.tcp.maxtcptw MIB variable controls the maximum number of these structures allocated. By default, it is initialized to .Va kern.ipc.maxsockets / 5. .It Va nolocaltimewait Suppress creating of compressed TCP TIME_WAIT states for connections in which both endpoints are local. .It Va fast_finwait2_recycle Recycle .Tn TCP .Dv FIN_WAIT_2 connections faster when the socket is marked as .Dv SBS_CANTRCVMORE (no user process has the socket open, data received on the socket cannot be read). The timeout used here is .Va finwait2_timeout . .It Va finwait2_timeout Timeout to use for fast recycling of .Tn TCP .Dv FIN_WAIT_2 connections. Defaults to 60 seconds. .It Va ecn.enable Enable support for TCP Explicit Congestion Notification (ECN). ECN allows a TCP sender to reduce the transmission rate in order to avoid packet drops. .It Va ecn.maxretries Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a specific connection. This is needed to help with connection establishment when a broken firewall is in the network path. .It Va pmtud_blackhole_detection Turn on automatic path MTU blackhole detection. In case of retransmits OS will lower the MSS to check if it's MTU problem. If current MSS is greater than configured value to try, it will be set to configured value, otherwise, MSS will be set to default values .Po Va net.inet.tcp.mssdflt and .Va net.inet.tcp.v6mssdflt .Pc . .It Va pmtud_blackhole_mss MSS to try for IPv4 if PMTU blackhole detection is turned on. .It Va v6pmtud_blackhole_mss MSS to try for IPv6 if PMTU blackhole detection is turned on. .It Va pmtud_blackhole_activated Number of times configured values were used in an attempt to downshift. .It Va pmtud_blackhole_activated_min_mss Number of times default MSS was used in an attempt to downshift. .It Va pmtud_blackhole_failed Number of connections for which retransmits continued even after MSS downshift. .El .Sh ERRORS A socket operation may fail with one of the following errors returned: .Bl -tag -width Er .It Bq Er EISCONN when trying to establish a connection on a socket which already has one; .It Bq Er ENOBUFS when the system runs out of memory for an internal data structure; .It Bq Er ETIMEDOUT when a connection was dropped due to excessive retransmissions; .It Bq Er ECONNRESET when the remote peer forces the connection to be closed; .It Bq Er ECONNREFUSED when the remote peer actively refuses connection establishment (usually because no process is listening to the port); .It Bq Er EADDRINUSE when an attempt is made to create a socket with a port which has already been allocated; .It Bq Er EADDRNOTAVAIL when an attempt is made to create a socket with a network address for which no network interface exists; .It Bq Er EAFNOSUPPORT when an attempt is made to bind or connect a socket to a multicast address. .El .Sh SEE ALSO .Xr getsockopt 2 , .Xr socket 2 , .Xr sysctl 3 , .Xr blackhole 4 , .Xr inet 4 , .Xr intro 4 , .Xr ip 4 , .Xr mod_cc 4 , .Xr siftr 4 , .Xr syncache 4 , .Xr setkey 8 .Rs .%A "V. Jacobson" .%A "R. Braden" .%A "D. Borman" .%T "TCP Extensions for High Performance" .%O "RFC 1323" .Re .Rs .%A "A. Heffernan" .%T "Protection of BGP Sessions via the TCP MD5 Signature Option" .%O "RFC 2385" .Re .Rs .%A "K. Ramakrishnan" .%A "S. Floyd" .%A "D. Black" .%T "The Addition of Explicit Congestion Notification (ECN) to IP" .%O "RFC 3168" .Re .Sh HISTORY The .Tn TCP protocol appeared in .Bx 4.2 . The RFC 1323 extensions for window scaling and timestamps were added in .Bx 4.4 . The .Dv TCP_INFO option was introduced in .Tn Linux 2.6 and is .Em subject to change . Index: head/share/man/man9/mod_cc.9 =================================================================== --- head/share/man/man9/mod_cc.9 (revision 294539) +++ head/share/man/man9/mod_cc.9 (revision 294540) @@ -1,334 +1,350 @@ .\" .\" Copyright (c) 2008-2009 Lawrence Stewart .\" Copyright (c) 2010-2011 The FreeBSD Foundation .\" All rights reserved. .\" .\" Portions of this documentation were written at the Centre for Advanced .\" Internet Architectures, Swinburne University of Technology, Melbourne, .\" Australia by David Hayes and Lawrence Stewart under sponsorship from the .\" FreeBSD Foundation. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR .\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd December 26, 2014 +.Dd January 21, 2016 .Dt MOD_CC 9 .Os .Sh NAME .Nm mod_cc , .Nm DECLARE_CC_MODULE , .Nm CCV .Nd Modular Congestion Control .Sh SYNOPSIS -.In netinet/cc.h +.In netinet/tcp.h +.In netinet/tcp_cc.h .In netinet/cc/cc_module.h .Fn DECLARE_CC_MODULE "ccname" "ccalgo" .Fn CCV "ccv" "what" .Sh DESCRIPTION The .Nm framework allows congestion control algorithms to be implemented as dynamically loadable kernel modules via the .Xr kld 4 facility. Transport protocols can select from the list of available algorithms on a connection-by-connection basis, or use the system default (see .Xr mod_cc 4 for more details). .Pp .Nm modules are identified by an .Xr ascii 7 name and set of hook functions encapsulated in a .Vt "struct cc_algo" , which has the following members: .Bd -literal -offset indent struct cc_algo { char name[TCP_CA_NAME_MAX]; int (*mod_init) (void); int (*mod_destroy) (void); int (*cb_init) (struct cc_var *ccv); void (*cb_destroy) (struct cc_var *ccv); void (*conn_init) (struct cc_var *ccv); void (*ack_received) (struct cc_var *ccv, uint16_t type); void (*cong_signal) (struct cc_var *ccv, uint32_t type); void (*post_recovery) (struct cc_var *ccv); void (*after_idle) (struct cc_var *ccv); + int (*ctl_output)(struct cc_var *, struct sockopt *, void *); }; .Ed .Pp The .Va name field identifies the unique name of the algorithm, and should be no longer than TCP_CA_NAME_MAX-1 characters in length (the TCP_CA_NAME_MAX define lives in .In netinet/tcp.h for compatibility reasons). .Pp The .Va mod_init function is called when a new module is loaded into the system but before the registration process is complete. It should be implemented if a module needs to set up some global state prior to being available for use by new connections. Returning a non-zero value from .Va mod_init will cause the loading of the module to fail. .Pp The .Va mod_destroy function is called prior to unloading an existing module from the kernel. It should be implemented if a module needs to clean up any global state before being removed from the kernel. The return value is currently ignored. .Pp The .Va cb_init function is called when a TCP control block .Vt struct tcpcb is created. It should be implemented if a module needs to allocate memory for storing private per-connection state. Returning a non-zero value from .Va cb_init will cause the connection set up to be aborted, terminating the connection as a result. .Pp The .Va cb_destroy function is called when a TCP control block .Vt struct tcpcb is destroyed. It should be implemented if a module needs to free memory allocated in .Va cb_init . .Pp The .Va conn_init function is called when a new connection has been established and variables are being initialised. It should be implemented to initialise congestion control algorithm variables for the newly established connection. .Pp The .Va ack_received function is called when a TCP acknowledgement (ACK) packet is received. Modules use the .Fa type argument as an input to their congestion management algorithms. The ACK types currently reported by the stack are CC_ACK and CC_DUPACK. CC_ACK indicates the received ACK acknowledges previously unacknowledged data. CC_DUPACK indicates the received ACK acknowledges data we have already received an ACK for. .Pp The .Va cong_signal function is called when a congestion event is detected by the TCP stack. Modules use the .Fa type argument as an input to their congestion management algorithms. The congestion event types currently reported by the stack are CC_ECN, CC_RTO, CC_RTO_ERR and CC_NDUPACK. CC_ECN is reported when the TCP stack receives an explicit congestion notification (RFC3168). CC_RTO is reported when the retransmission time out timer fires. CC_RTO_ERR is reported if the retransmission time out timer fired in error. CC_NDUPACK is reported if N duplicate ACKs have been received back-to-back, where N is the fast retransmit duplicate ack threshold (N=3 currently as per RFC5681). .Pp The .Va post_recovery function is called after the TCP connection has recovered from a congestion event. It should be implemented to adjust state as required. .Pp The .Va after_idle function is called when data transfer resumes after an idle period. It should be implemented to adjust state as required. +.Pp +The +.Va ctl_output +function is called when +.Xr getsockopt 2 +or +.Xr setsockopt 2 +is called on a +.Xr tcp 4 +socket with the +.Va struct sockopt +pointer forwarded unmodified from the TCP control, and a +.Va void * +pointer to algorithm specific argument. .Pp The .Fn DECLARE_CC_MODULE macro provides a convenient wrapper around the .Xr DECLARE_MODULE 9 macro, and is used to register a .Nm module with the .Nm framework. The .Fa ccname argument specifies the module's name. The .Fa ccalgo argument points to the module's .Vt struct cc_algo . .Pp .Nm modules must instantiate a .Vt struct cc_algo , but are only required to set the name field, and optionally any of the function pointers. The stack will skip calling any function pointer which is NULL, so there is no requirement to implement any of the function pointers. Using the C99 designated initialiser feature to set fields is encouraged. .Pp Each function pointer which deals with congestion control state is passed a pointer to a .Vt struct cc_var , which has the following members: .Bd -literal -offset indent struct cc_var { void *cc_data; int bytes_this_ack; tcp_seq curack; uint32_t flags; int type; union ccv_container { struct tcpcb *tcp; struct sctp_nets *sctp; } ccvc; }; .Ed .Pp .Vt struct cc_var groups congestion control related variables into a single, embeddable structure and adds a layer of indirection to accessing transport protocol control blocks. The eventual goal is to allow a single set of .Nm modules to be shared between all congestion aware transport protocols, though currently only .Xr tcp 4 is supported. .Pp To aid the eventual transition towards this goal, direct use of variables from the transport protocol's data structures is strongly discouraged. However, it is inevitable at the current time to require access to some of these variables, and so the .Fn CCV macro exists as a convenience accessor. The .Fa ccv argument points to the .Vt struct cc_var passed into the function by the .Nm framework. The .Fa what argument specifies the name of the variable to access. .Pp Apart from the .Va type and .Va ccv_container fields, the remaining fields in .Vt struct cc_var are for use by .Nm modules. .Pp The .Va cc_data field is available for algorithms requiring additional per-connection state to attach a dynamic memory pointer to. The memory should be allocated and attached in the module's .Va cb_init hook function. .Pp The .Va bytes_this_ack field specifies the number of new bytes acknowledged by the most recently received ACK packet. It is only valid in the .Va ack_received hook function. .Pp The .Va curack field specifies the sequence number of the most recently received ACK packet. It is only valid in the .Va ack_received , .Va cong_signal and .Va post_recovery hook functions. .Pp The .Va flags field is used to pass useful information from the stack to a .Nm module. The CCF_ABC_SENTAWND flag is relevant in .Va ack_received and is set when appropriate byte counting (RFC3465) has counted a window's worth of bytes has been sent. It is the module's responsibility to clear the flag after it has processed the signal. The CCF_CWND_LIMITED flag is relevant in .Va ack_received and is set when the connection's ability to send data is currently constrained by the value of the congestion window. Algorithms should use the absence of this flag being set to avoid accumulating a large difference between the congestion window and send window. .Sh SEE ALSO .Xr cc_cdg 4 , .Xr cc_chd 4 , .Xr cc_cubic 4 , .Xr cc_hd 4 , .Xr cc_htcp 4 , .Xr cc_newreno 4 , .Xr cc_vegas 4 , .Xr mod_cc 4 , .Xr tcp 4 .Sh ACKNOWLEDGEMENTS Development and testing of this software were made possible in part by grants from the FreeBSD Foundation and Cisco University Research Program Fund at Community Foundation Silicon Valley. .Sh FUTURE WORK Integrate with .Xr sctp 4 . .Sh HISTORY The modular Congestion Control (CC) framework first appeared in .Fx 9.0 . .Pp The framework was first released in 2007 by James Healy and Lawrence Stewart whilst working on the NewTCP research project at Swinburne University of Technology's Centre for Advanced Internet Architectures, Melbourne, Australia, which was made possible in part by a grant from the Cisco University Research Program Fund at Community Foundation Silicon Valley. More details are available at: .Pp http://caia.swin.edu.au/urp/newtcp/ .Sh AUTHORS .An -nosplit The .Nm framework was written by .An Lawrence Stewart Aq Mt lstewart@FreeBSD.org , .An James Healy Aq Mt jimmy@deefa.com and .An David Hayes Aq Mt david.hayes@ieee.org . .Pp This manual page was written by .An David Hayes Aq Mt david.hayes@ieee.org and .An Lawrence Stewart Aq Mt lstewart@FreeBSD.org . Index: head/sys/netinet/tcp.h =================================================================== --- head/sys/netinet/tcp.h (revision 294539) +++ head/sys/netinet/tcp.h (revision 294540) @@ -1,260 +1,261 @@ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp.h 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #ifndef _NETINET_TCP_H_ #define _NETINET_TCP_H_ #include #include #if __BSD_VISIBLE typedef u_int32_t tcp_seq; #define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ #define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ /* * TCP header. * Per RFC 793, September, 1981. */ struct tcphdr { u_short th_sport; /* source port */ u_short th_dport; /* destination port */ tcp_seq th_seq; /* sequence number */ tcp_seq th_ack; /* acknowledgement number */ #if BYTE_ORDER == LITTLE_ENDIAN u_char th_x2:4, /* (unused) */ th_off:4; /* data offset */ #endif #if BYTE_ORDER == BIG_ENDIAN u_char th_off:4, /* data offset */ th_x2:4; /* (unused) */ #endif u_char th_flags; #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_PUSH 0x08 #define TH_ACK 0x10 #define TH_URG 0x20 #define TH_ECE 0x40 #define TH_CWR 0x80 #define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) #define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" u_short th_win; /* window */ u_short th_sum; /* checksum */ u_short th_urp; /* urgent pointer */ }; #define TCPOPT_EOL 0 #define TCPOLEN_EOL 1 #define TCPOPT_PAD 0 /* padding after EOL */ #define TCPOLEN_PAD 1 #define TCPOPT_NOP 1 #define TCPOLEN_NOP 1 #define TCPOPT_MAXSEG 2 #define TCPOLEN_MAXSEG 4 #define TCPOPT_WINDOW 3 #define TCPOLEN_WINDOW 3 #define TCPOPT_SACK_PERMITTED 4 #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 #define TCPOLEN_SACKHDR 2 #define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 #define TCPOPT_FAST_OPEN 34 #define TCPOLEN_FAST_OPEN_EMPTY 2 #define TCPOLEN_FAST_OPEN_MIN 6 #define TCPOLEN_FAST_OPEN_MAX 18 /* Miscellaneous constants */ #define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ #define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ /* * The default maximum segment size (MSS) to be used for new TCP connections * when path MTU discovery is not enabled. * * RFC879 derives the default MSS from the largest datagram size hosts are * minimally required to handle directly or through IP reassembly minus the * size of the IP and TCP header. With IPv6 the minimum MTU is specified * in RFC2460. * * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr) * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr) * * We use explicit numerical definition here to avoid header pollution. */ #define TCP_MSS 536 #define TCP6_MSS 1220 /* * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS * option. Allowing low values of MSS can consume significant resources and * be used to mount a resource exhaustion attack. * Connections requesting lower MSS values will be rounded up to this value * and the IP_DF flag will be cleared to allow fragmentation along the path. * * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. Setting * it to "0" disables the minmss check. * * The default value is fine for TCP across the Internet's smallest official * link MTU (256 bytes for AX.25 packet radio). However, a connection is very * unlikely to come across such low MTU interfaces these days (anno domini 2003). */ #define TCP_MINMSS 216 #define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ #define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ #define TCP_MAXBURST 4 /* maximum segments in a burst */ #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ #endif /* __BSD_VISIBLE */ /* * User-settable options (used with setsockopt). These are discrete * values and are not masked together. Some values appear to be * bitmasks for historical reasons. */ #define TCP_NODELAY 1 /* don't delay send to coalesce packets */ #if __BSD_VISIBLE #define TCP_MAXSEG 2 /* set maximum segment size */ #define TCP_NOPUSH 4 /* don't push last block of write */ #define TCP_NOOPT 8 /* don't use TCP options */ #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ +#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ #define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ #define TCP_FASTOPEN 1025 /* enable TFO / was created via TFO */ #define TCP_PCAP_OUT 2048 /* number of output packets to keep */ #define TCP_PCAP_IN 4096 /* number of input packets to keep */ #define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR #define TCP_CA_NAME_MAX 16 /* max congestion control name length */ #define TCPI_OPT_TIMESTAMPS 0x01 #define TCPI_OPT_SACK 0x02 #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 /* * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits * the caller to query certain information about the state of a TCP * connection. We provide an overlapping set of fields with the Linux * implementation, but since this is a fixed size structure, room has been * left for growth. In order to maximize potential future compatibility with * the Linux API, the same variable names and order have been adopted, and * padding left to make room for omitted fields in case they are added later. * * XXX: This is currently an unstable ABI/API, in that it is expected to * change. */ struct tcp_info { u_int8_t tcpi_state; /* TCP FSM state. */ u_int8_t __tcpi_ca_state; u_int8_t __tcpi_retransmits; u_int8_t __tcpi_probes; u_int8_t __tcpi_backoff; u_int8_t tcpi_options; /* Options enabled on conn. */ u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ u_int32_t tcpi_rto; /* Retransmission timeout (usec). */ u_int32_t __tcpi_ato; u_int32_t tcpi_snd_mss; /* Max segment size for send. */ u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ u_int32_t __tcpi_unacked; u_int32_t __tcpi_sacked; u_int32_t __tcpi_lost; u_int32_t __tcpi_retrans; u_int32_t __tcpi_fackets; /* Times; measurements in usecs. */ u_int32_t __tcpi_last_data_sent; u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ u_int32_t tcpi_last_data_recv; /* Time since last recv data. */ u_int32_t __tcpi_last_ack_recv; /* Metrics; variable units. */ u_int32_t __tcpi_pmtu; u_int32_t __tcpi_rcv_ssthresh; u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ u_int32_t __tcpi_advmss; u_int32_t __tcpi_reordering; u_int32_t __tcpi_rcv_rtt; u_int32_t tcpi_rcv_space; /* Advertised recv window. */ /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ u_int32_t tcpi_snd_bwnd; /* No longer used. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ u_int32_t tcpi_snd_rexmitpack; /* Retransmitted packets */ u_int32_t tcpi_rcv_ooopack; /* Out-of-order packets */ u_int32_t tcpi_snd_zerowin; /* Zero-sized windows sent */ /* Padding to grow without breaking ABI. */ u_int32_t __tcpi_pad[26]; /* Padding. */ }; #endif #define TCP_FUNCTION_NAME_LEN_MAX 32 struct tcp_function_set { char function_set_name[TCP_FUNCTION_NAME_LEN_MAX]; uint32_t pcbcnt; }; #endif /* !_NETINET_TCP_H_ */ Index: head/sys/netinet/tcp_cc.h =================================================================== --- head/sys/netinet/tcp_cc.h (revision 294539) +++ head/sys/netinet/tcp_cc.h (revision 294540) @@ -1,175 +1,178 @@ /*- * Copyright (c) 2007-2008 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart and * James Healy, made possible in part by a grant from the Cisco University * Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ */ #ifndef _NETINET_TCP_CC_H_ #define _NETINET_TCP_CC_H_ #if !defined(_KERNEL) #error "no user-servicable parts inside" #endif /* Global CC vars. */ extern STAILQ_HEAD(cc_head, cc_algo) cc_list; extern const int tcprexmtthresh; extern struct cc_algo newreno_cc_algo; /* Per-netstack bits. */ VNET_DECLARE(struct cc_algo *, default_cc_ptr); #define V_default_cc_ptr VNET(default_cc_ptr) /* Define the new net.inet.tcp.cc sysctl tree. */ SYSCTL_DECL(_net_inet_tcp_cc); /* CC housekeeping functions. */ int cc_register_algo(struct cc_algo *add_cc); int cc_deregister_algo(struct cc_algo *remove_cc); /* * Wrapper around transport structs that contain same-named congestion * control variables. Allows algos to be shared amongst multiple CC aware * transprots. */ struct cc_var { void *cc_data; /* Per-connection private CC algorithm data. */ int bytes_this_ack; /* # bytes acked by the current ACK. */ tcp_seq curack; /* Most recent ACK. */ uint32_t flags; /* Flags for cc_var (see below) */ int type; /* Indicates which ptr is valid in ccvc. */ union ccv_container { struct tcpcb *tcp; struct sctp_nets *sctp; } ccvc; }; /* cc_var flags. */ #define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */ #define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */ #define CCF_DELACK 0x0004 /* Is this ack delayed? */ #define CCF_ACKNOW 0x0008 /* Will this ack be sent now? */ #define CCF_IPHDR_CE 0x0010 /* Does this packet set CE bit? */ #define CCF_TCPHDR_CWR 0x0020 /* Does this packet set CWR bit? */ /* ACK types passed to the ack_received() hook. */ #define CC_ACK 0x0001 /* Regular in sequence ACK. */ #define CC_DUPACK 0x0002 /* Duplicate ACK. */ #define CC_PARTIALACK 0x0004 /* Not yet. */ #define CC_SACK 0x0008 /* Not yet. */ /* * Congestion signal types passed to the cong_signal() hook. The highest order 8 * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own * congestion signal types. */ #define CC_ECN 0x00000001 /* ECN marked packet received. */ #define CC_RTO 0x00000002 /* RTO fired. */ #define CC_RTO_ERR 0x00000004 /* RTO fired in error. */ #define CC_NDUPACK 0x00000008 /* Threshold of dupack's reached. */ #define CC_SIGPRIVMASK 0xFF000000 /* Mask to check if sig is private. */ /* * Structure to hold data and function pointers that together represent a * congestion control algorithm. */ struct cc_algo { char name[TCP_CA_NAME_MAX]; /* Init global module state on kldload. */ int (*mod_init)(void); /* Cleanup global module state on kldunload. */ int (*mod_destroy)(void); /* Init CC state for a new control block. */ int (*cb_init)(struct cc_var *ccv); /* Cleanup CC state for a terminating control block. */ void (*cb_destroy)(struct cc_var *ccv); /* Init variables for a newly established connection. */ void (*conn_init)(struct cc_var *ccv); /* Called on receipt of an ack. */ void (*ack_received)(struct cc_var *ccv, uint16_t type); /* Called on detection of a congestion signal. */ void (*cong_signal)(struct cc_var *ccv, uint32_t type); /* Called after exiting congestion recovery. */ void (*post_recovery)(struct cc_var *ccv); /* Called when data transfer resumes after an idle period. */ void (*after_idle)(struct cc_var *ccv); /* Called for an additional ECN processing apart from RFC3168. */ void (*ecnpkt_handler)(struct cc_var *ccv); + /* Called for {get|set}sockopt() on a TCP socket with TCP_CCALGOOPT. */ + int (*ctl_output)(struct cc_var *, struct sockopt *, void *); + STAILQ_ENTRY (cc_algo) entries; }; /* Macro to obtain the CC algo's struct ptr. */ #define CC_ALGO(tp) ((tp)->cc_algo) /* Macro to obtain the CC algo's data ptr. */ #define CC_DATA(tp) ((tp)->ccv->cc_data) /* Macro to obtain the system default CC algo's struct ptr. */ #define CC_DEFAULT() V_default_cc_ptr extern struct rwlock cc_list_lock; #define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list") #define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock) #define CC_LIST_RLOCK() rw_rlock(&cc_list_lock) #define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock) #define CC_LIST_WLOCK() rw_wlock(&cc_list_lock) #define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock) #define CC_LIST_LOCK_ASSERT() rw_assert(&cc_list_lock, RA_LOCKED) #endif /* _NETINET_TCP_CC_H_ */ Index: head/sys/netinet/tcp_usrreq.c =================================================================== --- head/sys/netinet/tcp_usrreq.c (revision 294539) +++ head/sys/netinet/tcp_usrreq.c (revision 294540) @@ -1,2275 +1,2301 @@ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2006-2007 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #ifdef TCP_RFC7413 #include #endif #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif /* * TCP protocol interface to socket abstraction. */ static int tcp_attach(struct socket *); #ifdef INET static int tcp_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET6 */ static void tcp_disconnect(struct tcpcb *); static void tcp_usrclosed(struct tcpcb *); static void tcp_fill_info(struct tcpcb *, struct tcp_info *); #ifdef TCPDEBUG #define TCPDEBUG0 int ostate = 0 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ tcp_trace(TA_USER, ostate, tp, 0, 0, req) #else #define TCPDEBUG0 #define TCPDEBUG1() #define TCPDEBUG2(req) #endif /* * TCP attaches to socket via pru_attach(), reserving space, * and an internet control block. */ static int tcp_usr_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct tcpcb *tp = NULL; int error; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); TCPDEBUG1(); error = tcp_attach(so); if (error) goto out; if ((so->so_options & SO_LINGER) && so->so_linger == 0) so->so_linger = TCP_LINGERTIME; inp = sotoinpcb(so); tp = intotcpcb(inp); out: TCPDEBUG2(PRU_ATTACH); TCP_PROBE2(debug__user, tp, PRU_ATTACH); return error; } /* * tcp_detach is called when the socket layer loses its final reference * to the socket, be it a file descriptor reference, a reference from TCP, * etc. At this point, there is only one case in which we will keep around * inpcb state: time wait. * * This function can probably be re-absorbed back into tcp_usr_detach() now * that there is a single detach path. */ static void tcp_detach(struct socket *so, struct inpcb *inp) { struct tcpcb *tp; INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so")); tp = intotcpcb(inp); if (inp->inp_flags & INP_TIMEWAIT) { /* * There are two cases to handle: one in which the time wait * state is being discarded (INP_DROPPED), and one in which * this connection will remain in timewait. In the former, * it is time to discard all state (except tcptw, which has * already been discarded by the timewait close code, which * should be further up the call stack somewhere). In the * latter case, we detach from the socket, but leave the pcb * present until timewait ends. * * XXXRW: Would it be cleaner to free the tcptw here? * * Astute question indeed, from twtcp perspective there are * three cases to consider: * * #1 tcp_detach is called at tcptw creation time by * tcp_twstart, then do not discard the newly created tcptw * and leave inpcb present until timewait ends * #2 tcp_detach is called at timewait end (or reuse) by * tcp_twclose, then the tcptw has already been discarded * (or reused) and inpcb is freed here * #3 tcp_detach is called() after timewait ends (or reuse) * (e.g. by soclose), then tcptw has already been discarded * (or reused) and inpcb is freed here * * In all three cases the tcptw should not be freed here. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " "INP_DROPPED && tp != NULL")); in_pcbdetach(inp); in_pcbfree(inp); } else { in_pcbdetach(inp); INP_WUNLOCK(inp); } } else { /* * If the connection is not in timewait, we consider two * two conditions: one in which no further processing is * necessary (dropped || embryonic), and one in which TCP is * not yet done, but no longer requires the socket, so the * pcb will persist for the time being. * * XXXRW: Does the second case still occur? */ if (inp->inp_flags & INP_DROPPED || tp->t_state < TCPS_SYN_SENT) { tcp_discardcb(tp); in_pcbdetach(inp); in_pcbfree(inp); } else { in_pcbdetach(inp); INP_WUNLOCK(inp); } } } /* * pru_detach() detaches the TCP protocol from the socket. * If the protocol state is non-embryonic, then can't * do this directly: have to initiate a pru_disconnect(), * which may finish later; embryonic TCB's can just * be discarded here. */ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; int rlock = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); if (!INP_INFO_WLOCKED(&V_tcbinfo)) { INP_INFO_RLOCK(&V_tcbinfo); rlock = 1; } INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); if (rlock) INP_INFO_RUNLOCK(&V_tcbinfo); } #ifdef INET /* * Give the socket an address. */ static int tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6p; sin6p = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6p)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (sin6p->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) inp->inp_vflag |= INP_IPV4; else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } #endif error = in6_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Prepare to accept connections. */ static int tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } SOCK_UNLOCK(so); #ifdef TCP_RFC7413 if (tp->t_flags & TF_FASTOPEN) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); #endif out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); INP_HASH_WLOCK(&V_tcbinfo); if (error == 0 && inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } SOCK_UNLOCK(so); #ifdef TCP_RFC7413 if (tp->t_flags & TF_FASTOPEN) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); #endif out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ static int tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (sinp->sin_family == AF_INET && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) return (error); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; } if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if ((error = tcp_connect(tp, nam, td)) != 0) goto out; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6p; TCPDEBUG0; sin6p = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof (*sin6p)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (sin6p->sin6_family == AF_INET6 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; } if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); #ifdef INET /* * XXXRW: Some confusion: V4/V6 flags relate to binding, and * therefore probably require the hash lock, which isn't held here. * Is this a significant problem? */ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } in6_sin6_2_sin(&sin, sin6p); inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; if ((error = prison_remote_ip4(td->td_ucred, &sin.sin_addr)) != 0) goto out; if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif error = tp->t_fb->tfb_tcp_output(tp); goto out; } #endif inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0) goto out; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ static int tcp_usr_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; int error = 0; TCPDEBUG0; INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) goto out; if (inp->inp_flags & INP_DROPPED) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); out: TCPDEBUG2(PRU_DISCONNECT); TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } #ifdef INET /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) { int error = 0; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct in_addr addr; in_port_t port = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in_getpeeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ port = inp->inp_fport; addr = inp->inp_faddr; out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); if (error == 0) *nam = in_sockaddr(port, &addr); return error; } #endif /* INET */ #ifdef INET6 static int tcp6_usr_accept(struct socket *so, struct sockaddr **nam) { struct inpcb *inp = NULL; int error = 0; struct tcpcb *tp = NULL; struct in_addr addr; struct in6_addr addr6; in_port_t port = 0; int v4 = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in6_mapped_peeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ if (inp->inp_vflag & INP_IPV4) { v4 = 1; port = inp->inp_fport; addr = inp->inp_faddr; } else { port = inp->inp_fport; addr6 = inp->in6p_faddr; } out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); else *nam = in6_sockaddr(port, &addr6); } return error; } #endif /* INET6 */ /* * Mark the connection as being incapable of further output. */ static int tcp_usr_shutdown(struct socket *so) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } /* * After a receive, possibly send window update to peer. */ static int tcp_usr_rcvd(struct socket *so, int flags) { struct inpcb *inp; struct tcpcb *tp = NULL; int error = 0; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); #ifdef TCP_RFC7413 /* * For passively-created TFO connections, don't attempt a window * update while still in SYN_RECEIVED as this may trigger an early * SYN|ACK. It is preferable to have the SYN|ACK be sent along with * application response data, or failing that, when the DELACK timer * expires. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) goto out; #endif #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); TCP_PROBE2(debug__user, tp, PRU_RCVD); INP_WUNLOCK(inp); return (error); } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; #ifdef INET6 int isipv6; #endif TCPDEBUG0; /* * We require the pcbinfo lock if we will close the socket as part of * this call. */ if (flags & PRUS_EOF) INP_INFO_RLOCK(&V_tcbinfo); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { if (control) m_freem(control); /* * In case of PRUS_NOTREADY, tcp_usr_ready() is responsible * for freeing memory. */ if (m && (flags & PRUS_NOTREADY) == 0) m_freem(m); error = ECONNRESET; goto out; } #ifdef INET6 isipv6 = nam && nam->sa_family == AF_INET6; #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { sbappendstream(&so->so_snd, m, flags); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, nam, td); #endif if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); socantsendmore(so); tcp_usrclosed(tp); } if (!(inp->inp_flags & INP_DROPPED) && !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tp->t_fb->tfb_tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, nam, td); #endif if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if (!(flags & PRUS_NOTREADY)) { tp->t_flags |= TF_FORCEDATA; error = tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } } out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (flags & PRUS_EOF) INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } static int tcp_usr_ready(struct socket *so, struct mbuf *m, int count) { struct inpcb *inp; struct tcpcb *tp; int error; inp = sotoinpcb(so); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); for (int i = 0; i < count; i++) m = m_free(m); return (ECONNRESET); } tp = intotcpcb(inp); SOCKBUF_LOCK(&so->so_snd); error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); if (error == 0) error = tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); return (error); } /* * Abort the TCP. Drop the connection abruptly. */ static void tcp_usr_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, drop. */ if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tcp_drop(tp, ECONNABORTED); TCPDEBUG2(PRU_ABORT); TCP_PROBE2(debug__user, tp, PRU_ABORT); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); } /* * TCP socket is closed. Start friendly disconnect. */ static void tcp_usr_close(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, initiate * a disconnect. */ if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); TCPDEBUG2(PRU_CLOSE); TCP_PROBE2(debug__user, tp, PRU_CLOSE); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); } /* * Receive out-of-band data. */ static int tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { error = EINVAL; goto out; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); out: TCPDEBUG2(PRU_RCVOOB); TCP_PROBE2(debug__user, tp, PRU_RCVOOB); INP_WUNLOCK(inp); return (error); } #ifdef INET struct pr_usrreqs tcp_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp_usr_bind, .pru_connect = tcp_usr_connect, .pru_control = in_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp_usr_listen, .pru_peeraddr = in_getpeeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET */ #ifdef INET6 struct pr_usrreqs tcp6_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp6_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp6_usr_bind, .pru_connect = tcp6_usr_connect, .pru_control = in6_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp6_usr_listen, .pru_peeraddr = in6_mapped_peeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET6 */ #ifdef INET /* * Common subroutine to open a TCP connection to remote host specified * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local * port number if needed. Call in_pcbconnect_setup to do the routing and * to choose a local host address (interface). If there is an existing * incarnation of the same connection in TIME-WAIT state and if the remote * host was sending CC options and if the connection duration was < MSL, then * truncate the previous TIME-WAIT state and proceed. * Initialize connection parameters and enter SYN-SENT state. */ static int tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct in_addr laddr; u_short lport; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) goto out; } /* * Cannot simply call in_pcbconnect, because there might be an * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ laddr = inp->inp_laddr; lport = inp->inp_lport; error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) goto out; if (oinp) { error = EADDRINUSE; goto out; } inp->inp_laddr = laddr; in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); /* * Compute window scaling to request: * Scale to fit into sweet spot. See tcp_syncache.c. * XXX: This should move to tcp_output(). */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) goto out; } error = in6_pcbconnect(inp, nam, td->td_ucred); if (error != 0) goto out; INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(inp->inp_socket); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return error; } #endif /* INET6 */ /* * Export TCP internal state information via a struct tcp_info, based on the * Linux 2.6 API. Not ABI compatible as our constants are mapped differently * (TCP state machine, etc). We export all information using FreeBSD-native * constants -- for example, the numeric values for tcpi_state will differ * from Linux. */ static void tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) { INP_WLOCK_ASSERT(tp->t_inpcb); bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) ti->tcpi_options |= TCPI_OPT_SACK; if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } ti->tcpi_rto = tp->t_rxtcur * tick; ti->tcpi_last_data_recv = (long)(ticks - (int)tp->t_rcvtime) * tick; ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; /* * FreeBSD-specific extension fields for tcp_info. */ ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_wnd = tp->snd_wnd; ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; if (tp->t_flags & TF_TOE) ti->tcpi_options |= TCPI_OPT_TOE; ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; } /* * tcp_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ #define INP_WLOCK_RECHECK(inp) do { \ INP_WLOCK(inp); \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_WUNLOCK(inp); \ return (ECONNRESET); \ } \ tp = intotcpcb(inp); \ } while(0) int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { int error; struct inpcb *inp; struct tcpcb *tp; struct tcp_function_block *blk; struct tcp_function_set fsn; error = 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); INP_WLOCK(inp); if (sopt->sopt_level != IPPROTO_TCP) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); /* * Protect the TCP option TCP_FUNCTION_BLK so * that a sub-function can *never* overwrite this. */ if ((sopt->sopt_dir == SOPT_SET) && (sopt->sopt_name == TCP_FUNCTION_BLK)) { INP_WUNLOCK(inp); error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); if (error) return (error); INP_WLOCK_RECHECK(inp); if (tp->t_state != TCPS_CLOSED) { /* * The user has advanced the state * past the initial point, we can't * switch since we are down the road * and a new set of functions may * not be compatibile. */ INP_WUNLOCK(inp); return(EINVAL); } blk = find_and_ref_tcp_functions(&fsn); if (blk == NULL) { INP_WUNLOCK(inp); return (ENOENT); } if (tp->t_fb != blk) { if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (ENOENT); } /* * Release the old refcnt, the * lookup acquires a ref on the * new one. */ if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = blk; if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif INP_WUNLOCK(inp); return (error); } else if ((sopt->sopt_dir == SOPT_GET) && (sopt->sopt_name == TCP_FUNCTION_BLK)) { strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name); fsn.pcbcnt = tp->t_fb->tfb_refcnt; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &fsn, sizeof fsn); return (error); } /* Pass in the INP locked, called must unlock it */ return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); } int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int error, opt, optval; u_int ui; struct tcp_info ti; struct cc_algo *algo; char *buf; - + + /* + * For TCP_CCALGOOPT forward the control to CC module, for both + * SOPT_SET and SOPT_GET. + */ + switch (sopt->sopt_name) { + case TCP_CCALGOOPT: + INP_WUNLOCK(inp); + buf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); + error = sooptcopyin(sopt, buf, sopt->sopt_valsize, + sopt->sopt_valsize); + if (error) { + free(buf, M_TEMP); + return (error); + } + INP_WLOCK_RECHECK(inp); + if (CC_ALGO(tp)->ctl_output != NULL) + error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, buf); + else + error = ENOENT; + INP_WUNLOCK(inp); + if (error == 0 && sopt->sopt_dir == SOPT_GET) + error = sooptcopyout(sopt, buf, sopt->sopt_valsize); + free(buf, M_TEMP); + return (error); + } + switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #ifdef TCP_SIGNATURE case TCP_MD5SIG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval > 0) tp->t_flags |= TF_SIGNATURE; else tp->t_flags &= ~TF_SIGNATURE; goto unlock_and_done; #endif /* TCP_SIGNATURE */ case TCP_NODELAY: case TCP_NOOPT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_NODELAY: opt = TF_NODELAY; break; case TCP_NOOPT: opt = TF_NOOPT; break; default: opt = 0; /* dead code to fool gcc */ break; } if (optval) tp->t_flags |= opt; else tp->t_flags &= ~opt; unlock_and_done: #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif INP_WUNLOCK(inp); break; case TCP_NOPUSH: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval) tp->t_flags |= TF_NOPUSH; else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) error = tp->t_fb->tfb_tcp_output(tp); } goto unlock_and_done; case TCP_MAXSEG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && optval + 40 >= V_tcp_minmss) tp->t_maxseg = optval; else error = EINVAL; goto unlock_and_done; case TCP_INFO: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_CONGESTION: INP_WUNLOCK(inp); buf = malloc(TCP_CA_NAME_MAX, M_TEMP, M_WAITOK|M_ZERO); error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX, 1); if (error) { free(buf, M_TEMP); break; } CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) break; CC_LIST_RUNLOCK(); free(buf, M_TEMP); if (algo == NULL) { error = EINVAL; break; } INP_WLOCK_RECHECK(inp); /* * We hold a write lock over the tcb so it's safe to * do these things without ordering concerns. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); CC_ALGO(tp) = algo; /* * If something goes pear shaped initialising the new * algo, fall back to newreno (which does not * require initialisation). */ if (algo->cb_init != NULL && algo->cb_init(tp->ccv) != 0) { CC_ALGO(tp) = &newreno_cc_algo; /* * The only reason init should fail is * because of malloc. */ error = ENOMEM; } INP_WUNLOCK(inp); break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); if (ui > (UINT_MAX / hz)) { error = EINVAL; break; } ui *= hz; INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_KEEPIDLE: tp->t_keepidle = ui; /* * XXX: better check current remaining * timeout and "merge" it with new value. */ if ((tp->t_state > TCPS_LISTEN) && (tp->t_state <= TCPS_CLOSING)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); break; case TCP_KEEPINTVL: tp->t_keepintvl = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); break; case TCP_KEEPINIT: tp->t_keepinit = ui; if (tp->t_state == TCPS_SYN_RECEIVED || tp->t_state == TCPS_SYN_SENT) tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); break; } goto unlock_and_done; case TCP_KEEPCNT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); INP_WLOCK_RECHECK(inp); tp->t_keepcnt = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); goto unlock_and_done; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval >= 0) tcp_pcap_set_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts), optval); else error = EINVAL; goto unlock_and_done; #endif #ifdef TCP_RFC7413 case TCP_FASTOPEN: INP_WUNLOCK(inp); if (!V_tcp_fastopen_enabled) return (EPERM); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval) { tp->t_flags |= TF_FASTOPEN; if ((tp->t_state == TCPS_LISTEN) && (tp->t_tfo_pending == NULL)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); } else tp->t_flags &= ~TF_FASTOPEN; goto unlock_and_done; #endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: tp = intotcpcb(inp); switch (sopt->sopt_name) { #ifdef TCP_SIGNATURE case TCP_MD5SIG: optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_MAXSEG: optval = tp->t_maxseg; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_INFO: tcp_fill_info(tp, &ti); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; case TCP_CONGESTION: INP_WUNLOCK(inp); error = sooptcopyout(sopt, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: case TCP_KEEPCNT: switch (sopt->sopt_name) { case TCP_KEEPIDLE: ui = tp->t_keepidle / hz; break; case TCP_KEEPINTVL: ui = tp->t_keepintvl / hz; break; case TCP_KEEPINIT: ui = tp->t_keepinit / hz; break; case TCP_KEEPCNT: ui = tp->t_keepcnt; break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ui, sizeof(ui)); break; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts)); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif #ifdef TCP_RFC7413 case TCP_FASTOPEN: optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #undef INP_WLOCK_RECHECK /* * Attach TCP protocol to socket, allocating * internet protocol control block, tcp control block, * bufer space, and entering LISTEN state if to accept connections. */ static int tcp_attach(struct socket *so) { struct tcpcb *tp; struct inpcb *inp; int error; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); if (error) return (error); } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; INP_INFO_RLOCK(&V_tcbinfo); error = in_pcballoc(so, &V_tcbinfo); if (error) { INP_INFO_RUNLOCK(&V_tcbinfo); return (error); } inp = sotoinpcb(so); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { inp->inp_vflag |= INP_IPV6; inp->in6p_hops = -1; /* use kernel default */ } else #endif inp->inp_vflag |= INP_IPV4; tp = tcp_newtcpcb(inp); if (tp == NULL) { in_pcbdetach(inp); in_pcbfree(inp); INP_INFO_RUNLOCK(&V_tcbinfo); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } /* * Initiate (or continue) disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ static void tcp_disconnect(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* * Neither tcp_close() nor tcp_drop() should return NULL, as the * socket is still open. */ if (tp->t_state < TCPS_ESTABLISHED) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { tp = tcp_drop(tp, 0); KASSERT(tp != NULL, ("tcp_disconnect: tcp_drop() returned NULL")); } else { soisdisconnecting(so); sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) tp->t_fb->tfb_tcp_output(tp); } } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ static void tcp_usrclosed(struct tcpcb *tp) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { case TCPS_LISTEN: #ifdef TCP_OFFLOAD tcp_offload_listen_stop(tp); #endif tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ case TCPS_CLOSED: tp = tcp_close(tp); /* * tcp_close() should never return NULL here as the socket is * still open. */ KASSERT(tp != NULL, ("tcp_usrclosed: tcp_close() returned NULL")); break; case TCPS_SYN_SENT: case TCPS_SYN_RECEIVED: tp->t_flags |= TF_NEEDFIN; break; case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_FIN_WAIT_1); break; case TCPS_CLOSE_WAIT: tcp_state_change(tp, TCPS_LAST_ACK); break; } if (tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) { int timeout; timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : TP_MAXIDLE(tp); tcp_timer_activate(tp, TT_2MSL, timeout); } } } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_tstate(int t_state) { switch (t_state) { case TCPS_CLOSED: db_printf("TCPS_CLOSED"); return; case TCPS_LISTEN: db_printf("TCPS_LISTEN"); return; case TCPS_SYN_SENT: db_printf("TCPS_SYN_SENT"); return; case TCPS_SYN_RECEIVED: db_printf("TCPS_SYN_RECEIVED"); return; case TCPS_ESTABLISHED: db_printf("TCPS_ESTABLISHED"); return; case TCPS_CLOSE_WAIT: db_printf("TCPS_CLOSE_WAIT"); return; case TCPS_FIN_WAIT_1: db_printf("TCPS_FIN_WAIT_1"); return; case TCPS_CLOSING: db_printf("TCPS_CLOSING"); return; case TCPS_LAST_ACK: db_printf("TCPS_LAST_ACK"); return; case TCPS_FIN_WAIT_2: db_printf("TCPS_FIN_WAIT_2"); return; case TCPS_TIME_WAIT: db_printf("TCPS_TIME_WAIT"); return; default: db_printf("unknown"); return; } } static void db_print_tflags(u_int t_flags) { int comma; comma = 0; if (t_flags & TF_ACKNOW) { db_printf("%sTF_ACKNOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_DELACK) { db_printf("%sTF_DELACK", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NODELAY) { db_printf("%sTF_NODELAY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOOPT) { db_printf("%sTF_NOOPT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SENTFIN) { db_printf("%sTF_SENTFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_SCALE) { db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_SCALE) { db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_TSTMP) { db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_TSTMP) { db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SACK_PERMIT) { db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDSYN) { db_printf("%sTF_NEEDSYN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDFIN) { db_printf("%sTF_NEEDFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOPUSH) { db_printf("%sTF_NOPUSH", comma ? ", " : ""); comma = 1; } if (t_flags & TF_MORETOCOME) { db_printf("%sTF_MORETOCOME", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LQ_OVERFLOW) { db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LASTIDLE) { db_printf("%sTF_LASTIDLE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RXWIN0SENT) { db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTRECOVERY) { db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_CONGRECOVERY) { db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_WASFRECOVERY) { db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SIGNATURE) { db_printf("%sTF_SIGNATURE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FORCEDATA) { db_printf("%sTF_FORCEDATA", comma ? ", " : ""); comma = 1; } if (t_flags & TF_TSO) { db_printf("%sTF_TSO", comma ? ", " : ""); comma = 1; } if (t_flags & TF_ECN_PERMIT) { db_printf("%sTF_ECN_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTOPEN) { db_printf("%sTF_FASTOPEN", comma ? ", " : ""); comma = 1; } } static void db_print_toobflags(char t_oobflags) { int comma; comma = 0; if (t_oobflags & TCPOOB_HAVEDATA) { db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); comma = 1; } if (t_oobflags & TCPOOB_HADDATA) { db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); comma = 1; } } static void db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, tp); indent += 2; db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); db_print_indent(indent); db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, &tp->t_timers->tt_delack, tp->t_inpcb); db_print_indent(indent); db_printf("t_state: %d (", tp->t_state); db_print_tstate(tp->t_state); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags: 0x%x (", tp->t_flags); db_print_tflags(tp->t_flags); db_printf(")\n"); db_print_indent(indent); db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", tp->snd_una, tp->snd_max, tp->snd_nxt); db_print_indent(indent); db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", tp->snd_up, tp->snd_wl1, tp->snd_wl2); db_print_indent(indent); db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", tp->iss, tp->irs, tp->rcv_nxt); db_print_indent(indent); db_printf("rcv_adv: 0x%08x rcv_wnd: %lu rcv_up: 0x%08x\n", tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); db_printf("snd_wnd: %lu snd_cwnd: %lu\n", tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); db_printf("snd_ssthresh: %lu snd_recover: " "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_rcvtime: %u t_startime: %u\n", tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %u t_rtsq: 0x%08x\n", tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, tp->t_rttbest); db_print_indent(indent); db_printf("t_rttupdated: %lu max_sndwnd: %lu t_softerror: %d\n", tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); db_print_indent(indent); db_printf("t_oobflags: 0x%x (", tp->t_oobflags); db_print_toobflags(tp->t_oobflags); db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); db_print_indent(indent); db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", tp->snd_scale, tp->rcv_scale, tp->request_r_scale); db_print_indent(indent); db_printf("ts_recent: %u ts_recent_age: %u\n", tp->ts_recent, tp->ts_recent_age); db_print_indent(indent); db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); db_print_indent(indent); db_printf("snd_ssthresh_prev: %lu snd_recover_prev: 0x%08x " "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, tp->snd_recover_prev, tp->t_badrxtwin); db_print_indent(indent); db_printf("snd_numholes: %d snd_holes first: %p\n", tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); db_print_indent(indent); db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: " "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata); /* Skip sackblks, sackhint. */ db_print_indent(indent); db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); } DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) { struct tcpcb *tp; if (!have_addr) { db_printf("usage: show tcpcb \n"); return; } tp = (struct tcpcb *)addr; db_print_tcpcb(tp, "tcpcb", 0); } #endif