Index: head/lib/libc/sys/getsockopt.2 =================================================================== --- head/lib/libc/sys/getsockopt.2 (revision 333698) +++ head/lib/libc/sys/getsockopt.2 (revision 333699) @@ -1,576 +1,578 @@ .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)getsockopt.2 8.4 (Berkeley) 5/2/95 .\" $FreeBSD$ .\" -.Dd January 18, 2017 +.Dd May 9, 2018 .Dt GETSOCKOPT 2 .Os .Sh NAME .Nm getsockopt , .Nm setsockopt .Nd get and set options on sockets .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In sys/types.h .In sys/socket.h .Ft int .Fn getsockopt "int s" "int level" "int optname" "void * restrict optval" "socklen_t * restrict optlen" .Ft int .Fn setsockopt "int s" "int level" "int optname" "const void *optval" "socklen_t optlen" .Sh DESCRIPTION The .Fn getsockopt and .Fn setsockopt system calls manipulate the .Em options associated with a socket. Options may exist at multiple protocol levels; they are always present at the uppermost .Dq socket level. .Pp When manipulating socket options the level at which the option resides and the name of the option must be specified. To manipulate options at the socket level, .Fa level is specified as .Dv SOL_SOCKET . To manipulate options at any other level the protocol number of the appropriate protocol controlling the option is supplied. For example, to indicate that an option is to be interpreted by the .Tn TCP protocol, .Fa level should be set to the protocol number of .Tn TCP ; see .Xr getprotoent 3 . .Pp The .Fa optval and .Fa optlen arguments are used to access option values for .Fn setsockopt . For .Fn getsockopt they identify a buffer in which the value for the requested option(s) are to be returned. For .Fn getsockopt , .Fa optlen is a value-result argument, initially containing the size of the buffer pointed to by .Fa optval , and modified on return to indicate the actual size of the value returned. If no option value is to be supplied or returned, .Fa optval may be NULL. .Pp The .Fa optname argument and any specified options are passed uninterpreted to the appropriate protocol module for interpretation. The include file .In sys/socket.h contains definitions for socket level options, described below. Options at other protocol levels vary in format and name; consult the appropriate entries in section 4 of the manual. .Pp Most socket-level options utilize an .Vt int argument for .Fa optval . For .Fn setsockopt , the argument should be non-zero to enable a boolean option, or zero if the option is to be disabled. .Dv SO_LINGER uses a .Vt "struct linger" argument, defined in .In sys/socket.h , which specifies the desired state of the option and the linger interval (see below). .Dv SO_SNDTIMEO and .Dv SO_RCVTIMEO use a .Vt "struct timeval" argument, defined in .In sys/time.h . .Pp The following options are recognized at the socket level. For protocol-specific options, see protocol manual pages, e.g. .Xr ip 4 or .Xr tcp 4 . Except as noted, each may be examined with .Fn getsockopt and set with .Fn setsockopt . .Bl -column SO_ACCEPTFILTER -offset indent .It Dv SO_DEBUG Ta "enables recording of debugging information" .It Dv SO_REUSEADDR Ta "enables local address reuse" .It Dv SO_REUSEPORT Ta "enables duplicate address and port bindings" .It Dv SO_KEEPALIVE Ta "enables keep connections alive" .It Dv SO_DONTROUTE Ta "enables routing bypass for outgoing messages" .It Dv SO_LINGER Ta "linger on close if data present" .It Dv SO_BROADCAST Ta "enables permission to transmit broadcast messages" .It Dv SO_OOBINLINE Ta "enables reception of out-of-band data in band" .It Dv SO_SNDBUF Ta "set buffer size for output" .It Dv SO_RCVBUF Ta "set buffer size for input" .It Dv SO_SNDLOWAT Ta "set minimum count for output" .It Dv SO_RCVLOWAT Ta "set minimum count for input" .It Dv SO_SNDTIMEO Ta "set timeout value for output" .It Dv SO_RCVTIMEO Ta "set timeout value for input" .It Dv SO_ACCEPTFILTER Ta "set accept filter on listening socket" .It Dv SO_NOSIGPIPE Ta controls generation of .Dv SIGPIPE for the socket .It Dv SO_TIMESTAMP Ta "enables reception of a timestamp with datagrams" .It Dv SO_BINTIME Ta "enables reception of a timestamp with datagrams" .It Dv SO_ACCEPTCONN Ta "get listening status of the socket (get only)" .It Dv SO_TYPE Ta "get the type of the socket (get only)" .It Dv SO_PROTOCOL Ta "get the protocol number for the socket (get only)" .It Dv SO_PROTOTYPE Ta "SunOS alias for the Linux SO_PROTOCOL (get only)" .It Dv SO_ERROR Ta "get and clear error on the socket (get only)" .It Dv SO_SETFIB Ta "set the associated FIB (routing table) for the socket (set only)" .El .Pp The following options are recognized in .Fx : .Bl -column SO_LISTENINCQLEN -offset indent .It Dv SO_LABEL Ta "get MAC label of the socket (get only)" .It Dv SO_PEERLABEL Ta "get socket's peer's MAC label (get only)" .It Dv SO_LISTENQLIMIT Ta "get backlog limit of the socket (get only)" .It Dv SO_LISTENQLEN Ta "get complete queue length of the socket (get only)" .It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)" .It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)" .It Dv SO_TS_CLOCK Ta "set specific format of timestamp returned by SO_TIMESTAMP" .It Dv SO_MAX_PACING_RATE Ta "set the maximum transmit rate in bytes per second for the socket" .El .Pp .Dv SO_DEBUG enables debugging in the underlying protocol modules. .Pp .Dv SO_REUSEADDR indicates that the rules used in validating addresses supplied in a .Xr bind 2 system call should allow reuse of local addresses. .Pp .Dv SO_REUSEPORT allows completely duplicate bindings by multiple processes if they all set .Dv SO_REUSEPORT before binding the port. This option permits multiple instances of a program to each receive UDP/IP multicast or broadcast datagrams destined for the bound port. .Pp .Dv SO_KEEPALIVE enables the periodic transmission of messages on a connected socket. Should the connected party fail to respond to these messages, the connection is considered broken and processes using the socket are notified via a .Dv SIGPIPE signal when attempting to send data. .Pp .Dv SO_DONTROUTE indicates that outgoing messages should bypass the standard routing facilities. Instead, messages are directed to the appropriate network interface according to the network portion of the destination address. .Pp .Dv SO_LINGER controls the action taken when unsent messages are queued on socket and a .Xr close 2 is performed. If the socket promises reliable delivery of data and .Dv SO_LINGER is set, the system will block the process on the .Xr close 2 attempt until it is able to transmit the data or until it decides it is unable to deliver the information (a timeout period, termed the linger interval, is specified in seconds in the .Fn setsockopt system call when .Dv SO_LINGER is requested). If .Dv SO_LINGER is disabled and a .Xr close 2 is issued, the system will process the close in a manner that allows the process to continue as quickly as possible. .Pp The option .Dv SO_BROADCAST requests permission to send broadcast datagrams on the socket. Broadcast was a privileged operation in earlier versions of the system. .Pp With protocols that support out-of-band data, the .Dv SO_OOBINLINE option requests that out-of-band data be placed in the normal data input queue as received; it will then be accessible with .Xr recv 2 or .Xr read 2 calls without the .Dv MSG_OOB flag. Some protocols always behave as if this option is set. .Pp .Dv SO_SNDBUF and .Dv SO_RCVBUF are options to adjust the normal buffer sizes allocated for output and input buffers, respectively. The buffer size may be increased for high-volume connections, or may be decreased to limit the possible backlog of incoming data. The system places an absolute maximum on these values, which is accessible through the .Xr sysctl 3 MIB variable .Dq Li kern.ipc.maxsockbuf . .Pp .Dv SO_SNDLOWAT is an option to set the minimum count for output operations. Most output operations process all of the data supplied by the call, delivering data to the protocol for transmission and blocking as necessary for flow control. Nonblocking output operations will process as much data as permitted subject to flow control without blocking, but will process no data if flow control does not allow the smaller of the low water mark value or the entire request to be processed. A .Xr select 2 operation testing the ability to write to a socket will return true only if the low water mark amount could be processed. The default value for .Dv SO_SNDLOWAT is set to a convenient size for network efficiency, often 1024. .Pp .Dv SO_RCVLOWAT is an option to set the minimum count for input operations. In general, receive calls will block until any (non-zero) amount of data is received, then return with the smaller of the amount available or the amount requested. The default value for .Dv SO_RCVLOWAT is 1. If .Dv SO_RCVLOWAT is set to a larger value, blocking receive calls normally wait until they have received the smaller of the low water mark value or the requested amount. Receive calls may still return less than the low water mark if an error occurs, a signal is caught, or the type of data next in the receive queue is different from that which was returned. .Pp .Dv SO_SNDTIMEO is an option to set a timeout value for output operations. It accepts a .Vt "struct timeval" argument with the number of seconds and microseconds used to limit waits for output operations to complete. If a send operation has blocked for this much time, it returns with a partial count or with the error .Er EWOULDBLOCK if no data were sent. In the current implementation, this timer is restarted each time additional data are delivered to the protocol, implying that the limit applies to output portions ranging in size from the low water mark to the high water mark for output. .Pp .Dv SO_RCVTIMEO is an option to set a timeout value for input operations. It accepts a .Vt "struct timeval" argument with the number of seconds and microseconds used to limit waits for input operations to complete. In the current implementation, this timer is restarted each time additional data are received by the protocol, and thus the limit is in effect an inactivity timer. If a receive operation has been blocked for this much time without receiving additional data, it returns with a short count or with the error .Er EWOULDBLOCK if no data were received. .Pp .Dv SO_SETFIB can be used to over-ride the default FIB (routing table) for the given socket. The value must be from 0 to one less than the number returned from the sysctl .Em net.fibs . .Pp .Dv SO_USER_COOKIE can be used to set the uint32_t so_user_cookie field in the socket. The value is an uint32_t, and can be used in the kernel code that manipulates traffic related to the socket. The default value for the field is 0. As an example, the value can be used as the skipto target or pipe number in .Nm ipfw/dummynet . .Pp .Dv SO_ACCEPTFILTER places an .Xr accept_filter 9 on the socket, which will filter incoming connections on a listening stream socket before being presented for .Xr accept 2 . Once more, .Xr listen 2 must be called on the socket before trying to install the filter on it, or else the .Fn setsockopt system call will fail. .Bd -literal struct accept_filter_arg { char af_name[16]; char af_arg[256-16]; }; .Ed .Pp The .Fa optval argument should point to a .Fa struct accept_filter_arg that will select and configure the .Xr accept_filter 9 . The .Fa af_name argument should be filled with the name of the accept filter that the application wishes to place on the listening socket. The optional argument .Fa af_arg can be passed to the accept filter specified by .Fa af_name to provide additional configuration options at attach time. Passing in an .Fa optval of NULL will remove the filter. .Pp The .Dv SO_NOSIGPIPE option controls generation of the .Dv SIGPIPE signal normally sent when writing to a connected socket where the other end has been closed returns with the error .Er EPIPE . .Pp If the .Dv SO_TIMESTAMP or .Dv SO_BINTIME option is enabled on a .Dv SOCK_DGRAM socket, the .Xr recvmsg 2 call will return a timestamp corresponding to when the datagram was received. The .Va msg_control field in the .Vt msghdr structure points to a buffer that contains a .Vt cmsghdr structure followed by a .Vt "struct timeval" for .Dv SO_TIMESTAMP and .Vt "struct bintime" for .Dv SO_BINTIME . The .Vt cmsghdr fields have the following values for TIMESTAMP by default: .Bd -literal cmsg_len = CMSG_LEN(sizeof(struct timeval)); cmsg_level = SOL_SOCKET; cmsg_type = SCM_TIMESTAMP; .Ed .Pp and for .Dv SO_BINTIME : .Bd -literal cmsg_len = CMSG_LEN(sizeof(struct bintime)); cmsg_level = SOL_SOCKET; cmsg_type = SCM_BINTIME; .Ed .Pp Additional timestamp types are available by following .Dv SO_TIMESTAMP with .Dv SO_TS_CLOCK , which requests a specific timestamp format to be returned instead of .Dv SCM_TIMESTAMP when .Dv SO_TIMESTAMP is enabled. These .Dv SO_TS_CLOCK values are recognized in .Fx : .Bl -column SO_TS_CLOCK -offset indent .It Dv SO_TS_REALTIME_MICRO Ta "realtime (SCM_TIMESTAMP, struct timeval), default" .It Dv SO_TS_BINTIME Ta "realtime (SCM_BINTIME, struct bintime)" .It Dv SO_TS_REALTIME Ta "realtime (SCM_REALTIME, struct timespec)" .It Dv SO_TS_MONOTONIC Ta "monotonic time (SCM_MONOTONIC, struct timespec)" .El .Pp .Dv SO_ACCEPTCONN , .Dv SO_TYPE , .Dv SO_PROTOCOL (and its alias .Dv SO_PROTOTYPE ) and .Dv SO_ERROR are options used only with .Fn getsockopt . .Dv SO_ACCEPTCONN returns whether the socket is currently accepting connections, that is, whether or not the .Xr listen 2 system call was invoked on the socket. .Dv SO_TYPE returns the type of the socket, such as .Dv SOCK_STREAM ; it is useful for servers that inherit sockets on startup. .Dv SO_PROTOCOL returns the protocol number for the socket, for .Dv AF_INET and .Dv AF_INET6 address families. .Dv SO_ERROR returns any pending error on the socket and clears the error status. It may be used to check for asynchronous errors on connected datagram sockets or for other asynchronous errors. .Pp Finally, .Dv SO_LABEL returns the MAC label of the socket. .Dv SO_PEERLABEL returns the MAC label of the socket's peer. Note that your kernel must be compiled with MAC support. See .Xr mac 3 for more information. .Dv SO_LISTENQLIMIT returns the maximal number of queued connections, as set by .Xr listen 2 . .Dv SO_LISTENQLEN returns the number of unaccepted complete connections. .Dv SO_LISTENINCQLEN returns the number of unaccepted incomplete connections. .Pp .Dv SO_MAX_PACING_RATE instruct the socket and underlying network adapter layers to limit the transfer rate to the given unsigned 32-bit value in bytes per second. .Sh RETURN VALUES .Rv -std .Sh ERRORS The call succeeds unless: .Bl -tag -width Er .It Bq Er EBADF The argument .Fa s is not a valid descriptor. .It Bq Er ENOTSOCK The argument .Fa s is a file, not a socket. .It Bq Er ENOPROTOOPT The option is unknown at the level indicated. .It Bq Er EFAULT The address pointed to by .Fa optval is not in a valid part of the process address space. For .Fn getsockopt , this error may also be returned if .Fa optlen is not in a valid part of the process address space. .It Bq Er EINVAL Installing an .Xr accept_filter 9 on a non-listening socket was attempted. +.It Bq Er ENOMEM +A memory allocation failed that was required to service the request. .El .Sh SEE ALSO .Xr ioctl 2 , .Xr listen 2 , .Xr recvmsg 2 , .Xr socket 2 , .Xr getprotoent 3 , .Xr mac 3 , .Xr sysctl 3 , .Xr ip 4 , .Xr ip6 4 , .Xr sctp 4 , .Xr tcp 4 , .Xr protocols 5 , .Xr sysctl 8 , .Xr accept_filter 9 , .Xr bintime 9 .Sh HISTORY The .Fn getsockopt and .Fn setsockopt system calls appeared in .Bx 4.2 . .Sh BUGS Several of the socket options should be handled at lower levels of the system. Index: head/share/man/man4/tcp.4 =================================================================== --- head/share/man/man4/tcp.4 (revision 333698) +++ head/share/man/man4/tcp.4 (revision 333699) @@ -1,676 +1,676 @@ .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. .\" Copyright (c) 2010-2011 The FreeBSD Foundation .\" All rights reserved. .\" .\" Portions of this documentation were written at the Centre for Advanced .\" Internet Architectures, Swinburne University of Technology, Melbourne, .\" Australia by David Hayes under sponsorship from the FreeBSD Foundation. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd February 6, 2017 +.Dd May 9, 2018 .Dt TCP 4 .Os .Sh NAME .Nm tcp .Nd Internet Transmission Control Protocol .Sh SYNOPSIS .In sys/types.h .In sys/socket.h .In netinet/in.h .In netinet/tcp.h .Ft int .Fn socket AF_INET SOCK_STREAM 0 .Sh DESCRIPTION The .Tn TCP protocol provides reliable, flow-controlled, two-way transmission of data. It is a byte-stream protocol used to support the .Dv SOCK_STREAM abstraction. .Tn TCP uses the standard Internet address format and, in addition, provides a per-host collection of .Dq "port addresses" . Thus, each address is composed of an Internet address specifying the host and network, with a specific .Tn TCP port on the host identifying the peer entity. .Pp Sockets utilizing the .Tn TCP protocol are either .Dq active or .Dq passive . Active sockets initiate connections to passive sockets. By default, .Tn TCP sockets are created active; to create a passive socket, the .Xr listen 2 system call must be used after binding the socket with the .Xr bind 2 system call. Only passive sockets may use the .Xr accept 2 call to accept incoming connections. Only active sockets may use the .Xr connect 2 call to initiate connections. .Pp Passive sockets may .Dq underspecify their location to match incoming connection requests from multiple networks. This technique, termed .Dq "wildcard addressing" , allows a single server to provide service to clients on multiple networks. To create a socket which listens on all networks, the Internet address .Dv INADDR_ANY must be bound. The .Tn TCP port may still be specified at this time; if the port is not specified, the system will assign one. Once a connection has been established, the socket's address is fixed by the peer entity's location. The address assigned to the socket is the address associated with the network interface through which packets are being transmitted and received. Normally, this address corresponds to the peer entity's network. .Pp .Tn TCP supports a number of socket options which can be set with .Xr setsockopt 2 and tested with .Xr getsockopt 2 : .Bl -tag -width ".Dv TCP_FUNCTION_BLK" .It Dv TCP_INFO Information about a socket's underlying TCP session may be retrieved by passing the read-only option .Dv TCP_INFO to .Xr getsockopt 2 . It accepts a single argument: a pointer to an instance of .Vt "struct tcp_info" . .Pp This API is subject to change; consult the source to determine which fields are currently filled out by this option. .Fx specific additions include send window size, receive window size, and bandwidth-controlled window space. .It Dv TCP_CCALGOOPT Set or query congestion control algorithm specific parameters. See .Xr mod_cc 4 for details. .It Dv TCP_CONGESTION Select or query the congestion control algorithm that TCP will use for the connection. See .Xr mod_cc 4 for details. .It Dv TCP_FUNCTION_BLK Select or query the set of functions that TCP will use for this connection. This allows a user to select an alternate TCP stack. The alternate TCP stack must already be loaded in the kernel. To list the available TCP stacks, see .Va functions_available in the .Sx MIB Variables section further down. To list the default TCP stack, see .Va functions_default in the .Sx MIB Variables section. .It Dv TCP_KEEPINIT This .Xr setsockopt 2 option accepts a per-socket timeout argument of .Vt "u_int" in seconds, for new, non-established .Tn TCP connections. For the global default in milliseconds see .Va keepinit in the .Sx MIB Variables section further down. .It Dv TCP_KEEPIDLE This .Xr setsockopt 2 option accepts an argument of .Vt "u_int" for the amount of time, in seconds, that the connection must be idle before keepalive probes (if enabled) are sent for the connection of this socket. If set on a listening socket, the value is inherited by the newly created socket upon .Xr accept 2 . For the global default in milliseconds see .Va keepidle in the .Sx MIB Variables section further down. .It Dv TCP_KEEPINTVL This .Xr setsockopt 2 option accepts an argument of .Vt "u_int" to set the per-socket interval, in seconds, between keepalive probes sent to a peer. If set on a listening socket, the value is inherited by the newly created socket upon .Xr accept 2 . For the global default in milliseconds see .Va keepintvl in the .Sx MIB Variables section further down. .It Dv TCP_KEEPCNT This .Xr setsockopt 2 option accepts an argument of .Vt "u_int" and allows a per-socket tuning of the number of probes sent, with no response, before the connection will be dropped. If set on a listening socket, the value is inherited by the newly created socket upon .Xr accept 2 . For the global default see the .Va keepcnt in the .Sx MIB Variables section further down. .It Dv TCP_NODELAY Under most circumstances, .Tn TCP sends data when it is presented; when outstanding data has not yet been acknowledged, it gathers small amounts of output to be sent in a single packet once an acknowledgement is received. For a small number of clients, such as window systems that send a stream of mouse events which receive no replies, this packetization may cause significant delays. The boolean option .Dv TCP_NODELAY defeats this algorithm. .It Dv TCP_MAXSEG By default, a sender- and .No receiver- Ns Tn TCP will negotiate among themselves to determine the maximum segment size to be used for each connection. The .Dv TCP_MAXSEG option allows the user to determine the result of this negotiation, and to reduce it if desired. .It Dv TCP_NOOPT .Tn TCP usually sends a number of options in each packet, corresponding to various .Tn TCP extensions which are provided in this implementation. The boolean option .Dv TCP_NOOPT is provided to disable .Tn TCP option use on a per-connection basis. .It Dv TCP_NOPUSH By convention, the .No sender- Ns Tn TCP will set the .Dq push bit, and begin transmission immediately (if permitted) at the end of every user call to .Xr write 2 or .Xr writev 2 . When this option is set to a non-zero value, .Tn TCP will delay sending any data at all until either the socket is closed, or the internal send buffer is filled. .It Dv TCP_MD5SIG This option enables the use of MD5 digests (also known as TCP-MD5) on writes to the specified socket. Outgoing traffic is digested; digests on incoming traffic are verified. When this option is enabled on a socket, all inbound and outgoing TCP segments must be signed with MD5 digests. .Pp One common use for this in a .Fx router deployment is to enable based routers to interwork with Cisco equipment at peering points. Support for this feature conforms to RFC 2385. .Pp In order for this option to function correctly, it is necessary for the administrator to add a tcp-md5 key entry to the system's security associations database (SADB) using the .Xr setkey 8 utility. This entry can only be specified on a per-host basis at this time. .Pp If an SADB entry cannot be found for the destination, the system does not send any outgoing segments and drops any inbound segments. .Pp Each dropped segment is taken into account in the TCP protocol statistics. .El .Pp The option level for the .Xr setsockopt 2 call is the protocol number for .Tn TCP , available from .Xr getprotobyname 3 , or .Dv IPPROTO_TCP . All options are declared in .In netinet/tcp.h . .Pp Options at the .Tn IP transport level may be used with .Tn TCP ; see .Xr ip 4 . Incoming connection requests that are source-routed are noted, and the reverse source route is used in responding. .Pp The default congestion control algorithm for .Tn TCP is .Xr cc_newreno 4 . Other congestion control algorithms can be made available using the .Xr mod_cc 4 framework. .Ss MIB Variables The .Tn TCP protocol implements a number of variables in the .Va net.inet.tcp branch of the .Xr sysctl 3 MIB. .Bl -tag -width ".Va TCPCTL_DO_RFC1323" .It Dv TCPCTL_DO_RFC1323 .Pq Va rfc1323 Implement the window scaling and timestamp options of RFC 1323 (default is true). .It Dv TCPCTL_MSSDFLT .Pq Va mssdflt The default value used for the maximum segment size .Pq Dq MSS when no advice to the contrary is received from MSS negotiation. .It Dv TCPCTL_SENDSPACE .Pq Va sendspace Maximum .Tn TCP send window. .It Dv TCPCTL_RECVSPACE .Pq Va recvspace Maximum .Tn TCP receive window. .It Va log_in_vain Log any connection attempts to ports where there is not a socket accepting connections. The value of 1 limits the logging to .Tn SYN (connection establishment) packets only. That of 2 results in any .Tn TCP packets to closed ports being logged. Any value unlisted above disables the logging (default is 0, i.e., the logging is disabled). .It Va msl The Maximum Segment Lifetime, in milliseconds, for a packet. .It Va keepinit Timeout, in milliseconds, for new, non-established .Tn TCP connections. The default is 75000 msec. .It Va keepidle Amount of time, in milliseconds, that the connection must be idle before keepalive probes (if enabled) are sent. The default is 7200000 msec (2 hours). .It Va keepintvl The interval, in milliseconds, between keepalive probes sent to remote machines, when no response is received on a .Va keepidle probe. The default is 75000 msec. .It Va keepcnt Number of probes sent, with no response, before a connection is dropped. The default is 8 packets. .It Va always_keepalive Assume that .Dv SO_KEEPALIVE is set on all .Tn TCP connections, the kernel will periodically send a packet to the remote host to verify the connection is still up. .It Va icmp_may_rst Certain .Tn ICMP unreachable messages may abort connections in .Tn SYN-SENT state. .It Va do_tcpdrain Flush packets in the .Tn TCP reassembly queue if the system is low on mbufs. .It Va blackhole If enabled, disable sending of RST when a connection is attempted to a port where there is not a socket accepting connections. See .Xr blackhole 4 . .It Va delayed_ack Delay ACK to try and piggyback it onto a data packet. .It Va delacktime Maximum amount of time, in milliseconds, before a delayed ACK is sent. .It Va path_mtu_discovery Enable Path MTU Discovery. .It Va tcbhashsize Size of the .Tn TCP control-block hash table (read-only). This may be tuned using the kernel option .Dv TCBHASHSIZE or by setting .Va net.inet.tcp.tcbhashsize in the .Xr loader 8 . .It Va pcbcount Number of active process control blocks (read-only). .It Va syncookies Determines whether or not .Tn SYN cookies should be generated for outbound .Tn SYN-ACK packets. .Tn SYN cookies are a great help during .Tn SYN flood attacks, and are enabled by default. (See .Xr syncookies 4 . ) .It Va isn_reseed_interval The interval (in seconds) specifying how often the secret data used in RFC 1948 initial sequence number calculations should be reseeded. By default, this variable is set to zero, indicating that no reseeding will occur. Reseeding should not be necessary, and will break .Dv TIME_WAIT recycling for a few minutes. .It Va rexmit_min , rexmit_slop Adjust the retransmit timer calculation for .Tn TCP . The slop is typically added to the raw calculation to take into account occasional variances that the .Tn SRTT (smoothed round-trip time) is unable to accommodate, while the minimum specifies an absolute minimum. While a number of .Tn TCP RFCs suggest a 1 second minimum, these RFCs tend to focus on streaming behavior, and fail to deal with the fact that a 1 second minimum has severe detrimental effects over lossy interactive connections, such as a 802.11b wireless link, and over very fast but lossy connections for those cases not covered by the fast retransmit code. For this reason, we use 200ms of slop and a near-0 minimum, which gives us an effective minimum of 200ms (similar to .Tn Linux ) . .It Va initcwnd_segments Enable the ability to specify initial congestion window in number of segments. The default value is 10 as suggested by RFC 6928. Changing the value on fly would not affect connections using congestion window from the hostcache. Caution: This regulates the burst of packets allowed to be sent in the first RTT. The value should be relative to the link capacity. Start with small values for lower-capacity links. Large bursts can cause buffer overruns and packet drops if routers have small buffers or the link is experiencing congestion. .It Va rfc3042 Enable the Limited Transmit algorithm as described in RFC 3042. It helps avoid timeouts on lossy links and also when the congestion window is small, as happens on short transfers. .It Va rfc3390 Enable support for RFC 3390, which allows for a variable-sized starting congestion window on new connections, depending on the maximum segment size. This helps throughput in general, but particularly affects short transfers and high-bandwidth large propagation-delay connections. .It Va sack.enable Enable support for RFC 2018, TCP Selective Acknowledgment option, which allows the receiver to inform the sender about all successfully arrived segments, allowing the sender to retransmit the missing segments only. .It Va sack.maxholes Maximum number of SACK holes per connection. Defaults to 128. .It Va sack.globalmaxholes Maximum number of SACK holes per system, across all connections. Defaults to 65536. .It Va maxtcptw When a TCP connection enters the .Dv TIME_WAIT state, its associated socket structure is freed, since it is of negligible size and use, and a new structure is allocated to contain a minimal amount of information necessary for sustaining a connection in this state, called the compressed TCP TIME_WAIT state. Since this structure is smaller than a socket structure, it can save a significant amount of system memory. The .Va net.inet.tcp.maxtcptw MIB variable controls the maximum number of these structures allocated. By default, it is initialized to .Va kern.ipc.maxsockets / 5. .It Va nolocaltimewait Suppress creating of compressed TCP TIME_WAIT states for connections in which both endpoints are local. .It Va fast_finwait2_recycle Recycle .Tn TCP .Dv FIN_WAIT_2 connections faster when the socket is marked as .Dv SBS_CANTRCVMORE (no user process has the socket open, data received on the socket cannot be read). The timeout used here is .Va finwait2_timeout . .It Va finwait2_timeout Timeout to use for fast recycling of .Tn TCP .Dv FIN_WAIT_2 connections. Defaults to 60 seconds. .It Va ecn.enable Enable support for TCP Explicit Congestion Notification (ECN). ECN allows a TCP sender to reduce the transmission rate in order to avoid packet drops. Settings: .Bl -tag -compact .It 0 Disable ECN. .It 1 Allow incoming connections to request ECN. Outgoing connections will request ECN. .It 2 Allow incoming connections to request ECN. Outgoing connections will not request ECN. .El .It Va ecn.maxretries Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a specific connection. This is needed to help with connection establishment when a broken firewall is in the network path. .It Va pmtud_blackhole_detection Turn on automatic path MTU blackhole detection. In case of retransmits OS will lower the MSS to check if it's MTU problem. If current MSS is greater than configured value to try, it will be set to configured value, otherwise, MSS will be set to default values .Po Va net.inet.tcp.mssdflt and .Va net.inet.tcp.v6mssdflt .Pc . .It Va pmtud_blackhole_mss MSS to try for IPv4 if PMTU blackhole detection is turned on. .It Va v6pmtud_blackhole_mss MSS to try for IPv6 if PMTU blackhole detection is turned on. .It Va pmtud_blackhole_activated Number of times configured values were used in an attempt to downshift. .It Va pmtud_blackhole_activated_min_mss Number of times default MSS was used in an attempt to downshift. .It Va pmtud_blackhole_failed Number of connections for which retransmits continued even after MSS downshift. .It Va functions_available List of available TCP function blocks (TCP stacks). .It Va functions_default The default TCP function block (TCP stack). .It Va functions_inherit_listen_socket_stack Determines whether to inherit listen socket's tcp stack or use the current system default tcp stack, as defined by .Va functions_default .Pc . Default is true. .It Va insecure_rst Use criteria defined in RFC793 instead of RFC5961 for accepting RST segments. Default is false. .It Va insecure_syn Use criteria defined in RFC793 instead of RFC5961 for accepting SYN segments. Default is false. .El .Sh ERRORS A socket operation may fail with one of the following errors returned: .Bl -tag -width Er .It Bq Er EISCONN when trying to establish a connection on a socket which already has one; -.It Bq Er ENOBUFS +.It Bo Er ENOBUFS Bc or Bo Er ENOMEM Bc when the system runs out of memory for an internal data structure; .It Bq Er ETIMEDOUT when a connection was dropped due to excessive retransmissions; .It Bq Er ECONNRESET when the remote peer forces the connection to be closed; .It Bq Er ECONNREFUSED when the remote peer actively refuses connection establishment (usually because no process is listening to the port); .It Bq Er EADDRINUSE when an attempt is made to create a socket with a port which has already been allocated; .It Bq Er EADDRNOTAVAIL when an attempt is made to create a socket with a network address for which no network interface exists; .It Bq Er EAFNOSUPPORT when an attempt is made to bind or connect a socket to a multicast address. .It Bq Er EINVAL when trying to change TCP function blocks at an invalid point in the session; .It Bq Er ENOENT when trying to use a TCP function block that is not available; .El .Sh SEE ALSO .Xr getsockopt 2 , .Xr socket 2 , .Xr sysctl 3 , .Xr blackhole 4 , .Xr inet 4 , .Xr intro 4 , .Xr ip 4 , .Xr mod_cc 4 , .Xr siftr 4 , .Xr syncache 4 , .Xr setkey 8 , .Xr tcp_functions 9 .Rs .%A "V. Jacobson" .%A "R. Braden" .%A "D. Borman" .%T "TCP Extensions for High Performance" .%O "RFC 1323" .Re .Rs .%A "A. Heffernan" .%T "Protection of BGP Sessions via the TCP MD5 Signature Option" .%O "RFC 2385" .Re .Rs .%A "K. Ramakrishnan" .%A "S. Floyd" .%A "D. Black" .%T "The Addition of Explicit Congestion Notification (ECN) to IP" .%O "RFC 3168" .Re .Sh HISTORY The .Tn TCP protocol appeared in .Bx 4.2 . The RFC 1323 extensions for window scaling and timestamps were added in .Bx 4.4 . The .Dv TCP_INFO option was introduced in .Tn Linux 2.6 and is .Em subject to change . Index: head/sys/netinet/cc/cc_newreno.c =================================================================== --- head/sys/netinet/cc/cc_newreno.c (revision 333698) +++ head/sys/netinet/cc/cc_newreno.c (revision 333699) @@ -1,376 +1,394 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. * Copyright (c) 2007-2008,2010,2014 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * All rights reserved. * * This software was developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, James * Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This software was first released in 2007 by James Healy and Lawrence Stewart * whilst working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ * * Dec 2014 garmitage@swin.edu.au * Borrowed code fragments from cc_cdg.c to add modifiable beta * via sysctls. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NEWRENO, "newreno data", "newreno beta values"); #define CAST_PTR_INT(X) (*((int*)(X))) -static int newreno_cb_init(struct cc_var *ccv); +static void newreno_cb_destroy(struct cc_var *ccv); static void newreno_ack_received(struct cc_var *ccv, uint16_t type); static void newreno_after_idle(struct cc_var *ccv); static void newreno_cong_signal(struct cc_var *ccv, uint32_t type); static void newreno_post_recovery(struct cc_var *ccv); static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf); static VNET_DEFINE(uint32_t, newreno_beta) = 50; static VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; #define V_newreno_beta VNET(newreno_beta) #define V_newreno_beta_ecn VNET(newreno_beta_ecn) struct cc_algo newreno_cc_algo = { .name = "newreno", - .cb_init = newreno_cb_init, + .cb_destroy = newreno_cb_destroy, .ack_received = newreno_ack_received, .after_idle = newreno_after_idle, .cong_signal = newreno_cong_signal, .post_recovery = newreno_post_recovery, .ctl_output = newreno_ctl_output, }; struct newreno { uint32_t beta; uint32_t beta_ecn; }; -int -newreno_cb_init(struct cc_var *ccv) +static inline struct newreno * +newreno_malloc(struct cc_var *ccv) { - struct newreno *nreno; + struct newreno *nreno; - nreno = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT|M_ZERO); + nreno = malloc(sizeof(struct newreno), M_NEWRENO, M_NOWAIT); if (nreno != NULL) { + /* NB: nreno is not zeroed, so initialise all fields. */ nreno->beta = V_newreno_beta; nreno->beta_ecn = V_newreno_beta_ecn; + ccv->cc_data = nreno; } - return (0); + return (nreno); } static void +newreno_cb_destroy(struct cc_var *ccv) +{ + + if (ccv->cc_data != NULL) + free(ccv->cc_data, M_NEWRENO); +} + +static void newreno_ack_received(struct cc_var *ccv, uint16_t type) { if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) && (ccv->flags & CCF_CWND_LIMITED)) { u_int cw = CCV(ccv, snd_cwnd); u_int incr = CCV(ccv, t_maxseg); /* * Regular in-order ACK, open the congestion window. * Method depends on which congestion control state we're * in (slow start or cong avoid) and if ABC (RFC 3465) is * enabled. * * slow start: cwnd <= ssthresh * cong avoid: cwnd > ssthresh * * slow start and ABC (RFC 3465): * Grow cwnd exponentially by the amount of data * ACKed capping the max increment per ACK to * (abc_l_var * maxseg) bytes. * * slow start without ABC (RFC 5681): * Grow cwnd exponentially by maxseg per ACK. * * cong avoid and ABC (RFC 3465): * Grow cwnd linearly by maxseg per RTT for each * cwnd worth of ACKed data. * * cong avoid without ABC (RFC 5681): * Grow cwnd linearly by approximately maxseg per RTT using * maxseg^2 / cwnd per ACK as the increment. * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to * avoid capping cwnd. */ if (cw > CCV(ccv, snd_ssthresh)) { if (V_tcp_do_rfc3465) { if (ccv->flags & CCF_ABC_SENTAWND) ccv->flags &= ~CCF_ABC_SENTAWND; else incr = 0; } else incr = max((incr * incr / cw), 1); } else if (V_tcp_do_rfc3465) { /* * In slow-start with ABC enabled and no RTO in sight? * (Must not use abc_l_var > 1 if slow starting after * an RTO. On RTO, snd_nxt = snd_una, so the * snd_nxt == snd_max check is sufficient to * handle this). * * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, ccv->nsegs * V_tcp_abc_l_var * CCV(ccv, t_maxseg)); else incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); } /* ABC is on by default, so incr equals 0 frequently. */ if (incr > 0) CCV(ccv, snd_cwnd) = min(cw + incr, TCP_MAXWIN << CCV(ccv, snd_scale)); } } static void newreno_after_idle(struct cc_var *ccv) { int rw; /* * If we've been idle for more than one retransmit timeout the old * congestion window is no longer current and we have to reduce it to * the restart window before we can transmit again. * * The restart window is the initial window or the last CWND, whichever * is smaller. * * This is done to prevent us from flooding the path with a full CWND at * wirespeed, overloading router and switch buffers along the way. * * See RFC5681 Section 4.1. "Restarting Idle Connections". */ if (V_tcp_do_rfc3390) rw = min(4 * CCV(ccv, t_maxseg), max(2 * CCV(ccv, t_maxseg), 4380)); else rw = CCV(ccv, t_maxseg) * 2; CCV(ccv, snd_cwnd) = min(rw, CCV(ccv, snd_cwnd)); } /* * Perform any necessary tasks before we enter congestion recovery. */ static void newreno_cong_signal(struct cc_var *ccv, uint32_t type) { struct newreno *nreno; - uint32_t cwin, factor; + uint32_t beta, beta_ecn, cwin, factor; u_int mss; - factor = V_newreno_beta; - nreno = ccv->cc_data; - if (nreno != NULL) { - if (V_cc_do_abe) - factor = (type == CC_ECN ? nreno->beta_ecn: nreno->beta); - else - factor = nreno->beta; - } - cwin = CCV(ccv, snd_cwnd); mss = CCV(ccv, t_maxseg); + nreno = ccv->cc_data; + beta = (nreno == NULL) ? V_newreno_beta : nreno->beta; + beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn; + if (V_cc_do_abe && type == CC_ECN) + factor = beta_ecn; + else + factor = beta; /* Catch algos which mistakenly leak private signal types. */ KASSERT((type & CC_SIGPRIVMASK) == 0, ("%s: congestion signal type 0x%08x is private\n", __func__, type)); cwin = max(((uint64_t)cwin * (uint64_t)factor) / (100ULL * (uint64_t)mss), 2) * mss; switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) { if (IN_CONGRECOVERY(CCV(ccv, t_flags) && V_cc_do_abe && V_cc_abe_frlossreduce)) { CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_ssthresh) * - (uint64_t)nreno->beta) / - (100ULL * (uint64_t)nreno->beta_ecn); + (uint64_t)beta) / + (100ULL * (uint64_t)beta_ecn); } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; ENTER_RECOVERY(CCV(ccv, t_flags)); } break; case CC_ECN: if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) { CCV(ccv, snd_ssthresh) = cwin; CCV(ccv, snd_cwnd) = cwin; ENTER_CONGRECOVERY(CCV(ccv, t_flags)); } break; } } /* * Perform any necessary tasks before we exit congestion recovery. */ static void newreno_post_recovery(struct cc_var *ccv) { int pipe; - pipe = 0; if (IN_FASTRECOVERY(CCV(ccv, t_flags))) { /* * Fast recovery will conclude after returning from this * function. Window inflation should have left us with * approximately snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do it via the * slow start mechanism. * * XXXLAS: Find a way to do this without needing curack */ if (V_tcp_do_rfc6675_pipe) pipe = tcp_compute_pipe(ccv->ccvc.tcp); else pipe = CCV(ccv, snd_max) - ccv->curack; if (pipe < CCV(ccv, snd_ssthresh)) CCV(ccv, snd_cwnd) = pipe + CCV(ccv, t_maxseg); else CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh); } } -int +static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf) { struct newreno *nreno; struct cc_newreno_opts *opt; if (sopt->sopt_valsize != sizeof(struct cc_newreno_opts)) return (EMSGSIZE); nreno = ccv->cc_data; opt = buf; - + switch (sopt->sopt_dir) { case SOPT_SET: + /* We cannot set without cc_data memory. */ + if (nreno == NULL) { + nreno = newreno_malloc(ccv); + if (nreno == NULL) + return (ENOMEM); + } switch (opt->name) { case CC_NEWRENO_BETA: nreno->beta = opt->val; break; case CC_NEWRENO_BETA_ECN: if (!V_cc_do_abe) return (EACCES); nreno->beta_ecn = opt->val; break; default: return (ENOPROTOOPT); } + break; case SOPT_GET: switch (opt->name) { case CC_NEWRENO_BETA: - opt->val = nreno->beta; + opt->val = (nreno == NULL) ? + V_newreno_beta : nreno->beta; break; case CC_NEWRENO_BETA_ECN: - opt->val = nreno->beta_ecn; + opt->val = (nreno == NULL) ? + V_newreno_beta_ecn : nreno->beta_ecn; break; default: return (ENOPROTOOPT); } + break; default: return (EINVAL); } return (0); } static int newreno_beta_handler(SYSCTL_HANDLER_ARGS) { + if (req->newptr != NULL ) { if (arg1 == &VNET_NAME(newreno_beta_ecn) && !V_cc_do_abe) return (EACCES); if (CAST_PTR_INT(req->newptr) <= 0 || CAST_PTR_INT(req->newptr) > 100) return (EINVAL); } return (sysctl_handle_int(oidp, arg1, arg2, req)); } SYSCTL_DECL(_net_inet_tcp_cc_newreno); SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, newreno, CTLFLAG_RW, NULL, "New Reno related settings"); SYSCTL_PROC(_net_inet_tcp_cc_newreno, OID_AUTO, beta, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(newreno_beta), 3, &newreno_beta_handler, "IU", "New Reno beta, specified as number between 1 and 100"); SYSCTL_PROC(_net_inet_tcp_cc_newreno, OID_AUTO, beta_ecn, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, &VNET_NAME(newreno_beta_ecn), 3, &newreno_beta_handler, "IU", "New Reno beta ecn, specified as number between 1 and 100"); DECLARE_CC_MODULE(newreno, &newreno_cc_algo);