diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 13e48eef459b..564225f39c88 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1,3171 +1,3179 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * Copyright (c) 2006-2007 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif /* INET6 */ #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include #include #include /* * TCP protocol interface to socket abstraction. */ #ifdef INET static int tcp_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *, struct sockaddr *, struct thread *td); #endif /* INET6 */ static void tcp_disconnect(struct tcpcb *); static void tcp_usrclosed(struct tcpcb *); static void tcp_fill_info(struct tcpcb *, struct tcp_info *); static int tcp_pru_options_support(struct tcpcb *tp, int flags); #ifdef TCPDEBUG #define TCPDEBUG0 int ostate = 0 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ tcp_trace(TA_USER, ostate, tp, 0, 0, req) #else #define TCPDEBUG0 #define TCPDEBUG1() #define TCPDEBUG2(req) #endif /* * tcp_require_unique port requires a globally-unique source port for each * outgoing connection. The default is to require the 4-tuple to be unique. */ VNET_DEFINE(int, tcp_require_unique_port) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, require_unique_port, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_require_unique_port), 0, "Require globally-unique ephemeral port for outgoing connections"); #define V_tcp_require_unique_port VNET(tcp_require_unique_port) /* * TCP attaches to socket via pru_attach(), reserving space, * and an internet control block. */ static int tcp_usr_attach(struct socket *so, int proto, struct thread *td) { struct inpcb *inp; struct tcpcb *tp = NULL; int error; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); TCPDEBUG1(); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); if (error) goto out; } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; error = in_pcballoc(so, &V_tcbinfo); if (error) goto out; inp = sotoinpcb(so); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; inp->in6p_hops = -1; /* use kernel default */ } else #endif inp->inp_vflag |= INP_IPV4; tp = tcp_newtcpcb(inp); if (tp == NULL) { error = ENOBUFS; in_pcbdetach(inp); in_pcbfree(inp); goto out; } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); TCPSTATES_INC(TCPS_CLOSED); out: TCPDEBUG2(PRU_ATTACH); TCP_PROBE2(debug__user, tp, PRU_ATTACH); return (error); } /* * tcp_usr_detach is called when the socket layer loses its final reference * to the socket, be it a file descriptor reference, a reference from TCP, * etc. At this point, there is only one case in which we will keep around * inpcb state: time wait. */ static void tcp_usr_detach(struct socket *so) { struct inpcb *inp; struct tcpcb *tp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); KASSERT(so->so_pcb == inp && inp->inp_socket == so, ("%s: socket %p inp %p mismatch", __func__, so, inp)); tp = intotcpcb(inp); if (inp->inp_flags & INP_TIMEWAIT) { /* * There are two cases to handle: one in which the time wait * state is being discarded (INP_DROPPED), and one in which * this connection will remain in timewait. In the former, * it is time to discard all state (except tcptw, which has * already been discarded by the timewait close code, which * should be further up the call stack somewhere). In the * latter case, we detach from the socket, but leave the pcb * present until timewait ends. * * XXXRW: Would it be cleaner to free the tcptw here? * * Astute question indeed, from twtcp perspective there are * four cases to consider: * * #1 tcp_usr_detach is called at tcptw creation time by * tcp_twstart, then do not discard the newly created tcptw * and leave inpcb present until timewait ends * #2 tcp_usr_detach is called at tcptw creation time by * tcp_twstart, but connection is local and tw will be * discarded immediately * #3 tcp_usr_detach is called at timewait end (or reuse) by * tcp_twclose, then the tcptw has already been discarded * (or reused) and inpcb is freed here * #4 tcp_usr_detach is called() after timewait ends (or reuse) * (e.g. by soclose), then tcptw has already been discarded * (or reused) and inpcb is freed here * * In all three cases the tcptw should not be freed here. */ if (inp->inp_flags & INP_DROPPED) { KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " "INP_DROPPED && tp != NULL")); in_pcbdetach(inp); in_pcbfree(inp); } else { in_pcbdetach(inp); INP_WUNLOCK(inp); } } else { /* * If the connection is not in timewait, we consider two * two conditions: one in which no further processing is * necessary (dropped || embryonic), and one in which TCP is * not yet done, but no longer requires the socket, so the * pcb will persist for the time being. * * XXXRW: Does the second case still occur? */ if (inp->inp_flags & INP_DROPPED || tp->t_state < TCPS_SYN_SENT) { tcp_discardcb(tp); in_pcbdetach(inp); in_pcbfree(inp); } else { in_pcbdetach(inp); INP_WUNLOCK(inp); } } } #ifdef INET /* * Give the socket an address. */ static int tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_family != AF_INET) { /* * Preserve compatibility with old programs. */ if (nam->sa_family != AF_UNSPEC || nam->sa_len < offsetof(struct sockaddr_in, sin_zero) || sinp->sin_addr.s_addr != INADDR_ANY) return (EAFNOSUPPORT); nam->sa_family = AF_INET; } if (nam->sa_len != sizeof(*sinp)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6; u_char vflagsav; sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if (nam->sa_len != sizeof(*sin6)) return (EINVAL); /* * Must check for multicast addresses and disallow binding * to them. */ if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); INP_WLOCK(inp); vflagsav = inp->inp_vflag; if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); INP_HASH_WLOCK(&V_tcbinfo); inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) inp->inp_vflag |= INP_IPV4; else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; error = in_pcbbind(inp, (struct sockaddr *)&sin, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); goto out; } } #endif error = in6_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); out: if (error != 0) inp->inp_vflag = vflagsav; TCPDEBUG2(PRU_BIND); TCP_PROBE2(debug__user, tp, PRU_BIND); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Prepare to accept connections. */ static int tcp_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); if (error != 0) { SOCK_UNLOCK(so); goto out; } if (inp->inp_lport == 0) { INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, NULL, td->td_ucred); INP_HASH_WUNLOCK(&V_tcbinfo); } if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } else { solisten_proto_abort(so); } SOCK_UNLOCK(so); if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; u_char vflagsav; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = EINVAL; goto out; } vflagsav = inp->inp_vflag; tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); error = solisten_proto_check(so); if (error != 0) { SOCK_UNLOCK(so); goto out; } INP_HASH_WLOCK(&V_tcbinfo); if (inp->inp_lport == 0) { inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; error = in6_pcbbind(inp, NULL, td->td_ucred); } INP_HASH_WUNLOCK(&V_tcbinfo); if (error == 0) { tcp_state_change(tp, TCPS_LISTEN); solisten_proto(so, backlog); #ifdef TCP_OFFLOAD if ((so->so_options & SO_NO_OFFLOAD) == 0) tcp_offload_listen_start(tp); #endif } else { solisten_proto_abort(so); } SOCK_UNLOCK(so); if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); if (error != 0) inp->inp_vflag = vflagsav; out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ #ifdef INET /* * Initiate connection to peer. * Create a template for use in transmissions on this connection. * Enter SYN_SENT state, and mark socket as connecting. * Start keep-alive timer, and seed output sequence space. * Send initial segment on connection. */ static int tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in *sinp; sinp = (struct sockaddr_in *)nam; if (nam->sa_family != AF_INET) return (EAFNOSUPPORT); if (nam->sa_len != sizeof (*sinp)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) return (EAFNOSUPPORT); if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) return (EACCES); if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) return (error); TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; } if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } if (SOLISTENING(so)) { error = EOPNOTSUPP; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); NET_EPOCH_ENTER(et); if ((error = tcp_connect(tp, nam, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out_in_epoch; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); error = tcp_output(tp); KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()" ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error)); out_in_epoch: NET_EPOCH_EXIT(et); out: TCPDEBUG2(PRU_CONNECT); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct sockaddr_in6 *sin6; u_int8_t incflagsav; u_char vflagsav; TCPDEBUG0; sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_family != AF_INET6) return (EAFNOSUPPORT); if (nam->sa_len != sizeof (*sin6)) return (EINVAL); /* * Must disallow TCP ``connections'' to multicast addresses. */ if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EAFNOSUPPORT); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); INP_WLOCK(inp); vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; if (inp->inp_flags & INP_TIMEWAIT) { error = EADDRINUSE; goto out; } if (inp->inp_flags & INP_DROPPED) { error = ECONNREFUSED; goto out; } if (SOLISTENING(so)) { error = EINVAL; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); #ifdef INET /* * XXXRW: Some confusion: V4/V6 flags relate to binding, and * therefore probably require the hash lock, which isn't held here. * Is this a significant problem? */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { struct sockaddr_in sin; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } in6_sin6_2_sin(&sin, sin6); if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; } if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) { error = EACCES; goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sin.sin_addr)) != 0) goto out; inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; NET_EPOCH_ENTER(et); if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) goto out_in_epoch; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out_in_epoch; #endif error = tcp_output(tp); goto out_in_epoch; } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } } #endif if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0) goto out; inp->inp_vflag &= ~INP_IPV4; inp->inp_vflag |= INP_IPV6; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = tcp6_connect(tp, nam, td)) != 0) goto out; #ifdef TCP_OFFLOAD if (registered_toedevs > 0 && (so->so_options & SO_NO_OFFLOAD) == 0 && (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); NET_EPOCH_ENTER(et); error = tcp_output(tp); #ifdef INET out_in_epoch: #endif NET_EPOCH_EXIT(et); out: KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()" ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error)); /* * If the implicit bind in the connect call fails, restore * the flags we modified. */ if (error != 0 && inp->inp_lport == 0) { inp->inp_vflag = vflagsav; inp->inp_inc.inc_flags = incflagsav; } TCPDEBUG2(PRU_CONNECT); TCP_PROBE2(debug__user, tp, PRU_CONNECT); INP_WUNLOCK(inp); return (error); } #endif /* INET6 */ /* * Initiate disconnect from peer. * If connection never passed embryonic stage, just drop; * else if don't need to let data drain, then can just drop anyways, * else have to begin TCP shutdown process: mark socket disconnecting, * drain unread data, state switch to reflect user close, and * send segment (e.g. FIN) to peer. Socket will be really disconnected * when peer sends FIN and acks ours. * * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. */ static int tcp_usr_disconnect(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; int error = 0; TCPDEBUG0; NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & INP_TIMEWAIT) goto out; if (inp->inp_flags & INP_DROPPED) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); out: TCPDEBUG2(PRU_DISCONNECT); TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); return (error); } #ifdef INET /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. */ static int tcp_usr_accept(struct socket *so, struct sockaddr **nam) { int error = 0; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct in_addr addr; in_port_t port = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in_getpeeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ port = inp->inp_fport; addr = inp->inp_faddr; out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); if (error == 0) *nam = in_sockaddr(port, &addr); return error; } #endif /* INET */ #ifdef INET6 static int tcp6_usr_accept(struct socket *so, struct sockaddr **nam) { struct inpcb *inp = NULL; int error = 0; struct tcpcb *tp = NULL; struct in_addr addr; struct in6_addr addr6; struct epoch_tracker et; in_port_t port = 0; int v4 = 0; TCPDEBUG0; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; goto out; } tp = intotcpcb(inp); TCPDEBUG1(); /* * We inline in6_mapped_peeraddr and COMMON_END here, so that we can * copy the data of interest and defer the malloc until after we * release the lock. */ if (inp->inp_vflag & INP_IPV4) { v4 = 1; port = inp->inp_fport; addr = inp->inp_faddr; } else { port = inp->inp_fport; addr6 = inp->in6p_faddr; } out: TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); else *nam = in6_sockaddr(port, &addr6); } return error; } #endif /* INET6 */ /* * Mark the connection as being incapable of further output. */ static int tcp_usr_shutdown(struct socket *so) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); NET_EPOCH_ENTER(et); TCPDEBUG1(); socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) error = tcp_output_nodrop(tp); TCPDEBUG2(PRU_SHUTDOWN); TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); error = tcp_unlock_or_drop(tp, error); NET_EPOCH_EXIT(et); return (error); } /* * After a receive, possibly send window update to peer. */ static int tcp_usr_rcvd(struct socket *so, int flags) { struct epoch_tracker et; struct inpcb *inp; struct tcpcb *tp = NULL; int outrv = 0, error = 0; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); NET_EPOCH_ENTER(et); TCPDEBUG1(); /* * For passively-created TFO connections, don't attempt a window * update while still in SYN_RECEIVED as this may trigger an early * SYN|ACK. It is preferable to have the SYN|ACK be sent along with * application response data, or failing that, when the DELACK timer * expires. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) goto out; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) tcp_offload_rcvd(tp); else #endif outrv = tcp_output_nodrop(tp); out: TCPDEBUG2(PRU_RCVD); TCP_PROBE2(debug__user, tp, PRU_RCVD); (void) tcp_unlock_or_drop(tp, outrv); NET_EPOCH_EXIT(et); return (error); } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. */ static int tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; #ifdef INET #ifdef INET6 struct sockaddr_in sin; #endif struct sockaddr_in *sinp; #endif #ifdef INET6 int isipv6; #endif u_int8_t incflagsav; u_char vflagsav; bool restoreflags; TCPDEBUG0; if (control != NULL) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { m_freem(control); return (EINVAL); } m_freem(control); /* empty control, just free it */ } inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { if (m != NULL && (flags & PRUS_NOTREADY) == 0) m_freem(m); INP_WUNLOCK(inp); return (ECONNRESET); } vflagsav = inp->inp_vflag; incflagsav = inp->inp_inc.inc_flags; restoreflags = false; tp = intotcpcb(inp); NET_EPOCH_ENTER(et); if ((flags & PRUS_OOB) != 0 && (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0) goto out; TCPDEBUG1(); if (nam != NULL && tp->t_state < TCPS_SYN_SENT) { if (tp->t_state == TCPS_LISTEN) { error = EINVAL; goto out; } switch (nam->sa_family) { #ifdef INET case AF_INET: sinp = (struct sockaddr_in *)nam; if (sinp->sin_len != sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV6) != 0) { error = EAFNOSUPPORT; goto out; } if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; } if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) { error = EACCES; goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr))) goto out; #ifdef INET6 isipv6 = 0; #endif break; #endif /* INET */ #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)nam; if (sin6->sin6_len != sizeof(*sin6)) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV6PROTO) == 0) { error = EAFNOSUPPORT; goto out; } if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { error = EAFNOSUPPORT; goto out; } if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { #ifdef INET if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { error = EINVAL; goto out; } if ((inp->inp_vflag & INP_IPV4) == 0) { error = EAFNOSUPPORT; goto out; } restoreflags = true; inp->inp_vflag &= ~INP_IPV6; sinp = &sin; in6_sin6_2_sin(sinp, sin6); if (IN_MULTICAST( ntohl(sinp->sin_addr.s_addr))) { error = EAFNOSUPPORT; goto out; } if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr))) goto out; isipv6 = 0; #else /* !INET */ error = EAFNOSUPPORT; goto out; #endif /* INET */ } else { if ((inp->inp_vflag & INP_IPV6) == 0) { error = EAFNOSUPPORT; goto out; } restoreflags = true; inp->inp_vflag &= ~INP_IPV4; inp->inp_inc.inc_flags |= INC_ISIPV6; if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr))) goto out; isipv6 = 1; } break; } #endif /* INET6 */ default: error = EAFNOSUPPORT; goto out; } } if (!(flags & PRUS_OOB)) { sbappendstream(&so->so_snd, m, flags); m = NULL; if (nam && tp->t_state < TCPS_SYN_SENT) { KASSERT(tp->t_state == TCPS_CLOSED, ("%s: tp %p is listening", __func__, tp)); /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, (struct sockaddr *)sinp, td); #endif /* * The bind operation in tcp_connect succeeded. We * no longer want to restore the flags if later * operations fail. */ if (error == 0 || inp->inp_lport != 0) restoreflags = false; if (error) { /* m is freed if PRUS_NOTREADY is unset. */ sbflush(&so->so_snd); goto out; } if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_connect(tp); else { tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ socantsendmore(so); tcp_usrclosed(tp); } if (TCPS_HAVEESTABLISHED(tp->t_state) && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_out == 0) && (so->so_snd.sb_ccc > 0)) { tp->t_fbyte_out = ticks; if (tp->t_fbyte_out == 0) tp->t_fbyte_out = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } if (!(inp->inp_flags & INP_DROPPED) && !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; error = tcp_output_nodrop(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } } else { /* * XXXRW: PRUS_EOF not implemented with PRUS_OOB? */ SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); m = NULL; if (nam && tp->t_state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected, * initialize window to default value, and * initialize maxseg using peer's cached MSS. */ /* * Not going to contemplate SYN|URG */ if (IS_FASTOPEN(tp->t_flags)) tp->t_flags &= ~TF_FASTOPEN; #ifdef INET6 if (isipv6) error = tcp6_connect(tp, nam, td); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET error = tcp_connect(tp, (struct sockaddr *)sinp, td); #endif /* * The bind operation in tcp_connect succeeded. We * no longer want to restore the flags if later * operations fail. */ if (error == 0 || inp->inp_lport != 0) restoreflags = false; if (error != 0) { /* m is freed if PRUS_NOTREADY is unset. */ sbflush(&so->so_snd); goto out; } tp->snd_wnd = TTCP_CLIENT_SND_WND; tcp_mss(tp, -1); } tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if ((flags & PRUS_NOTREADY) == 0) { tp->t_flags |= TF_FORCEDATA; error = tcp_output_nodrop(tp); tp->t_flags &= ~TF_FORCEDATA; } } TCP_LOG_EVENT(tp, NULL, &inp->inp_socket->so_rcv, &inp->inp_socket->so_snd, TCP_LOG_USERSEND, error, 0, NULL, false); out: /* * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is * responsible for freeing memory. */ if (m != NULL && (flags & PRUS_NOTREADY) == 0) m_freem(m); /* * If the request was unsuccessful and we changed flags, * restore the original flags. */ if (error != 0 && restoreflags) { inp->inp_vflag = vflagsav; inp->inp_inc.inc_flags = incflagsav; } TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); error = tcp_unlock_or_drop(tp, error); NET_EPOCH_EXIT(et); return (error); } static int tcp_usr_ready(struct socket *so, struct mbuf *m, int count) { struct epoch_tracker et; struct inpcb *inp; struct tcpcb *tp; int error; inp = sotoinpcb(so); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); mb_free_notready(m, count); return (ECONNRESET); } tp = intotcpcb(inp); SOCKBUF_LOCK(&so->so_snd); error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); if (error) { INP_WUNLOCK(inp); return (error); } NET_EPOCH_ENTER(et); error = tcp_output_unlock(tp); NET_EPOCH_EXIT(et); return (error); } /* * Abort the TCP. Drop the connection abruptly. */ static void tcp_usr_abort(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, drop. */ if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tp = tcp_drop(tp, ECONNABORTED); if (tp == NULL) goto dropped; TCPDEBUG2(PRU_ABORT); TCP_PROBE2(debug__user, tp, PRU_ABORT); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); dropped: NET_EPOCH_EXIT(et); } /* * TCP socket is closed. Start friendly disconnect. */ static void tcp_usr_close(struct socket *so) { struct inpcb *inp; struct tcpcb *tp = NULL; struct epoch_tracker et; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); NET_EPOCH_ENTER(et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); /* * If we still have full TCP state, and we're not dropped, initiate * a disconnect. */ if (!(inp->inp_flags & INP_TIMEWAIT) && !(inp->inp_flags & INP_DROPPED)) { tp = intotcpcb(inp); TCPDEBUG1(); tcp_disconnect(tp); TCPDEBUG2(PRU_CLOSE); TCP_PROBE2(debug__user, tp, PRU_CLOSE); } if (!(inp->inp_flags & INP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); } static int tcp_pru_options_support(struct tcpcb *tp, int flags) { /* * If the specific TCP stack has a pru_options * specified then it does not always support * all the PRU_XX options and we must ask it. * If the function is not specified then all * of the PRU_XX options are supported. */ int ret = 0; if (tp->t_fb->tfb_pru_options) { ret = (*tp->t_fb->tfb_pru_options)(tp, flags); } return (ret); } /* * Receive out-of-band data. */ static int tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; TCPDEBUG0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNRESET; goto out; } tp = intotcpcb(inp); error = tcp_pru_options_support(tp, PRUS_OOB); if (error) { goto out; } TCPDEBUG1(); if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || tp->t_oobflags & TCPOOB_HADDATA) { error = EINVAL; goto out; } if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = tp->t_iobc; if ((flags & MSG_PEEK) == 0) tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); out: TCPDEBUG2(PRU_RCVOOB); TCP_PROBE2(debug__user, tp, PRU_RCVOOB); INP_WUNLOCK(inp); return (error); } #ifdef INET struct pr_usrreqs tcp_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp_usr_bind, .pru_connect = tcp_usr_connect, .pru_control = in_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp_usr_listen, .pru_peeraddr = in_getpeeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in_getsockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET */ #ifdef INET6 struct pr_usrreqs tcp6_usrreqs = { .pru_abort = tcp_usr_abort, .pru_accept = tcp6_usr_accept, .pru_attach = tcp_usr_attach, .pru_bind = tcp6_usr_bind, .pru_connect = tcp6_usr_connect, .pru_control = in6_control, .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp6_usr_listen, .pru_peeraddr = in6_mapped_peeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, .pru_send = tcp_usr_send, .pru_ready = tcp_usr_ready, .pru_shutdown = tcp_usr_shutdown, .pru_sockaddr = in6_mapped_sockaddr, .pru_sosetlabel = in_pcbsosetlabel, .pru_close = tcp_usr_close, }; #endif /* INET6 */ #ifdef INET /* * Common subroutine to open a TCP connection to remote host specified * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local * port number if needed. Call in_pcbconnect_setup to do the routing and * to choose a local host address (interface). If there is an existing * incarnation of the same connection in TIME-WAIT state and if the remote * host was sending CC options and if the connection duration was < MSL, then * truncate the previous TIME-WAIT state and proceed. * Initialize connection parameters and enter SYN-SENT state. */ static int tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct in_addr laddr; u_short lport; int error; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); if (V_tcp_require_unique_port && inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) goto out; } /* * Cannot simply call in_pcbconnect, because there might be an * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ laddr = inp->inp_laddr; lport = inp->inp_lport; error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); if (error && oinp == NULL) goto out; if (oinp) { error = EADDRINUSE; goto out; } /* Handle initial bind if it hadn't been done in advance. */ if (inp->inp_lport == 0) { inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { inp->inp_lport = 0; error = EAGAIN; goto out; } } inp->inp_laddr = laddr; in_pcbrehash(inp); INP_HASH_WUNLOCK(&V_tcbinfo); /* * Compute window scaling to request: * Scale to fit into sweet spot. See tcp_syncache.c. * XXX: This should move to tcp_output(). */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(so); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(&inp->inp_inc); if (tp->t_flags & TF_REQ_TSTMP) tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return (error); } #endif /* INET */ #ifdef INET6 static int tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) { struct inpcb *inp = tp->t_inpcb; int error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(&V_tcbinfo); if (V_tcp_require_unique_port && inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); if (error) goto out; } error = in6_pcbconnect(inp, nam, td->td_ucred); if (error != 0) goto out; INP_HASH_WUNLOCK(&V_tcbinfo); /* Compute window scaling to request. */ while (tp->request_r_scale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << tp->request_r_scale) < sb_max) tp->request_r_scale++; soisconnecting(inp->inp_socket); TCPSTAT_INC(tcps_connattempt); tcp_state_change(tp, TCPS_SYN_SENT); tp->iss = tcp_new_isn(&inp->inp_inc); if (tp->t_flags & TF_REQ_TSTMP) tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); tcp_sendseqinit(tp); return 0; out: INP_HASH_WUNLOCK(&V_tcbinfo); return error; } #endif /* INET6 */ /* * Export TCP internal state information via a struct tcp_info, based on the * Linux 2.6 API. Not ABI compatible as our constants are mapped differently * (TCP state machine, etc). We export all information using FreeBSD-native * constants -- for example, the numeric values for tcpi_state will differ * from Linux. */ static void tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) { INP_WLOCK_ASSERT(tp->t_inpcb); bzero(ti, sizeof(*ti)); ti->tcpi_state = tp->t_state; if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tp->t_flags & TF_SACK_PERMIT) ti->tcpi_options |= TCPI_OPT_SACK; if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { ti->tcpi_options |= TCPI_OPT_WSCALE; ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } if (tp->t_flags2 & TF2_ECN_PERMIT) ti->tcpi_options |= TCPI_OPT_ECN; ti->tcpi_rto = tp->t_rxtcur * tick; ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; /* * FreeBSD-specific extension fields for tcp_info. */ ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_rcv_nxt = tp->rcv_nxt; ti->tcpi_snd_wnd = tp->snd_wnd; ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ ti->tcpi_snd_nxt = tp->snd_nxt; ti->tcpi_snd_mss = tp->t_maxseg; ti->tcpi_rcv_mss = tp->t_maxseg; ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { ti->tcpi_options |= TCPI_OPT_TOE; tcp_offload_tcp_info(tp, ti); } #endif } /* * tcp_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ INP_WLOCK(inp); \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_WUNLOCK(inp); \ cleanup; \ return (ECONNRESET); \ } \ tp = intotcpcb(inp); \ } while(0) #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) -static int +int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) { - struct tcpcb *tp = intotcpcb(inp); + struct socket *so = inp->inp_socket; + struct tcpcb *tp = intotcpcb(inp); int error = 0; MPASS(sopt->sopt_dir == SOPT_SET); INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, + ("inp_flags == %x", inp->inp_flags)); + KASSERT(so != NULL, ("inp_socket == NULL")); if (sopt->sopt_level != IPPROTO_TCP) { INP_WUNLOCK(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) - error = ip6_ctloutput(inp->inp_socket, sopt); + error = ip6_ctloutput(so, sopt); #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET - error = ip_ctloutput(inp->inp_socket, sopt); + error = ip_ctloutput(so, sopt); #endif /* * When an IP-level socket option affects TCP, pass control * down to stack tfb_tcp_ctloutput, otherwise return what * IP level returned. */ switch (sopt->sopt_level) { #ifdef INET6 case IPPROTO_IPV6: if ((inp->inp_vflag & INP_IPV6PROTO) == 0) return (error); switch (sopt->sopt_name) { case IPV6_TCLASS: /* Notify tcp stacks that care (e.g. RACK). */ break; case IPV6_USE_MIN_MTU: /* Update t_maxseg accordingly. */ break; default: return (error); } break; #endif #ifdef INET case IPPROTO_IP: switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos &= ~IPTOS_ECN_MASK; break; case IP_TTL: /* Notify tcp stacks that care (e.g. RACK). */ break; default: return (error); } break; #endif default: return (error); } INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } } else if (sopt->sopt_name == TCP_FUNCTION_BLK) { /* * Protect the TCP option TCP_FUNCTION_BLK so * that a sub-function can *never* overwrite this. */ struct tcp_function_set fsn; struct tcp_function_block *blk; INP_WUNLOCK(inp); error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); if (error) return (error); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); blk = find_and_ref_tcp_functions(&fsn); if (blk == NULL) { INP_WUNLOCK(inp); return (ENOENT); } if (tp->t_fb == blk) { /* You already have this */ refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (0); } if (tp->t_state != TCPS_CLOSED) { /* * The user has advanced the state * past the initial point, we may not * be able to switch. */ if (blk->tfb_tcp_handoff_ok != NULL) { /* * Does the stack provide a * query mechanism, if so it may * still be possible? */ error = (*blk->tfb_tcp_handoff_ok)(tp); } else error = EINVAL; if (error) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return(error); } } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (ENOENT); } /* * Release the old refcnt, the * lookup acquired a ref on the * new one already. */ if (tp->t_fb->tfb_tcp_fb_fini) { struct epoch_tracker et; /* * Tell the stack to cleanup with 0 i.e. * the tcb is not going away. */ NET_EPOCH_ENTER(et); (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); NET_EPOCH_EXIT(et); } #ifdef TCPHPTS /* Assure that we are not on any hpts */ tcp_hpts_remove(tp->t_inpcb); #endif if (blk->tfb_tcp_fb_init) { error = (*blk->tfb_tcp_fb_init)(tp); if (error) { refcount_release(&blk->tfb_refcnt); if (tp->t_fb->tfb_tcp_fb_init) { if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) { /* Fall back failed, drop the connection */ INP_WUNLOCK(inp); - soabort(inp->inp_socket); - return(error); + soabort(so); + return (error); } } goto err_out; } } refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = blk; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif err_out: INP_WUNLOCK(inp); return (error); } - tp = intotcpcb(inp); - /* Pass in the INP locked, callee must unlock it. */ return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt)); } static int tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt) { - int error = 0; - struct tcpcb *tp; + struct socket *so = inp->inp_socket; + struct tcpcb *tp = intotcpcb(inp); + int error = 0; MPASS(sopt->sopt_dir == SOPT_GET); INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, + ("inp_flags == %x", inp->inp_flags)); + KASSERT(so != NULL, ("inp_socket == NULL")); if (sopt->sopt_level != IPPROTO_TCP) { INP_WUNLOCK(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) - error = ip6_ctloutput(inp->inp_socket, sopt); + error = ip6_ctloutput(so, sopt); #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET - error = ip_ctloutput(inp->inp_socket, sopt); + error = ip_ctloutput(so, sopt); #endif return (error); } - tp = intotcpcb(inp); if (((sopt->sopt_name == TCP_FUNCTION_BLK) || (sopt->sopt_name == TCP_FUNCTION_ALIAS))) { struct tcp_function_set fsn; if (sopt->sopt_name == TCP_FUNCTION_ALIAS) { memset(&fsn, 0, sizeof(fsn)); find_tcp_function_alias(tp->t_fb, &fsn); } else { strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX); fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; } fsn.pcbcnt = tp->t_fb->tfb_refcnt; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &fsn, sizeof fsn); return (error); } /* Pass in the INP locked, callee must unlock it. */ return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt)); } int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } if (sopt->sopt_dir == SOPT_SET) return (tcp_ctloutput_set(inp, sopt)); else if (sopt->sopt_dir == SOPT_GET) return (tcp_ctloutput_get(inp, sopt)); else panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); } /* * If this assert becomes untrue, we need to change the size of the buf * variable in tcp_default_ctloutput(). */ #ifdef CTASSERT CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN); CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN); #endif #ifdef KERN_TLS static int copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls) { struct tls_enable_v0 tls_v0; int error; if (sopt->sopt_valsize == sizeof(tls_v0)) { error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0), sizeof(tls_v0)); if (error) return (error); memset(tls, 0, sizeof(*tls)); tls->cipher_key = tls_v0.cipher_key; tls->iv = tls_v0.iv; tls->auth_key = tls_v0.auth_key; tls->cipher_algorithm = tls_v0.cipher_algorithm; tls->cipher_key_len = tls_v0.cipher_key_len; tls->iv_len = tls_v0.iv_len; tls->auth_algorithm = tls_v0.auth_algorithm; tls->auth_key_len = tls_v0.auth_key_len; tls->flags = tls_v0.flags; tls->tls_vmajor = tls_v0.tls_vmajor; tls->tls_vminor = tls_v0.tls_vminor; return (0); } return (sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls))); } #endif extern struct cc_algo newreno_cc_algo; static int tcp_congestion(struct inpcb *inp, struct sockopt *sopt) { struct cc_algo *algo; void *ptr = NULL; struct tcpcb *tp; struct cc_var cc_mem; char buf[TCP_CA_NAME_MAX]; size_t mem_sz; int error; INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); if (error) return(error); buf[sopt->sopt_valsize] = '\0'; CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) { if (algo->flags & CC_MODULE_BEING_REMOVED) { /* We can't "see" modules being unloaded */ continue; } break; } if (algo == NULL) { CC_LIST_RUNLOCK(); return(ESRCH); } do_over: if (algo->cb_init != NULL) { /* We can now pre-get the memory for the CC */ mem_sz = (*algo->cc_data_sz)(); if (mem_sz == 0) { goto no_mem_needed; } CC_LIST_RUNLOCK(); ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK); CC_LIST_RLOCK(); STAILQ_FOREACH(algo, &cc_list, entries) if (strncmp(buf, algo->name, TCP_CA_NAME_MAX) == 0) break; if (algo == NULL) { if (ptr) free(ptr, M_CC_MEM); CC_LIST_RUNLOCK(); return(ESRCH); } } else { no_mem_needed: mem_sz = 0; ptr = NULL; } /* * Make sure its all clean and zero and also get * back the inplock. */ memset(&cc_mem, 0, sizeof(cc_mem)); if (mem_sz != (*algo->cc_data_sz)()) { if (ptr) free(ptr, M_CC_MEM); goto do_over; } INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); CC_LIST_RUNLOCK(); free(ptr, M_CC_MEM); return (ECONNRESET); } tp = intotcpcb(inp); if (ptr != NULL) memset(ptr, 0, mem_sz); CC_LIST_RUNLOCK(); cc_mem.ccvc.tcp = tp; /* * We once again hold a write lock over the tcb so it's * safe to do these things without ordering concerns. * Note here we init into stack memory. */ if (algo->cb_init != NULL) error = algo->cb_init(&cc_mem, ptr); else error = 0; /* * The CC algorithms, when given their memory * should not fail we could in theory have a * KASSERT here. */ if (error == 0) { /* * Touchdown, lets go ahead and move the * connection to the new CC module by * copying in the cc_mem after we call * the old ones cleanup (if any). */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var)); tp->cc_algo = algo; /* Ok now are we where we have gotten past any conn_init? */ if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) { /* Yep run the connection init for the new CC */ CC_ALGO(tp)->conn_init(tp->ccv); } } else if (ptr) free(ptr, M_CC_MEM); INP_WUNLOCK(inp); return (error); } int tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt) { - struct tcpcb *tp; + struct socket *so = inp->inp_socket; + struct tcpcb *tp = intotcpcb(inp); int error, opt, optval; u_int ui; struct tcp_info ti; #ifdef KERN_TLS struct tls_enable tls; #endif char *pbuf, buf[TCP_LOG_ID_LEN]; #ifdef STATS struct statsblob *sbp; #endif size_t len; INP_WLOCK_ASSERT(inp); + KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0, + ("inp_flags == %x", inp->inp_flags)); + KASSERT(so != NULL, ("inp_socket == NULL")); - tp = intotcpcb(inp); switch (sopt->sopt_level) { #ifdef INET6 case IPPROTO_IPV6: MPASS(inp->inp_vflag & INP_IPV6PROTO); switch (sopt->sopt_name) { case IPV6_USE_MIN_MTU: tcp6_use_min_mtu(tp); /* FALLTHROUGH */ } INP_WUNLOCK(inp); return (0); #endif #ifdef INET case IPPROTO_IP: INP_WUNLOCK(inp); return (0); #endif } /* * For TCP_CCALGOOPT forward the control to CC module, for both * SOPT_SET and SOPT_GET. */ switch (sopt->sopt_name) { case TCP_CCALGOOPT: INP_WUNLOCK(inp); if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT) return (EINVAL); pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize, sopt->sopt_valsize); if (error) { free(pbuf, M_TEMP); return (error); } INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); if (CC_ALGO(tp)->ctl_output != NULL) error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf); else error = ENOENT; INP_WUNLOCK(inp); if (error == 0 && sopt->sopt_dir == SOPT_GET) error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize); free(pbuf, M_TEMP); return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: if (!TCPMD5_ENABLED()) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = TCPMD5_PCBCTL(inp, sopt); if (error) return (error); goto unlock_and_done; #endif /* IPSEC */ case TCP_NODELAY: case TCP_NOOPT: case TCP_LRD: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_NODELAY: opt = TF_NODELAY; break; case TCP_NOOPT: opt = TF_NOOPT; break; case TCP_LRD: opt = TF_LRD; break; default: opt = 0; /* dead code to fool gcc */ break; } if (optval) tp->t_flags |= opt; else tp->t_flags &= ~opt; unlock_and_done: #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, sopt->sopt_name); } #endif INP_WUNLOCK(inp); break; case TCP_NOPUSH: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval) tp->t_flags |= TF_NOPUSH; else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) { struct epoch_tracker et; NET_EPOCH_ENTER(et); error = tcp_output_nodrop(tp); NET_EPOCH_EXIT(et); } } goto unlock_and_done; case TCP_REMOTE_UDP_ENCAPS_PORT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); if ((optval < TCP_TUNNELING_PORT_MIN) || (optval > TCP_TUNNELING_PORT_MAX)) { /* Its got to be in range */ return (EINVAL); } if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) { /* You have to have enabled a UDP tunneling port first */ return (EINVAL); } INP_WLOCK_RECHECK(inp); if (tp->t_state != TCPS_CLOSED) { /* You can't change after you are connected */ error = EINVAL; } else { /* Ok we are all good set the port */ tp->t_port = htons(optval); } goto unlock_and_done; case TCP_MAXSEG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval > 0 && optval <= tp->t_maxseg && optval + 40 >= V_tcp_minmss) tp->t_maxseg = optval; else error = EINVAL; goto unlock_and_done; case TCP_INFO: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_STATS: INP_WUNLOCK(inp); #ifdef STATS error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); if (optval > 0) sbp = stats_blob_alloc( V_tcp_perconn_stats_dflt_tpl, 0); else sbp = NULL; INP_WLOCK_RECHECK(inp); if ((tp->t_stats != NULL && sbp == NULL) || (tp->t_stats == NULL && sbp != NULL)) { struct statsblob *t = tp->t_stats; tp->t_stats = sbp; sbp = t; } INP_WUNLOCK(inp); stats_blob_destroy(sbp); #else return (EOPNOTSUPP); #endif /* !STATS */ break; case TCP_CONGESTION: error = tcp_congestion(inp, sopt); break; case TCP_REUSPORT_LB_NUMA: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); INP_WLOCK_RECHECK(inp); if (!error) error = in_pcblbgroup_numa(inp, optval); INP_WUNLOCK(inp); break; #ifdef KERN_TLS case TCP_TXTLS_ENABLE: INP_WUNLOCK(inp); error = copyin_tls_enable(sopt, &tls); if (error) break; - error = ktls_enable_tx(inp->inp_socket, &tls); + error = ktls_enable_tx(so, &tls); break; case TCP_TXTLS_MODE: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); INP_WLOCK_RECHECK(inp); - error = ktls_set_tx_mode(inp->inp_socket, ui); + error = ktls_set_tx_mode(so, ui); INP_WUNLOCK(inp); break; case TCP_RXTLS_ENABLE: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &tls, sizeof(tls), sizeof(tls)); if (error) break; - error = ktls_enable_rx(inp->inp_socket, &tls); + error = ktls_enable_rx(so, &tls); break; #endif case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); if (ui > (UINT_MAX / hz)) { error = EINVAL; break; } ui *= hz; INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { case TCP_KEEPIDLE: tp->t_keepidle = ui; /* * XXX: better check current remaining * timeout and "merge" it with new value. */ if ((tp->t_state > TCPS_LISTEN) && (tp->t_state <= TCPS_CLOSING)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); break; case TCP_KEEPINTVL: tp->t_keepintvl = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); break; case TCP_KEEPINIT: tp->t_keepinit = ui; if (tp->t_state == TCPS_SYN_RECEIVED || tp->t_state == TCPS_SYN_SENT) tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); break; } goto unlock_and_done; case TCP_KEEPCNT: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); if (error) return (error); INP_WLOCK_RECHECK(inp); tp->t_keepcnt = ui; if ((tp->t_state == TCPS_FIN_WAIT_2) && (TP_MAXIDLE(tp) > 0)) tcp_timer_activate(tp, TT_2MSL, TP_MAXIDLE(tp)); goto unlock_and_done; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); if (optval >= 0) tcp_pcap_set_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts), optval); else error = EINVAL; goto unlock_and_done; #endif case TCP_FASTOPEN: { struct tcp_fastopen tfo_optval; INP_WUNLOCK(inp); if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) return (EPERM); error = sooptcopyin(sopt, &tfo_optval, sizeof(tfo_optval), sizeof(int)); if (error) return (error); INP_WLOCK_RECHECK(inp); if ((tp->t_state != TCPS_CLOSED) && (tp->t_state != TCPS_LISTEN)) { error = EINVAL; goto unlock_and_done; } if (tfo_optval.enable) { if (tp->t_state == TCPS_LISTEN) { if (!V_tcp_fastopen_server_enable) { error = EPERM; goto unlock_and_done; } if (tp->t_tfo_pending == NULL) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); } else { /* * If a pre-shared key was provided, * stash it in the client cookie * field of the tcpcb for use during * connect. */ if (sopt->sopt_valsize == sizeof(tfo_optval)) { memcpy(tp->t_tfo_cookie.client, tfo_optval.psk, TCP_FASTOPEN_PSK_LEN); tp->t_tfo_client_cookie_len = TCP_FASTOPEN_PSK_LEN; } } tp->t_flags |= TF_FASTOPEN; } else tp->t_flags &= ~TF_FASTOPEN; goto unlock_and_done; } #ifdef TCP_BLACKBOX case TCP_LOG: INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); INP_WLOCK_RECHECK(inp); error = tcp_log_state_change(tp, optval); goto unlock_and_done; case TCP_LOGBUF: INP_WUNLOCK(inp); error = EINVAL; break; case TCP_LOGID: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); error = tcp_log_set_id(tp, buf); /* tcp_log_set_id() unlocks the INP. */ break; case TCP_LOGDUMP: case TCP_LOGDUMPID: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0); if (error) break; buf[sopt->sopt_valsize] = '\0'; INP_WLOCK_RECHECK(inp); if (sopt->sopt_name == TCP_LOGDUMP) { error = tcp_log_dump_tp_logbuf(tp, buf, M_WAITOK, true); INP_WUNLOCK(inp); } else { tcp_log_dump_tp_bucket_logbufs(tp, buf); /* * tcp_log_dump_tp_bucket_logbufs() drops the * INP lock. */ } break; #endif default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: tp = intotcpcb(inp); switch (sopt->sopt_name) { #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) case TCP_MD5SIG: if (!TCPMD5_ENABLED()) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = TCPMD5_PCBCTL(inp, sopt); break; #endif case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_MAXSEG: optval = tp->t_maxseg; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_REMOTE_UDP_ENCAPS_PORT: optval = ntohs(tp->t_port); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOOPT: optval = tp->t_flags & TF_NOOPT; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; case TCP_INFO: tcp_fill_info(tp, &ti); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; case TCP_STATS: { #ifdef STATS int nheld; TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0; error = 0; socklen_t outsbsz = sopt->sopt_valsize; if (tp->t_stats == NULL) error = ENOENT; else if (outsbsz >= tp->t_stats->cursz) outsbsz = tp->t_stats->cursz; else if (outsbsz >= sizeof(struct statsblob)) outsbsz = sizeof(struct statsblob); else error = EINVAL; INP_WUNLOCK(inp); if (error) break; sbp = sopt->sopt_val; nheld = atop(round_page(((vm_offset_t)sbp) + (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp)); vm_page_t ma[nheld]; if (vm_fault_quick_hold_pages( &curproc->p_vmspace->vm_map, (vm_offset_t)sbp, outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma, nheld) < 0) { error = EFAULT; break; } if ((error = copyin_nofault(&(sbp->flags), &sbflags, SIZEOF_MEMBER(struct statsblob, flags)))) goto unhold; INP_WLOCK_RECHECK(inp); error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats, sbflags | SB_CLONE_USRDSTNOFAULT); INP_WUNLOCK(inp); sopt->sopt_valsize = outsbsz; unhold: vm_page_unhold_pages(ma, nheld); #else INP_WUNLOCK(inp); error = EOPNOTSUPP; #endif /* !STATS */ break; } case TCP_CONGESTION: len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: case TCP_KEEPCNT: switch (sopt->sopt_name) { case TCP_KEEPIDLE: ui = TP_KEEPIDLE(tp) / hz; break; case TCP_KEEPINTVL: ui = TP_KEEPINTVL(tp) / hz; break; case TCP_KEEPINIT: ui = TP_KEEPINIT(tp) / hz; break; case TCP_KEEPCNT: ui = TP_KEEPCNT(tp); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ui, sizeof(ui)); break; #ifdef TCPPCAP case TCP_PCAP_OUT: case TCP_PCAP_IN: optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ? &(tp->t_outpkts) : &(tp->t_inpkts)); INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif case TCP_FASTOPEN: optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; #ifdef TCP_BLACKBOX case TCP_LOG: optval = tp->t_logstate; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case TCP_LOGBUF: /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */ error = tcp_log_getlogbuf(sopt, tp); break; case TCP_LOGID: len = tcp_log_get_id(tp, buf); INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; case TCP_LOGDUMP: case TCP_LOGDUMPID: INP_WUNLOCK(inp); error = EINVAL; break; #endif #ifdef KERN_TLS case TCP_TXTLS_MODE: - error = ktls_get_tx_mode(inp->inp_socket, &optval); + error = ktls_get_tx_mode(so, &optval); INP_WUNLOCK(inp); if (error == 0) error = sooptcopyout(sopt, &optval, sizeof(optval)); break; case TCP_RXTLS_MODE: - error = ktls_get_rx_mode(inp->inp_socket, &optval); + error = ktls_get_rx_mode(so, &optval); INP_WUNLOCK(inp); if (error == 0) error = sooptcopyout(sopt, &optval, sizeof(optval)); break; #endif case TCP_LRD: optval = tp->t_flags & TF_LRD; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #undef INP_WLOCK_RECHECK #undef INP_WLOCK_RECHECK_CLEANUP /* * Initiate (or continue) disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ static void tcp_disconnect(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); /* * Neither tcp_close() nor tcp_drop() should return NULL, as the * socket is still open. */ if (tp->t_state < TCPS_ESTABLISHED && !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { tp = tcp_drop(tp, 0); KASSERT(tp != NULL, ("tcp_disconnect: tcp_drop() returned NULL")); } else { soisdisconnecting(so); sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) /* Ignore stack's drop request, we already at it. */ (void)tcp_output_nodrop(tp); } } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ static void tcp_usrclosed(struct tcpcb *tp) { NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { case TCPS_LISTEN: #ifdef TCP_OFFLOAD tcp_offload_listen_stop(tp); #endif tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ case TCPS_CLOSED: tp = tcp_close(tp); /* * tcp_close() should never return NULL here as the socket is * still open. */ KASSERT(tp != NULL, ("tcp_usrclosed: tcp_close() returned NULL")); break; case TCPS_SYN_SENT: case TCPS_SYN_RECEIVED: tp->t_flags |= TF_NEEDFIN; break; case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_FIN_WAIT_1); break; case TCPS_CLOSE_WAIT: tcp_state_change(tp, TCPS_LAST_ACK); break; } if (tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) { int timeout; timeout = (tcp_fast_finwait2_recycle) ? tcp_finwait2_timeout : TP_MAXIDLE(tp); tcp_timer_activate(tp, TT_2MSL, timeout); } } } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_tstate(int t_state) { switch (t_state) { case TCPS_CLOSED: db_printf("TCPS_CLOSED"); return; case TCPS_LISTEN: db_printf("TCPS_LISTEN"); return; case TCPS_SYN_SENT: db_printf("TCPS_SYN_SENT"); return; case TCPS_SYN_RECEIVED: db_printf("TCPS_SYN_RECEIVED"); return; case TCPS_ESTABLISHED: db_printf("TCPS_ESTABLISHED"); return; case TCPS_CLOSE_WAIT: db_printf("TCPS_CLOSE_WAIT"); return; case TCPS_FIN_WAIT_1: db_printf("TCPS_FIN_WAIT_1"); return; case TCPS_CLOSING: db_printf("TCPS_CLOSING"); return; case TCPS_LAST_ACK: db_printf("TCPS_LAST_ACK"); return; case TCPS_FIN_WAIT_2: db_printf("TCPS_FIN_WAIT_2"); return; case TCPS_TIME_WAIT: db_printf("TCPS_TIME_WAIT"); return; default: db_printf("unknown"); return; } } static void db_print_tflags(u_int t_flags) { int comma; comma = 0; if (t_flags & TF_ACKNOW) { db_printf("%sTF_ACKNOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_DELACK) { db_printf("%sTF_DELACK", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NODELAY) { db_printf("%sTF_NODELAY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOOPT) { db_printf("%sTF_NOOPT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SENTFIN) { db_printf("%sTF_SENTFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_SCALE) { db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_SCALE) { db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_REQ_TSTMP) { db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RCVD_TSTMP) { db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SACK_PERMIT) { db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDSYN) { db_printf("%sTF_NEEDSYN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NEEDFIN) { db_printf("%sTF_NEEDFIN", comma ? ", " : ""); comma = 1; } if (t_flags & TF_NOPUSH) { db_printf("%sTF_NOPUSH", comma ? ", " : ""); comma = 1; } if (t_flags & TF_MORETOCOME) { db_printf("%sTF_MORETOCOME", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LQ_OVERFLOW) { db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); comma = 1; } if (t_flags & TF_LASTIDLE) { db_printf("%sTF_LASTIDLE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_RXWIN0SENT) { db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTRECOVERY) { db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_CONGRECOVERY) { db_printf("%sTF_CONGRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_WASFRECOVERY) { db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); comma = 1; } if (t_flags & TF_SIGNATURE) { db_printf("%sTF_SIGNATURE", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FORCEDATA) { db_printf("%sTF_FORCEDATA", comma ? ", " : ""); comma = 1; } if (t_flags & TF_TSO) { db_printf("%sTF_TSO", comma ? ", " : ""); comma = 1; } if (t_flags & TF_FASTOPEN) { db_printf("%sTF_FASTOPEN", comma ? ", " : ""); comma = 1; } } static void db_print_tflags2(u_int t_flags2) { int comma; comma = 0; if (t_flags2 & TF2_ECN_PERMIT) { db_printf("%sTF2_ECN_PERMIT", comma ? ", " : ""); comma = 1; } } static void db_print_toobflags(char t_oobflags) { int comma; comma = 0; if (t_oobflags & TCPOOB_HAVEDATA) { db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); comma = 1; } if (t_oobflags & TCPOOB_HADDATA) { db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); comma = 1; } } static void db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, tp); indent += 2; db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); db_print_indent(indent); db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, &tp->t_timers->tt_delack, tp->t_inpcb); db_print_indent(indent); db_printf("t_state: %d (", tp->t_state); db_print_tstate(tp->t_state); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags: 0x%x (", tp->t_flags); db_print_tflags(tp->t_flags); db_printf(")\n"); db_print_indent(indent); db_printf("t_flags2: 0x%x (", tp->t_flags2); db_print_tflags2(tp->t_flags2); db_printf(")\n"); db_print_indent(indent); db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", tp->snd_una, tp->snd_max, tp->snd_nxt); db_print_indent(indent); db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", tp->snd_up, tp->snd_wl1, tp->snd_wl2); db_print_indent(indent); db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", tp->iss, tp->irs, tp->rcv_nxt); db_print_indent(indent); db_printf("rcv_adv: 0x%08x rcv_wnd: %u rcv_up: 0x%08x\n", tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); db_printf("snd_wnd: %u snd_cwnd: %u\n", tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); db_printf("snd_ssthresh: %u snd_recover: " "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_rcvtime: %u t_startime: %u\n", tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); db_printf("t_rttime: %u t_rtsq: 0x%08x\n", tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, tp->t_rttbest); db_print_indent(indent); db_printf("t_rttupdated: %lu max_sndwnd: %u t_softerror: %d\n", tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); db_print_indent(indent); db_printf("t_oobflags: 0x%x (", tp->t_oobflags); db_print_toobflags(tp->t_oobflags); db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); db_print_indent(indent); db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", tp->snd_scale, tp->rcv_scale, tp->request_r_scale); db_print_indent(indent); db_printf("ts_recent: %u ts_recent_age: %u\n", tp->ts_recent, tp->ts_recent_age); db_print_indent(indent); db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); db_print_indent(indent); db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x " "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, tp->snd_recover_prev, tp->t_badrxtwin); db_print_indent(indent); db_printf("snd_numholes: %d snd_holes first: %p\n", tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); db_print_indent(indent); db_printf("snd_fack: 0x%08x rcv_numsacks: %d\n", tp->snd_fack, tp->rcv_numsacks); /* Skip sackblks, sackhint. */ db_print_indent(indent); db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); } DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) { struct tcpcb *tp; if (!have_addr) { db_printf("usage: show tcpcb \n"); return; } tp = (struct tcpcb *)addr; db_print_tcpcb(tp, "tcpcb", 0); } #endif diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 4f2db050799e..7fff6984f672 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,1298 +1,1299 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include "opt_kern_tls.h" #include #include #include #endif #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information array" */ /* Types of ending byte info */ #define TCP_EI_EMPTY_SLOT 0 #define TCP_EI_STATUS_CLIENT_FIN 0x1 #define TCP_EI_STATUS_CLIENT_RST 0x2 #define TCP_EI_STATUS_SERVER_FIN 0x3 #define TCP_EI_STATUS_SERVER_RST 0x4 #define TCP_EI_STATUS_RETRAN 0x5 #define TCP_EI_STATUS_PROGRESS 0x6 #define TCP_EI_STATUS_PERSIST_MAX 0x7 #define TCP_EI_STATUS_KEEP_MAX 0x8 #define TCP_EI_STATUS_DATA_A_CLOSE 0x9 #define TCP_EI_STATUS_RST_IN_FRONT 0xa #define TCP_EI_STATUS_2MSL 0xb #define TCP_EI_STATUS_MAX_VALUE 0xb /************************************************/ /* Status bits we track to assure no duplicates, * the bits here are not used by the code but * for human representation. To check a bit we * take and shift over by 1 minus the value (1-8). */ /************************************************/ #define TCP_EI_BITS_CLIENT_FIN 0x001 #define TCP_EI_BITS_CLIENT_RST 0x002 #define TCP_EI_BITS_SERVER_FIN 0x004 #define TCP_EI_BITS_SERVER_RST 0x008 #define TCP_EI_BITS_RETRAN 0x010 #define TCP_EI_BITS_PROGRESS 0x020 #define TCP_EI_BITS_PRESIST_MAX 0x040 #define TCP_EI_BITS_KEEP_MAX 0x080 #define TCP_EI_BITS_DATA_A_CLO 0x100 #define TCP_EI_BITS_RST_IN_FR 0x200 /* a front state reset */ #define TCP_EI_BITS_2MS_TIMER 0x400 /* 2 MSL timer expired */ #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { TAILQ_ENTRY(tseg_qent) tqe_q; struct mbuf *tqe_m; /* mbuf contains packet */ struct mbuf *tqe_last; /* last mbuf in chain */ tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ uint32_t tqe_flags; /* The flags from tcp_get_flags() */ uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int32_t sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int32_t delivered_data; /* Newly acked data from last SACK */ int32_t sacked_bytes; /* Total sacked bytes reported by the * receiver via sack option */ uint32_t recover_fs; /* Flight Size at the start of Loss recovery */ uint32_t prr_delivered; /* Total bytes delivered using PRR */ uint32_t prr_out; /* Bytes sent during IN_RECOVERY */ }; #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* * Tcp control block, one per tcp; fields: * Organized for 64 byte cacheline efficiency based * on common tcp_input/tcp_output processing. */ struct tcpcb { /* Cache line 1 */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ t_fin_is_rst: 1, /* Are fin's treated as resets */ t_log_state_set: 1, bits_spare : 2; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */ /* Cache line 2 */ u_int32_t ts_offset; /* our timestamp offset */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit in bytes */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ u_int32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* Count of bytes mbufs on all entries */ struct tsegqe_head t_segq; /* segment reassembly queue */ struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; struct tcp_timer *t_timers; /* All the TCP timers in one struct */ struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ /* Cache line 4 */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; /* RACK and BBR incoming new data was acked */ u_int t_sndtime; /* time last data was sent */ u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ uint16_t cl4_spare; /* Spare to adjust CL 4 */ char t_oobflags; /* have some */ char t_iobc; /* input character */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_fbyte_in; /* ticks time when first byte queued in */ u_int t_fbyte_out; /* ticks time when first byte queued out */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ int t_blackhole_enter; /* when to enter blackhole detection */ int t_blackhole_exit; /* when to exit blackhole detection */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ /* Cache line 5 */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ u_long t_rttupdated; /* number of times rtt sampled */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ struct statsblob *t_stats; /* Per-connection stats */ uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint32_t t_maxpeakrate; /* max peak rate set by user, in bytes/s */ uint32_t t_sndtlppack; /* tail loss probe packets sent */ uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */ uint64_t t_sndbytes; /* total bytes sent */ uint64_t t_snd_rxt_bytes; /* total bytes retransmitted */ uint32_t t_dsack_bytes; /* Total number of dsack bytes we have received */ uint32_t t_dsack_tlp_bytes; /* Total number of dsack bytes we have received for TLPs sent */ uint32_t t_dsack_pack; /* Total dsack packets we have recieved */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ uint32_t t_end_info_status; /* Status flag of end info */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ union { uint8_t t_end_info_bytes[TCP_END_BYTE_INFO]; uint64_t t_end_info; }; #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_PORT_MIN 0 #define TCP_TUNNELING_PORT_MAX 65535 #define TCP_TUNNELING_PORT_DEFAULT 0 /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_OVERHEAD_MIN sizeof(struct udphdr) #define TCP_TUNNELING_OVERHEAD_MAX 1024 #define TCP_TUNNELING_OVERHEAD_DEFAULT TCP_TUNNELING_OVERHEAD_MIN /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ #define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ /* * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the * tcp_timer_discard callback. You should check * the return of callout_async_drain() and if 0 * increment tt_draincnt. Since the timer sub-system * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); int (*tfb_tcp_ctloutput)(struct inpcb *inp, struct sockopt *sopt); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_timer_activate)(struct tcpcb *, uint32_t, u_int); int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *); int (*tfb_pru_options)(struct tcpcb *, int); void (*tfb_hwtls_change)(struct tcpcb *, int); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); struct tcpcb * tcp_drop(struct tcpcb *, int); #ifdef _NETINET_IN_PCB_H_ /* * tcp_output() * Handles tcp_drop request from advanced stacks and reports that inpcb is * gone with negative return code. * Drop in replacement for the default stack. */ static inline int tcp_output(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tp->t_inpcb); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); tp = tcp_drop(tp, -rv); if (tp) INP_WUNLOCK(tp->t_inpcb); } return (rv); } /* * tcp_output_unlock() * Always returns unlocked, handles drop request from advanced stacks. * Always returns positive error code. */ static inline int tcp_output_unlock(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tp->t_inpcb); rv = tp->t_fb->tfb_tcp_output(tp); if (rv < 0) { KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); rv = -rv; tp = tcp_drop(tp, rv); if (tp) INP_WUNLOCK(tp->t_inpcb); } else INP_WUNLOCK(tp->t_inpcb); return (rv); } /* * tcp_output_nodrop() * Always returns locked. It is caller's responsibility to run tcp_drop()! * Useful in syscall implementations, when we want to perform some logging * and/or tracing with tcpcb before calling tcp_drop(). To be used with * tcp_unlock_or_drop() later. * * XXXGL: maybe don't allow stacks to return a drop request at certain * TCP states? Why would it do in connect(2)? In recv(2)? */ static inline int tcp_output_nodrop(struct tcpcb *tp) { int rv; INP_WLOCK_ASSERT(tp->t_inpcb); rv = tp->t_fb->tfb_tcp_output(tp); KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, ("TCP stack %s requested tcp_drop(%p)", tp->t_fb->tfb_tcp_block_name, tp)); return (rv); } /* * tcp_unlock_or_drop() * Handle return code from tfb_tcp_output() after we have logged/traced, * to be used with tcp_output_nodrop(). */ static inline int tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval) { INP_WLOCK_ASSERT(tp->t_inpcb); if (tcp_output_retval < 0) { tcp_output_retval = -tcp_output_retval; if (tcp_drop(tp, tcp_output_retval) != NULL) INP_WUNLOCK(tp->t_inpcb); } else INP_WUNLOCK(tp->t_inpcb); return (tcp_output_retval); } #endif /* _NETINET_IN_PCB_H_ */ #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x00000001 /* ack peer immediately */ #define TF_DELACK 0x00000002 /* ack, but try to delay it */ #define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x00000008 /* don't use tcp options */ #define TF_SENTFIN 0x00000010 /* have sent FIN */ #define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid * Note: accessing and restoring from * these may only be done in the 1st * RTO recovery round (t_rxtshift == 1) */ #define TF_WAKESOR 0x00004000 /* wake up receive socket */ #define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x00800000 /* force out a byte */ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_UNUSED0 0x04000000 /* unused */ #define TF_UNUSED1 0x08000000 /* unused */ #define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #if defined(_KERNEL) && !defined(TCP_RFC7413) #define IS_FASTOPEN(t_flags) (false) #else #define IS_FASTOPEN(t_flags) (t_flags & TF_FASTOPEN) #endif #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ #define TF2_ECN_PERMIT 0x00000020 /* connection ECN-ready */ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ #define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ uint32_t t_port:16, /* UDP port number if TCPoUDP */ t_unused:16; tcp_seq snd_nxt; tcp_seq rcv_nxt; u_short last_win; /* cached window value */ short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_int32_t t_recent; u_int32_t ts_offset; /* our timestamp offset */ int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; u_int tw_flags; /* tcpcb t_flags */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_lostrexmt; /* SACK lost retransmission recovered */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_ect0; /* ECN Capable Transport */ uint64_t tcps_ecn_ect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t tcps_tunneled_pkts; /* Packets encap's in UDP received */ uint64_t tcps_tunneled_errs; /* Packets that had errors that were UDP encaped */ /* Dsack related stats */ uint64_t tcps_dsack_count; /* Number of ACKs arriving with DSACKs */ uint64_t tcps_dsack_bytes; /* Number of bytes DSACK'ed no TLP */ uint64_t tcps_dsack_tlp_bytes; /* Number of bytes DSACK'ed due to TLPs */ /* TCPS_TIME_WAIT usage stats */ uint64_t tcps_tw_recycles; /* Times time-wait was recycled. */ uint64_t tcps_tw_resets; /* Times time-wait sent a reset. */ uint64_t tcps_tw_responds; /* Times time-wait sent a valid ack. */ uint64_t _pad[6]; /* 3 UTO, 3 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_add(int statnum, int val); #define KMOD_TCPSTAT_ADD(name, val) \ kmod_tcpstat_add(offsetof(struct tcpstat, name) / sizeof(uint64_t), val) #define KMOD_TCPSTAT_INC(name) KMOD_TCPSTAT_ADD(name, 1) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ char xt_cc[TCP_CA_NAME_MAX]; /* (s) */ int64_t spare64[6]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ uint32_t t_snd_cwnd; /* (s) */ uint32_t t_snd_ssthresh; /* (s) */ uint32_t t_maxseg; /* (s) */ uint32_t t_rcv_wnd; /* (s) */ uint32_t t_snd_wnd; /* (s) */ uint32_t xt_ecn; /* (s) */ uint32_t t_dsack_bytes; /* (n) */ uint32_t t_dsack_tlp_bytes; /* (n) */ uint32_t t_dsack_pack; /* (n) */ uint16_t xt_encaps_port; /* (s) */ int16_t spare16; int32_t spare32[22]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif VNET_DECLARE(int, tcp_log_in_vain); #define V_tcp_log_in_vain VNET(tcp_log_in_vain) /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_lrd); VNET_DECLARE(int, tcp_do_prr); VNET_DECLARE(int, tcp_do_prr_conservative); VNET_DECLARE(int, tcp_do_newcwv); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_tolerate_missing_ts); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_newsack); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); #ifdef STATS VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); VNET_DECLARE(int, tcp_perconn_stats_enable); #endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); VNET_DECLARE(int, tcp_udp_tunneling_port); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_lrd VNET(tcp_do_lrd) #define V_tcp_do_prr VNET(tcp_do_prr) #define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative) #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_tolerate_missing_ts VNET(tcp_tolerate_missing_ts) #define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_newsack VNET(tcp_do_newsack) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #ifdef STATS #define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) #define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) #endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); bool tcp_freecb(struct tcpcb *); void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_ctlinput_viaudp(int, struct sockaddr *, void *, void *); void tcp_drain(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos); void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); int tcp_input_with_port(struct mbuf **, int *, int, uint16_t); void tcp_handle_wakeup(struct tcpcb *, struct socket *); void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt); +int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt); extern counter_u64_t tcp_inp_lro_direct_queue; extern counter_u64_t tcp_inp_lro_wokeup_queue; extern counter_u64_t tcp_inp_lro_compressed; extern counter_u64_t tcp_inp_lro_locks_taken; extern counter_u64_t tcp_extra_mbuf; extern counter_u64_t tcp_would_have_but; extern counter_u64_t tcp_comp_total; extern counter_u64_t tcp_uncomp_total; extern counter_u64_t tcp_bad_csums; #ifdef NETFLIX_EXP_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; extern int32_t tcp_sad_decay_val; extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif extern uint32_t tcp_ack_war_time_window; extern uint32_t tcp_ack_war_cnt; uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp6_use_min_mtu(struct tcpcb *); u_int tcp_maxseg(const struct tcpcb *); u_int tcp_fixed_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_default_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); #ifdef VIMAGE void tcp_tw_destroy(void); #endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); int tcp_timer_suspend(struct tcpcb *, uint32_t); void tcp_timers_unsuspend(struct tcpcb *, uint32_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); int tcp_dsack_block_exists(struct tcpcb *); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *); void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); void tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); int tcp_can_enable_pacing(void); void tcp_decrement_paced_conn(void); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); int tcp_stats_init(void); void tcp_log_end_status(struct tcpcb *tp, uint8_t status); static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } static inline uint16_t tcp_get_flags(const struct tcphdr *th) { return (((uint16_t)th->th_x2 << 8) | th->th_flags); } static inline void tcp_set_flags(struct tcphdr *th, uint16_t flags) { th->th_x2 = (flags >> 8) & 0x0f; th->th_flags = flags & 0xff; } static inline void tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, uint8_t is_tlp, int hw_tls) { if (is_tlp) { tp->t_sndtlppack++; tp->t_sndtlpbyte += len; } /* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */ if (is_rxt) tp->t_snd_rxt_bytes += len; else tp->t_sndbytes += len; #ifdef KERN_TLS if (hw_tls && is_rxt && len != 0) { uint64_t rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes)); if (rexmit_percent > ktls_ifnet_max_rexmit_pct) ktls_disable_ifnet(tp); } #endif } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */