diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index ea8bbea1b5ff..70aaca21a20f 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1,3339 +1,3351 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #ifdef INET #include #include #endif #include #include #ifdef TCPHPTS #include #endif #include #include #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include #endif #include #include #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 #define INP_FREED 0x00000200 /* See in_pcb.h. */ /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Enable random ephemeral port allocation by default. */ VNET_DEFINE(int, ipport_randomized) = 1; #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp, uint8_t numa_domain); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); #ifdef RATELIMIT counter_u64_t rate_limit_new; counter_u64_t rate_limit_chg; counter_u64_t rate_limit_active; counter_u64_t rate_limit_alloc_fail; counter_u64_t rate_limit_set_ok; static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "IP Rate Limiting"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, &rate_limit_active, "Active rate limited connections"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, &rate_limit_alloc_fail, "Rate limited connection failures"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, &rate_limit_set_ok, "Rate limited setting succeeded"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, &rate_limit_new, "Total Rate limit new attempts"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, &rate_limit_chg, "Total Rate limited change attempts"); #endif /* RATELIMIT */ #endif /* INET */ VNET_DEFINE(uint32_t, in_pcbhashseed); static void in_pcbhashseed_init(void) { V_in_pcbhashseed = arc4random(); } VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in_pcbhashseed_init, 0); static void in_pcbremhash(struct inpcb *); /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, uint8_t numa_domain) { struct inpcblbgroup *grp; size_t bytes; bytes = __offsetof(struct inpcblbgroup, il_inp[size]); grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); if (grp == NULL) return (NULL); grp->il_cred = crhold(cred); grp->il_vflag = vflag; grp->il_lport = port; grp->il_numa_domain = numa_domain; grp->il_dependladdr = *addr; grp->il_inpsiz = size; CK_LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } static void in_pcblbgroup_free_deferred(epoch_context_t ctx) { struct inpcblbgroup *grp; grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); crfree(grp->il_cred); free(grp, M_PCB); } static void in_pcblbgroup_free(struct inpcblbgroup *grp) { CK_LIST_REMOVE(grp, il_list); NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); } static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { struct inpcblbgroup *grp; int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size, old_grp->il_numa_domain); if (grp == NULL) return (NULL); KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, ("invalid new local group size %d and old local group count %d", grp->il_inpsiz, old_grp->il_inpcnt)); for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; in_pcblbgroup_free(old_grp); return (grp); } /* * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] * and shrink group if possible. */ static void in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, int i) { struct inpcblbgroup *grp, *new_grp; grp = *grpp; for (; i + 1 < grp->il_inpcnt; ++i) grp->il_inp[i] = grp->il_inp[i + 1]; grp->il_inpcnt--; if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && grp->il_inpcnt <= grp->il_inpsiz / 4) { /* Shrink this group. */ new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); if (new_grp != NULL) *grpp = new_grp; } } /* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) { const static struct timeval interval = { 60, 0 }; static struct timeval lastprint; struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); #ifdef INET6 /* * Don't allow IPv4 mapped INET6 wild socket. */ if ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr == INADDR_ANY && INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { return (0); } #endif idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; CK_LIST_FOREACH(grp, hdr, il_list) { if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && grp->il_numa_domain == numa_domain && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) { break; } } if (grp == NULL) { /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN, numa_domain); if (grp == NULL) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) printf("lb group port %d, limit reached\n", ntohs(grp->il_lport)); return (0); } /* Expand this local group. */ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (grp == NULL) return (ENOBUFS); } KASSERT(grp->il_inpcnt < grp->il_inpsiz, ("invalid local group size %d and count %d", grp->il_inpsiz, grp->il_inpcnt)); grp->il_inp[grp->il_inpcnt] = inp; grp->il_inpcnt++; return (0); } /* * Remove PCB from load balance group. */ static void in_pcbremlbgrouphash(struct inpcb *inp) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int i; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_inpcnt == 1) { /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { /* Pull up inpcbs, shrink group if possible. */ in_pcblbgroup_reorder(hdr, &grp, i); } return; } } } int in_pcblbgroup_numa(struct inpcb *inp, int arg) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int err, i; uint8_t numa_domain; switch (arg) { case TCP_REUSPORT_LB_NUMA_NODOM: numa_domain = M_NODOM; break; case TCP_REUSPORT_LB_NUMA_CURDOM: numa_domain = PCPU_GET(domain); break; default: if (arg < 0 || arg >= vm_ndomains) return (EINVAL); numa_domain = arg; } err = 0; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_numa_domain == numa_domain) { goto abort_with_hash_wlock; } /* Remove it from the old group. */ in_pcbremlbgrouphash(inp); /* Add it to the new group based on numa domain. */ in_pcbinslbgrouphash(inp, numa_domain); goto abort_with_hash_wlock; } } err = ENOENT; abort_with_hash_wlock: INP_HASH_WUNLOCK(pcbinfo); return (err); } /* Make sure it is safe to use hashinit(9) on CK_LIST. */ CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); /* * Initialize an inpcbinfo - a per-VNET instance of connections db. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, u_int hash_nelements, u_int porthash_nelements) { mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, NULL, MTX_DEF); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif CK_LIST_INIT(&pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); pcbinfo->ipi_zone = pcbstor->ips_zone; pcbinfo->ipi_portzone = pcbstor->ips_portzone; pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); mtx_destroy(&pcbinfo->ipi_hash_lock); mtx_destroy(&pcbinfo->ipi_lock); } /* * Initialize a pcbstorage - per protocol zones to allocate inpcbs. */ static void inpcb_dtor(void *, int, void *); static void inpcb_fini(void *, int); void in_pcbstorage_init(void *arg) { struct inpcbstorage *pcbstor = arg; pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit, inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_smr(pcbstor->ips_portzone, uma_zone_get_smr(pcbstor->ips_zone)); } /* * Destroy a pcbstorage - used by unloadable protocols. */ void in_pcbstorage_destroy(void *arg) { struct inpcbstorage *pcbstor = arg; uma_zdestroy(pcbstor->ips_zone); uma_zdestroy(pcbstor->ips_portzone); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) int error; #endif inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); #ifdef NUMA inp->inp_numa_domain = M_NODOM; #endif inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; #ifdef INET else inp->inp_vflag |= INP_IPV4; #endif if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; inp->in6p_hops = -1; /* use kernel default */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET inp->inp_vflag |= INP_IPV4; #endif /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; #ifdef TCPHPTS /* * If using hpts lets drop a random number in so * not all new connections fall on the same CPU. */ inp->inp_hpts_cpu = hpts_random_cpu(inp); #endif refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ INP_WLOCK(inp); INP_INFO_WLOCK(pcbinfo); pcbinfo->ipi_count++; inp->inp_gencnt = ++pcbinfo->ipi_gencnt; CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); INP_INFO_WUNLOCK(pcbinfo); so->so_pcb = inp; return (0); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: uma_zfree_smr(pcbinfo->ipi_zone, inp); return (error); #endif } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; KASSERT(nam == NULL || nam->sa_family == AF_INET, ("%s: invalid address family for %p", __func__, nam)); KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in), ("%s: invalid address length for %p", __func__, nam)); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif #if defined(INET) || defined(INET6) /* * Assign a local port like in_pcb_lport(), but also used with connect() * and a foreign address and port. If fsa is non-NULL, choose a local port * that is unused with those, otherwise one that is completely unused. * lsa can be NULL for IPv6. */ int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr, faddr; #endif #ifdef INET6 struct in6_addr *laddr6, *faddr6; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { if (lsa != NULL) laddr = ((struct sockaddr_in *)lsa)->sin_addr; if (fsa != NULL) faddr = ((struct sockaddr_in *)fsa)->sin_addr; } #endif #ifdef INET6 laddr6 = NULL; if ((inp->inp_vflag & INP_IPV6) != 0) { if (lsa != NULL) laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; if (fsa != NULL) faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; } #endif tmpinp = NULL; lport = *lportp; if (V_ipport_randomized) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); if (fsa != NULL) { #ifdef INET if (lsa->sa_family == AF_INET) { tmpinp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags, NULL, M_NODOM); } #endif #ifdef INET6 if (lsa->sa_family == AF_INET6) { tmpinp = in6_pcblookup_hash_locked(pcbinfo, faddr6, fport, laddr6, lport, lookupflags, NULL, M_NODOM); } #endif } else { #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #ifdef INET if (tmpinp == NULL && (inp->inp_vflag & INP_IPV4)) tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } } while (tmpinp != NULL); *lportp = lport; return (0); } /* * Select a local port (number) to use. */ int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct sockaddr_in laddr; if (laddrp) { bzero(&laddr, sizeof(laddr)); laddr.sin_family = AF_INET; laddr.sin_addr = *laddrp; } return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : NULL, lportp, NULL, 0, cred, lookupflags)); } /* * Return cached socket options. */ int inp_so_options(const struct inpcb *inp) { int so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) so_options |= SO_REUSEPORT_LB; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the existing inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here * so that we don't have to add to the (already messy) code below. */ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; KASSERT(sin->sin_family == AF_INET, ("%s: invalid family for address %p", __func__, sin)); KASSERT(sin->sin_len == sizeof(*sin), ("%s: invalid length for address %p", __func__, sin)); error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; /* * XXX: How to deal with SO_REUSEPORT_LB here? * Treat same as SO_REUSEPORT for now. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) || (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0 && (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, bool rehash) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { KASSERT(rehash == true, ("Rehashing required for unbound inps")); inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; if (rehash) { in_pcbrehash(inp); } else { in_pcbinshash(inp); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin, dst; struct nhop_object *nh; int error; NET_EPOCH_ASSERT(); KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (!prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; nh = NULL; bzero(&dst, sizeof(dst)); sin = &dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 0, NHR_NONE, 0); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (nh == NULL || nh->nh_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); } if (ia == NULL) { error = ENETUNREACH; goto done; } if (!prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (!prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = nh->nh_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { struct in_ifaddr *ia; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); if (!prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; KASSERT(sin->sin_family == AF_INET, ("%s: invalid address family for %p", __func__, sin)); KASSERT(sin->sin_len == sizeof(*sin), ("%s: invalid address length for %p", __func__, sin)); /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ NET_EPOCH_ASSERT(); INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; #ifdef ROUTE_MPATH if (CALC_FLOWID_OUTBOUND) { uint32_t hash_val, hash_type; hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, inp->inp_socket->so_proto->pr_protocol, &hash_type); inp->inp_flowid = hash_val; inp->inp_flowtype = hash_type; } #endif if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { faddr = IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; if ((error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifp == ifp && prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } } } if (error) return (error); } if (lport != 0) { oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL, M_NODOM); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } } else { struct sockaddr_in lsin, fsin; bzero(&lsin, sizeof(lsin)); bzero(&fsin, sizeof(fsin)); lsin.sin_family = AF_INET; lsin.sin_addr = laddr; fsin.sin_family = AF_INET; fsin.sin_addr = faddr; error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, &lport, (struct sockaddr *)& fsin, fport, cred, INPLOOKUP_WILDCARD); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * inpcb hash lookups are protected by SMR section. * * Once desired pcb has been found, switching from SMR section to a pcb * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK * here because SMR is a critical section. * In 99%+ cases inp_smr_lock() would obtain the lock immediately. */ static inline void inp_lock(struct inpcb *inp, const inp_lookup_t lock) { lock == INPLOOKUP_RLOCKPCB ? rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); } static inline void inp_unlock(struct inpcb *inp, const inp_lookup_t lock) { lock == INPLOOKUP_RLOCKPCB ? rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); } static inline int inp_trylock(struct inpcb *inp, const inp_lookup_t lock) { return (lock == INPLOOKUP_RLOCKPCB ? rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); } static inline bool in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) { return (lock == INPLOOKUP_RLOCKPCB ? in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); } -bool -inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) +static inline bool +_inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) { MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); if (__predict_true(inp_trylock(inp, lock))) { - if (__predict_false(inp->inp_flags & INP_FREED)) { + if (__predict_false(inp->inp_flags & ignflags)) { smr_exit(inp->inp_pcbinfo->ipi_smr); inp_unlock(inp, lock); return (false); } smr_exit(inp->inp_pcbinfo->ipi_smr); return (true); } if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { smr_exit(inp->inp_pcbinfo->ipi_smr); inp_lock(inp, lock); if (__predict_false(in_pcbrele(inp, lock))) return (false); /* * inp acquired through refcount & lock for sure didn't went * through uma_zfree(). However, it may have already went * through in_pcbfree() and has another reference, that * prevented its release by our in_pcbrele(). */ - if (__predict_false(inp->inp_flags & INP_FREED)) { + if (__predict_false(inp->inp_flags & ignflags)) { inp_unlock(inp, lock); return (false); } return (true); } else { smr_exit(inp->inp_pcbinfo->ipi_smr); return (false); } } +bool +inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) +{ + + /* + * in_pcblookup() family of functions ignore not only freed entries, + * that may be found due to lockless access to the hash, but dropped + * entries, too. + */ + return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); +} + /* * inp_next() - inpcb hash/list traversal iterator * * Requires initialized struct inpcb_iterator for context. * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). * * - Iterator can have either write-lock or read-lock semantics, that can not * be changed later. * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through * a single hash slot. Note: only rip_input() does the latter. * - Iterator may have optional bool matching function. The matching function * will be executed for each inpcb in the SMR context, so it can not acquire * locks and can safely access only immutable fields of inpcb. * * A fresh initialized iterator has NULL inpcb in its context and that * means that inp_next() call would return the very first inpcb on the list * locked with desired semantic. In all following calls the context pointer * shall hold the current inpcb pointer. The KPI user is not supposed to * unlock the current inpcb! Upon end of traversal inp_next() will return NULL * and write NULL to its context. After end of traversal an iterator can be * reused. * * List traversals have the following features/constraints: * - New entries won't be seen, as they are always added to the head of a list. * - Removed entries won't stop traversal as long as they are not added to * a different list. This is violated by in_pcbrehash(). */ #define II_LIST_FIRST(ipi, hash) \ (((hash) == INP_ALL_LIST) ? \ CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) #define II_LIST_NEXT(inp, hash) \ (((hash) == INP_ALL_LIST) ? \ CK_LIST_NEXT((inp), inp_list) : \ CK_LIST_NEXT((inp), inp_hash)) #define II_LOCK_ASSERT(inp, lock) \ rw_assert(&(inp)->inp_lock, \ (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) struct inpcb * inp_next(struct inpcb_iterator *ii) { const struct inpcbinfo *ipi = ii->ipi; inp_match_t *match = ii->match; void *ctx = ii->ctx; inp_lookup_t lock = ii->lock; int hash = ii->hash; struct inpcb *inp; if (ii->inp == NULL) { /* First call. */ smr_enter(ipi->ipi_smr); /* This is unrolled CK_LIST_FOREACH(). */ for (inp = II_LIST_FIRST(ipi, hash); inp != NULL; inp = II_LIST_NEXT(inp, hash)) { if (match != NULL && (match)(inp, ctx) == false) continue; - if (__predict_true(inp_smr_lock(inp, lock))) + if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) break; else { smr_enter(ipi->ipi_smr); MPASS(inp != II_LIST_FIRST(ipi, hash)); inp = II_LIST_FIRST(ipi, hash); if (inp == NULL) break; } } if (inp == NULL) smr_exit(ipi->ipi_smr); else ii->inp = inp; return (inp); } /* Not a first call. */ smr_enter(ipi->ipi_smr); restart: inp = ii->inp; II_LOCK_ASSERT(inp, lock); next: inp = II_LIST_NEXT(inp, hash); if (inp == NULL) { smr_exit(ipi->ipi_smr); goto found; } if (match != NULL && (match)(inp, ctx) == false) goto next; if (__predict_true(inp_trylock(inp, lock))) { if (__predict_false(inp->inp_flags & INP_FREED)) { /* * Entries are never inserted in middle of a list, thus * as long as we are in SMR, we can continue traversal. * Jump to 'restart' should yield in the same result, * but could produce unnecessary looping. Could this * looping be unbound? */ inp_unlock(inp, lock); goto next; } else { smr_exit(ipi->ipi_smr); goto found; } } /* * Can't obtain lock immediately, thus going hard. Once we exit the * SMR section we can no longer jump to 'next', and our only stable * anchoring point is ii->inp, which we keep locked for this case, so * we jump to 'restart'. */ if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { smr_exit(ipi->ipi_smr); inp_lock(inp, lock); if (__predict_false(in_pcbrele(inp, lock))) { smr_enter(ipi->ipi_smr); goto restart; } /* * See comment in inp_smr_lock(). */ if (__predict_false(inp->inp_flags & INP_FREED)) { inp_unlock(inp, lock); smr_enter(ipi->ipi_smr); goto restart; } } else goto next; found: inp_unlock(ii->inp, lock); ii->inp = inp; return (ii->inp); } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released or * SMR section exited. * * To free a reference later in_pcbrele_(r|w)locked() must be performed. */ void in_pcbref(struct inpcb *inp) { u_int old __diagused; old = refcount_acquire(&inp->inp_refcount); KASSERT(old > 0, ("%s: refcount 0", __func__)); } /* * Drop a refcount on an inpcb elevated using in_pcbref(), potentially * freeing the pcb, if the reference was very last. */ bool in_pcbrele_rlocked(struct inpcb *inp) { INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) return (false); MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); MPASS(inp->inp_in_hpts == 0); INP_RUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); } bool in_pcbrele_wlocked(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) return (false); MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); MPASS(inp->inp_in_hpts == 0); INP_WUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. * Almost all work, including removal from global lists, is done in this * context, where the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INET struct ip_moptions *imo; #endif #ifdef INET6 struct ip6_moptions *im6o; #endif INP_WLOCK_ASSERT(inp); KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); KASSERT((inp->inp_flags & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); inp->inp_flags |= INP_FREED; INP_INFO_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; pcbinfo->ipi_count--; CK_LIST_REMOVE(inp, inp_list); INP_INFO_WUNLOCK(pcbinfo); if (inp->inp_flags & INP_INHASHLIST) in_pcbremhash(inp); RO_INVALIDATE_CACHE(&inp->inp_route); #ifdef MAC mac_inpcb_destroy(inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif #ifdef INET if (inp->inp_options) (void)m_free(inp->inp_options); imo = inp->inp_moptions; #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); im6o = inp->in6p_moptions; } else im6o = NULL; #endif if (__predict_false(in_pcbrele_wlocked(inp) == false)) { INP_WUNLOCK(inp); } #ifdef INET6 ip6_freemoptions(im6o); #endif #ifdef INET inp_freemoptions(imo); #endif /* Destruction is finalized in inpcb_dtor(). */ } static void inpcb_dtor(void *mem, int size, void *arg) { struct inpcb *inp = mem; crfree(inp->inp_cred); #ifdef INVARIANTS inp->inp_cred = NULL; #endif } /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); #ifdef INVARIANTS if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) MPASS(inp->inp_refcount > 1); #endif inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) in_pcbremhash(inp); } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } static bool inp_v4_multi_match(const struct inpcb *inp, void *v __unused) { if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) return (true); else return (false); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, inp_v4_multi_match, NULL); struct inpcb *inp; struct in_multi *inm; struct in_mfilter *imf; struct ip_moptions *imo; IN_MULTI_LOCK_ASSERT(); while ((inp = inp_next(&inpi)) != NULL) { INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * * XXX This can all be deferred to an epoch_call */ restart: IP_MFILTER_FOREACH(imf, &imo->imo_head) { if ((inm = imf->imf_inm) == NULL) continue; if (inm->inm_ifp != ifp) continue; ip_mfilter_remove(&imo->imo_head, imf); in_leavegroup_locked(inm, NULL); ip_mfilter_free(imf); goto restart; } } } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (!prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST static bool in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) { return (domain == M_NODOM || domain == grp->il_numa_domain); } static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, uint16_t fport, int lookupflags, int domain) { const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; INP_HASH_LOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; /* * Search for an LB group match based on the following criteria: * - prefer jailed groups to non-jailed groups * - prefer exact source address matches to wildcard matches * - prefer groups bound to the specified NUMA domain */ jail_exact = jail_wild = local_exact = local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { bool injail; #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) continue; #endif if (grp->il_lport != lport) continue; injail = prison_flag(grp->il_cred, PR_IP4) != 0; if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, laddr) != 0) continue; if (grp->il_laddr.s_addr == laddr->s_addr) { if (injail) { jail_exact = grp; if (in_pcblookup_lb_numa_match(grp, domain)) /* This is a perfect match. */ goto out; } else if (local_exact == NULL || in_pcblookup_lb_numa_match(grp, domain)) { local_exact = grp; } } else if (grp->il_laddr.s_addr == INADDR_ANY && (lookupflags & INPLOOKUP_WILDCARD) != 0) { if (injail) { if (jail_wild == NULL || in_pcblookup_lb_numa_match(grp, domain)) jail_wild = grp; } else if (local_wild == NULL || in_pcblookup_lb_numa_match(grp, domain)) { local_wild = grp; } } } if (jail_exact != NULL) grp = jail_exact; else if (jail_wild != NULL) grp = jail_wild; else if (local_exact != NULL) grp = local_exact; else grp = local_wild; if (grp == NULL) return (NULL); out: return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % grp->il_inpcnt]); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has either locked the hash list, which usually happens * for bind(2) operations, or is in SMR section, which happens when sorting * out incoming packets. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp, uint8_t numa_domain) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * First see if an LB group matches the request before scanning * all sockets on this port. */ inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags, numa_domain); if (inp != NULL) return (inp); /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4_locked( inp->inp_cred->cr_prison, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, uint8_t numa_domain) { struct inpcb *inp; smr_enter(pcbinfo->ipi_smr); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); if (inp != NULL) { if (__predict_false(inp_smr_lock(inp, (lookupflags & INPLOOKUP_LOCKMASK)) == false)) inp = NULL; } else smr_exit(pcbinfo->ipi_smr); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp, M_NODOM)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp, m->m_pkthdr.numa_domain)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ int in_pcbinshash(struct inpcb *inp) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; else #endif pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Add entry to load balance group. * Only do this if SO_REUSEPORT_LB is set. */ if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) { int error = in_pcbinslbgrouphash(inp, M_NODOM); if (error != 0) return (error); } /* * Go through port list and look for a head for this lport. */ CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); if (phd == NULL) { if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) in_pcbremlbgrouphash(inp); return (ENOMEM); } phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; return (0); } static void in_pcbremhash(struct inpcb *inp) { struct inpcbport *phd = inp->inp_phd; INP_WLOCK_ASSERT(inp); MPASS(inp->inp_flags & INP_INHASHLIST); INP_HASH_WLOCK(inp->inp_pcbinfo); if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. * * XXXGL: a race between this function and SMR-protected hash iterator * will lead to iterator traversing a possibly wrong hash list. However, * this race should have been here since change from rwlock to epoch. */ void in_pcbrehash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; else #endif head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; CK_LIST_REMOVE(inp, inp_hash); CK_LIST_INSERT_HEAD(head, inp, inp_hash); } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(struct inpcbinfo *pcbinfo, void (*func)(struct inpcb *, void *), void *arg) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inp; while ((inp = inp_next(&inpi)) != NULL) func(inp, arg); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { bzero(xi, sizeof(*xi)); xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, int (*ctloutput_set)(struct inpcb *, struct sockopt *)) { struct sockopt sopt; struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inp; struct sockopt_parameters *params; struct socket *so; int error; char buf[1024]; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen > sizeof(buf)) return (ENOMEM); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) return (error); if (req->newlen < sizeof(struct sockopt_parameters)) return (EINVAL); params = (struct sockopt_parameters *)buf; sopt.sopt_level = params->sop_level; sopt.sopt_name = params->sop_optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = params->sop_optval; sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); sopt.sopt_td = NULL; #ifdef INET6 if (params->sop_inc.inc_flags & INC_ISIPV6) { if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) params->sop_inc.inc6_laddr.s6_addr16[1] = htons(params->sop_inc.inc6_zoneid & 0xffff); if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) params->sop_inc.inc6_faddr.s6_addr16[1] = htons(params->sop_inc.inc6_zoneid & 0xffff); } #endif if (params->sop_inc.inc_lport != htons(0)) { if (params->sop_inc.inc_fport == htons(0)) inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport, pcbinfo->ipi_hashmask); else #ifdef INET6 if (params->sop_inc.inc_flags & INC_ISIPV6) inpi.hash = INP6_PCBHASH( ¶ms->sop_inc.inc6_faddr, params->sop_inc.inc_lport, params->sop_inc.inc_fport, pcbinfo->ipi_hashmask); else #endif inpi.hash = INP_PCBHASH( ¶ms->sop_inc.inc_faddr, params->sop_inc.inc_lport, params->sop_inc.inc_fport, pcbinfo->ipi_hashmask); } while ((inp = inp_next(&inpi)) != NULL) if (inp->inp_gencnt == params->sop_id) { if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } so = inp->inp_socket; KASSERT(so != NULL, ("inp_socket == NULL")); soref(so); error = (*ctloutput_set)(inp, &sopt); sorele(so); break; } if (inp == NULL) error = ESRCH; return (error); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = mst->sw->snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = mst->sw->snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_query == NULL) return (EOPNOTSUPP); error = mst->sw->snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.hdr.numa_domain = inp->inp_numa_domain, .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; int error; INP_WLOCK_ASSERT(inp); /* * If there is already a send tag, or the INP is being torn * down, allocating a new send tag is not allowed. Else send * tags may leak. */ if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) return (EINVAL); error = m_snd_tag_alloc(ifp, ¶ms, st); #ifdef INET if (error == 0) { counter_u64_add(rate_limit_set_ok, 1); counter_u64_add(rate_limit_active, 1); } else if (error != EOPNOTSUPP) counter_u64_add(rate_limit_alloc_fail, 1); #endif return (error); } void in_pcbdetach_tag(struct m_snd_tag *mst) { m_snd_tag_rele(mst); #ifdef INET counter_u64_add(rate_limit_active, -1); #endif } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; m_snd_tag_rele(mst); #ifdef INET counter_u64_add(rate_limit_active, -1); #endif } int in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) { int error; /* * If the existing send tag is for the wrong interface due to * a route change, first drop the existing tag. Set the * CHANGED flag so that we will keep trying to allocate a new * tag if we fail to allocate one this time. */ if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { in_pcbdetach_txrtlmt(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; } /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; return (error); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { bool did_upgrade; if (inp == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #ifdef INET static void rl_init(void *st) { rate_limit_new = counter_u64_alloc(M_WAITOK); rate_limit_chg = counter_u64_alloc(M_WAITOK); rate_limit_active = counter_u64_alloc(M_WAITOK); rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); rate_limit_set_ok = counter_u64_alloc(M_WAITOK); } SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); #endif #endif /* RATELIMIT */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 370b947767ff..eeed49681ec6 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1,4117 +1,4108 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #include #include #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include const int tcprexmtthresh = 3; VNET_DEFINE(int, tcp_log_in_vain) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_log_in_vain), 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(bool, blackhole_local) = false; #define V_blackhole_local VNET(blackhole_local) SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, blackhole_local, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole_local), false, "Enforce net.inet.tcp.blackhole for locally originated packets"); VNET_DEFINE(int, tcp_delack_enabled) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); VNET_DEFINE(int, tcp_do_prr_conservative) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr_conservative), 0, "Do conservative Proportional Rate Reduction"); VNET_DEFINE(int, tcp_do_prr) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr), 1, "Enable Proportional Rate Reduction per RFC 6937"); VNET_DEFINE(int, tcp_do_lrd) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_lrd, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_lrd), 1, "Perform Lost Retransmission Detection"); VNET_DEFINE(int, tcp_do_newcwv) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_newcwv), 0, "Enable New Congestion Window Validation per RFC7661"); VNET_DEFINE(int, tcp_do_rfc3042) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); VNET_DEFINE(int, tcp_initcwnd_segments) = 10; SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, "Slow-start flight size (initial congestion window) in number of segments"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); VNET_DEFINE(int, tcp_insecure_syn) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_syn), 0, "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); VNET_DEFINE(int, tcp_insecure_rst) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); VNET_DEFINE(struct inpcbinfo, tcbinfo); /* * TCP statistics are stored in an array of counter(9)s, which size matches * size of struct tcpstat. TCP running connection count is a regular array. */ VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, "TCP connection counts by TCP state"); /* * Kernel module interface for updating tcpstat. The first argument is an index * into tcpstat treated as an array. */ void kmod_tcpstat_add(int statnum, int val) { counter_u64_add(VNET(tcpstat)[statnum], val); } /* * Make sure that we only start a SACK loss recovery when * receiving a duplicate ACK with a SACK block, and also * complete SACK loss recovery in case the other end * reneges. */ static bool inline tcp_is_sack_recovery(struct tcpcb *tp, struct tcpopt *to) { return ((tp->t_flags & TF_SACK_PERMIT) && ((to->to_flags & TOF_SACK) || (!TAILQ_EMPTY(&tp->snd_holes)))); } #ifdef TCP_HHOOK /* * Wrapper for the TCP established input helper hook. */ void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, tp->osd); } } #endif /* * CC wrapper hook functions */ void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type) { #ifdef STATS int32_t gput; #endif INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2)))) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { #ifdef STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t)tp->snd_cwnd) - tp->snd_wnd); if (!IN_RECOVERY(tp->t_flags)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs)); if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { /* * Compute goodput in bits per millisecond. */ gput = (((int64_t)SEQ_SUB(th->th_ack, tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an API * to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; } #endif /* STATS */ if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += tp->ccv->bytes_this_ack; if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); #endif } void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tp->t_inpcb; u_int maxseg; int rtt; INP_WLOCK_ASSERT(tp->t_inpcb); tcp_hc_get(&inp->inp_inc, &metrics); maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { tp->t_srtt = rtt; tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; TCPSTAT_INC(tcps_usedrtt); if (metrics.rmx_rttvar) { tp->t_rttvar = metrics.rmx_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.rmx_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * maxseg, metrics.rmx_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } /* * Set the initial slow-start flight size. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ else tp->snd_cwnd = tcp_compute_initwnd(maxseg); if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(tp->ccv); } void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); #endif switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags) || /* * Allow ECN reaction on ACK to CWR, if * that data segment was also CE marked. */ SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_CONGRECOVERY(tp->t_flags); TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max + 1; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; tp->t_badrxtwin = 0; break; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); /* XXXLAS: KASSERT that we're in recovery? */ if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } /* XXXLAS: EXIT_RECOVERY ? */ tp->t_bytes_acked = 0; tp->sackhint.delivered_data = 0; tp->sackhint.prr_out = 0; } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) void inline cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos) { INP_WLOCK_ASSERT(tp->t_inpcb); if (CC_ALGO(tp)->ecnpkt_handler != NULL) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->ccv->flags |= CCF_IPHDR_CE; break; case IPTOS_ECN_ECT0: /* FALLTHROUGH */ case IPTOS_ECN_ECT1: /* FALLTHROUGH */ case IPTOS_ECN_NOTECT: tp->ccv->flags &= ~CCF_IPHDR_CE; break; } if (flags & TH_CWR) tp->ccv->flags |= CCF_TCPHDR_CWR; else tp->ccv->flags &= ~CCF_TCPHDR_CWR; CC_ALGO(tp)->ecnpkt_handler(tp->ccv); if (tp->ccv->flags & CCF_ACKNOW) { tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); tp->t_flags |= TF_ACKNOW; } } } void inline cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) { cc_ecnpkt_handler_flags(tp, tcp_get_flags(th), iptos); } /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m; struct in6_ifaddr *ia6; struct ip6_hdr *ip6; m = *mp; if (m->m_len < *offp + sizeof(struct tcphdr)) { m = m_pullup(m, *offp + sizeof(struct tcphdr)); if (m == NULL) { *mp = m; TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ip6 = mtod(m, struct ip6_hdr *); ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); *mp = NULL; return (IPPROTO_DONE); } *mp = m; return (tcp_input_with_port(mp, offp, proto, port)); } int tcp6_input(struct mbuf **mp, int *offp, int proto) { return(tcp6_input_with_port(mp, offp, proto, 0)); } #endif /* INET6 */ int tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int off0; int optlen = 0; #ifdef INET int len; uint8_t ipttl; #endif int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ int lookupflag; uint8_t iptos; struct m_tag *fwd_tag = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif off0 = *offp; m = *mp; *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (port) goto skip6_csum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } skip6_csum: /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ goto drop; } iptos = IPV6_TRAFFIC_CLASS(ip6); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ntohs(ip->ip_len) - off0; iptos = ip->ip_tos; if (port) goto skip_csum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = off0 + tlen; ipttl = ip->ip_ttl; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_ttl = ipttl; ip->ip_v = IPVERSION; ip->ip_hl = off0 >> 2; } skip_csum: if (th->th_sum && (port == 0)) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } } #endif /* INET */ /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { if (m->m_len < off0 + off) { m = m_pullup(m, off0 + off); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); } } #endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = tcp_get_flags(th); /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ( #ifdef INET6 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) #ifdef INET || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) #endif #endif #if defined(INET) && !defined(INET6) (m->m_flags & M_IP_NEXTHOP) #endif ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); /* * For initial SYN packets we don't need write lock on matching * PCB, be it a listening one or a synchronized one. The packet * shall not modify its state. */ lookupflag = INPLOOKUP_WILDCARD | ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ? INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB); findpcb: #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : th->th_dport, lookupflag, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, lookupflag, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); #endif /* INET */ /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { if (rstreason != 0) { /* We came here after second (safety) lookup. */ MPASS((lookupflag & INPLOOKUP_WILDCARD) == 0); goto dropwithreset; } /* * Log communication attempts to ports that are not * in use. */ if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || V_tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if (((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole == 2) && (V_blackhole_local || ( #ifdef INET6 isipv6 ? !in6_localaddr(&ip6->ip6_src) : #endif #ifdef INET !in_localip(ip->ip_src) #else true #endif ))) goto dropunlock; rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK_ASSERT(inp); - /* - * While waiting for inp lock during the lookup, another thread - * can have dropped the inpcb, in which case we need to loop back - * and try to find a new inpcb to deliver to. - */ - if (inp->inp_flags & INP_DROPPED) { - INP_UNLOCK(inp); - inp = NULL; - goto findpcb; - } + if ((inp->inp_flowtype == M_HASHTYPE_NONE) && (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && ((inp->inp_socket == NULL) || !SOLISTENING(inp->inp_socket))) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6) && IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { goto dropunlock; } #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) { goto dropunlock; } #endif /* INET */ #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6) { if (inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; } else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } tp = intotcpcb(inp); switch (tp->t_state) { case TCPS_TIME_WAIT: /* * A previous connection in TIMEWAIT state is supposed to catch * stray or duplicate segments arriving late. If this segment * was a legitimate new connection attempt, the old INPCB gets * removed and we can try again to find a listening socket. */ tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); /* * tcp_twcheck unlocks the inp always, and frees the m if fails. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; return (IPPROTO_DONE); case TCPS_CLOSED: /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); m = NULL; /* consumed by the TOE driver */ goto dropunlock; } #endif #ifdef MAC if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) { ostate = tp->t_state; #ifdef INET6 if (isipv6) { bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6)); } else #endif bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip)); tcp_savetcp = *th; } #endif /* TCPDEBUG */ /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ KASSERT(tp->t_state == TCPS_LISTEN || !SOLISTENING(so), ("%s: so accepting but tp %p not listening", __func__, tp)); if (tp->t_state == TCPS_LISTEN && SOLISTENING(so)) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; if (inp->inp_inc.inc_flags & INC_IPV6MINMTU) inc.inc_flags |= INC_IPV6MINMTU; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; inc.inc_fibnum = so->so_fibnum; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock inp. */ rstreason = syncache_expand(&inc, &to, th, &so, m, port); if (rstreason < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped * and must not produce any response back * to the sender. */ goto dropunlock; } else if (rstreason == 0) { /* * No syncache entry, or ACK was not for our * SYN/ACK. Do our protection against double * ACK. If peer sent us 2 ACKs, then for the * first one syncache_expand() successfully * converted syncache entry into a socket, * while we were waiting on the inpcb lock. We * don't want to sent RST for the second ACK, * so we perform second lookup without wildcard * match, hoping to find the new socket. If * the ACK is stray indeed, rstreason would * hint the above code that the lookup was a * second attempt. * * NB: syncache did its own logging * of the failure cause. */ INP_WUNLOCK(inp); rstreason = BANDLIM_RST_OPENPORT; lookupflag &= ~INPLOOKUP_WILDCARD; goto findpcb; } tfo_socket_result: if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. * If we came here via jump to tfo_socket_result, * then listening socket is read-locked. */ INP_UNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); /* * New connection inpcb is already locked by * syncache_expand(). */ INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ TCP_PROBE5(receive, NULL, tp, m, tp, th); tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th, m, port); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc, port); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); INP_RLOCK_ASSERT(inp); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use * in_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } #ifdef INET6 if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } #endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL, iptos, port)) != NULL) goto tfo_socket_result; /* * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* * When a listen socket is torn down the SO_ACCEPTCONN * flag is removed first while connections are drained * from the accept queue in a unlock/lock cycle of the * ACCEPT_LOCK, opening a race condition allowing a SYN * attempt go through unhandled. */ goto dropunlock; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) { tcp_dooptions(&to, optp, optlen, thflags); if ((to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); goto dropunlock; } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to.to_signature) != 0) goto dropunlock; } #endif TCP_PROBE5(receive, NULL, tp, m, tp, th); /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. * * XXXGL: in case of a pure SYN arriving on existing connection * TCP stacks won't need to modify the PCB, they would either drop * the segment silently, or send a challenge ACK. However, we try * to upgrade the lock, because calling convention for stacks is * write-lock on PCB. If upgrade fails, drop the SYN. */ if ((lookupflag & INPLOOKUP_RLOCKPCB) && INP_TRY_UPGRADE(inp) == 0) goto dropunlock; tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_UNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ goto drop; dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) INP_UNLOCK(inp); drop: if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return (IPPROTO_DONE); } /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during 1/2 of an sRTT * is at least 3/8 of the current socket buffer size. * 3. receive buffer size has not hit maximal automatic size; * * If all of the criteria are met we increaset the socket buffer * by a 1/2 (bounded by the max). This allows us to keep ahead * of slow-start but also makes it so our peer never gets limited * by our rwnd which we then open up causing a burst. * * This algorithm does two steps per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ int tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int tlen) { int newsize = 0; if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && tp->t_srtt != 0 && tp->rfbuf_ts != 0 && TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) { if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max); } TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else { tp->rfbuf_cnt += tlen; /* add up */ } return (newsize); } int tcp_input(struct mbuf **mp, int *offp, int proto) { return(tcp_input_with_port(mp, offp, proto, 0)); } static void tcp_handle_wakeup(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_WAKESOR) { struct socket *so = tp->t_inpcb->inp_socket; tp->t_flags &= ~TF_WAKESOR; SOCKBUF_LOCK_ASSERT(&so->so_rcv); sorwakeup_locked(so); } } void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { uint16_t thflags; int acked, ourfinisacked, needoutput = 0, sack_changed; int rstreason, todrop, win, incforsyn = 0; uint32_t tiwin; uint16_t nsegs; char *s; struct in_conninfo *inc; struct mbuf *mfree; struct tcpopt to; int tfo_syn; u_int maxseg; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif thflags = tcp_get_flags(th); inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; sack_changed = 0; nsegs = max(1, m->m_pkthdr.lro_nsegs); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); #ifdef TCPPCAP /* Save segment, if requested. */ tcp_pcap_add(th, m, &(tp->t_inpkts)); #endif TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, NULL, true); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } goto drop; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una) && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) cc_after_idle(tp); tp->t_rcvtime = ticks; if (thflags & TH_FIN) tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); /* * Scale up the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * TCP ECN processing. */ if (tcp_ecn_input_segment(tp, thflags, tlen, tcp_packets_this_ack(tp, th->th_ack), iptos)) cc_cong_signal(tp, th, CC_ECN); /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if ((tp->t_flags & TF_SIGNATURE) != 0 && (to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_sigopt); /* XXX: should drop? */ } #endif /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; else if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { /* Handle parallel SYN for ECN */ tcp_ecn_input_parallel_syn(tp, thflags, iptos); if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE) && !(tp->t_flags & TF_NOOPT)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } else tp->t_flags &= ~TF_REQ_SCALE; /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if ((to.to_flags & TOF_TS) && (tp->t_flags & TF_REQ_TSTMP) && !(tp->t_flags & TF_NOOPT)) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } else tp->t_flags &= ~TF_REQ_TSTMP; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (!(to.to_flags & TOF_SACKPERM) || (tp->t_flags & TF_NOOPT))) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if ((to.to_flags & TOF_FASTOPEN) && !(tp->t_flags & TF_NOOPT)) { uint16_t mss; if (to.to_flags & TOF_MSS) mss = to.to_mss; else if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) mss = TCP6_MSS; else mss = TCP_MSS; tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else tcp_fastopen_disable_path(tp); } } /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop * the segment, unless it is a RST segment or missing timestamps are * tolerated. * See section 3.2 of RFC 7323. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment silently dropped\n", s, __func__); free(s, M_TCPLOG); } goto drop; } } /* * If timestamps were not negotiated during SYN/ACK and a * segment with a timestamp is received, ignore the * timestamp and process the packet normally. * See section 3.2 of RFC 7323. */ if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && SEGQ_EMPTY(tp) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery without timestamps. */ if ((to.to_flags & TOF_TS) == 0 && tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && TSTMP_LT(ticks, tp->t_badrxtwin)) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); /* * Clear t_acktime if remote side has ACKd * all data in the socket buffer. * Otherwise, update t_acktime if we received * a sufficiently large ACK. */ if (sbavail(&so->so_snd) == 0) tp->t_acktime = 0; else if (acked > 1) tp->t_acktime = ticks; if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); sowwakeup(so); if (sbavail(&so->so_snd)) (void) tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((tcp_timer_active(tp, TT_DELACK) || tcp_timer_active(tp, TT_REXMT))) goto drop; } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { goto drop; } } break; /* * If the state is SYN_SENT: * if seg contains a RST with valid ACK (SEQ.ACK has already * been verified), then drop the connection. * if seg contains a RST without an ACK, drop the seg. * if seg does not contain SYN, then drop the seg. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial_ack = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial_ack = 1; } /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; tcp_ecn_input_syn_sent(tp, thflags, iptos); /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_acktime = ticks; tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: /* FALLTHROUGH */ default: tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST); tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCPS_SYN_RECEIVED) { TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { tcp_ecn_input_syn_sent(tp, thflags, iptos); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } /* * DSACK - add SACK block for dropped range */ if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ tp->t_flags |= TF_ACKNOW; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((tp->t_flags & TF_CLOSED) && tlen) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); /* tcp_close will kill the inp pre-log the Reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) { if (tp->t_state == TCPS_SYN_RECEIVED && IS_FASTOPEN(tp->t_flags)) { tp->snd_wnd = tiwin; cc_conn_init(tp); } goto step6; } else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); if (tp->t_flags & TF_SONOTCONN) { /* * Usually SYN_RECEIVED had been created from a LISTEN, * and solisten_enqueue() has already marked the socket * layer as connected. If it didn't, which can happen * only with an accept_filter(9), then the tp is marked * with TF_SONOTCONN. The other reason for this mark * to be set is a simultaneous open, a SYN_RECEIVED * that had been created from SYN_SENT. */ tp->t_flags &= ~TF_SONOTCONN; soisconnected(so); } /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->snd_wnd = tiwin; /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } if (tp->t_flags & TF_NEEDFIN) { tp->t_acktime = ticks; tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such * connections is not harmless as it would undo the * snd_cwnd reduction that occurs when a TFO SYN|ACK * is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * Account for the ACK of our SYN prior to * regular ACK processing below, except for * simultaneous SYN, which is handled later. */ if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) incforsyn = 1; /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) { (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tcp_handle_wakeup(tp); } tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if (tcp_is_sack_recovery(tp, &to)) { if (((sack_changed = tcp_sack_doack(tp, &to, th->th_ack)) != 0) && (tp->t_flags & TF_LRD)) { tcp_sack_lost_retransmission(tp, th); } } else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif if (SEQ_LEQ(th->th_ack, tp->snd_una)) { maxseg = tcp_maxseg(tp); if (tlen == 0 && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ /* * Following 2 kinds of acks should not affect * dupack counting: * 1) Old acks * 2) Acks with SACK but without any new SACK * information in them. These could result from * any anomaly in the network like a switch * duplicating packets or a possible DoS attack. */ if (th->th_ack != tp->snd_una || (tcp_is_sack_recovery(tp, &to) && !sack_changed)) break; else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags)) { tcp_do_prr_ack(tp, th, &to); } else if (tcp_is_sack_recovery(tp, &to) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ if (V_tcp_do_newsack) awnd = tcp_compute_pipe(tp); else awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh || (tp->t_flags & TF_SACK_PERMIT && V_tcp_do_newsack && tp->sackhint.sacked_bytes > (tcprexmtthresh - 1) * maxseg)) { enter_recovery: /* * Above is the RFC6675 trigger condition of * more than (dupthresh-1)*maxseg sacked data. * If the count of holes in the * scoreboard is >= dupthresh, we could * also enter loss recovery, but don't * have that value readily available. */ tp->t_dupacks = tcprexmtthresh; tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, or prr, check * to see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (V_tcp_do_prr || (tp->t_flags & TF_SACK_PERMIT)) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (V_tcp_do_prr) { /* * snd_ssthresh is already updated by * cc_cong_signal. */ if (tcp_is_sack_recovery(tp, &to)) { tp->sackhint.prr_delivered = tp->sackhint.sacked_bytes; } else { tp->sackhint.prr_delivered = imin(tp->snd_max - tp->snd_una, imin(INT_MAX / 65536, tp->t_dupacks) * maxseg); } tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } if (tcp_is_sack_recovery(tp, &to)) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->snd_recover = tp->snd_nxt; tp->snd_cwnd = maxseg; (void) tcp_output(tp); if (SEQ_GT(th->th_ack, tp->snd_una)) goto resume_partialack; goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, nsegs, CC_DUPACK); uint32_t oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * maxseg; /* * Only call tcp_output when there * is new data available to be sent * or we need to send an ACK. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0 || tp->t_flags & TF_ACKNOW) (void) tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } break; } else { /* * This ack is advancing the left edge, reset the * counter. */ tp->t_dupacks = 0; /* * If this ack also has new SACK info, increment the * counter as per rfc6675. The variable * sack_changed tracks all changes to the SACK * scoreboard, including when partial ACKs without * SACK options are received, and clear the scoreboard * from the left side. Such partial ACKs should not be * counted as dupacks here. */ if (tcp_is_sack_recovery(tp, &to) && sack_changed) { tp->t_dupacks++; /* limit overhead by setting maxseg last */ if (!IN_FASTRECOVERY(tp->t_flags) && (tp->sackhint.sacked_bytes > ((tcprexmtthresh - 1) * (maxseg = tcp_maxseg(tp))))) { goto enter_recovery; } } } resume_partialack: KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) if (V_tcp_do_prr && to.to_flags & TOF_SACK) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tcp_do_prr_ack(tp, th, &to); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } else tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } else if (IN_CONGRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (V_tcp_do_prr) { tp->sackhint.delivered_data = BYTES_THIS_ACK(tp, th); tp->snd_fack = th->th_ack; tcp_do_prr_ack(tp, th, &to); (void) tcp_output(tp); } } else cc_post_recovery(tp, th); } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Adjust for the SYN bit in sequence space, * but don't account for it in cwnd calculations. * This is for the SYN_RECEIVED, non-simultaneous * SYN case. SYN_SENT and simultaneous SYN are * treated elsewhere. */ if (incforsyn) tp->snd_una++; acked = BYTES_THIS_ACK(tp, th); KASSERT(acked >= 0, ("%s: acked unexepectedly negative " "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, tp->snd_una, th->th_ack, tp, m)); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && to.to_flags & TOF_TS && to.to_tsecr != 0 && TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } SOCKBUF_LOCK(&so->so_snd); /* * Clear t_acktime if remote side has ACKd all data in the * socket buffer and FIN (if applicable). * Otherwise, update t_acktime if we received a sufficiently * large ACK. */ if ((tp->t_state <= TCPS_CLOSE_WAIT && acked == sbavail(&so->so_snd)) || acked > sbavail(&so->so_snd)) tp->t_acktime = 0; else if (acked > 1) tp->t_acktime = ticks; /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) { SOCKBUF_UNLOCK(&so->so_snd); goto step6; } /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, nsegs, CC_ACK); if (acked > sbavail(&so->so_snd)) { if (tp->snd_wnd >= sbavail(&so->so_snd)) tp->snd_wnd -= sbavail(&so->so_snd); else tp->snd_wnd = 0; mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); if (tp->snd_wnd >= (uint32_t) acked) tp->snd_wnd -= acked; else tp->snd_wnd = 0; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tcp_twstart(tp); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } thflags = tcp_get_flags(th) & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); tp->t_flags |= TF_WAKESOR; } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { tcp_update_dsack_list(tp, save_start, save_start + tlen); } } tcp_handle_wakeup(tp); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* The socket upcall is handled by socantrcvmore. */ socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tcp_twstart(tp); return; } } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tcp_output(tp); check_delack: INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); if (tp != NULL) { INP_WUNLOCK(tp->t_inpcb); } m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_LOCK_ASSERT(tp->t_inpcb); } /* Don't bother if destination was broadcast/multicast. */ if ((tcp_get_flags(th) & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (tcp_get_flags(th) & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (tcp_get_flags(th) & TH_SYN) tlen++; if (tcp_get_flags(th) & TH_FIN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_SIGNATURE: /* * In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have * to record the fact that the option was observed * here for the syncache code to perform the correct * response. */ if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; case TCPOPT_FAST_OPEN: /* * Cookie length validation is performed by the * server side cookie checking code or the client * side cookie cache update code. */ if (!(flags & TO_SYN)) continue; if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) continue; to->to_flags |= TOF_FASTOPEN; to->to_tfo_len = optlen - 2; to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tp->t_inpcb); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; INP_WLOCK_ASSERT(tp->t_inpcb); TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt * 1000 / hz)); #endif if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing interface * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * NOTE that resulting t_maxseg doesn't include space for TCP options or * IP options, e.g. IPSEC data, since length of this data may vary, and * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS * settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { int mss = 0; uint32_t maxmtu = 0; struct inpcb *inp = tp->t_inpcb; struct hc_metrics_lite metrics; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_port) min_protoh += V_tcp_udp_tunneling_overhead; if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_mssdflt; } #endif /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxseg above. */ offer = tp->t_maxseg; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } /* * rmx information is now retrieved from tcp_hostcache. */ tcp_hc_get(&inp->inp_inc, &metrics); if (metricptr != NULL) bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite)); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ if (metrics.rmx_mtu) mss = min(metrics.rmx_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } #endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); tp->t_maxseg = mss; } void tcp_mss(struct tcpcb *tp, int offer) { int mss; uint32_t bufsize; struct inpcb *inp; struct socket *so; struct hc_metrics_lite metrics; struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); bzero(&cap, sizeof(cap)); tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; inp = tp->t_inpcb; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.rmx_sendpipe) bufsize = metrics.rmx_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(so, SO_SND, bufsize, NULL); } SOCKBUF_UNLOCK(&so->so_snd); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ tp->t_maxseg = max(mss, 64); SOCKBUF_LOCK(&so->so_rcv); if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.rmx_recvpipe) bufsize = metrics.rmx_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(so, SO_RCV, bufsize, NULL); } SOCKBUF_UNLOCK(&so->so_rcv); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; uint32_t thcmtu = 0; uint32_t maxmtu = 0; size_t min_protoh; KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); min_protoh = sizeof(struct tcpiphdr); } #endif #if defined(INET6) || defined(INET) thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ #endif if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } void tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0; int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Compute the amount of data that this ACK is indicating * (del_data) and an estimate of how many bytes are in the * network. */ if (tcp_is_sack_recovery(tp, to) || (IN_CONGRECOVERY(tp->t_flags) && !IN_FASTRECOVERY(tp->t_flags))) { del_data = tp->sackhint.delivered_data; if (V_tcp_do_newsack) pipe = tcp_compute_pipe(tp); else pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; } else { if (tp->sackhint.prr_delivered < (tcprexmtthresh * maxseg + tp->snd_recover - tp->snd_una)) del_data = maxseg; pipe = imax(0, tp->snd_max - tp->snd_una - imin(INT_MAX / 65536, tp->t_dupacks) * maxseg); } tp->sackhint.prr_delivered += del_data; /* * Proportional Rate Reduction */ if (pipe >= tp->snd_ssthresh) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = imax(1, tp->snd_nxt - tp->snd_una); snd_cnt = howmany((long)tp->sackhint.prr_delivered * tp->snd_ssthresh, tp->sackhint.recover_fs) - tp->sackhint.prr_out; } else { if (V_tcp_do_prr_conservative || (del_data == 0)) limit = tp->sackhint.prr_delivered - tp->sackhint.prr_out; else limit = imax(tp->sackhint.prr_delivered - tp->sackhint.prr_out, del_data) + maxseg; snd_cnt = imin((tp->snd_ssthresh - pipe), limit); } snd_cnt = imax(snd_cnt, 0) / maxseg; /* * Send snd_cnt new data into the network in response to this ack. * If there is going to be a SACK retransmission, adjust snd_cwnd * accordingly. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (tcp_is_sack_recovery(tp, to)) { tp->snd_cwnd = tp->snd_nxt - tp->snd_recover + tp->sackhint.sack_bytes_rexmit + (snd_cnt * maxseg); } else { tp->snd_cwnd = (tp->snd_max - tp->snd_una) + (snd_cnt * maxseg); } } else if (IN_CONGRECOVERY(tp->t_flags)) tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); tp->snd_cwnd = imax(maxseg, tp->snd_cwnd); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; uint32_t ocwnd = tp->snd_cwnd; u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += maxseg; } int tcp_compute_pipe(struct tcpcb *tp) { if (tp->t_fb->tfb_compute_pipe == NULL) { return (tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes); } else { return((*tp->t_fb->tfb_compute_pipe)(tp)); } } uint32_t tcp_compute_initwnd(uint32_t maxseg) { /* * Calculate the Initial Window, also used as Restart Window * * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. * Support for user specified value for initial flight size. */ if (V_tcp_initcwnd_segments) return min(V_tcp_initcwnd_segments * maxseg, max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) return min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (maxseg > 2190) return (2 * maxseg); else if (maxseg > 1095) return (3 * maxseg); else return (4 * maxseg); } } diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 4f7457d875e7..b0b9a812b3df 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1,2065 +1,2063 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2016-2021 Mellanox Technologies. * All rights reserved. * * Portions of this software were developed by Bjoern Zeeb * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #define TCP_LRO_TS_OPTION \ ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP) static void tcp_lro_rx_done(struct lro_ctrl *lc); static int tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash); #ifdef TCPHPTS static bool do_bpf_strip_and_compress(struct inpcb *, struct lro_ctrl *, struct lro_entry *, struct mbuf **, struct mbuf **, struct mbuf **, bool *, bool, bool, struct ifnet *, bool); #endif SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP LRO"); static long tcplro_stacks_wanting_mbufq; counter_u64_t tcp_inp_lro_direct_queue; counter_u64_t tcp_inp_lro_wokeup_queue; counter_u64_t tcp_inp_lro_compressed; counter_u64_t tcp_inp_lro_locks_taken; counter_u64_t tcp_extra_mbuf; counter_u64_t tcp_would_have_but; counter_u64_t tcp_comp_total; counter_u64_t tcp_uncomp_total; counter_u64_t tcp_bad_csums; static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, "default number of LRO entries"); static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH; SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold, CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0, "Number of interrupts in a row on the same CPU that will make us declare an 'affinity' cpu?"); static uint32_t tcp_less_accurate_lro_ts = 0; SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_less_accurate, CTLFLAG_MPSAFE, &tcp_less_accurate_lro_ts, 0, "Do we trade off efficency by doing less timestamp operations for time accuracy?"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD, &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD, &tcp_would_have_but, "Number of times we would have had an extra compressed, but mget failed"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD, &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD, &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lro_badcsum, CTLFLAG_RD, &tcp_bad_csums, "Number of packets that the common code saw with bad csums"); void tcp_lro_reg_mbufq(void) { atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); } void tcp_lro_dereg_mbufq(void) { atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); } static __inline void tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, struct lro_entry *le) { LIST_INSERT_HEAD(&lc->lro_active, le, next); LIST_INSERT_HEAD(bucket, le, hash_next); } static __inline void tcp_lro_active_remove(struct lro_entry *le) { LIST_REMOVE(le, next); /* active list */ LIST_REMOVE(le, hash_next); /* hash bucket */ } int tcp_lro_init(struct lro_ctrl *lc) { return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); } int tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, unsigned lro_entries, unsigned lro_mbufs) { struct lro_entry *le; size_t size; unsigned i, elements; lc->lro_bad_csum = 0; lc->lro_queued = 0; lc->lro_flushed = 0; lc->lro_mbuf_count = 0; lc->lro_mbuf_max = lro_mbufs; lc->lro_cnt = lro_entries; lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; lc->lro_length_lim = TCP_LRO_LENGTH_MAX; lc->ifp = ifp; LIST_INIT(&lc->lro_free); LIST_INIT(&lc->lro_active); /* create hash table to accelerate entry lookup */ if (lro_entries > lro_mbufs) elements = lro_entries; else elements = lro_mbufs; lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, HASH_NOWAIT); if (lc->lro_hash == NULL) { memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute size to allocate */ size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + (lro_entries * sizeof(*le)); lc->lro_mbuf_data = (struct lro_mbuf_sort *) malloc(size, M_LRO, M_NOWAIT | M_ZERO); /* check for out of memory */ if (lc->lro_mbuf_data == NULL) { free(lc->lro_hash, M_LRO); memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute offset for LRO entries */ le = (struct lro_entry *) (lc->lro_mbuf_data + lro_mbufs); /* setup linked list */ for (i = 0; i != lro_entries; i++) LIST_INSERT_HEAD(&lc->lro_free, le + i, next); return (0); } struct vxlan_header { uint32_t vxlh_flags; uint32_t vxlh_vni; }; static inline void * tcp_lro_low_level_parser(void *ptr, struct lro_parser *parser, bool update_data, bool is_vxlan, int mlen) { const struct ether_vlan_header *eh; void *old; uint16_t eth_type; if (update_data) memset(parser, 0, sizeof(*parser)); old = ptr; if (is_vxlan) { const struct vxlan_header *vxh; vxh = ptr; ptr = (uint8_t *)ptr + sizeof(*vxh); if (update_data) { parser->data.vxlan_vni = vxh->vxlh_vni & htonl(0xffffff00); } } eh = ptr; if (__predict_false(eh->evl_encap_proto == htons(ETHERTYPE_VLAN))) { eth_type = eh->evl_proto; if (update_data) { /* strip priority and keep VLAN ID only */ parser->data.vlan_id = eh->evl_tag & htons(EVL_VLID_MASK); } /* advance to next header */ ptr = (uint8_t *)ptr + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; mlen -= (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); } else { eth_type = eh->evl_encap_proto; /* advance to next header */ mlen -= ETHER_HDR_LEN; ptr = (uint8_t *)ptr + ETHER_HDR_LEN; } if (__predict_false(mlen <= 0)) return (NULL); switch (eth_type) { #ifdef INET case htons(ETHERTYPE_IP): parser->ip4 = ptr; if (__predict_false(mlen < sizeof(struct ip))) return (NULL); /* Ensure there are no IPv4 options. */ if ((parser->ip4->ip_hl << 2) != sizeof (*parser->ip4)) break; /* .. and the packet is not fragmented. */ if (parser->ip4->ip_off & htons(IP_MF|IP_OFFMASK)) break; ptr = (uint8_t *)ptr + (parser->ip4->ip_hl << 2); mlen -= sizeof(struct ip); if (update_data) { parser->data.s_addr.v4 = parser->ip4->ip_src; parser->data.d_addr.v4 = parser->ip4->ip_dst; } switch (parser->ip4->ip_p) { case IPPROTO_UDP: if (__predict_false(mlen < sizeof(struct udphdr))) return (NULL); parser->udp = ptr; if (update_data) { parser->data.lro_type = LRO_TYPE_IPV4_UDP; parser->data.s_port = parser->udp->uh_sport; parser->data.d_port = parser->udp->uh_dport; } else { MPASS(parser->data.lro_type == LRO_TYPE_IPV4_UDP); } ptr = ((uint8_t *)ptr + sizeof(*parser->udp)); parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; return (ptr); case IPPROTO_TCP: parser->tcp = ptr; if (__predict_false(mlen < sizeof(struct tcphdr))) return (NULL); if (update_data) { parser->data.lro_type = LRO_TYPE_IPV4_TCP; parser->data.s_port = parser->tcp->th_sport; parser->data.d_port = parser->tcp->th_dport; } else { MPASS(parser->data.lro_type == LRO_TYPE_IPV4_TCP); } if (__predict_false(mlen < (parser->tcp->th_off << 2))) return (NULL); ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2); parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; return (ptr); default: break; } break; #endif #ifdef INET6 case htons(ETHERTYPE_IPV6): parser->ip6 = ptr; if (__predict_false(mlen < sizeof(struct ip6_hdr))) return (NULL); ptr = (uint8_t *)ptr + sizeof(*parser->ip6); if (update_data) { parser->data.s_addr.v6 = parser->ip6->ip6_src; parser->data.d_addr.v6 = parser->ip6->ip6_dst; } mlen -= sizeof(struct ip6_hdr); switch (parser->ip6->ip6_nxt) { case IPPROTO_UDP: if (__predict_false(mlen < sizeof(struct udphdr))) return (NULL); parser->udp = ptr; if (update_data) { parser->data.lro_type = LRO_TYPE_IPV6_UDP; parser->data.s_port = parser->udp->uh_sport; parser->data.d_port = parser->udp->uh_dport; } else { MPASS(parser->data.lro_type == LRO_TYPE_IPV6_UDP); } ptr = (uint8_t *)ptr + sizeof(*parser->udp); parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; return (ptr); case IPPROTO_TCP: if (__predict_false(mlen < sizeof(struct tcphdr))) return (NULL); parser->tcp = ptr; if (update_data) { parser->data.lro_type = LRO_TYPE_IPV6_TCP; parser->data.s_port = parser->tcp->th_sport; parser->data.d_port = parser->tcp->th_dport; } else { MPASS(parser->data.lro_type == LRO_TYPE_IPV6_TCP); } if (__predict_false(mlen < (parser->tcp->th_off << 2))) return (NULL); ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2); parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; return (ptr); default: break; } break; #endif default: break; } /* Invalid packet - cannot parse */ return (NULL); } static const int vxlan_csum = CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID; static inline struct lro_parser * tcp_lro_parser(struct mbuf *m, struct lro_parser *po, struct lro_parser *pi, bool update_data) { void *data_ptr; /* Try to parse outer headers first. */ data_ptr = tcp_lro_low_level_parser(m->m_data, po, update_data, false, m->m_len); if (data_ptr == NULL || po->total_hdr_len > m->m_len) return (NULL); if (update_data) { /* Store VLAN ID, if any. */ if (__predict_false(m->m_flags & M_VLANTAG)) { po->data.vlan_id = htons(m->m_pkthdr.ether_vtag) & htons(EVL_VLID_MASK); } /* Store decrypted flag, if any. */ if (__predict_false((m->m_pkthdr.csum_flags & CSUM_TLS_MASK) == CSUM_TLS_DECRYPTED)) po->data.lro_flags |= LRO_FLAG_DECRYPTED; } switch (po->data.lro_type) { case LRO_TYPE_IPV4_UDP: case LRO_TYPE_IPV6_UDP: /* Check for VXLAN headers. */ if ((m->m_pkthdr.csum_flags & vxlan_csum) != vxlan_csum) break; /* Try to parse inner headers. */ data_ptr = tcp_lro_low_level_parser(data_ptr, pi, update_data, true, (m->m_len - ((caddr_t)data_ptr - m->m_data))); if (data_ptr == NULL || (pi->total_hdr_len + po->total_hdr_len) > m->m_len) break; /* Verify supported header types. */ switch (pi->data.lro_type) { case LRO_TYPE_IPV4_TCP: case LRO_TYPE_IPV6_TCP: return (pi); default: break; } break; case LRO_TYPE_IPV4_TCP: case LRO_TYPE_IPV6_TCP: if (update_data) memset(pi, 0, sizeof(*pi)); return (po); default: break; } return (NULL); } static inline int tcp_lro_trim_mbuf_chain(struct mbuf *m, const struct lro_parser *po) { int len; switch (po->data.lro_type) { #ifdef INET case LRO_TYPE_IPV4_TCP: len = ((uint8_t *)po->ip4 - (uint8_t *)m->m_data) + ntohs(po->ip4->ip_len); break; #endif #ifdef INET6 case LRO_TYPE_IPV6_TCP: len = ((uint8_t *)po->ip6 - (uint8_t *)m->m_data) + ntohs(po->ip6->ip6_plen) + sizeof(*po->ip6); break; #endif default: return (TCP_LRO_CANNOT); } /* * If the frame is padded beyond the end of the IP packet, * then trim the extra bytes off: */ if (__predict_true(m->m_pkthdr.len == len)) { return (0); } else if (m->m_pkthdr.len > len) { m_adj(m, len - m->m_pkthdr.len); return (0); } return (TCP_LRO_CANNOT); } static struct tcphdr * tcp_lro_get_th(struct mbuf *m) { return ((struct tcphdr *)((uint8_t *)m->m_data + m->m_pkthdr.lro_tcp_h_off)); } static void lro_free_mbuf_chain(struct mbuf *m) { struct mbuf *save; while (m) { save = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = save; } } void tcp_lro_free(struct lro_ctrl *lc) { struct lro_entry *le; unsigned x; /* reset LRO free list */ LIST_INIT(&lc->lro_free); /* free active mbufs, if any */ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); lro_free_mbuf_chain(le->m_head); } /* free hash table */ free(lc->lro_hash, M_LRO); lc->lro_hash = NULL; lc->lro_hashsz = 0; /* free mbuf array, if any */ for (x = 0; x != lc->lro_mbuf_count; x++) m_freem(lc->lro_mbuf_data[x].mb); lc->lro_mbuf_count = 0; /* free allocated memory, if any */ free(lc->lro_mbuf_data, M_LRO); lc->lro_mbuf_data = NULL; } static uint16_t tcp_lro_rx_csum_tcphdr(const struct tcphdr *th) { const uint16_t *ptr; uint32_t csum; uint16_t len; csum = -th->th_sum; /* exclude checksum field */ len = th->th_off; ptr = (const uint16_t *)th; while (len--) { csum += *ptr; ptr++; csum += *ptr; ptr++; } while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); return (csum); } static uint16_t tcp_lro_rx_csum_data(const struct lro_parser *pa, uint16_t tcp_csum) { uint32_t c; uint16_t cs; c = tcp_csum; switch (pa->data.lro_type) { #ifdef INET6 case LRO_TYPE_IPV6_TCP: /* Compute full pseudo IPv6 header checksum. */ cs = in6_cksum_pseudo(pa->ip6, ntohs(pa->ip6->ip6_plen), pa->ip6->ip6_nxt, 0); break; #endif #ifdef INET case LRO_TYPE_IPV4_TCP: /* Compute full pseudo IPv4 header checsum. */ cs = in_addword(ntohs(pa->ip4->ip_len) - sizeof(*pa->ip4), IPPROTO_TCP); cs = in_pseudo(pa->ip4->ip_src.s_addr, pa->ip4->ip_dst.s_addr, htons(cs)); break; #endif default: cs = 0; /* Keep compiler happy. */ break; } /* Complement checksum. */ cs = ~cs; c += cs; /* Remove TCP header checksum. */ cs = ~tcp_lro_rx_csum_tcphdr(pa->tcp); c += cs; /* Compute checksum remainder. */ while (c > 0xffff) c = (c >> 16) + (c & 0xffff); return (c); } static void tcp_lro_rx_done(struct lro_ctrl *lc) { struct lro_entry *le; while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } static void tcp_lro_flush_active(struct lro_ctrl *lc) { struct lro_entry *le; /* * Walk through the list of le entries, and * any one that does have packets flush. This * is called because we have an inbound packet * (e.g. SYN) that has to have all others flushed * in front of it. Note we have to do the remove * because tcp_lro_flush() assumes that the entry * is being freed. This is ok it will just get * reallocated again like it was new. */ LIST_FOREACH(le, &lc->lro_active, next) { if (le->m_head != NULL) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } } void tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) { struct lro_entry *le, *le_tmp; uint64_t now, tov; struct bintime bt; NET_EPOCH_ASSERT(); if (LIST_EMPTY(&lc->lro_active)) return; /* get timeout time and current time in ns */ binuptime(&bt); now = bintime2ns(&bt); tov = ((timeout->tv_sec * 1000000000) + (timeout->tv_usec * 1000)); LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { if (now >= (bintime2ns(&le->alloc_time) + tov)) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } } #ifdef INET static int tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4) { uint16_t csum; /* Legacy IP has a header checksum that needs to be correct. */ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { if (__predict_false((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } else { csum = in_cksum_hdr(ip4); if (__predict_false(csum != 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } return (0); } #endif #ifdef TCPHPTS static void tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc, const struct lro_entry *le, const struct mbuf *m, int frm, int32_t tcp_data_len, uint32_t th_seq, uint32_t th_ack, uint16_t th_win) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv, btv; uint32_t cts; cts = tcp_get_usecs(&tv); memset(&log, 0, sizeof(union tcp_log_stackspecific)); log.u_bbr.flex8 = frm; log.u_bbr.flex1 = tcp_data_len; if (m) log.u_bbr.flex2 = m->m_pkthdr.len; else log.u_bbr.flex2 = 0; if (le->m_head) { log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs; log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len; log.u_bbr.flex5 = le->m_head->m_pkthdr.len; log.u_bbr.delRate = le->m_head->m_flags; log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; } log.u_bbr.inflight = th_seq; log.u_bbr.delivered = th_ack; log.u_bbr.timeStamp = cts; log.u_bbr.epoch = le->next_seq; log.u_bbr.lt_epoch = le->ack_seq; log.u_bbr.pacing_gain = th_win; log.u_bbr.cwnd_gain = le->window; log.u_bbr.lost = curcpu; log.u_bbr.cur_del_rate = (uintptr_t)m; log.u_bbr.bw_inuse = (uintptr_t)le->m_head; bintime2timeval(&lc->lro_last_queue_time, &btv); log.u_bbr.flex6 = tcp_tv_to_usectick(&btv); log.u_bbr.flex7 = le->compressed; log.u_bbr.pacing_gain = le->uncompressed; if (in_epoch(net_epoch_preempt)) log.u_bbr.inhpts = 1; else log.u_bbr.inhpts = 0; TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_LRO, 0, 0, &log, false, &tv); } } #endif static inline void tcp_lro_assign_and_checksum_16(uint16_t *ptr, uint16_t value, uint16_t *psum) { uint32_t csum; csum = 0xffff - *ptr + value; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); *ptr = value; *psum = csum; } static uint16_t tcp_lro_update_checksum(const struct lro_parser *pa, const struct lro_entry *le, uint16_t payload_len, uint16_t delta_sum) { uint32_t csum; uint16_t tlen; uint16_t temp[5] = {}; switch (pa->data.lro_type) { case LRO_TYPE_IPV4_TCP: /* Compute new IPv4 length. */ tlen = (pa->ip4->ip_hl << 2) + (pa->tcp->th_off << 2) + payload_len; tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]); /* Subtract delta from current IPv4 checksum. */ csum = pa->ip4->ip_sum + 0xffff - temp[0]; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]); goto update_tcp_header; case LRO_TYPE_IPV6_TCP: /* Compute new IPv6 length. */ tlen = (pa->tcp->th_off << 2) + payload_len; tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]); goto update_tcp_header; case LRO_TYPE_IPV4_UDP: /* Compute new IPv4 length. */ tlen = (pa->ip4->ip_hl << 2) + sizeof(*pa->udp) + payload_len; tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]); /* Subtract delta from current IPv4 checksum. */ csum = pa->ip4->ip_sum + 0xffff - temp[0]; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]); goto update_udp_header; case LRO_TYPE_IPV6_UDP: /* Compute new IPv6 length. */ tlen = sizeof(*pa->udp) + payload_len; tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]); goto update_udp_header; default: return (0); } update_tcp_header: /* Compute current TCP header checksum. */ temp[2] = tcp_lro_rx_csum_tcphdr(pa->tcp); /* Incorporate the latest ACK into the TCP header. */ pa->tcp->th_ack = le->ack_seq; pa->tcp->th_win = le->window; /* Incorporate latest timestamp into the TCP header. */ if (le->timestamp != 0) { uint32_t *ts_ptr; ts_ptr = (uint32_t *)(pa->tcp + 1); ts_ptr[1] = htonl(le->tsval); ts_ptr[2] = le->tsecr; } /* Compute new TCP header checksum. */ temp[3] = tcp_lro_rx_csum_tcphdr(pa->tcp); /* Compute new TCP checksum. */ csum = pa->tcp->th_sum + 0xffff - delta_sum + 0xffff - temp[0] + 0xffff - temp[3] + temp[2]; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); /* Assign new TCP checksum. */ tcp_lro_assign_and_checksum_16(&pa->tcp->th_sum, csum, &temp[4]); /* Compute all modififications affecting next checksum. */ csum = temp[0] + temp[1] + 0xffff - temp[2] + temp[3] + temp[4] + delta_sum; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); /* Return delta checksum to next stage, if any. */ return (csum); update_udp_header: tlen = sizeof(*pa->udp) + payload_len; /* Assign new UDP length and compute checksum delta. */ tcp_lro_assign_and_checksum_16(&pa->udp->uh_ulen, htons(tlen), &temp[2]); /* Check if there is a UDP checksum. */ if (__predict_false(pa->udp->uh_sum != 0)) { /* Compute new UDP checksum. */ csum = pa->udp->uh_sum + 0xffff - delta_sum + 0xffff - temp[0] + 0xffff - temp[2]; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); /* Assign new UDP checksum. */ tcp_lro_assign_and_checksum_16(&pa->udp->uh_sum, csum, &temp[3]); } /* Compute all modififications affecting next checksum. */ csum = temp[0] + temp[1] + temp[2] + temp[3] + delta_sum; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); /* Return delta checksum to next stage, if any. */ return (csum); } static void tcp_flush_out_entry(struct lro_ctrl *lc, struct lro_entry *le) { /* Check if we need to recompute any checksums. */ if (le->needs_merge) { uint16_t csum; switch (le->inner.data.lro_type) { case LRO_TYPE_IPV4_TCP: csum = tcp_lro_update_checksum(&le->inner, le, le->m_head->m_pkthdr.lro_tcp_d_len, le->m_head->m_pkthdr.lro_tcp_d_csum); csum = tcp_lro_update_checksum(&le->outer, NULL, le->m_head->m_pkthdr.lro_tcp_d_len + le->inner.total_hdr_len, csum); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; le->m_head->m_pkthdr.csum_data = 0xffff; if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; break; case LRO_TYPE_IPV6_TCP: csum = tcp_lro_update_checksum(&le->inner, le, le->m_head->m_pkthdr.lro_tcp_d_len, le->m_head->m_pkthdr.lro_tcp_d_csum); csum = tcp_lro_update_checksum(&le->outer, NULL, le->m_head->m_pkthdr.lro_tcp_d_len + le->inner.total_hdr_len, csum); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; le->m_head->m_pkthdr.csum_data = 0xffff; if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; break; case LRO_TYPE_NONE: switch (le->outer.data.lro_type) { case LRO_TYPE_IPV4_TCP: csum = tcp_lro_update_checksum(&le->outer, le, le->m_head->m_pkthdr.lro_tcp_d_len, le->m_head->m_pkthdr.lro_tcp_d_csum); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; le->m_head->m_pkthdr.csum_data = 0xffff; if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; break; case LRO_TYPE_IPV6_TCP: csum = tcp_lro_update_checksum(&le->outer, le, le->m_head->m_pkthdr.lro_tcp_d_len, le->m_head->m_pkthdr.lro_tcp_d_csum); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; le->m_head->m_pkthdr.csum_data = 0xffff; if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED)) le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED; break; default: break; } break; default: break; } } /* * Break any chain, this is not set to NULL on the singleton * case m_nextpkt points to m_head. Other case set them * m_nextpkt to NULL in push_and_replace. */ le->m_head->m_nextpkt = NULL; lc->lro_queued += le->m_head->m_pkthdr.lro_nsegs; (*lc->ifp->if_input)(lc->ifp, le->m_head); } static void tcp_set_entry_to_mbuf(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m, struct tcphdr *th) { uint32_t *ts_ptr; uint16_t tcp_data_len; uint16_t tcp_opt_len; ts_ptr = (uint32_t *)(th + 1); tcp_opt_len = (th->th_off << 2); tcp_opt_len -= sizeof(*th); /* Check if there is a timestamp option. */ if (tcp_opt_len == 0 || __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || *ts_ptr != TCP_LRO_TS_OPTION)) { /* We failed to find the timestamp option. */ le->timestamp = 0; } else { le->timestamp = 1; le->tsval = ntohl(*(ts_ptr + 1)); le->tsecr = *(ts_ptr + 2); } tcp_data_len = m->m_pkthdr.lro_tcp_d_len; /* Pull out TCP sequence numbers and window size. */ le->next_seq = ntohl(th->th_seq) + tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; le->flags = tcp_get_flags(th); le->needs_merge = 0; /* Setup new data pointers. */ le->m_head = m; le->m_tail = m_last(m); } static void tcp_push_and_replace(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) { struct lro_parser *pa; /* * Push up the stack of the current entry * and replace it with "m". */ struct mbuf *msave; /* Grab off the next and save it */ msave = le->m_head->m_nextpkt; le->m_head->m_nextpkt = NULL; /* Now push out the old entry */ tcp_flush_out_entry(lc, le); /* Re-parse new header, should not fail. */ pa = tcp_lro_parser(m, &le->outer, &le->inner, false); KASSERT(pa != NULL, ("tcp_push_and_replace: LRO parser failed on m=%p\n", m)); /* * Now to replace the data properly in the entry * we have to reset the TCP header and * other fields. */ tcp_set_entry_to_mbuf(lc, le, m, pa->tcp); /* Restore the next list */ m->m_nextpkt = msave; } static void tcp_lro_mbuf_append_pkthdr(struct lro_entry *le, const struct mbuf *p) { struct mbuf *m; uint32_t csum; m = le->m_head; if (m->m_pkthdr.lro_nsegs == 1) { /* Compute relative checksum. */ csum = p->m_pkthdr.lro_tcp_d_csum; } else { /* Merge TCP data checksums. */ csum = (uint32_t)m->m_pkthdr.lro_tcp_d_csum + (uint32_t)p->m_pkthdr.lro_tcp_d_csum; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); } /* Update various counters. */ m->m_pkthdr.len += p->m_pkthdr.lro_tcp_d_len; m->m_pkthdr.lro_tcp_d_csum = csum; m->m_pkthdr.lro_tcp_d_len += p->m_pkthdr.lro_tcp_d_len; m->m_pkthdr.lro_nsegs += p->m_pkthdr.lro_nsegs; le->needs_merge = 1; } static void tcp_lro_condense(struct lro_ctrl *lc, struct lro_entry *le) { /* * Walk through the mbuf chain we * have on tap and compress/condense * as required. */ uint32_t *ts_ptr; struct mbuf *m; struct tcphdr *th; uint32_t tcp_data_len_total; uint32_t tcp_data_seg_total; uint16_t tcp_data_len; uint16_t tcp_opt_len; /* * First we must check the lead (m_head) * we must make sure that it is *not* * something that should be sent up * right away (sack etc). */ again: m = le->m_head->m_nextpkt; if (m == NULL) { /* Just one left. */ return; } th = tcp_lro_get_th(m); tcp_opt_len = (th->th_off << 2); tcp_opt_len -= sizeof(*th); ts_ptr = (uint32_t *)(th + 1); if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || *ts_ptr != TCP_LRO_TS_OPTION)) { /* * Its not the timestamp. We can't * use this guy as the head. */ le->m_head->m_nextpkt = m->m_nextpkt; tcp_push_and_replace(lc, le, m); goto again; } if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) { /* * Make sure that previously seen segments/ACKs are delivered * before this segment, e.g. FIN. */ le->m_head->m_nextpkt = m->m_nextpkt; tcp_push_and_replace(lc, le, m); goto again; } while((m = le->m_head->m_nextpkt) != NULL) { /* * condense m into le, first * pull m out of the list. */ le->m_head->m_nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; /* Setup my data */ tcp_data_len = m->m_pkthdr.lro_tcp_d_len; th = tcp_lro_get_th(m); ts_ptr = (uint32_t *)(th + 1); tcp_opt_len = (th->th_off << 2); tcp_opt_len -= sizeof(*th); tcp_data_len_total = le->m_head->m_pkthdr.lro_tcp_d_len + tcp_data_len; tcp_data_seg_total = le->m_head->m_pkthdr.lro_nsegs + m->m_pkthdr.lro_nsegs; if (tcp_data_seg_total >= lc->lro_ackcnt_lim || tcp_data_len_total >= lc->lro_length_lim) { /* Flush now if appending will result in overflow. */ tcp_push_and_replace(lc, le, m); goto again; } if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || *ts_ptr != TCP_LRO_TS_OPTION)) { /* * Maybe a sack in the new one? We need to * start all over after flushing the * current le. We will go up to the beginning * and flush it (calling the replace again possibly * or just returning). */ tcp_push_and_replace(lc, le, m); goto again; } if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) { tcp_push_and_replace(lc, le, m); goto again; } if (tcp_opt_len != 0) { uint32_t tsval = ntohl(*(ts_ptr + 1)); /* Make sure timestamp values are increasing. */ if (TSTMP_GT(le->tsval, tsval)) { tcp_push_and_replace(lc, le, m); goto again; } le->tsval = tsval; le->tsecr = *(ts_ptr + 2); } /* Try to append the new segment. */ if (__predict_false(ntohl(th->th_seq) != le->next_seq || ((tcp_get_flags(th) & TH_ACK) != (le->flags & TH_ACK)) || (tcp_data_len == 0 && le->ack_seq == th->th_ack && le->window == th->th_win))) { /* Out of order packet, non-ACK + ACK or dup ACK. */ tcp_push_and_replace(lc, le, m); goto again; } if (tcp_data_len != 0 || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { le->next_seq += tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; le->needs_merge = 1; } else if (th->th_ack == le->ack_seq) { if (WIN_GT(th->th_win, le->window)) { le->window = th->th_win; le->needs_merge = 1; } } if (tcp_data_len == 0) { m_freem(m); continue; } /* Merge TCP data checksum and length to head mbuf. */ tcp_lro_mbuf_append_pkthdr(le, m); /* * Adjust the mbuf so that m_data points to the first byte of * the ULP payload. Adjust the mbuf to avoid complications and * append new segment to existing mbuf chain. */ m_adj(m, m->m_pkthdr.len - tcp_data_len); m_demote_pkthdr(m); le->m_tail->m_next = m; le->m_tail = m_last(m); } } #ifdef TCPHPTS static void tcp_queue_pkts(struct inpcb *inp, struct tcpcb *tp, struct lro_entry *le) { INP_WLOCK_ASSERT(inp); if (tp->t_in_pkt == NULL) { /* Nothing yet there */ tp->t_in_pkt = le->m_head; tp->t_tail_pkt = le->m_last_mbuf; } else { /* Already some there */ tp->t_tail_pkt->m_nextpkt = le->m_head; tp->t_tail_pkt = le->m_last_mbuf; } le->m_head = NULL; le->m_last_mbuf = NULL; } static struct mbuf * tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, struct inpcb *inp, int32_t *new_m, bool can_append_old_cmp) { struct tcpcb *tp; struct mbuf *m; tp = intotcpcb(inp); if (__predict_false(tp == NULL)) return (NULL); /* Look at the last mbuf if any in queue */ if (can_append_old_cmp) { m = tp->t_tail_pkt; if (m != NULL && (m->m_flags & M_ACKCMP) != 0) { if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) { tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0); *new_m = 0; counter_u64_add(tcp_extra_mbuf, 1); return (m); } else { /* Mark we ran out of space */ inp->inp_flags2 |= INP_MBUF_L_ACKS; } } } /* Decide mbuf size. */ tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0); if (inp->inp_flags2 & INP_MBUF_L_ACKS) m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (__predict_false(m == NULL)) { counter_u64_add(tcp_would_have_but, 1); return (NULL); } counter_u64_add(tcp_comp_total, 1); m->m_pkthdr.rcvif = lc->ifp; m->m_flags |= M_ACKCMP; *new_m = 1; return (m); } static struct inpcb * tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa) { struct inpcb *inp; switch (pa->data.lro_type) { #ifdef INET6 case LRO_TYPE_IPV6_TCP: inp = in6_pcblookup(&V_tcbinfo, &pa->data.s_addr.v6, pa->data.s_port, &pa->data.d_addr.v6, pa->data.d_port, INPLOOKUP_WLOCKPCB, ifp); break; #endif #ifdef INET case LRO_TYPE_IPV4_TCP: inp = in_pcblookup(&V_tcbinfo, pa->data.s_addr.v4, pa->data.s_port, pa->data.d_addr.v4, pa->data.d_port, INPLOOKUP_WLOCKPCB, ifp); break; #endif default: inp = NULL; break; } return (inp); } static inline bool tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts) { /* * This function returns two bits of valuable information. * a) Is what is present capable of being ack-compressed, * we can ack-compress if there is no options or just * a timestamp option, and of course the th_flags must * be correct as well. * b) Our other options present such as SACK. This is * used to determine if we want to wakeup or not. */ bool ret = true; switch (th->th_off << 2) { case (sizeof(*th) + TCPOLEN_TSTAMP_APPA): *ppts = (uint32_t *)(th + 1); /* Check if we have only one timestamp option. */ if (**ppts == TCP_LRO_TS_OPTION) *other_opts = false; else { *other_opts = true; ret = false; } break; case (sizeof(*th)): /* No options. */ *ppts = NULL; *other_opts = false; break; default: *ppts = NULL; *other_opts = true; ret = false; break; } /* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */ if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0) ret = false; /* If it has data on it we cannot compress it */ if (m->m_pkthdr.lro_tcp_d_len) ret = false; /* ACK flag must be set. */ if (!(tcp_get_flags(th) & TH_ACK)) ret = false; return (ret); } static int tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le) { struct inpcb *inp; struct tcpcb *tp; struct mbuf **pp, *cmp, *mv_to; struct ifnet *lagg_ifp; bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp; /* Check if packet doesn't belongs to our network interface. */ if ((tcplro_stacks_wanting_mbufq == 0) || (le->outer.data.vlan_id != 0) || (le->inner.data.lro_type != LRO_TYPE_NONE)) return (TCP_LRO_CANNOT); #ifdef INET6 /* * Be proactive about unspecified IPv6 address in source. As * we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP && IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6))) return (TCP_LRO_CANNOT); if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP && IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6))) return (TCP_LRO_CANNOT); #endif /* Lookup inp, if any. */ inp = tcp_lro_lookup(lc->ifp, (le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner); if (inp == NULL) return (TCP_LRO_CANNOT); counter_u64_add(tcp_inp_lro_locks_taken, 1); /* Get TCP control structure. */ tp = intotcpcb(inp); /* Check if the inp is dead, Jim. */ - if (tp == NULL || - (inp->inp_flags & INP_DROPPED) || - (tp->t_state == TCPS_TIME_WAIT)) { + if (tp->t_state == TCPS_TIME_WAIT) { INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) { inp->inp_irq_cpu = lc->lro_last_cpu; inp->inp_irq_cpu_set = 1; } /* Check if the transport doesn't support the needed optimizations. */ if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) { INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } if (inp->inp_flags2 & INP_MBUF_QUEUE_READY) should_wake = false; else should_wake = true; /* Check if packets should be tapped to BPF. */ bpf_req = bpf_peers_present(lc->ifp->if_bpf); lagg_bpf_req = false; lagg_ifp = NULL; if (lc->ifp->if_type == IFT_IEEE8023ADLAG || lc->ifp->if_type == IFT_INFINIBANDLAG) { struct lagg_port *lp = lc->ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; lagg_ifp = sc->sc_ifp; if (lagg_ifp != NULL) lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf); } /* Strip and compress all the incoming packets. */ can_append_old_cmp = true; cmp = NULL; for (pp = &le->m_head; *pp != NULL; ) { mv_to = NULL; if (do_bpf_strip_and_compress(inp, lc, le, pp, &cmp, &mv_to, &should_wake, bpf_req, lagg_bpf_req, lagg_ifp, can_append_old_cmp) == false) { /* Advance to next mbuf. */ pp = &(*pp)->m_nextpkt; /* * Once we have appended we can't look in the pending * inbound packets for a compressed ack to append to. */ can_append_old_cmp = false; /* * Once we append we also need to stop adding to any * compressed ack we were remembering. A new cmp * ack will be required. */ cmp = NULL; tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0); } else if (mv_to != NULL) { /* We are asked to move pp up */ pp = &mv_to->m_nextpkt; tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0); } else tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0); } /* Update "m_last_mbuf", if any. */ if (pp == &le->m_head) le->m_last_mbuf = *pp; else le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt); /* Check if any data mbufs left. */ if (le->m_head != NULL) { counter_u64_add(tcp_inp_lro_direct_queue, 1); tcp_lro_log(tp, lc, le, NULL, 22, 1, inp->inp_flags2, 0, 1); tcp_queue_pkts(inp, tp, le); } if (should_wake) { /* Wakeup */ counter_u64_add(tcp_inp_lro_wokeup_queue, 1); if ((*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0)) inp = NULL; } if (inp != NULL) INP_WUNLOCK(inp); return (0); /* Success. */ } #endif void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { /* Only optimise if there are multiple packets waiting. */ #ifdef TCPHPTS int error; #endif NET_EPOCH_ASSERT(); #ifdef TCPHPTS CURVNET_SET(lc->ifp->if_vnet); error = tcp_lro_flush_tcphpts(lc, le); CURVNET_RESTORE(); if (error != 0) { #endif tcp_lro_condense(lc, le); tcp_flush_out_entry(lc, le); #ifdef TCPHPTS } #endif lc->lro_flushed++; bzero(le, sizeof(*le)); LIST_INSERT_HEAD(&lc->lro_free, le, next); } #ifdef HAVE_INLINE_FLSLL #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) #else static inline uint64_t tcp_lro_msb_64(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); return (x & ~(x >> 1)); } #endif /* * The tcp_lro_sort() routine is comparable to qsort(), except it has * a worst case complexity limit of O(MIN(N,64)*N), where N is the * number of elements to sort and 64 is the number of sequence bits * available. The algorithm is bit-slicing the 64-bit sequence number, * sorting one bit at a time from the most significant bit until the * least significant one, skipping the constant bits. This is * typically called a radix sort. */ static void tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) { struct lro_mbuf_sort temp; uint64_t ones; uint64_t zeros; uint32_t x; uint32_t y; repeat: /* for small arrays insertion sort is faster */ if (size <= 12) { for (x = 1; x < size; x++) { temp = parray[x]; for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) parray[y] = parray[y - 1]; parray[y] = temp; } return; } /* compute sequence bits which are constant */ ones = 0; zeros = 0; for (x = 0; x != size; x++) { ones |= parray[x].seq; zeros |= ~parray[x].seq; } /* compute bits which are not constant into "ones" */ ones &= zeros; if (ones == 0) return; /* pick the most significant bit which is not constant */ ones = tcp_lro_msb_64(ones); /* * Move entries having cleared sequence bits to the beginning * of the array: */ for (x = y = 0; y != size; y++) { /* skip set bits */ if (parray[y].seq & ones) continue; /* swap entries */ temp = parray[x]; parray[x] = parray[y]; parray[y] = temp; x++; } KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); /* sort zeros */ tcp_lro_sort(parray, x); /* sort ones */ parray += x; size -= x; goto repeat; } void tcp_lro_flush_all(struct lro_ctrl *lc) { uint64_t seq; uint64_t nseq; unsigned x; NET_EPOCH_ASSERT(); /* check if no mbufs to flush */ if (lc->lro_mbuf_count == 0) goto done; if (lc->lro_cpu_is_set == 0) { if (lc->lro_last_cpu == curcpu) { lc->lro_cnt_of_same_cpu++; /* Have we reached the threshold to declare a cpu? */ if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh) lc->lro_cpu_is_set = 1; } else { lc->lro_last_cpu = curcpu; lc->lro_cnt_of_same_cpu = 0; } } CURVNET_SET(lc->ifp->if_vnet); /* get current time */ binuptime(&lc->lro_last_queue_time); /* sort all mbufs according to stream */ tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); /* input data into LRO engine, stream by stream */ seq = 0; for (x = 0; x != lc->lro_mbuf_count; x++) { struct mbuf *mb; /* get mbuf */ mb = lc->lro_mbuf_data[x].mb; /* get sequence number, masking away the packet index */ nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); /* check for new stream */ if (seq != nseq) { seq = nseq; /* flush active streams */ tcp_lro_rx_done(lc); } /* add packet to LRO engine */ if (tcp_lro_rx_common(lc, mb, 0, false) != 0) { /* Flush anything we have acummulated */ tcp_lro_flush_active(lc); /* input packet to network layer */ (*lc->ifp->if_input)(lc->ifp, mb); lc->lro_queued++; lc->lro_flushed++; } } CURVNET_RESTORE(); done: /* flush active streams */ tcp_lro_rx_done(lc); #ifdef TCPHPTS tcp_run_hpts(); #endif lc->lro_mbuf_count = 0; } #ifdef TCPHPTS static void build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m, uint32_t *ts_ptr, uint16_t iptos) { /* * Given a TCP ACK, summarize it down into the small TCP ACK * entry. */ ae->timestamp = m->m_pkthdr.rcv_tstmp; ae->flags = 0; if (m->m_flags & M_TSTMP_LRO) ae->flags |= TSTMP_LRO; else if (m->m_flags & M_TSTMP) ae->flags |= TSTMP_HDWR; ae->seq = ntohl(th->th_seq); ae->ack = ntohl(th->th_ack); ae->flags |= tcp_get_flags(th); if (ts_ptr != NULL) { ae->ts_value = ntohl(ts_ptr[1]); ae->ts_echo = ntohl(ts_ptr[2]); ae->flags |= HAS_TSTMP; } ae->win = ntohs(th->th_win); ae->codepoint = iptos; } /* * Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets * and strip all, but the IPv4/IPv6 header. */ static bool do_bpf_strip_and_compress(struct inpcb *inp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp, struct mbuf **mv_to, bool *should_wake, bool bpf_req, bool lagg_bpf_req, struct ifnet *lagg_ifp, bool can_append_old_cmp) { union { void *ptr; struct ip *ip4; struct ip6_hdr *ip6; } l3; struct mbuf *m; struct mbuf *nm; struct tcphdr *th; struct tcp_ackent *ack_ent; uint32_t *ts_ptr; int32_t n_mbuf; bool other_opts, can_compress; uint8_t lro_type; uint16_t iptos; int tcp_hdr_offset; int idx; /* Get current mbuf. */ m = *pp; /* Let the BPF see the packet */ if (__predict_false(bpf_req)) ETHER_BPF_MTAP(lc->ifp, m); if (__predict_false(lagg_bpf_req)) ETHER_BPF_MTAP(lagg_ifp, m); tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off; lro_type = le->inner.data.lro_type; switch (lro_type) { case LRO_TYPE_NONE: lro_type = le->outer.data.lro_type; switch (lro_type) { case LRO_TYPE_IPV4_TCP: tcp_hdr_offset -= sizeof(*le->outer.ip4); m->m_pkthdr.lro_etype = ETHERTYPE_IP; break; case LRO_TYPE_IPV6_TCP: tcp_hdr_offset -= sizeof(*le->outer.ip6); m->m_pkthdr.lro_etype = ETHERTYPE_IPV6; break; default: goto compressed; } break; case LRO_TYPE_IPV4_TCP: tcp_hdr_offset -= sizeof(*le->outer.ip4); m->m_pkthdr.lro_etype = ETHERTYPE_IP; break; case LRO_TYPE_IPV6_TCP: tcp_hdr_offset -= sizeof(*le->outer.ip6); m->m_pkthdr.lro_etype = ETHERTYPE_IPV6; break; default: goto compressed; } MPASS(tcp_hdr_offset >= 0); m_adj(m, tcp_hdr_offset); m->m_flags |= M_LRO_EHDRSTRP; m->m_flags &= ~M_ACKCMP; m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset; th = tcp_lro_get_th(m); th->th_sum = 0; /* TCP checksum is valid. */ /* Check if ACK can be compressed */ can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts); /* Now lets look at the should wake states */ if ((other_opts == true) && ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) == 0)) { /* * If there are other options (SACK?) and the * tcp endpoint has not expressly told us it does * not care about SACKS, then we should wake up. */ *should_wake = true; } /* Is the ack compressable? */ if (can_compress == false) goto done; /* Does the TCP endpoint support ACK compression? */ if ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0) goto done; /* Lets get the TOS/traffic class field */ l3.ptr = mtod(m, void *); switch (lro_type) { case LRO_TYPE_IPV4_TCP: iptos = l3.ip4->ip_tos; break; case LRO_TYPE_IPV6_TCP: iptos = IPV6_TRAFFIC_CLASS(l3.ip6); break; default: iptos = 0; /* Keep compiler happy. */ break; } /* Now lets get space if we don't have some already */ if (*cmp == NULL) { new_one: nm = tcp_lro_get_last_if_ackcmp(lc, le, inp, &n_mbuf, can_append_old_cmp); if (__predict_false(nm == NULL)) goto done; *cmp = nm; if (n_mbuf) { /* * Link in the new cmp ack to our in-order place, * first set our cmp ack's next to where we are. */ nm->m_nextpkt = m; (*pp) = nm; /* * Set it up so mv_to is advanced to our * compressed ack. This way the caller can * advance pp to the right place. */ *mv_to = nm; /* * Advance it here locally as well. */ pp = &nm->m_nextpkt; } } else { /* We have one already we are working on */ nm = *cmp; if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) { /* We ran out of space */ inp->inp_flags2 |= INP_MBUF_L_ACKS; goto new_one; } } MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent)); counter_u64_add(tcp_inp_lro_compressed, 1); le->compressed++; /* We can add in to the one on the tail */ ack_ent = mtod(nm, struct tcp_ackent *); idx = (nm->m_len / sizeof(struct tcp_ackent)); build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos); /* Bump the size of both pkt-hdr and len */ nm->m_len += sizeof(struct tcp_ackent); nm->m_pkthdr.len += sizeof(struct tcp_ackent); compressed: /* Advance to next mbuf before freeing. */ *pp = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); return (true); done: counter_u64_add(tcp_uncomp_total, 1); le->uncompressed++; return (false); } #endif static struct lro_head * tcp_lro_rx_get_bucket(struct lro_ctrl *lc, struct mbuf *m, struct lro_parser *parser) { u_long hash; if (M_HASHTYPE_ISHASH(m)) { hash = m->m_pkthdr.flowid; } else { for (unsigned i = hash = 0; i != LRO_RAW_ADDRESS_MAX; i++) hash += parser->data.raw[i]; } return (&lc->lro_hash[hash % lc->lro_hashsz]); } static int tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash) { struct lro_parser pi; /* inner address data */ struct lro_parser po; /* outer address data */ struct lro_parser *pa; /* current parser for TCP stream */ struct lro_entry *le; struct lro_head *bucket; struct tcphdr *th; int tcp_data_len; int tcp_opt_len; int error; uint16_t tcp_data_sum; #ifdef INET /* Quickly decide if packet cannot be LRO'ed */ if (__predict_false(V_ipforwarding != 0)) return (TCP_LRO_CANNOT); #endif #ifdef INET6 /* Quickly decide if packet cannot be LRO'ed */ if (__predict_false(V_ip6_forwarding != 0)) return (TCP_LRO_CANNOT); #endif if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || (m->m_pkthdr.csum_data != 0xffff)) { /* * The checksum either did not have hardware offload * or it was a bad checksum. We can't LRO such * a packet. */ counter_u64_add(tcp_bad_csums, 1); return (TCP_LRO_CANNOT); } /* We expect a contiguous header [eh, ip, tcp]. */ pa = tcp_lro_parser(m, &po, &pi, true); if (__predict_false(pa == NULL)) return (TCP_LRO_NOT_SUPPORTED); /* We don't expect any padding. */ error = tcp_lro_trim_mbuf_chain(m, pa); if (__predict_false(error != 0)) return (error); #ifdef INET switch (pa->data.lro_type) { case LRO_TYPE_IPV4_TCP: error = tcp_lro_rx_ipv4(lc, m, pa->ip4); if (__predict_false(error != 0)) return (error); break; default: break; } #endif /* If no hardware or arrival stamp on the packet add timestamp */ if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) { m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); m->m_flags |= M_TSTMP_LRO; } /* Get pointer to TCP header. */ th = pa->tcp; /* Don't process SYN packets. */ if (__predict_false(tcp_get_flags(th) & TH_SYN)) return (TCP_LRO_CANNOT); /* Get total TCP header length and compute payload length. */ tcp_opt_len = (th->th_off << 2); tcp_data_len = m->m_pkthdr.len - ((uint8_t *)th - (uint8_t *)m->m_data) - tcp_opt_len; tcp_opt_len -= sizeof(*th); /* Don't process invalid TCP headers. */ if (__predict_false(tcp_opt_len < 0 || tcp_data_len < 0)) return (TCP_LRO_CANNOT); /* Compute TCP data only checksum. */ if (tcp_data_len == 0) tcp_data_sum = 0; /* no data, no checksum */ else if (__predict_false(csum != 0)) tcp_data_sum = tcp_lro_rx_csum_data(pa, ~csum); else tcp_data_sum = tcp_lro_rx_csum_data(pa, ~th->th_sum); /* Save TCP info in mbuf. */ m->m_nextpkt = NULL; m->m_pkthdr.rcvif = lc->ifp; m->m_pkthdr.lro_tcp_d_csum = tcp_data_sum; m->m_pkthdr.lro_tcp_d_len = tcp_data_len; m->m_pkthdr.lro_tcp_h_off = ((uint8_t *)th - (uint8_t *)m->m_data); m->m_pkthdr.lro_nsegs = 1; /* Get hash bucket. */ if (!use_hash) { bucket = &lc->lro_hash[0]; } else { bucket = tcp_lro_rx_get_bucket(lc, m, pa); } /* Try to find a matching previous segment. */ LIST_FOREACH(le, bucket, hash_next) { /* Compare addresses and ports. */ if (lro_address_compare(&po.data, &le->outer.data) == false || lro_address_compare(&pi.data, &le->inner.data) == false) continue; /* Check if no data and old ACK. */ if (tcp_data_len == 0 && SEQ_LT(ntohl(th->th_ack), ntohl(le->ack_seq))) { m_freem(m); return (0); } /* Mark "m" in the last spot. */ le->m_last_mbuf->m_nextpkt = m; /* Now set the tail to "m". */ le->m_last_mbuf = m; return (0); } /* Try to find an empty slot. */ if (LIST_EMPTY(&lc->lro_free)) return (TCP_LRO_NO_ENTRIES); /* Start a new segment chain. */ le = LIST_FIRST(&lc->lro_free); LIST_REMOVE(le, next); tcp_lro_active_insert(lc, bucket, le); /* Make sure the headers are set. */ le->inner = pi; le->outer = po; /* Store time this entry was allocated. */ le->alloc_time = lc->lro_last_queue_time; tcp_set_entry_to_mbuf(lc, le, m, th); /* Now set the tail to "m". */ le->m_last_mbuf = m; return (0); } int tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) { int error; if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) != ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || (m->m_pkthdr.csum_data != 0xffff)) { /* * The checksum either did not have hardware offload * or it was a bad checksum. We can't LRO such * a packet. */ counter_u64_add(tcp_bad_csums, 1); return (TCP_LRO_CANNOT); } /* get current time */ binuptime(&lc->lro_last_queue_time); CURVNET_SET(lc->ifp->if_vnet); error = tcp_lro_rx_common(lc, m, csum, true); if (__predict_false(error != 0)) { /* * Flush anything we have acummulated * ahead of this packet that can't * be LRO'd. This preserves order. */ tcp_lro_flush_active(lc); } CURVNET_RESTORE(); return (error); } void tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) { NET_EPOCH_ASSERT(); /* sanity checks */ if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || lc->lro_mbuf_max == 0)) { /* packet drop */ m_freem(mb); return; } /* check if packet is not LRO capable */ if (__predict_false((lc->ifp->if_capenable & IFCAP_LRO) == 0)) { /* input packet to network layer */ (*lc->ifp->if_input) (lc->ifp, mb); return; } /* If no hardware or arrival stamp on the packet add timestamp */ if ((tcplro_stacks_wanting_mbufq > 0) && (tcp_less_accurate_lro_ts == 0) && ((mb->m_flags & M_TSTMP) == 0)) { /* Add in an LRO time since no hardware */ binuptime(&lc->lro_last_queue_time); mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); mb->m_flags |= M_TSTMP_LRO; } /* create sequence number */ lc->lro_mbuf_data[lc->lro_mbuf_count].seq = (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | (((uint64_t)mb->m_pkthdr.flowid) << 24) | ((uint64_t)lc->lro_mbuf_count); /* enter mbuf */ lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; /* flush if array is full */ if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) tcp_lro_flush_all(lc); } /* end */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index b78967a0f20c..2363cdf75e1e 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4106 +1,4102 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include "opt_tcpdebug.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #include #ifdef INVARIANTS #define TCPSTATES #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCPDEBUG #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 static ip6proto_ctlinput_t tcp6_ctlinput; static udp_tun_icmp_t tcp6_ctlinput_viaudp; #endif VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif #ifdef NETFLIX_EXP_DETECTION /* Sack attack detection thresholds and such */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Sack Attack detection thresholds"); int32_t tcp_force_detection = 0; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection, CTLFLAG_RW, &tcp_force_detection, 0, "Do we force detection even if the INP has it off?"); int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh, CTLFLAG_RW, &tcp_sack_to_ack_thresh, 700, "Percentage of sacks to acks we must see above (10.1 percent is 101)?"); int32_t tcp_sack_to_move_thresh = 600; /* 60 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh, CTLFLAG_RW, &tcp_sack_to_move_thresh, 600, "Percentage of sack moves we must see above (10.1 percent is 101)"); int32_t tcp_restoral_thresh = 650; /* 65 % (sack:2:ack -5%) */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh, CTLFLAG_RW, &tcp_restoral_thresh, 550, "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)"); int32_t tcp_sad_decay_val = 800; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per, CTLFLAG_RW, &tcp_sad_decay_val, 800, "The decay percentage (10.1 percent equals 101 )"); int32_t tcp_map_minimum = 500; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps, CTLFLAG_RW, &tcp_map_minimum, 500, "Number of Map enteries before we start detection"); int32_t tcp_attack_on_turns_on_logging = 0; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, attacks_logged, CTLFLAG_RW, &tcp_attack_on_turns_on_logging, 0, "When we have a positive hit on attack, do we turn on logging?"); int32_t tcp_sad_pacing_interval = 2000; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int, CTLFLAG_RW, &tcp_sad_pacing_interval, 2000, "What is the minimum pacing interval for a classified attacker?"); int32_t tcp_sad_low_pps = 100; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps, CTLFLAG_RW, &tcp_sad_low_pps, 100, "What is the input pps that below which we do not decay?"); #endif uint32_t tcp_ack_war_time_window = 1000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, CTLFLAG_RW, &tcp_ack_war_time_window, 1000, "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?"); uint32_t tcp_ack_war_cnt = 5; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_RW, &tcp_ack_war_cnt, 5, "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?"); struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); /* * As of June 2021, several TCP stacks violate RFC 7323 from September 2014. * Some stacks negotiate TS, but never send them after connection setup. Some * stacks negotiate TS, but don't send them when sending keep-alive segments. * These include modern widely deployed TCP stacks. * Therefore tolerating violations for now... */ VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_tolerate_missing_ts), 0, "Tolerate missing TCP timestamps"); VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ts_offset_per_conn), 0, "Initialize TCP timestamps per connection instead of per host pair"); /* How many connections are pacing */ static volatile uint32_t number_of_tcp_connections_pacing = 0; static uint32_t shadow_num_connections = 0; static int tcp_pacing_limit = 10000; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, &tcp_pacing_limit, 1000, "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, &shadow_num_connections, 0, "Number of TCP connections being paced"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0; /* unlimited */ static int sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_tcp_map_entries_limit; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { /* only allow "0" and value > minimum */ if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT) error = EINVAL; else V_tcp_map_entries_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_map_entries_limit), 0, &sysctl_net_inet_tcp_map_limit_check, "IU", "Total sendmap entries limit"); VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0; /* unlimited */ SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_map_split_limit), 0, "Total sendmap split entries limit"); #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); #define V_ts_offset_secret VNET(ts_offset_secret) static int tcp_default_fb_init(struct tcpcb *tp); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc(struct inpcb *, int); static struct inpcb *tcp_drop_syn_sent(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr); static ipproto_ctlinput_t tcp_ctlinput; static udp_tun_icmp_t tcp_ctlinput_viaudp; static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_block_name = "freebsd", .tfb_tcp_output = tcp_default_output, .tfb_tcp_do_segment = tcp_do_segment, .tfb_tcp_ctloutput = tcp_default_ctloutput, .tfb_tcp_handoff_ok = tcp_default_handoff_ok, .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, }; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp) { TCPSTAT_INC(tcps_dsack_count); tp->t_dsack_pack++; if (tlp == 0) { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_bytes, (start - end)); } } else { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end)); } } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } /* Find a matching alias for the given tcp_function_block. */ int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs) { struct tcp_function *f; int found; found = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { if ((f->tf_fb == blk) && (strncmp(f->tf_name, blk->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX) != 0)) { /* Matching function block with different name. */ strncpy(fs->function_set_name, f->tf_name, TCP_FUNCTION_NAME_LEN_MAX); found = 1; break; } } /* Null terminate the string appropriately. */ if (found) { fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; } else { fs->function_set_name[0] = '\0'; } rw_runlock(&tcp_function_lock); return (found); } static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = tcp_func_set_ptr; refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } void tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); /* * Release the old stack. This function will either find a new one * or panic. */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* * Now, we'll find a new function block to use. * Start by trying the current user-selected * default, unless this stack is the user-selected * default. */ tfb = find_and_ref_tcp_default_fb(); if (tfb == tp->t_fb) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Does the stack accept this connection? */ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ tp->t_fb = tfb; if (tp->t_fb->tfb_tcp_fb_init == NULL || (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0) return; /* * Initialization failed. Release the reference count on * the stack. */ refcount_release(&tfb->tfb_refcnt); } /* * If that wasn't feasible, use the built-in default * stack which is not allowed to reject anyone. */ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); if (tfb == NULL) { /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } if (tfb->tfb_tcp_handoff_ok != NULL) { if ((*tfb->tfb_tcp_handoff_ok) (tp)) { /* The default stack cannot say no */ panic("Default stack rejects a new session?"); } } tp->t_fb = tfb; if (tp->t_fb->tfb_tcp_fb_init != NULL && (*tp->t_fb->tfb_tcp_fb_init)(tp)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } } static bool tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct ip *iph; #ifdef INET6 struct ip6_hdr *ip6; #endif struct udphdr *uh; struct tcphdr *th; int thlen; uint16_t port; TCPSTAT_INC(tcps_tunneled_pkts); if ((m->m_flags & M_PKTHDR) == 0) { /* Can't handle one that is not a pkt hdr */ TCPSTAT_INC(tcps_tunneled_errs); goto out; } thlen = sizeof(struct tcphdr); if (m->m_len < off + sizeof(struct udphdr) + thlen && (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); thlen = th->th_off << 2; if (m->m_len < off + sizeof(struct udphdr) + thlen) { m = m_pullup(m, off + sizeof(struct udphdr) + thlen); if (m == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } else { iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); } } m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; bcopy(th, uh, m->m_len - off); m->m_len -= sizeof(struct udphdr); m->m_pkthdr.len -= sizeof(struct udphdr); /* * We use the same algorithm for * both UDP and TCP for c-sum. So * the code in tcp_input will skip * the checksum. So we do nothing * with the flag (m->m_pkthdr.csum_flags). */ switch (iph->ip_v) { #ifdef INET case IPVERSION: iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); tcp_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif #ifdef INET6 case IPV6_VERSION >> 4: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif default: goto out; break; } return (true); out: m_freem(m); return (true); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; #ifdef INET VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; #define V_udp4_tun_socket VNET(udp4_tun_socket) #endif #ifdef INET6 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; #define V_udp6_tun_socket VNET(udp6_tun_socket) #endif static void tcp_over_udp_stop(void) { /* * This function assumes sysctl caller holds inp_rinfo_lock() * for writing! */ #ifdef INET if (V_udp4_tun_socket != NULL) { soclose(V_udp4_tun_socket); V_udp4_tun_socket = NULL; } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { soclose(V_udp6_tun_socket); V_udp6_tun_socket = NULL; } #endif } static int tcp_over_udp_start(void) { uint16_t port; int ret; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif /* * This function assumes sysctl caller holds inp_info_rlock() * for writing! */ port = V_tcp_udp_tunneling_port; if (ntohs(port) == 0) { /* Must have a port set */ return (EINVAL); } #ifdef INET if (V_udp4_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET if ((ret = socreate(PF_INET, &V_udp4_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, tcp_recv_udp_tunneled_packet, tcp_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_port = htons(port); if ((ret = sobind(V_udp4_tun_socket, (struct sockaddr *)&sin, curthread))) { tcp_over_udp_stop(); return (ret); } #endif #ifdef INET6 if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, tcp_recv_udp_tunneled_packet, tcp6_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_port = htons(port); if ((ret = sobind(V_udp6_tun_socket, (struct sockaddr *)&sin6, curthread))) { tcp_over_udp_stop(); return (ret); } #endif return (0); } static int sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t old, new; old = V_tcp_udp_tunneling_port; new = old; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if ((new < TCP_TUNNELING_PORT_MIN) || (new > TCP_TUNNELING_PORT_MAX)) { error = EINVAL; } else { V_tcp_udp_tunneling_port = new; if (old != 0) { tcp_over_udp_stop(); } if (new != 0) { error = tcp_over_udp_start(); } } } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_port), 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", "Tunneling port for tcp over udp"); VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; static int sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_udp_tunneling_overhead; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if ((new < TCP_TUNNELING_OVERHEAD_MIN) || (new > TCP_TUNNELING_OVERHEAD_MAX)) error = EINVAL; else V_tcp_udp_tunneling_overhead = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_overhead), 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", "MSS reduction when using tcp over udp"); /* * Exports one (struct tcp_function_info) for each alias/name. */ static int sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { int cnt, error; struct tcp_function *f; struct tcp_function_info tfi; /* * We don't allow writes. */ if (req->newptr != NULL) return (EINVAL); /* * Wire the old buffer so we can directly copy the functions to * user space without dropping the lock. */ if (req->oldptr != NULL) { error = sysctl_wire_old_buffer(req, 0); if (error) return (error); } /* * Walk the list and copy out matching entries. If INVARIANTS * is compiled in, also walk the list to verify the length of * the list matches what we have recorded. */ rw_rlock(&tcp_function_lock); cnt = 0; #ifndef INVARIANTS if (req->oldptr == NULL) { cnt = tcp_fb_cnt; goto skip_loop; } #endif TAILQ_FOREACH(f, &t_functions, tf_next) { #ifdef INVARIANTS cnt++; #endif if (req->oldptr != NULL) { bzero(&tfi, sizeof(tfi)); tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; (void)strlcpy(tfi.tfi_alias, f->tf_name, sizeof(tfi.tfi_alias)); (void)strlcpy(tfi.tfi_name, f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the * mechanism we use to accumulate length * information if the buffer was too short. */ } } KASSERT(cnt == tcp_fb_cnt, ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); #ifndef INVARIANTS skip_loop: #endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* * tfb_tcp_handoff_ok() function for the default stack. * Note that we'll basically try to take all comers. */ static int tcp_default_handoff_ok(struct tcpcb *tp) { return (0); } /* * tfb_tcp_fb_init() function for the default stack. * * This handles making sure we have appropriate timers set if you are * transitioning a socket that has some amount of setup done. * * The init() fuction from the default can *never* return non-zero i.e. * it is required to always succeed since it is the stack of last resort! */ static int tcp_default_fb_init(struct tcpcb *tp) { struct socket *so; INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, ("%s: connection %p in unexpected state %d", __func__, tp, tp->t_state)); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). */ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) return (0); /* * Make sure some kind of transmission timer is set if there is * outstanding data. */ so = tp->t_inpcb->inp_socket; if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || tcp_timer_active(tp, TT_PERSIST))) { /* * If the session has established and it looks like it should * be in the persist state, set the persist timer. Otherwise, * set the retransmit timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && (int32_t)(tp->snd_nxt - tp->snd_una) < (int32_t)sbavail(&so->so_snd)) tcp_setpersist(tp); else tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } /* All non-embryonic sessions get a keepalive timer. */ if (!tcp_timer_active(tp, TT_KEEP)) tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); /* * Make sure critical variables are initialized * if transitioning while in Recovery. */ if IN_FASTRECOVERY(tp->t_flags) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } return (0); } /* * tfb_tcp_fb_fini() function for the default stack. * * This changes state as necessary (or prudent) to prepare for another stack * to assume responsibility for the connection. */ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) { INP_WLOCK_ASSERT(tp->t_inpcb); return; } /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif /* * XXX * Callouts should be moved into struct tcp directly. They are currently * separate because the tcpcb structure is exported to userland for sysctl * parsing purposes, which do not know about callouts. */ struct tcpcb_mem { struct tcpcb tcb; struct tcp_timer tt; struct cc_var ccv; #ifdef TCP_HHOOK struct osd osd; #endif }; VNET_DEFINE_STATIC(uma_zone_t, tcpcb_zone); #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) INPCBSTORAGE_DEFINE(tcpcbstor, "tcpinp", "tcp_inpcb", "tcp", "tcphash"); /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } static volatile int next_tcp_stack_id = 1; /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *n; struct tcp_function_set fs; int error, i; KASSERT(names != NULL && *num_names > 0, ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); KASSERT(rw_initialized(&tcp_function_lock), ("%s: called too early", __func__)); if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ *num_names = 0; return (EINVAL); } if (blk->tfb_tcp_timer_stop_all || blk->tfb_tcp_timer_activate || blk->tfb_tcp_timer_active || blk->tfb_tcp_timer_stop) { /* * If you define one timer function you * must have them all. */ if ((blk->tfb_tcp_timer_stop_all == NULL) || (blk->tfb_tcp_timer_activate == NULL) || (blk->tfb_tcp_timer_active == NULL) || (blk->tfb_tcp_timer_stop == NULL)) { *num_names = 0; return (EINVAL); } } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { *num_names = 0; return (EINVAL); } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { error = ENOMEM; goto cleanup; } n->tf_fb = blk; (void)strlcpy(fs.function_set_name, names[i], sizeof(fs.function_set_name)); rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); error = EALREADY; goto cleanup; } (void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name)); TAILQ_INSERT_TAIL(&t_functions, n, tf_next); tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); } return(0); cleanup: /* * Deregister the names we just added. Because registration failed * for names[i], we don't need to deregister that name. */ *num_names = i; rw_wlock(&tcp_function_lock); while (--i >= 0) { TAILQ_FOREACH(n, &t_functions, tf_next) { if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); tcp_fb_cnt--; n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; } } } rw_wunlock(&tcp_function_lock); return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } /* * Deregister all names associated with a function block. This * functionally removes the function block from use within the system. * * When called with a true quiesce argument, mark the function block * as being removed so no more stacks will use it and determine * whether the removal would succeed. * * When called with a false quiesce argument, actually attempt the * removal. * * When called with a force argument, attempt to switch all TCBs to * use the default stack instead of returning EBUSY. * * Returns 0 on success (or if the removal would succeed, or an error * code on failure. */ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; if (blk == &tcp_def_funcblk) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); if (blk == tcp_func_set_ptr) { /* You can't free the current default */ rw_wunlock(&tcp_function_lock); return (EBUSY); } /* Mark the block so no more stacks can use it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; /* * If TCBs are still attached to the stack, attempt to switch them * to the default stack. */ if (force && blk->tfb_refcnt) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); while ((inp = inp_next(&inpi)) != NULL) { tp = intotcpcb(inp); if (tp == NULL || tp->t_fb != blk) continue; tcp_switch_back_to_default(tp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); rw_wlock(&tcp_function_lock); } if (blk->tfb_refcnt) { /* TCBs still attached. */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (quiesce) { /* Skip removal. */ rw_wunlock(&tcp_function_lock); return (0); } /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { TAILQ_REMOVE(&t_functions, f, tf_next); tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); return (0); } static void tcp_drain(void) { struct epoch_tracker et; VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ while ((inpb = inp_next(&inpi)) != NULL) { if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tcpb->t_inpkts)); tcp_pcap_drain(&(tcpb->t_outpkts)); } #endif } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); NET_EPOCH_EXIT(et); } static void tcp_vnet_init(void *arg __unused) { #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif #ifdef STATS if (tcp_stats_init()) printf("%s: WARNING: unable to initialise TCP stats\n", __func__); #endif in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize, tcp_tcbhashsize); /* * These have to be type stable for the benefit of the timers. */ V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_fastopen_init(); COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); V_tcp_msl = TCPTV_MSL; } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_vnet_init, NULL); static void tcp_init(void *arg __unused) { const char *tcbhash_tuneable; int hashsize; tcp_reass_global_init(); /* XXX virtualize those below? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_rexmit_initial = TCPTV_RTOBASE; if (tcp_rexmit_initial < 1) tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; /* Setup the tcp function block list */ TAILQ_INIT(&t_functions); rw_init(&tcp_function_lock, "tcp_func_lock"); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ tcp_log_init(); #endif arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); if (tcp_soreceive_stream) { #ifdef INET tcp_protosw.pr_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_protosw.pr_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 max_protohdr_grow(sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); #else /* INET6 */ max_protohdr_grow(sizeof(struct tcpiphdr)); #endif /* INET6 */ ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); tcp_extra_mbuf = counter_u64_alloc(M_WAITOK); tcp_would_have_but = counter_u64_alloc(M_WAITOK); tcp_comp_total = counter_u64_alloc(M_WAITOK); tcp_uncomp_total = counter_u64_alloc(M_WAITOK); tcp_bad_csums = counter_u64_alloc(M_WAITOK); #ifdef TCPPCAP tcp_pcap_init(); #endif hashsize = TCBHASHSIZE; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } tcp_tcbhashsize = hashsize; #ifdef INET IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput); #endif #ifdef INET6 IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput); #endif } SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL); #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { int n; #ifdef TCP_HHOOK int error; #endif /* * All our processes are gone, all our sockets should be cleaned * up, which means, we should be past the tcp_discardcb() calls. * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { INP_INFO_WLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; INP_INFO_WUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); } tcp_hc_destroy(); syncache_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); uma_zdestroy(V_tcpcb_zone); /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); if (port == 0) ip6->ip6_nxt = IPPROTO_TCP; else ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; if (port == 0) ip->ip_p = IPPROTO_TCP; else ip->ip_p = IPPROTO_UDP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_off = 5; tcp_set_flags(th, 0); th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, uint16_t flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct udphdr *uh = NULL; struct tcphdr *nth; struct tcp_log_buffer *lgb; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win, ulen; int ect = 0; bool incl_opts; uint16_t port; int output_ret; #ifdef INVARIANTS int thflags = tcp_get_flags(th); #endif KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tp->t_inpcb; KASSERT(inp != NULL, ("tcp control block w/o inpcb")); INP_LOCK_ASSERT(inp); } else inp = NULL; if (m != NULL) { #ifdef INET6 if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else #endif if (ip && (ip->ip_p == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else port = 0; } else port = tp->t_port; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if ((!M_WRITABLE(m)) || (port != 0)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif if (port) tlen += sizeof (struct udphdr); #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { ect = tcp_ecn_output_established(tp, &flags, 0, false); /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { if (uh) { ulen = tlen - sizeof(struct ip6_hdr); uh->uh_ulen = htons(ulen); } ip6->ip6_flow = htonl(ect << 20); ip6->ip6_vfc = IPV6_VERSION; if (port) ip6->ip6_nxt = IPPROTO_UDP; else ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (uh) { ulen = tlen - sizeof(struct ip); uh->uh_ulen = htons(ulen); } ip->ip_tos = ect; ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_TCP; } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; tcp_set_flags(nth, flags); if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif #ifdef INET6 if (isipv6) { if (port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); } ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb : NULL, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (port) { uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } } #endif /* INET */ #ifdef TCPDEBUG if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); lgb = NULL; if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) { if (INP_WLOCKED(inp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts; log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.delivered = 0; lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv); } else { /* * We can not log the packet, since we only own the * read lock, but a write lock is needed. The read lock * is not upgraded to a write lock, since only getting * the read lock was done intentionally to improve the * handling of SYN flooding attacks. * This happens only for pure SYN segments received in * the initial CLOSED state, or received in a more * advanced state than listen and the UDP encapsulation * port is unexpected. * The incoming SYN segments do not really belong to * the TCP connection and the handling does not change * the state of the TCP connection. Therefore, the * sending of the RST segments is not logged. Please * note that also the incoming SYN segments are not * logged. * * The following code ensures that the above description * is and stays correct. */ KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN && (tp->t_state == TCPS_CLOSED || (tp->t_state > TCPS_LISTEN && tp->t_port != port)), ("%s: Logging of TCP segment with flags 0x%b and " "UDP encapsulation port %u skipped in state %s", __func__, thflags, PRINT_TH_FLAGS, ntohs(port), tcpstates[tp->t_state])); } } if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); TCPSTAT_INC(tcps_sndtotal); #ifdef INET6 if (isipv6) { TCP_PROBE5(send, NULL, tp, ip6, tp, nth); output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { TCP_PROBE5(send, NULL, tp, ip, tp, nth); output_ret = ip_output(m, NULL, NULL, 0, NULL, inp); } #endif if (lgb != NULL) lgb->tlb_errno = output_ret; } /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. The `inp' parameter must have * come from the zone allocator set up in tcp_init(). */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb_mem *tm; struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO); if (tm == NULL) return (NULL); tp = &tm->tcb; /* Initialise cc_var struct for this tcpcb. */ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ cc_attach(tp, CC_DEFAULT_ALGO()); /* * The tcpcb will hold a reference on its inpcb until tcp_discardcb() * is called. */ in_pcbref(inp); /* Reference for tcpcb */ tp->t_inpcb = inp; if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) { cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); in_pcbrele_wlocked(inp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #ifdef TCP_HHOOK tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); in_pcbrele_wlocked(inp); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } #endif #ifdef VIMAGE tp->t_vnet = inp->inp_vnet; #endif tp->t_timers = &tm->tt; TAILQ_INIT(&tp->t_segq); tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* Set up our timeouts. */ callout_init(&tp->t_timers->tt_rexmt, 1); callout_init(&tp->t_timers->tt_persist, 1); callout_init(&tp->t_timers->tt_keep, 1); callout_init(&tp->t_timers->tt_2msl, 1); callout_init(&tp->t_timers->tt_delack, 1); switch (V_tcp_do_rfc1323) { case 0: break; default: case 1: tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); break; case 2: tp->t_flags = TF_REQ_SCALE; break; case 3: tp->t_flags = TF_REQ_TSTMP; break; } if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = tcp_rexmit_initial; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; inp->inp_ppcb = tp; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) { refcount_release(&tp->t_fb->tfb_refcnt); in_pcbrele_wlocked(inp); uma_zfree(V_tcpcb_zone, tm); return (NULL); } } #ifdef STATS if (V_tcp_perconn_stats_enable == 1) tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); #endif if (V_tcp_do_lrd) tp->t_flags |= TF_LRD; return (tp); /* XXX */ } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); /* Don't use tcp_output() here due to possible recursion. */ (void)tcp_output_nodrop(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); /* * Make sure that all of our timers are stopped before we delete the * PCB. * * If stopping a timer fails, we schedule a discard function in same * callout, and the last discard function called will take care of * deleting the tcpcb. */ tp->t_timers->tt_draincnt = 0; tcp_timer_stop(tp, TT_REXMT); tcp_timer_stop(tp, TT_PERSIST); tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); if (tp->t_fb->tfb_tcp_timer_stop_all) { /* * Call the stop-all function of the methods, * this function should call the tcp_timer_stop() * method with each of the function specific timeouts. * That stop will be called via the tfb_tcp_timer_stop() * which should use the async drain function of the * callout system (see tcp_var.h). */ tp->t_fb->tfb_tcp_timer_stop_all(tp); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(tp->ccv); CC_DATA(tp) = NULL; /* Detach from the CC algorithm */ cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); #endif #ifdef STATS stats_blob_destroy(tp->t_stats); #endif CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; if (tp->t_timers->tt_draincnt == 0) { bool released __diagused; released = tcp_freecb(tp); KASSERT(!released, ("%s: inp %p should not have been released " "here", __func__, inp)); } } bool tcp_freecb(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(inp); MPASS(tp->t_timers->tt_draincnt == 0); /* We own the last reference on tcpcb, let's free it. */ #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! * * XXXRRS: Updating must be after the stack fini() since * that may be converting some internal representation of * say srtt etc into the general one used by other stacks. * Lets also at least protect against the so being NULL * as RW stated below. */ if ((tp->t_rttupdated >= 4) && (so != NULL)) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tp); return (in_pcbrele_wlocked(inp)); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; struct socket *so; INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } #ifdef TCPHPTS tcp_hpts_remove(inp); #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); sorele(so); return (NULL); } return (tp); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_WLOCK_ASSERT(inp); if (inp->inp_flags & INP_DROPPED) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != NULL) return (EPERM); if (req->oldptr == NULL) { int n; n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req); if (error) return (error); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { INP_RUNLOCK(inp); break; } else continue; } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } NET_EPOCH_ENTER(et); #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET /* Path MTU to try next when a fragmentation-needed message is received. */ static inline int tcp_next_pmtu(const struct icmp *icp, const struct ip *ip) { int mtu = ntohs(icp->icmp_nextmtu); /* If no alternative MTU was proposed, try the next smaller one. */ if (!mtu) mtu = ip_next_mtu(ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); return (mtu); } static void tcp_ctlinput_with_port(struct icmp *icp, uint16_t port) { struct ip *ip; struct tcphdr *th; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int); struct in_conninfo inc; tcp_seq icmp_tcp_seq; int errno, mtu; errno = icmp_errmap(icp); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: if (V_icmp_may_rst && icp->icmp_type == ICMP_TIMXCEED) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } ip = &icp->icmp_ip; th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); icmp_tcp_seq = th->th_seq; inp = in_pcblookup(&V_tcbinfo, ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { - if (!(inp->inp_flags & INP_DROPPED) && - !(inp->inp_socket == NULL)) { + if (inp->inp_socket != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* * MTU discovery for offloaded connections. Let * the TOE driver verify seq# and process it. */ mtu = tcp_next_pmtu(icp, ip); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) { goto out; } if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: we got a needfrag and * will potentially try a lower MTU. */ mtu = tcp_next_pmtu(icp, ip); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = ip->ip_dst; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); inp = tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, errno); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = ip->ip_dst; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp_ctlinput(struct icmp *icmp) { tcp_ctlinput_with_port(icmp, htons(0)); } static void tcp_ctlinput_viaudp(udp_tun_icmp_param_t param) { /* Its a tunneled TCP over UDP icmp */ struct icmp *icmp = param.icmp; struct ip *outer_ip, *inner_ip; struct udphdr *udp; struct tcphdr *th, ttemp; int i_hlen, o_len; uint16_t port; outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); inner_ip = &icmp->icmp_ip; i_hlen = inner_ip->ip_hl << 2; o_len = ntohs(outer_ip->ip_len); if (o_len < (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { /* Not enough data present */ return; } /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; th = (struct tcphdr *)(udp + 1); memcpy(&ttemp, th, sizeof(struct tcphdr)); memcpy(udp, &ttemp, sizeof(struct tcphdr)); /* Now adjust down the size of the outer IP header */ o_len -= sizeof(struct udphdr); outer_ip->ip_len = htons(o_len); /* Now call in to the normal handling code */ tcp_ctlinput_with_port(icmp, port); } #endif /* INET */ #ifdef INET6 static inline int tcp6_next_pmtu(const struct icmp6_hdr *icmp6) { int mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was proposed, or the proposed MTU was too * small, set to the min. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */ return (mtu); } static void tcp6_ctlinput_with_port(struct ip6ctlparam *ip6cp, uint16_t port) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int); struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct in_conninfo inc; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; int errno; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; dst = &ip6cp->ip6c_finaldst->sin6_addr; errno = icmp6_errmap(icmp6); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: /* * There are only four ICMPs that may reset connection: * - administratively prohibited * - port unreachable * - time exceeded in transit * - unknown next header */ if (V_icmp_may_rst && ((icmp6->icmp6_type == ICMP6_DST_UNREACH && (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN || icmp6->icmp6_code == ICMP6_DST_UNREACH_NOPORT)) || (icmp6->icmp6_type == ICMP6_TIME_EXCEEDED && icmp6->icmp6_code == ICMP6_TIME_EXCEED_TRANSIT) || (icmp6->icmp6_type == ICMP6_PARAM_PROB && icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER))) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { - if (!(inp->inp_flags & INP_DROPPED) && - !(inp->inp_socket == NULL)) { + if (inp->inp_socket != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* MTU discovery for offloaded connections. */ mtu = tcp6_next_pmtu(icmp6); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) { goto out; } if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = tcp6_next_pmtu(icmp6); bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, errno); } } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp6_ctlinput(struct ip6ctlparam *ctl) { tcp6_ctlinput_with_port(ctl, htons(0)); } static void tcp6_ctlinput_viaudp(udp_tun_icmp_param_t param) { struct ip6ctlparam *ip6cp = param.ip6cp; struct mbuf *m; struct udphdr *udp; uint16_t port; m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); if (m == NULL) { return; } udp = mtod(m, struct udphdr *); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; m_adj(m, sizeof(struct udphdr)); if ((m->m_flags & M_PKTHDR) == 0) { ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); } /* Now call in to the normal handling code */ tcp6_ctlinput_with_port(ip6cp, port); } #endif /* INET6 */ static uint32_t tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) { SIPHASH_CTX ctx; uint32_t hash[2]; KASSERT(len >= SIPHASH_KEY_LENGTH, ("%s: keylen %u too short ", __func__, len)); SipHash24_Init(&ctx); SipHash_SetKey(&ctx, (uint8_t *)key); SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); break; #endif } SipHash_Final((uint8_t *)hash, &ctx); return (hash[0] ^ hash[1]); } uint32_t tcp_new_ts_offset(struct in_conninfo *inc) { struct in_conninfo inc_store, *local_inc; if (!V_tcp_ts_offset_per_conn) { memcpy(&inc_store, inc, sizeof(struct in_conninfo)); inc_store.inc_lport = 0; inc_store.inc_fport = 0; local_inc = &inc_store; } else { local_inc = inc; } return (tcp_keyed_hash(local_inc, V_ts_offset_secret, sizeof(V_ts_offset_secret))); } /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the ISN lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) #define ISN_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); VNET_DEFINE_STATIC(int, isn_last); VNET_DEFINE_STATIC(int, isn_last_reseed); VNET_DEFINE_STATIC(u_int32_t, isn_offset); VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct in_conninfo *inc) { tcp_seq new_isn; u_int32_t projected_offset; ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); V_isn_last_reseed = ticks; } /* Compute the hash and return the ISN. */ new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, sizeof(V_isn_secret)); V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ static struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); if (inp->inp_flags & INP_DROPPED) return (inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_disable_path(tp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { return (tcp_mtudisc(inp, -1)); } static struct inpcb * tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); if (inp->inp_flags & INP_DROPPED) return (inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); if (tp->t_fb->tfb_tcp_mtu_chg != NULL) { /* * Conceptually the snd_nxt setting * and freeing sack holes should * be done by the default stacks * own tfb_tcp_mtu_chg(). */ tp->t_fb->tfb_tcp_mtu_chg(tp); } if (tcp_output(tp) < 0) return (NULL); else return (inp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (inc->inc_flags & INC_IPV6MINMTU) return (IPV6_MMTU); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } /* * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack. * * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag. * The right place to do that is ip6_setpktopt() that has just been * executed. By the way it just filled ip6po_minmtu for us. */ void tcp6_use_min_mtu(struct tcpcb *tp) { struct inpcb *inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); /* * In case of the IPV6_USE_MIN_MTU socket * option, the INC_IPV6MINMTU flag to announce * a corresponding MSS during the initial * handshake. If the TCP connection is not in * the front states, just reduce the MSS being * used. This avoids the sending of TCP * segments which will be fragmented at the * IPv6 layer. */ inp->inp_inc.inc_flags |= INC_IPV6MINMTU; if ((tp->t_state >= TCPS_SYN_SENT) && (inp->inp_inc.inc_flags & INC_ISIPV6)) { struct ip6_pktopts *opt; opt = inp->in6p_outputopts; if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && tp->t_maxseg > TCP6_MSS) tp->t_maxseg = TCP6_MSS; } } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PADTCPOLEN(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } u_int tcp_fixed_maxseg(const struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. This is important * for cc modules to figure out what the modulo of the * cwnd should be. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { - if ((inp->inp_flags & INP_DROPPED) == 0 && - !SOLISTENING(inp->inp_socket)) { + if (!SOLISTENING(inp->inp_socket)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; NET_EPOCH_EXIT(et); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "", "Drop TCP connection"); static int tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS) { return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo, &tcp_ctloutput_set)); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "", "Set socket option for TCP endpoint"); #ifdef KERN_TLS static int sysctl_switch_tls(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } NET_EPOCH_EXIT(et); if (inp != NULL) { - if ((inp->inp_flags & INP_DROPPED) != 0 || - inp->inp_socket == NULL) { + if (inp->inp_socket == NULL) { error = ECONNRESET; INP_WUNLOCK(inp); } else { struct socket *so; so = inp->inp_socket; soref(so); error = ktls_set_tx_mode(so, arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); INP_WUNLOCK(inp); sorele(so); } } else error = ESRCH; return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "", "Switch TCP connection to SW TLS"); SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "", "Switch TCP connection to ifnet TLS"); #endif /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (V_tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; #ifdef INET const struct ip *ip = (const struct ip *)ip4hdr; #endif #ifdef INET6 const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); sbintime_t now; bzero(xt, sizeof(*xt)); xt->t_state = tp->t_state; xt->t_logstate = tp->t_logstate; xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; xt->t_rcv_wnd = tp->rcv_wnd; xt->t_snd_wnd = tp->snd_wnd; xt->t_snd_cwnd = tp->snd_cwnd; xt->t_snd_ssthresh = tp->snd_ssthresh; xt->t_dsack_bytes = tp->t_dsack_bytes; xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; xt->t_dsack_pack = tp->t_dsack_pack; xt->t_maxseg = tp->t_maxseg; xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; now = getsbinuptime(); #define COPYTIMER(ttt) do { \ if (callout_active(&tp->t_timers->ttt)) \ xt->ttt = (tp->t_timers->ttt.c_time - now) / \ SBT_1MS; \ else \ xt->ttt = 0; \ } while (0) COPYTIMER(tt_delack); COPYTIMER(tt_rexmt); COPYTIMER(tt_persist); COPYTIMER(tt_keep); COPYTIMER(tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; xt->xt_encaps_port = tp->t_port; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); #ifdef TCP_BLACKBOX (void)tcp_log_get_id(tp, xt->xt_logid); #endif xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); if (inp->inp_socket == NULL) xt->xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; } void tcp_log_end_status(struct tcpcb *tp, uint8_t status) { uint32_t bit, i; if ((tp == NULL) || (status > TCP_EI_STATUS_MAX_VALUE) || (status == 0)) { /* Invalid */ return; } if (status > (sizeof(uint32_t) * 8)) { /* Should this be a KASSERT? */ return; } bit = 1U << (status - 1); if (bit & tp->t_end_info_status) { /* already logged */ return; } for (i = 0; i < TCP_END_BYTE_INFO; i++) { if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) { tp->t_end_info_bytes[i] = status; tp->t_end_info_status |= bit; break; } } } int tcp_can_enable_pacing(void) { if ((tcp_pacing_limit == -1) || (tcp_pacing_limit > number_of_tcp_connections_pacing)) { atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); shadow_num_connections = number_of_tcp_connections_pacing; return (1); } else { return (0); } } static uint8_t tcp_pacing_warning = 0; void tcp_decrement_paced_conn(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); shadow_num_connections = number_of_tcp_connections_pacing; KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); if (ret == 0) { if (tcp_pacing_limit != -1) { printf("Warning all pacing is now disabled, count decrements invalidly!\n"); tcp_pacing_limit = 0; } else if (tcp_pacing_warning == 0) { printf("Warning pacing count is invalid, invalid decrement\n"); tcp_pacing_warning = 1; } } }