diff --git a/sys/netinet/in.c b/sys/netinet/in.c index 0cf148c38575..7d8cbc6725ad 100644 --- a/sys/netinet/in.c +++ b/sys/netinet/in.c @@ -1,1879 +1,1879 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (C) 2001 WIDE Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #define IN_HISTORICAL_NETS /* include class masks */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MAC #include #endif static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct ucred *); static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct ucred *); static int in_gifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct ucred *); static void in_socktrim(struct sockaddr_in *); static void in_purgemaddrs(struct ifnet *); static bool ia_need_loopback_route(const struct in_ifaddr *); VNET_DEFINE_STATIC(int, nosameprefix); #define V_nosameprefix VNET(nosameprefix) SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nosameprefix), 0, "Refuse to create same prefixes on different interfaces"); VNET_DEFINE_STATIC(bool, broadcast_lowest); #define V_broadcast_lowest VNET(broadcast_lowest) SYSCTL_BOOL(_net_inet_ip, OID_AUTO, broadcast_lowest, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(broadcast_lowest), 0, "Treat lowest address on a subnet (host 0) as broadcast"); VNET_DEFINE(bool, ip_allow_net240) = false; #define V_ip_allow_net240 VNET(ip_allow_net240) SYSCTL_BOOL(_net_inet_ip, OID_AUTO, allow_net240, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_allow_net240), 0, "Allow forwarding of and ICMP response to Experimental addresses, aka Class E (240/4)"); /* see https://datatracker.ietf.org/doc/draft-schoen-intarea-unicast-240 */ VNET_DEFINE(bool, ip_allow_net0) = false; SYSCTL_BOOL(_net_inet_ip, OID_AUTO, allow_net0, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_allow_net0), 0, "Allow forwarding of and ICMP response to addresses in network 0/8"); /* see https://datatracker.ietf.org/doc/draft-schoen-intarea-unicast-0 */ VNET_DEFINE(uint32_t, in_loopback_mask) = IN_LOOPBACK_MASK_DFLT; #define V_in_loopback_mask VNET(in_loopback_mask) static int sysctl_loopback_prefixlen(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_net_inet_ip, OID_AUTO, loopback_prefixlen, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_loopback_prefixlen, "I", "Prefix length of address space reserved for loopback"); /* see https://datatracker.ietf.org/doc/draft-schoen-intarea-unicast-127 */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); #define V_ripcbinfo VNET(ripcbinfo) static struct sx in_control_sx; SX_SYSINIT(in_control_sx, &in_control_sx, "in_control"); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). */ int in_localaddr(struct in_addr in) { u_long i = ntohl(in.s_addr); struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((i & ia->ia_subnetmask) == ia->ia_subnet) return (1); } return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ bool in_localip(struct in_addr in) { struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr) return (true); return (false); } /* * Like in_localip(), but FIB-aware and carp(4)-aware. */ bool in_localip_fib(struct in_addr in, uint16_t fib) { struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash) if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr && (ia->ia_ifa.ifa_carp == NULL || carp_master_p(&ia->ia_ifa)) && ia->ia_ifa.ifa_ifp->if_fib == fib) return (true); return (false); } /* * Return 1 if an internet address is configured on an interface. */ int in_ifhasaddr(struct ifnet *ifp, struct in_addr in) { struct ifaddr *ifa; struct in_ifaddr *ia; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == in.s_addr) return (1); } return (0); } /* * Return a reference to the interface address which is different to * the supplied one but with same IP address value. */ static struct in_ifaddr * in_localip_more(struct in_ifaddr *original_ia) { struct epoch_tracker et; in_addr_t original_addr = IA_SIN(original_ia)->sin_addr.s_addr; uint32_t original_fib = original_ia->ia_ifa.ifa_ifp->if_fib; struct in_ifaddr *ia; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(ia, INADDR_HASH(original_addr), ia_hash) { in_addr_t addr = IA_SIN(ia)->sin_addr.s_addr; uint32_t fib = ia->ia_ifa.ifa_ifp->if_fib; if (!V_rt_add_addr_allfibs && (original_fib != fib)) continue; if ((original_ia != ia) && (original_addr == addr)) { ifa_ref(&ia->ia_ifa); NET_EPOCH_EXIT(et); return (ia); } } NET_EPOCH_EXIT(et); return (NULL); } /* * Tries to find first IPv4 address in the provided fib. * Prefers non-loopback addresses and return loopback IFF * @loopback_ok is set. * * Returns ifa or NULL. */ struct in_ifaddr * in_findlocal(uint32_t fibnum, bool loopback_ok) { struct in_ifaddr *ia = NULL, *ia_lo = NULL; NET_EPOCH_ASSERT(); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { uint32_t ia_fib = ia->ia_ifa.ifa_ifp->if_fib; if (!V_rt_add_addr_allfibs && (fibnum != ia_fib)) continue; if (!IN_LOOPBACK(ntohl(IA_SIN(ia)->sin_addr.s_addr))) break; if (loopback_ok) ia_lo = ia; } if (ia == NULL) ia = ia_lo; return (ia); } /* * Determine whether an IP address is in a reserved set of addresses * that may not be forwarded, or whether datagrams to that destination * may be forwarded. */ int in_canforward(struct in_addr in) { u_long i = ntohl(in.s_addr); if (IN_MULTICAST(i) || IN_LINKLOCAL(i) || IN_LOOPBACK(i)) return (0); if (IN_EXPERIMENTAL(i) && !V_ip_allow_net240) return (0); if (IN_ZERONET(i) && !V_ip_allow_net0) return (0); return (1); } /* * Sysctl to manage prefix of reserved loopback network; translate * to/from mask. The mask is always contiguous high-order 1 bits * followed by all 0 bits. */ static int sysctl_loopback_prefixlen(SYSCTL_HANDLER_ARGS) { int error, preflen; /* ffs is 1-based; compensate. */ preflen = 33 - ffs(V_in_loopback_mask); error = sysctl_handle_int(oidp, &preflen, 0, req); if (error || !req->newptr) return (error); if (preflen < 8 || preflen > 31) return (EINVAL); V_in_loopback_mask = 0xffffffff << (32 - preflen); return (0); } /* * Trim a mask in a sockaddr */ static void in_socktrim(struct sockaddr_in *ap) { char *cplim = (char *) &ap->sin_addr; char *cp = (char *) (&ap->sin_addr + 1); ap->sin_len = 0; while (--cp >= cplim) if (*cp) { (ap)->sin_len = cp - (char *) (ap) + 1; break; } } /* * Generic internet control operations (ioctl's). */ int in_control_ioctl(u_long cmd, void *data, struct ifnet *ifp, struct ucred *cred) { struct ifreq *ifr = (struct ifreq *)data; struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; int error; if (ifp == NULL) return (EADDRNOTAVAIL); /* * Filter out 4 ioctls we implement directly. Forward the rest * to specific functions and ifp->if_ioctl(). */ switch (cmd) { case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFDSTADDR: case SIOCGIFNETMASK: break; case SIOCGIFALIAS: sx_xlock(&in_control_sx); error = in_gifaddr_ioctl(cmd, data, ifp, cred); sx_xunlock(&in_control_sx); return (error); case SIOCDIFADDR: sx_xlock(&in_control_sx); error = in_difaddr_ioctl(cmd, data, ifp, cred); sx_xunlock(&in_control_sx); return (error); case OSIOCAIFADDR: /* 9.x compat */ case SIOCAIFADDR: sx_xlock(&in_control_sx); error = in_aifaddr_ioctl(cmd, data, ifp, cred); sx_xunlock(&in_control_sx); return (error); case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* We no longer support that old commands. */ return (EINVAL); default: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); return ((*ifp->if_ioctl)(ifp, cmd, data)); } if (addr->sin_addr.s_addr != INADDR_ANY && prison_check_ip4(cred, &addr->sin_addr) != 0) return (EADDRNOTAVAIL); /* * Find address for this interface, if it exists. If an * address was specified, find that one instead of the * first one on the interface, if possible. */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr) break; } if (ifa == NULL) CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET) { ia = (struct in_ifaddr *)ifa; if (prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) break; } if (ifa == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } error = 0; switch (cmd) { case SIOCGIFADDR: *addr = ia->ia_addr; break; case SIOCGIFBRDADDR: if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EINVAL; break; } *addr = ia->ia_broadaddr; break; case SIOCGIFDSTADDR: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; break; } *addr = ia->ia_dstaddr; break; case SIOCGIFNETMASK: *addr = ia->ia_sockmask; break; } NET_EPOCH_EXIT(et); return (error); } int in_mask2len(struct in_addr *mask) { int x, y; u_char *p; p = (u_char *)mask; for (x = 0; x < sizeof(*mask); x++) { if (p[x] != 0xff) break; } y = 0; if (x < sizeof(*mask)) { for (y = 0; y < 8; y++) { if ((p[x] & (0x80 >> y)) == 0) break; } } return (x * 8 + y); } int in_control(struct socket *so, u_long cmd, void *data, struct ifnet *ifp, struct thread *td) { return (in_control_ioctl(cmd, data, ifp, td ? td->td_ucred : NULL)); } static int in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred) { const struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr; const struct sockaddr_in *mask = &ifra->ifra_mask; const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr; const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; bool iaIsFirst; int error = 0; error = priv_check_cred(cred, PRIV_NET_ADDIFADDR); if (error) return (error); /* * ifra_addr must be present and be of INET family. * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); if (broadaddr->sin_len != 0 && (broadaddr->sin_len != sizeof(struct sockaddr_in) || broadaddr->sin_family != AF_INET)) return (EINVAL); if (mask->sin_len != 0 && (mask->sin_len != sizeof(struct sockaddr_in) || mask->sin_family != AF_INET)) return (EINVAL); if ((ifp->if_flags & IFF_POINTOPOINT) && (dstaddr->sin_len != sizeof(struct sockaddr_in) || dstaddr->sin_addr.s_addr == INADDR_ANY)) return (EDESTADDRREQ); if (vhid != 0 && carp_attach_p == NULL) return (EPROTONOSUPPORT); #ifdef MAC /* Check if a MAC policy disallows setting the IPv4 address. */ error = mac_inet_check_add_addr(cred, &addr->sin_addr, ifp); if (error != 0) return (error); #endif /* * See whether address already exist. */ iaIsFirst = true; ia = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(cred, &addr->sin_addr) == 0) ia = it; else iaIsFirst = false; } NET_EPOCH_EXIT(et); if (ia != NULL) (void )in_difaddr_ioctl(cmd, data, ifp, cred); ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK); ia = (struct in_ifaddr *)ifa; ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock, CALLOUT_RETURNUNLOCKED); ia->ia_ifp = ifp; ia->ia_addr = *addr; if (mask->sin_len != 0) { ia->ia_sockmask = *mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); } else { in_addr_t i = ntohl(addr->sin_addr.s_addr); /* * If netmask isn't supplied, use historical default. * This is deprecated for interfaces other than loopback * or point-to-point; warn in other cases. In the future * we should return an error rather than warning. */ if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0) printf("%s: set address: WARNING: network mask " "should be specified; using historical default\n", ifp->if_xname); if (IN_CLASSA(i)) ia->ia_subnetmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) ia->ia_subnetmask = IN_CLASSB_NET; else ia->ia_subnetmask = IN_CLASSC_NET; ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask); } ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask; in_socktrim(&ia->ia_sockmask); if (ifp->if_flags & IFF_BROADCAST) { if (broadaddr->sin_len != 0) { ia->ia_broadaddr = *broadaddr; } else if (ia->ia_subnetmask == IN_RFC3021_MASK) { ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST; ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } else { ia->ia_broadaddr.sin_addr.s_addr = htonl(ia->ia_subnet | ~ia->ia_subnetmask); ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in); ia->ia_broadaddr.sin_family = AF_INET; } } if (ifp->if_flags & IFF_POINTOPOINT) ia->ia_dstaddr = *dstaddr; if (vhid != 0) { error = (*carp_attach_p)(&ia->ia_ifa, vhid); if (error) return (error); } /* if_addrhead is already referenced by ifa_alloc() */ IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(ifa); /* in_ifaddrhead */ sx_assert(&in_control_sx, SA_XLOCKED); CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link); CK_LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); /* * Give the interface a chance to initialize * if this is its first address, * and to validate the address if necessary. */ if (ifp->if_ioctl != NULL) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) goto fail1; } /* * Add route for the network. */ if (vhid == 0) { error = in_addprefix(ia); if (error) goto fail1; } /* * Add a loopback route to self. */ if (vhid == 0 && ia_need_loopback_route(ia)) { struct in_ifaddr *eia; eia = in_localip_more(ia); if (eia == NULL) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error) goto fail2; } else ifa_free(&eia->ia_ifa); } if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) { struct in_addr allhosts_addr; struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); error = in_joingroup(ifp, &allhosts_addr, NULL, &ii->ii_allhosts); } /* * Note: we don't need extra reference for ifa, since we called * with sx lock held, and ifaddr can not be deleted in concurrent * thread. */ EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, ifa, IFADDR_EVENT_ADD); return (error); fail2: if (vhid == 0) (void )in_scrubprefix(ia, LLE_STATIC); fail1: if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, false); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ sx_assert(&in_control_sx, SA_XLOCKED); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); CK_LIST_REMOVE(ia, ia_hash); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (error); } static int in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred) { const struct ifreq *ifr = (struct ifreq *)data; const struct sockaddr_in *addr = (const struct sockaddr_in *) &ifr->ifr_addr; struct ifaddr *ifa; struct in_ifaddr *ia; bool deleteAny, iaIsLast; int error; if (cred != NULL) { error = priv_check_cred(cred, PRIV_NET_DELIFADDR); if (error) return (error); } if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) deleteAny = true; else deleteAny = false; iaIsLast = true; ia = NULL; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (deleteAny && ia == NULL && (cred == NULL || prison_check_ip4(cred, &it->ia_addr.sin_addr) == 0)) ia = it; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && (cred == NULL || prison_check_ip4(cred, &addr->sin_addr) == 0)) ia = it; if (it != ia) iaIsLast = false; } if (ia == NULL) { IF_ADDR_WUNLOCK(ifp); return (EADDRNOTAVAIL); } CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ sx_assert(&in_control_sx, SA_XLOCKED); CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link); CK_LIST_REMOVE(ia, ia_hash); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, LLE_STATIC); /* * in_ifadown gets rid of all the rest of * the routes. This is not quite the right * thing to do, but at least if we are running * a routing process they will come back. */ in_ifadown(&ia->ia_ifa, 1); if (ia->ia_ifa.ifa_carp) (*carp_detach_p)(&ia->ia_ifa, cmd == SIOCAIFADDR); /* * If this is the last IPv4 address configured on this * interface, leave the all-hosts group. * No state-change report need be transmitted. */ if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) { struct in_ifinfo *ii; ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]); if (ii->ii_allhosts) { (void)in_leavegroup(ii->ii_allhosts, NULL); ii->ii_allhosts = NULL; } } IF_ADDR_WLOCK(ifp); if (callout_stop(&ia->ia_garp_timer) == 1) { ifa_free(&ia->ia_ifa); } IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa, IFADDR_EVENT_DEL); ifa_free(&ia->ia_ifa); /* in_ifaddrhead */ return (0); } static int in_gifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred) { struct in_aliasreq *ifra = (struct in_aliasreq *)data; const struct sockaddr_in *addr = &ifra->ifra_addr; struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; /* * ifra_addr must be present and be of INET family. */ if (addr->sin_len != sizeof(struct sockaddr_in) || addr->sin_family != AF_INET) return (EINVAL); /* * See whether address exist. */ ia = NULL; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct in_ifaddr *it; if (ifa->ifa_addr->sa_family != AF_INET) continue; it = (struct in_ifaddr *)ifa; if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr && prison_check_ip4(cred, &addr->sin_addr) == 0) { ia = it; break; } } if (ia == NULL) { NET_EPOCH_EXIT(et); return (EADDRNOTAVAIL); } ifra->ifra_mask = ia->ia_sockmask; if ((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_family == AF_INET) ifra->ifra_dstaddr = ia->ia_dstaddr; else if ((ifp->if_flags & IFF_BROADCAST) && ia->ia_broadaddr.sin_family == AF_INET) ifra->ifra_broadaddr = ia->ia_broadaddr; else memset(&ifra->ifra_broadaddr, 0, sizeof(ifra->ifra_broadaddr)); NET_EPOCH_EXIT(et); return (0); } static int in_match_ifaddr(const struct rtentry *rt, const struct nhop_object *nh, void *arg) { if (nh->nh_ifa == (struct ifaddr *)arg) return (1); return (0); } static int in_handle_prefix_route(uint32_t fibnum, int cmd, struct sockaddr_in *dst, struct sockaddr_in *netmask, struct ifaddr *ifa, struct ifnet *ifp) { NET_EPOCH_ASSERT(); /* Prepare gateway */ struct sockaddr_dl_short sdl = { .sdl_family = AF_LINK, .sdl_len = sizeof(struct sockaddr_dl_short), .sdl_type = ifa->ifa_ifp->if_type, .sdl_index = ifa->ifa_ifp->if_index, }; struct rt_addrinfo info = { .rti_ifa = ifa, .rti_ifp = ifp, .rti_flags = RTF_PINNED | ((netmask != NULL) ? 0 : RTF_HOST), .rti_info = { [RTAX_DST] = (struct sockaddr *)dst, [RTAX_NETMASK] = (struct sockaddr *)netmask, [RTAX_GATEWAY] = (struct sockaddr *)&sdl, }, /* Ensure we delete the prefix IFF prefix ifa matches */ .rti_filter = in_match_ifaddr, .rti_filterdata = ifa, }; return (rib_handle_ifaddr_info(fibnum, cmd, &info)); } /* * Routing table interaction with interface addresses. * * In general, two types of routes needs to be installed: * a) "interface" or "prefix" route, telling user that the addresses * behind the ifa prefix are reached directly. * b) "loopback" route installed for the ifa address, telling user that * the address belongs to local system. * * Handling for (a) and (b) differs in multi-fib aspects, hence they * are implemented in different functions below. * * The cases above may intersect - /32 interface aliases results in * the same prefix produced by (a) and (b). This blurs the definition * of the "loopback" route and complicate interactions. The interaction * table is defined below. The case numbers are used in the multiple * functions below to refer to the particular test case. * * There can be multiple options: * 1) Adding address with prefix on non-p2p/non-loopback interface. * Example: 192.0.2.1/24. Action: * * add "prefix" route towards 192.0.2.0/24 via @ia interface, * using @ia as an address source. * * add "loopback" route towards 192.0.2.1 via V_loif, saving * @ia ifp in the gateway and using @ia as an address source. * * 2) Adding address with /32 mask to non-p2p/non-loopback interface. * Example: 192.0.2.2/32. Action: * * add "prefix" host route via V_loif, using @ia as an address source. * * 3) Adding address with or without prefix to p2p interface. * Example: 10.0.0.1/24->10.0.0.2. Action: * * add "prefix" host route towards 10.0.0.2 via this interface, using @ia * as an address source. Note: no sense in installing full /24 as the interface * is point-to-point. * * add "loopback" route towards 10.0.9.1 via V_loif, saving * @ia ifp in the gateway and using @ia as an address source. * * 4) Adding address with or without prefix to loopback interface. * Example: 192.0.2.1/24. Action: * * add "prefix" host route via @ia interface, using @ia as an address source. * Note: Skip installing /24 prefix as it would introduce TTL loop * for the traffic destined to these addresses. */ /* * Checks if @ia needs to install loopback route to @ia address via * ifa_maintain_loopback_route(). * * Return true on success. */ static bool ia_need_loopback_route(const struct in_ifaddr *ia) { struct ifnet *ifp = ia->ia_ifp; /* Case 4: Skip loopback interfaces */ if ((ifp->if_flags & IFF_LOOPBACK) || (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)) return (false); /* Clash avoidance: Skip p2p interfaces with both addresses are equal */ if ((ifp->if_flags & IFF_POINTOPOINT) && ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) return (false); /* Case 2: skip /32 prefixes */ if (!(ifp->if_flags & IFF_POINTOPOINT) && (ia->ia_sockmask.sin_addr.s_addr == INADDR_BROADCAST)) return (false); return (true); } /* * Calculate "prefix" route corresponding to @ia. */ static void ia_getrtprefix(const struct in_ifaddr *ia, struct in_addr *prefix, struct in_addr *mask) { if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) { /* Case 3: return host route for dstaddr */ *prefix = ia->ia_dstaddr.sin_addr; mask->s_addr = INADDR_BROADCAST; } else if (ia->ia_ifp->if_flags & IFF_LOOPBACK) { /* Case 4: return host route for ifaddr */ *prefix = ia->ia_addr.sin_addr; mask->s_addr = INADDR_BROADCAST; } else { /* Cases 1,2: return actual ia prefix */ *prefix = ia->ia_addr.sin_addr; *mask = ia->ia_sockmask.sin_addr; prefix->s_addr &= mask->s_addr; } } /* * Adds or delete interface "prefix" route corresponding to @ifa. * Returns 0 on success or errno. */ static int in_handle_ifaddr_route(int cmd, struct in_ifaddr *ia) { struct ifaddr *ifa = &ia->ia_ifa; struct in_addr daddr, maddr; struct sockaddr_in *pmask; struct epoch_tracker et; int error; ia_getrtprefix(ia, &daddr, &maddr); struct sockaddr_in mask = { .sin_family = AF_INET, .sin_len = sizeof(struct sockaddr_in), .sin_addr = maddr, }; pmask = (maddr.s_addr != INADDR_BROADCAST) ? &mask : NULL; struct sockaddr_in dst = { .sin_family = AF_INET, .sin_len = sizeof(struct sockaddr_in), .sin_addr.s_addr = daddr.s_addr & maddr.s_addr, }; struct ifnet *ifp = ia->ia_ifp; if ((maddr.s_addr == INADDR_BROADCAST) && (!(ia->ia_ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)))) { /* Case 2: host route on broadcast interface */ ifp = V_loif; } uint32_t fibnum = ifa->ifa_ifp->if_fib; NET_EPOCH_ENTER(et); error = in_handle_prefix_route(fibnum, cmd, &dst, pmask, ifa, ifp); NET_EPOCH_EXIT(et); return (error); } /* * Check if we have a route for the given prefix already. */ static bool in_hasrtprefix(struct in_ifaddr *target) { struct epoch_tracker et; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; bool result = false; ia_getrtprefix(target, &prefix, &mask); /* Look for an existing address with the same prefix, mask, and fib */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { ia_getrtprefix(ia, &p, &m); if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib) continue; /* * If we got a matching prefix route inserted by other * interface address, we are done here. */ if (ia->ia_flags & IFA_ROUTE) { result = true; break; } } NET_EPOCH_EXIT(et); return (result); } int in_addprefix(struct in_ifaddr *target) { int error; if (in_hasrtprefix(target)) { if (V_nosameprefix) return (EEXIST); else { rt_addrmsg(RTM_ADD, &target->ia_ifa, target->ia_ifp->if_fib); return (0); } } /* * No-one seem to have this prefix route, so we try to insert it. */ rt_addrmsg(RTM_ADD, &target->ia_ifa, target->ia_ifp->if_fib); error = in_handle_ifaddr_route(RTM_ADD, target); if (!error) target->ia_flags |= IFA_ROUTE; return (error); } /* * Removes either all lle entries for given @ia, or lle * corresponding to @ia address. */ static void in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags) { struct sockaddr_in addr, mask; struct sockaddr *saddr, *smask; struct ifnet *ifp; saddr = (struct sockaddr *)&addr; bzero(&addr, sizeof(addr)); addr.sin_len = sizeof(addr); addr.sin_family = AF_INET; smask = (struct sockaddr *)&mask; bzero(&mask, sizeof(mask)); mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr.s_addr = ia->ia_subnetmask; ifp = ia->ia_ifp; if (all) { /* * Remove all L2 entries matching given prefix. * Convert address to host representation to avoid * doing this on every callback. ia_subnetmask is already * stored in host representation. */ addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr); lltable_prefix_free(AF_INET, saddr, smask, flags); } else { /* Remove interface address only */ addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr; lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr); } } /* * If there is no other address in the system that can serve a route to the * same prefix, remove the route. Hand over the route to the new address * otherwise. */ int in_scrubprefix(struct in_ifaddr *target, u_int flags) { struct epoch_tracker et; struct in_ifaddr *ia; struct in_addr prefix, mask, p, m; int error = 0; /* * Remove the loopback route to the interface address. */ if (ia_need_loopback_route(target) && (flags & LLE_STATIC)) { struct in_ifaddr *eia; eia = in_localip_more(target); if (eia != NULL) { error = ifa_switch_loopback_route((struct ifaddr *)eia, (struct sockaddr *)&target->ia_addr); ifa_free(&eia->ia_ifa); } else { error = ifa_del_loopback_route((struct ifaddr *)target, (struct sockaddr *)&target->ia_addr); } } ia_getrtprefix(target, &prefix, &mask); if ((target->ia_flags & IFA_ROUTE) == 0) { rt_addrmsg(RTM_DELETE, &target->ia_ifa, target->ia_ifp->if_fib); /* * Removing address from !IFF_UP interface or * prefix which exists on other interface (along with route). * No entries should exist here except target addr. * Given that, delete this entry only. */ in_scrubprefixlle(target, 0, flags); return (0); } NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { ia_getrtprefix(ia, &p, &m); if (prefix.s_addr != p.s_addr || mask.s_addr != m.s_addr) continue; if ((ia->ia_ifp->if_flags & IFF_UP) == 0) continue; /* * If we got a matching prefix address, move IFA_ROUTE and * the route itself to it. Make sure that routing daemons * get a heads-up. */ if ((ia->ia_flags & IFA_ROUTE) == 0) { ifa_ref(&ia->ia_ifa); NET_EPOCH_EXIT(et); error = in_handle_ifaddr_route(RTM_DELETE, target); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n", error); /* Scrub all entries IFF interface is different */ in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp, flags); error = in_handle_ifaddr_route(RTM_ADD, ia); if (error == 0) ia->ia_flags |= IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n", error); ifa_free(&ia->ia_ifa); return (error); } } NET_EPOCH_EXIT(et); /* * remove all L2 entries on the given prefix */ in_scrubprefixlle(target, 1, flags); /* * As no-one seem to have this prefix, we can remove the route. */ rt_addrmsg(RTM_DELETE, &target->ia_ifa, target->ia_ifp->if_fib); error = in_handle_ifaddr_route(RTM_DELETE, target); if (error == 0) target->ia_flags &= ~IFA_ROUTE; else log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error); return (error); } void in_ifscrub_all(void) { struct ifnet *ifp; struct ifaddr *ifa, *nifa; struct ifreq ifr; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* NET_EPOCH_ENTER(et); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET) continue; /* * This is ugly but the only way for legacy IP to * cleanly remove addresses and everything attached. */ bzero(&ifr, sizeof(ifr)); ifr.ifr_addr = *ifa->ifa_addr; (void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL); } /* NET_EPOCH_EXIT(et); */ in_purgemaddrs(ifp); igmp_domifdetach(ifp); } IFNET_RUNLOCK(); } bool in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia) { return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || /* * Optionally check for old-style (host 0) broadcast, but * taking into account that RFC 3021 obsoletes it. */ __predict_false(V_broadcast_lowest && ia->ia_subnetmask != IN_RFC3021_MASK && ntohl(in.s_addr) == ia->ia_subnet)) && /* * Check for an all one subnetmask. These * only exist when an interface gets a secondary * address. */ ia->ia_subnetmask != (u_long)0xffffffff); } /* * Return true if the address might be a local broadcast address. */ bool -in_broadcast(struct in_addr in, struct ifnet *ifp) +in_ifnet_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; NET_EPOCH_ASSERT(); if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) return (true); if ((ifp->if_flags & IFF_BROADCAST) == 0) return (false); /* * Look through the list of addresses for a match * with a broadcast address. */ CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (ifa->ifa_addr->sa_family == AF_INET && in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) return (true); return (false); } /* * On interface removal, clean up IPv4 data structures hung off of the ifnet. */ void in_ifdetach(struct ifnet *ifp) { IN_MULTI_LOCK(); in_pcbpurgeif0(&V_ripcbinfo, ifp); in_pcbpurgeif0(&V_udbinfo, ifp); in_pcbpurgeif0(&V_ulitecbinfo, ifp); in_purgemaddrs(ifp); IN_MULTI_UNLOCK(); /* * Make sure all multicast deletions invoking if_ioctl() are * completed before returning. Else we risk accessing a freed * ifnet structure pointer. */ inm_release_wait(NULL); } static void in_ifnet_event(void *arg __unused, struct ifnet *ifp, int event) { struct epoch_tracker et; struct ifaddr *ifa; struct in_ifaddr *ia; int error; NET_EPOCH_ENTER(et); switch (event) { case IFNET_EVENT_DOWN: CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if ((ia->ia_flags & IFA_ROUTE) == 0) continue; ifa_ref(ifa); /* * in_scrubprefix() kills the interface route. */ in_scrubprefix(ia, 0); /* * in_ifadown gets rid of all the rest of the * routes. This is not quite the right thing * to do, but at least if we are running a * routing process they will come back. */ in_ifadown(ifa, 0); ifa_free(ifa); } break; case IFNET_EVENT_UP: CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = (struct in_ifaddr *)ifa; if (ia->ia_flags & IFA_ROUTE) continue; ifa_ref(ifa); error = ifa_del_loopback_route(ifa, ifa->ifa_addr); rt_addrmsg(RTM_ADD, ifa, ifa->ifa_ifp->if_fib); error = in_handle_ifaddr_route(RTM_ADD, ia); if (error == 0) ia->ia_flags |= IFA_ROUTE; error = ifa_add_loopback_route(ifa, ifa->ifa_addr); ifa_free(ifa); } break; } NET_EPOCH_EXIT(et); } EVENTHANDLER_DEFINE(ifnet_event, in_ifnet_event, NULL, EVENTHANDLER_PRI_ANY); /* * Delete all IPv4 multicast address records, and associated link-layer * multicast address records, associated with ifp. * XXX It looks like domifdetach runs AFTER the link layer cleanup. * XXX This should not race with ifma_protospec being set during * a new allocation, if it does, we have bigger problems. */ static void in_purgemaddrs(struct ifnet *ifp) { struct epoch_tracker et; struct in_multi_head purgeinms; struct in_multi *inm; struct ifmultiaddr *ifma; SLIST_INIT(&purgeinms); IN_MULTI_LIST_LOCK(); /* * Extract list of in_multi associated with the detaching ifp * which the PF_INET layer is about to release. * We need to do this as IF_ADDR_LOCK() may be re-acquired * by code further down. */ IF_ADDR_WLOCK(ifp); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { inm = inm_ifmultiaddr_get_inm(ifma); if (inm == NULL) continue; inm_rele_locked(&purgeinms, inm); } NET_EPOCH_EXIT(et); IF_ADDR_WUNLOCK(ifp); inm_release_list_deferred(&purgeinms); igmp_ifdetach(ifp); IN_MULTI_LIST_UNLOCK(); } struct in_llentry { struct llentry base; }; #define IN_LLTBL_DEFAULT_HSIZE 32 #define IN_LLTBL_HASH(k, h) \ (((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1)) /* * Do actual deallocation of @lle. */ static void in_lltable_destroy_lle_unlocked(epoch_context_t ctx) { struct llentry *lle; lle = __containerof(ctx, struct llentry, lle_epoch_ctx); LLE_LOCK_DESTROY(lle); LLE_REQ_DESTROY(lle); free(lle, M_LLTABLE); } /* * Called by LLE_FREE_LOCKED when number of references * drops to zero. */ static void in_lltable_destroy_lle(struct llentry *lle) { LLE_WUNLOCK(lle); NET_EPOCH_CALL(in_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx); } static struct llentry * in_lltable_new(struct in_addr addr4, u_int flags) { struct in_llentry *lle; lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; /* * For IPv4 this will trigger "arpresolve" to generate * an ARP request. */ lle->base.la_expire = time_uptime; /* mark expired */ lle->base.r_l3addr.addr4 = addr4; lle->base.lle_refcnt = 1; lle->base.lle_free = in_lltable_destroy_lle; LLE_LOCK_INIT(&lle->base); LLE_REQ_INIT(&lle->base); callout_init(&lle->base.lle_timer, 1); return (&lle->base); } static int in_lltable_match_prefix(const struct sockaddr *saddr, const struct sockaddr *smask, u_int flags, struct llentry *lle) { struct in_addr addr, mask, lle_addr; addr = ((const struct sockaddr_in *)saddr)->sin_addr; mask = ((const struct sockaddr_in *)smask)->sin_addr; lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr); if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0) return (0); if (lle->la_flags & LLE_IFADDR) { /* * Delete LLE_IFADDR records IFF address & flag matches. * Note that addr is the interface address within prefix * being matched. * Note also we should handle 'ifdown' cases without removing * ifaddr macs. */ if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0) return (1); return (0); } /* flags & LLE_STATIC means deleting both dynamic and static entries */ if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)) return (1); return (0); } static void in_lltable_free_entry(struct lltable *llt, struct llentry *lle) { size_t pkts_dropped; LLE_WLOCK_ASSERT(lle); KASSERT(llt != NULL, ("lltable is NULL")); /* Unlink entry from table if not already */ if ((lle->la_flags & LLE_LINKED) != 0) { IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp); lltable_unlink_entry(llt, lle); } /* Drop hold queue */ pkts_dropped = llentry_free(lle); ARPSTAT_ADD(dropped, pkts_dropped); } static int in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct nhop_object *nh; struct in_addr addr; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); addr = ((const struct sockaddr_in *)l3addr)->sin_addr; nh = fib4_lookup(ifp->if_fib, addr, 0, NHR_NONE, 0); if (nh == NULL) return (EINVAL); /* * If the gateway for an existing host route matches the target L3 * address, which is a special route inserted by some implementation * such as MANET, and the interface is of the correct type, then * allow for ARP to proceed. */ if (nh->nh_flags & NHF_GATEWAY) { if (!(nh->nh_flags & NHF_HOST) || nh->nh_ifp->if_type != IFT_ETHER || (nh->nh_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 || memcmp(nh->gw_sa.sa_data, l3addr->sa_data, sizeof(in_addr_t)) != 0) { return (EINVAL); } } /* * Make sure that at least the destination address is covered * by the route. This is for handling the case where 2 or more * interfaces have the same prefix. An incoming packet arrives * on one interface and the corresponding outgoing packet leaves * another interface. */ if ((nh->nh_ifp != ifp) && (nh->nh_flags & NHF_HOST) == 0) { struct in_ifaddr *ia = (struct in_ifaddr *)ifaof_ifpforaddr(l3addr, ifp); struct in_addr dst_addr, mask_addr; if (ia == NULL) return (EINVAL); /* * ifaof_ifpforaddr() returns _best matching_ IFA. * It is possible that ifa prefix does not cover our address. * Explicitly verify and fail if that's the case. */ dst_addr = IA_SIN(ia)->sin_addr; mask_addr.s_addr = htonl(ia->ia_subnetmask); if (!IN_ARE_MASKED_ADDR_EQUAL(dst_addr, addr, mask_addr)) return (EINVAL); } return (0); } static inline uint32_t in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize) { return (IN_LLTBL_HASH(dst.s_addr, hsize)); } static uint32_t in_lltable_hash(const struct llentry *lle, uint32_t hsize) { return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize)); } static void in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)sa; bzero(sin, sizeof(*sin)); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = lle->r_l3addr.addr4; } static inline struct llentry * in_lltable_find_dst(struct lltable *llt, struct in_addr dst) { struct llentry *lle; struct llentries *lleh; u_int hashidx; hashidx = in_lltable_hash_dst(dst, llt->llt_hsize); lleh = &llt->lle_head[hashidx]; CK_LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED) continue; if (lle->r_l3addr.addr4.s_addr == dst.s_addr) break; } return (lle); } static void in_lltable_delete_entry(struct lltable *llt, struct llentry *lle) { lle->la_flags |= LLE_DELETED; EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED); #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif llentry_free(lle); } static struct llentry * in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; char linkhdr[LLE_MAX_LINKHDR]; size_t linkhdrsize; int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in_lltable_rtcheck(ifp, flags, l3addr) != 0) return (NULL); lle = in_lltable_new(sin->sin_addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return (NULL); } lle->la_flags = flags; if (flags & LLE_STATIC) lle->r_flags |= RLLE_VALID; if ((flags & LLE_IFADDR) == LLE_IFADDR) { linkhdrsize = LLE_MAX_LINKHDR; if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), linkhdr, &linkhdrsize, &lladdr_off) != 0) { in_lltable_free_entry(llt, lle); return (NULL); } lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, lladdr_off); lle->la_flags |= LLE_STATIC; lle->r_flags |= (RLLE_VALID | RLLE_IFADDR); lle->la_expire = 0; } return (lle); } /* * Return NULL if not found or marked for deletion. * If found return lle read locked. */ static struct llentry * in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct llentry *lle; IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) != (LLE_UNLOCKED | LLE_EXCLUSIVE), ("wrong lle request flags: %#x", flags)); lle = in_lltable_find_dst(llt, sin->sin_addr); if (lle == NULL) return (NULL); if (flags & LLE_UNLOCKED) return (lle); if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); /* * If the afdata lock is not held, the LLE may have been unlinked while * we were blocked on the LLE lock. Check for this case. */ if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) { if (flags & LLE_EXCLUSIVE) LLE_WUNLOCK(lle); else LLE_RUNLOCK(lle); return (NULL); } return (lle); } static int in_lltable_dump_entry(struct lltable *llt, struct llentry *lle, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in sin; struct sockaddr_dl sdl; } arpc; struct sockaddr_dl *sdl; int error; bzero(&arpc, sizeof(arpc)); /* skip deleted entries */ if ((lle->la_flags & LLE_DELETED) == LLE_DELETED) return (0); /* Skip if jailed and not a valid IP of the prison. */ lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin); if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0) return (0); /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in; (IPv4) * struct sockaddr_dl; */ arpc.rtm.rtm_msglen = sizeof(arpc); arpc.rtm.rtm_version = RTM_VERSION; arpc.rtm.rtm_type = RTM_GET; arpc.rtm.rtm_flags = RTF_UP; arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; /* publish */ if (lle->la_flags & LLE_PUB) arpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &arpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); } arpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) arpc.rtm.rtm_flags |= RTF_STATIC; if (lle->la_flags & LLE_IFADDR) arpc.rtm.rtm_flags |= RTF_PINNED; arpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &arpc, sizeof(arpc)); return (error); } static void in_lltable_post_resolved(struct lltable *llt, struct llentry *lle) { struct ifnet *ifp = llt->llt_ifp; /* gratuitous ARP */ if ((lle->la_flags & LLE_PUB) != 0) arprequest(ifp, &lle->r_l3addr.addr4, &lle->r_l3addr.addr4, lle->ll_addr); } static struct lltable * in_lltattach(struct ifnet *ifp) { struct lltable *llt; llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE); llt->llt_af = AF_INET; llt->llt_ifp = ifp; llt->llt_lookup = in_lltable_lookup; llt->llt_alloc_entry = in_lltable_alloc; llt->llt_delete_entry = in_lltable_delete_entry; llt->llt_dump_entry = in_lltable_dump_entry; llt->llt_hash = in_lltable_hash; llt->llt_fill_sa_entry = in_lltable_fill_sa_entry; llt->llt_free_entry = in_lltable_free_entry; llt->llt_match_prefix = in_lltable_match_prefix; llt->llt_mark_used = llentry_mark_used; llt->llt_post_resolved = in_lltable_post_resolved; lltable_link(llt); return (llt); } struct lltable * in_lltable_get(struct ifnet *ifp) { struct lltable *llt = NULL; void *afdata_ptr = ifp->if_afdata[AF_INET]; if (afdata_ptr != NULL) llt = ((struct in_ifinfo *)afdata_ptr)->ii_llt; return (llt); } void * in_domifattach(struct ifnet *ifp) { struct in_ifinfo *ii; ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO); ii->ii_llt = in_lltattach(ifp); ii->ii_igmp = igmp_domifattach(ifp); return (ii); } void in_domifdetach(struct ifnet *ifp, void *aux) { struct in_ifinfo *ii = (struct in_ifinfo *)aux; igmp_domifdetach(ifp); lltable_free(ii->ii_llt); free(ii, M_IFADDR); } diff --git a/sys/netinet/in.h b/sys/netinet/in.h index e03fe3391388..0ee4200017b5 100644 --- a/sys/netinet/in.h +++ b/sys/netinet/in.h @@ -1,705 +1,705 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_IN_H_ #define _NETINET_IN_H_ #include #include #include /* Protocols common to RFC 1700, POSIX, and X/Open. */ #define IPPROTO_IP 0 /* dummy for IP */ #define IPPROTO_ICMP 1 /* control message protocol */ #define IPPROTO_TCP 6 /* tcp */ #define IPPROTO_UDP 17 /* user datagram protocol */ #define INADDR_ANY ((in_addr_t)0x00000000) #define INADDR_BROADCAST ((in_addr_t)0xffffffff) /* must be masked */ #ifndef _UINT8_T_DECLARED typedef __uint8_t uint8_t; #define _UINT8_T_DECLARED #endif #ifndef _UINT16_T_DECLARED typedef __uint16_t uint16_t; #define _UINT16_T_DECLARED #endif #ifndef _UINT32_T_DECLARED typedef __uint32_t uint32_t; #define _UINT32_T_DECLARED #endif #ifndef _IN_ADDR_T_DECLARED typedef uint32_t in_addr_t; #define _IN_ADDR_T_DECLARED #endif #ifndef _IN_PORT_T_DECLARED typedef uint16_t in_port_t; #define _IN_PORT_T_DECLARED #endif #ifndef _SA_FAMILY_T_DECLARED typedef __sa_family_t sa_family_t; #define _SA_FAMILY_T_DECLARED #endif /* Internet address (a structure for historical reasons). */ #ifndef _STRUCT_IN_ADDR_DECLARED struct in_addr { in_addr_t s_addr; }; #define _STRUCT_IN_ADDR_DECLARED #endif #ifndef _SOCKLEN_T_DECLARED typedef __socklen_t socklen_t; #define _SOCKLEN_T_DECLARED #endif #include /* Socket address, internet style. */ struct sockaddr_in { uint8_t sin_len; sa_family_t sin_family; in_port_t sin_port; struct in_addr sin_addr; char sin_zero[8]; }; #if !defined(_KERNEL) && __POSIX_VISIBLE >= 200112 #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS uint32_t htonl(uint32_t); uint16_t htons(uint16_t); uint32_t ntohl(uint32_t); uint16_t ntohs(uint16_t); __END_DECLS #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif #endif /* !_KERNEL && __POSIX_VISIBLE >= 200112 */ #if __POSIX_VISIBLE >= 200112 #define IPPROTO_IPV6 41 /* IP6 header */ #define IPPROTO_RAW 255 /* raw IP packet */ #define INET_ADDRSTRLEN 16 #endif #if __BSD_VISIBLE /* * Constants and structures defined by the internet system, * Per RFC 790, September 1981, and numerous additions. */ /* * Protocols (RFC 1700) */ #define IPPROTO_HOPOPTS 0 /* IP6 hop-by-hop options */ #define IPPROTO_IGMP 2 /* group mgmt protocol */ #define IPPROTO_GGP 3 /* gateway^2 (deprecated) */ #define IPPROTO_IPV4 4 /* IPv4 encapsulation */ #define IPPROTO_IPIP IPPROTO_IPV4 /* for compatibility */ #define IPPROTO_ST 7 /* Stream protocol II */ #define IPPROTO_EGP 8 /* exterior gateway protocol */ #define IPPROTO_PIGP 9 /* private interior gateway */ #define IPPROTO_RCCMON 10 /* BBN RCC Monitoring */ #define IPPROTO_NVPII 11 /* network voice protocol*/ #define IPPROTO_PUP 12 /* pup */ #define IPPROTO_ARGUS 13 /* Argus */ #define IPPROTO_EMCON 14 /* EMCON */ #define IPPROTO_XNET 15 /* Cross Net Debugger */ #define IPPROTO_CHAOS 16 /* Chaos*/ #define IPPROTO_MUX 18 /* Multiplexing */ #define IPPROTO_MEAS 19 /* DCN Measurement Subsystems */ #define IPPROTO_HMP 20 /* Host Monitoring */ #define IPPROTO_PRM 21 /* Packet Radio Measurement */ #define IPPROTO_IDP 22 /* xns idp */ #define IPPROTO_TRUNK1 23 /* Trunk-1 */ #define IPPROTO_TRUNK2 24 /* Trunk-2 */ #define IPPROTO_LEAF1 25 /* Leaf-1 */ #define IPPROTO_LEAF2 26 /* Leaf-2 */ #define IPPROTO_RDP 27 /* Reliable Data */ #define IPPROTO_IRTP 28 /* Reliable Transaction */ #define IPPROTO_TP 29 /* tp-4 w/ class negotiation */ #define IPPROTO_BLT 30 /* Bulk Data Transfer */ #define IPPROTO_NSP 31 /* Network Services */ #define IPPROTO_INP 32 /* Merit Internodal */ #define IPPROTO_DCCP 33 /* Datagram Congestion Control Protocol */ #define IPPROTO_3PC 34 /* Third Party Connect */ #define IPPROTO_IDPR 35 /* InterDomain Policy Routing */ #define IPPROTO_XTP 36 /* XTP */ #define IPPROTO_DDP 37 /* Datagram Delivery */ #define IPPROTO_CMTP 38 /* Control Message Transport */ #define IPPROTO_TPXX 39 /* TP++ Transport */ #define IPPROTO_IL 40 /* IL transport protocol */ #define IPPROTO_SDRP 42 /* Source Demand Routing */ #define IPPROTO_ROUTING 43 /* IP6 routing header */ #define IPPROTO_FRAGMENT 44 /* IP6 fragmentation header */ #define IPPROTO_IDRP 45 /* InterDomain Routing*/ #define IPPROTO_RSVP 46 /* resource reservation */ #define IPPROTO_GRE 47 /* General Routing Encap. */ #define IPPROTO_MHRP 48 /* Mobile Host Routing */ #define IPPROTO_BHA 49 /* BHA */ #define IPPROTO_ESP 50 /* IP6 Encap Sec. Payload */ #define IPPROTO_AH 51 /* IP6 Auth Header */ #define IPPROTO_INLSP 52 /* Integ. Net Layer Security */ #define IPPROTO_SWIPE 53 /* IP with encryption */ #define IPPROTO_NHRP 54 /* Next Hop Resolution */ #define IPPROTO_MOBILE 55 /* IP Mobility */ #define IPPROTO_TLSP 56 /* Transport Layer Security */ #define IPPROTO_SKIP 57 /* SKIP */ #define IPPROTO_ICMPV6 58 /* ICMP6 */ #define IPPROTO_NONE 59 /* IP6 no next header */ #define IPPROTO_DSTOPTS 60 /* IP6 destination option */ #define IPPROTO_AHIP 61 /* any host internal protocol */ #define IPPROTO_CFTP 62 /* CFTP */ #define IPPROTO_HELLO 63 /* "hello" routing protocol */ #define IPPROTO_SATEXPAK 64 /* SATNET/Backroom EXPAK */ #define IPPROTO_KRYPTOLAN 65 /* Kryptolan */ #define IPPROTO_RVD 66 /* Remote Virtual Disk */ #define IPPROTO_IPPC 67 /* Pluribus Packet Core */ #define IPPROTO_ADFS 68 /* Any distributed FS */ #define IPPROTO_SATMON 69 /* Satnet Monitoring */ #define IPPROTO_VISA 70 /* VISA Protocol */ #define IPPROTO_IPCV 71 /* Packet Core Utility */ #define IPPROTO_CPNX 72 /* Comp. Prot. Net. Executive */ #define IPPROTO_CPHB 73 /* Comp. Prot. HeartBeat */ #define IPPROTO_WSN 74 /* Wang Span Network */ #define IPPROTO_PVP 75 /* Packet Video Protocol */ #define IPPROTO_BRSATMON 76 /* BackRoom SATNET Monitoring */ #define IPPROTO_ND 77 /* Sun net disk proto (temp.) */ #define IPPROTO_WBMON 78 /* WIDEBAND Monitoring */ #define IPPROTO_WBEXPAK 79 /* WIDEBAND EXPAK */ #define IPPROTO_EON 80 /* ISO cnlp */ #define IPPROTO_VMTP 81 /* VMTP */ #define IPPROTO_SVMTP 82 /* Secure VMTP */ #define IPPROTO_VINES 83 /* Banyon VINES */ #define IPPROTO_TTP 84 /* TTP */ #define IPPROTO_IGP 85 /* NSFNET-IGP */ #define IPPROTO_DGP 86 /* dissimilar gateway prot. */ #define IPPROTO_TCF 87 /* TCF */ #define IPPROTO_IGRP 88 /* Cisco/GXS IGRP */ #define IPPROTO_OSPFIGP 89 /* OSPFIGP */ #define IPPROTO_SRPC 90 /* Strite RPC protocol */ #define IPPROTO_LARP 91 /* Locus Address Resoloution */ #define IPPROTO_MTP 92 /* Multicast Transport */ #define IPPROTO_AX25 93 /* AX.25 Frames */ #define IPPROTO_IPEIP 94 /* IP encapsulated in IP */ #define IPPROTO_MICP 95 /* Mobile Int.ing control */ #define IPPROTO_SCCSP 96 /* Semaphore Comm. security */ #define IPPROTO_ETHERIP 97 /* Ethernet IP encapsulation */ #define IPPROTO_ENCAP 98 /* encapsulation header */ #define IPPROTO_APES 99 /* any private encr. scheme */ #define IPPROTO_GMTP 100 /* GMTP*/ #define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ #define IPPROTO_SCTP 132 /* SCTP */ #define IPPROTO_MH 135 /* IPv6 Mobility Header */ #define IPPROTO_UDPLITE 136 /* UDP-Lite */ #define IPPROTO_HIP 139 /* IP6 Host Identity Protocol */ #define IPPROTO_SHIM6 140 /* IP6 Shim6 Protocol */ /* 101-254: Partly Unassigned */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ #define IPPROTO_CARP 112 /* CARP */ #define IPPROTO_PGM 113 /* PGM */ #define IPPROTO_MPLS 137 /* MPLS-in-IP */ #define IPPROTO_PFSYNC 240 /* PFSYNC */ #define IPPROTO_RESERVED_253 253 /* Reserved */ #define IPPROTO_RESERVED_254 254 /* Reserved */ /* 255: Reserved */ /* BSD Private, local use, namespace incursion, no longer used */ #define IPPROTO_OLD_DIVERT 254 /* OLD divert pseudo-proto */ #define IPPROTO_MAX 256 /* last return value of *_input(), meaning "all job for this pkt is done". */ #define IPPROTO_DONE 257 /* Only used internally, so can be outside the range of valid IP protocols. */ #define IPPROTO_DIVERT 258 /* divert pseudo-protocol */ #define IPPROTO_SEND 259 /* SeND pseudo-protocol */ /* * Defined to avoid confusion. The master value is defined by * PROTO_SPACER in sys/protosw.h. */ #define IPPROTO_SPACER 32767 /* spacer for loadable protos */ /* * Local port number conventions: * * When a user does a bind(2) or connect(2) with a port number of zero, * a non-conflicting local port address is chosen. * The default range is IPPORT_HIFIRSTAUTO through * IPPORT_HILASTAUTO, although that is settable by sysctl. * * A user may set the IPPROTO_IP option IP_PORTRANGE to change this * default assignment range. * * The value IP_PORTRANGE_DEFAULT causes the default behavior. * * The value IP_PORTRANGE_HIGH changes the range of candidate port numbers * into the "high" range. These are reserved for client outbound connections * which do not want to be filtered by any firewalls. * * The value IP_PORTRANGE_LOW changes the range to the "low" are * that is (by convention) restricted to privileged processes. This * convention is based on "vouchsafe" principles only. It is only secure * if you trust the remote host to restrict these ports. * * The default range of ports and the high range can be changed by * sysctl(3). (net.inet.ip.portrange.{hi,low,}{first,last}) * * Changing those values has bad security implications if you are * using a stateless firewall that is allowing packets outside of that * range in order to allow transparent outgoing connections. * * Such a firewall configuration will generally depend on the use of these * default values. If you change them, you may find your Security * Administrator looking for you with a heavy object. * * For a slightly more orthodox text view on this: * * ftp://ftp.isi.edu/in-notes/iana/assignments/port-numbers * * port numbers are divided into three ranges: * * 0 - 1023 Well Known Ports * 1024 - 49151 Registered Ports * 49152 - 65535 Dynamic and/or Private Ports * */ /* * Ports < IPPORT_RESERVED are reserved for * privileged processes (e.g. root). (IP_PORTRANGE_LOW) */ #define IPPORT_RESERVED 1024 /* * Default local port range, used by IP_PORTRANGE_DEFAULT */ #define IPPORT_EPHEMERALFIRST 10000 #define IPPORT_EPHEMERALLAST 65535 /* * Dynamic port range, used by IP_PORTRANGE_HIGH. */ #define IPPORT_HIFIRSTAUTO 49152 #define IPPORT_HILASTAUTO 65535 /* * Scanning for a free reserved port return a value below IPPORT_RESERVED, * but higher than IPPORT_RESERVEDSTART. Traditionally the start value was * 512, but that conflicts with some well-known-services that firewalls may * have a fit if we use. */ #define IPPORT_RESERVEDSTART 600 #define IPPORT_MAX 65535 /* * Historical definitions of bits in internet address integers * (pre-CIDR). Class A/B/C are long obsolete, and now deprecated. * Hide these definitions from the kernel unless IN_HISTORICAL_NETS * is defined. Provide the historical definitions to user level for now. */ #ifndef _KERNEL #define IN_HISTORICAL_NETS #endif #ifdef IN_HISTORICAL_NETS #define IN_CLASSA(i) (((in_addr_t)(i) & 0x80000000) == 0) #define IN_CLASSA_NET 0xff000000 #define IN_CLASSA_NSHIFT 24 #define IN_CLASSA_HOST 0x00ffffff #define IN_CLASSA_MAX 128 #define IN_CLASSB(i) (((in_addr_t)(i) & 0xc0000000) == 0x80000000) #define IN_CLASSB_NET 0xffff0000 #define IN_CLASSB_NSHIFT 16 #define IN_CLASSB_HOST 0x0000ffff #define IN_CLASSB_MAX 65536 #define IN_CLASSC(i) (((in_addr_t)(i) & 0xe0000000) == 0xc0000000) #define IN_CLASSC_NET 0xffffff00 #define IN_CLASSC_NSHIFT 8 #define IN_CLASSC_HOST 0x000000ff #endif /* IN_HISTORICAL_NETS */ #define IN_NETMASK_DEFAULT 0xffffff00 /* mask when forced to guess */ #define IN_MULTICAST(i) (((in_addr_t)(i) & 0xf0000000) == 0xe0000000) #ifdef IN_HISTORICAL_NETS #define IN_CLASSD(i) IN_MULTICAST(i) #define IN_CLASSD_NET 0xf0000000 /* These ones aren't really */ #define IN_CLASSD_NSHIFT 28 /* net and host fields, but */ #define IN_CLASSD_HOST 0x0fffffff /* routing needn't know. */ #endif /* IN_HISTORICAL_NETS */ #define IN_EXPERIMENTAL(i) (((in_addr_t)(i) & 0xf0000000) == 0xf0000000) #define IN_BADCLASS(i) (((in_addr_t)(i) & 0xf0000000) == 0xf0000000) #define IN_LINKLOCAL(i) (((in_addr_t)(i) & 0xffff0000) == 0xa9fe0000) #ifdef _KERNEL #define IN_LOOPBACK(i) \ (((in_addr_t)(i) & V_in_loopback_mask) == 0x7f000000) #define IN_LOOPBACK_MASK_DFLT 0xff000000 #else #define IN_LOOPBACK(i) (((in_addr_t)(i) & 0xff000000) == 0x7f000000) #endif #define IN_ZERONET(i) (((in_addr_t)(i) & 0xff000000) == 0) #define IN_PRIVATE(i) ((((in_addr_t)(i) & 0xff000000) == 0x0a000000) || \ (((in_addr_t)(i) & 0xfff00000) == 0xac100000) || \ (((in_addr_t)(i) & 0xffff0000) == 0xc0a80000)) #define IN_LOCAL_GROUP(i) (((in_addr_t)(i) & 0xffffff00) == 0xe0000000) #define IN_ANY_LOCAL(i) (IN_LINKLOCAL(i) || IN_LOCAL_GROUP(i)) #define INADDR_LOOPBACK ((in_addr_t)0x7f000001) #ifndef _KERNEL #define INADDR_NONE ((in_addr_t)0xffffffff) /* -1 return */ #endif #define INADDR_UNSPEC_GROUP ((in_addr_t)0xe0000000) /* 224.0.0.0 */ #define INADDR_ALLHOSTS_GROUP ((in_addr_t)0xe0000001) /* 224.0.0.1 */ #define INADDR_ALLRTRS_GROUP ((in_addr_t)0xe0000002) /* 224.0.0.2 */ #define INADDR_ALLRPTS_GROUP ((in_addr_t)0xe0000016) /* 224.0.0.22, IGMPv3 */ #define INADDR_CARP_GROUP ((in_addr_t)0xe0000012) /* 224.0.0.18 */ #define INADDR_PFSYNC_GROUP ((in_addr_t)0xe00000f0) /* 224.0.0.240 */ #define INADDR_ALLMDNS_GROUP ((in_addr_t)0xe00000fb) /* 224.0.0.251 */ #define INADDR_MAX_LOCAL_GROUP ((in_addr_t)0xe00000ff) /* 224.0.0.255 */ #ifdef IN_HISTORICAL_NETS #define IN_LOOPBACKNET 127 /* official! */ #endif /* IN_HISTORICAL_NETS */ #define IN_RFC3021_MASK ((in_addr_t)0xfffffffe) #ifdef _KERNEL #include VNET_DECLARE(bool, ip_allow_net0); VNET_DECLARE(bool, ip_allow_net240); /* Address space reserved for loopback */ VNET_DECLARE(uint32_t, in_loopback_mask); #define V_ip_allow_net0 VNET(ip_allow_net0) #define V_ip_allow_net240 VNET(ip_allow_net240) #define V_in_loopback_mask VNET(in_loopback_mask) #endif /* * Options for use with [gs]etsockopt at the IP level. * First word of comment is data type; bool is stored in int. */ #define IP_OPTIONS 1 /* buf/ip_opts; set/get IP options */ #define IP_HDRINCL 2 /* int; header is included with data */ #define IP_TOS 3 /* int; IP type of service and preced. */ #define IP_TTL 4 /* int; IP time to live */ #define IP_RECVOPTS 5 /* bool; receive all IP opts w/dgram */ #define IP_RECVRETOPTS 6 /* bool; receive IP opts for response */ #define IP_RECVDSTADDR 7 /* bool; receive IP dst addr w/dgram */ #define IP_SENDSRCADDR IP_RECVDSTADDR /* cmsg_type to set src addr */ #define IP_RETOPTS 8 /* ip_opts; set/get IP options */ #define IP_MULTICAST_IF 9 /* struct in_addr *or* struct ip_mreqn; * set/get IP multicast i/f */ #define IP_MULTICAST_TTL 10 /* u_char; set/get IP multicast ttl */ #define IP_MULTICAST_LOOP 11 /* u_char; set/get IP multicast loopback */ #define IP_ADD_MEMBERSHIP 12 /* ip_mreq; add an IP group membership */ #define IP_DROP_MEMBERSHIP 13 /* ip_mreq; drop an IP group membership */ #define IP_MULTICAST_VIF 14 /* set/get IP mcast virt. iface */ #define IP_RSVP_ON 15 /* enable RSVP in kernel */ #define IP_RSVP_OFF 16 /* disable RSVP in kernel */ #define IP_RSVP_VIF_ON 17 /* set RSVP per-vif socket */ #define IP_RSVP_VIF_OFF 18 /* unset RSVP per-vif socket */ #define IP_PORTRANGE 19 /* int; range to choose for unspec port */ #define IP_RECVIF 20 /* bool; receive reception if w/dgram */ /* for IPSEC */ #define IP_IPSEC_POLICY 21 /* int; set/get security policy */ /* unused; was IP_FAITH */ #define IP_ONESBCAST 23 /* bool: send all-ones broadcast */ #define IP_BINDANY 24 /* bool: allow bind to any address */ /* unused; was IP_BIND_MULTI */ /* unused; was IP_RSS_LISTEN_BUCKET */ #define IP_ORIGDSTADDR 27 /* bool: receive IP dst addr/port w/dgram */ #define IP_RECVORIGDSTADDR IP_ORIGDSTADDR /* * Options for controlling the firewall and dummynet. * Historical options (from 40 to 64) will eventually be * replaced by only two options, IP_FW3 and IP_DUMMYNET3. */ #define IP_FW_TABLE_ADD 40 /* add entry */ #define IP_FW_TABLE_DEL 41 /* delete entry */ #define IP_FW_TABLE_FLUSH 42 /* flush table */ #define IP_FW_TABLE_GETSIZE 43 /* get table size */ #define IP_FW_TABLE_LIST 44 /* list table contents */ #define IP_FW3 48 /* generic ipfw v.3 sockopts */ #define IP_DUMMYNET3 49 /* generic dummynet v.3 sockopts */ #define IP_FW_ADD 50 /* add a firewall rule to chain */ #define IP_FW_DEL 51 /* delete a firewall rule from chain */ #define IP_FW_FLUSH 52 /* flush firewall rule chain */ #define IP_FW_ZERO 53 /* clear single/all firewall counter(s) */ #define IP_FW_GET 54 /* get entire firewall rule chain */ #define IP_FW_RESETLOG 55 /* reset logging counters */ #define IP_FW_NAT_CFG 56 /* add/config a nat rule */ #define IP_FW_NAT_DEL 57 /* delete a nat rule */ #define IP_FW_NAT_GET_CONFIG 58 /* get configuration of a nat rule */ #define IP_FW_NAT_GET_LOG 59 /* get log of a nat rule */ #define IP_DUMMYNET_CONFIGURE 60 /* add/configure a dummynet pipe */ #define IP_DUMMYNET_DEL 61 /* delete a dummynet pipe from chain */ #define IP_DUMMYNET_FLUSH 62 /* flush dummynet */ #define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */ #define IP_RECVTTL 65 /* bool; receive IP TTL w/dgram */ #define IP_MINTTL 66 /* minimum TTL for packet or drop */ #define IP_DONTFRAG 67 /* don't fragment packet */ #define IP_RECVTOS 68 /* bool; receive IP TOS w/dgram */ /* IPv4 Source Filter Multicast API [RFC3678] */ #define IP_ADD_SOURCE_MEMBERSHIP 70 /* join a source-specific group */ #define IP_DROP_SOURCE_MEMBERSHIP 71 /* drop a single source */ #define IP_BLOCK_SOURCE 72 /* block a source */ #define IP_UNBLOCK_SOURCE 73 /* unblock a source */ /* The following option is private; do not use it from user applications. */ #define IP_MSFILTER 74 /* set/get filter list */ /* The following option deals with the 802.1Q Ethernet Priority Code Point */ #define IP_VLAN_PCP 75 /* int; set/get PCP used for packet, */ /* -1 use interface default */ /* Protocol Independent Multicast API [RFC3678] */ #define MCAST_JOIN_GROUP 80 /* join an any-source group */ #define MCAST_LEAVE_GROUP 81 /* leave all sources for group */ #define MCAST_JOIN_SOURCE_GROUP 82 /* join a source-specific group */ #define MCAST_LEAVE_SOURCE_GROUP 83 /* leave a single source */ #define MCAST_BLOCK_SOURCE 84 /* block a source */ #define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */ /* Flow and RSS definitions */ #define IP_FLOWID 90 /* get flow id for the given socket/inp */ #define IP_FLOWTYPE 91 /* get flow type (M_HASHTYPE) */ #define IP_RSSBUCKETID 92 /* get RSS flowid -> bucket mapping */ #define IP_RECVFLOWID 93 /* bool; receive IP flowid/flowtype w/ datagram */ #define IP_RECVRSSBUCKETID 94 /* bool; receive IP RSS bucket id w/ datagram */ /* * Defaults and limits for options */ #define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */ #define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ /* * Limit for IPv4 multicast memberships */ #define IP_MAX_MEMBERSHIPS 4095 /* * Default resource limits for IPv4 multicast source filtering. * These may be modified by sysctl. */ #define IP_MAX_GROUP_SRC_FILTER 512 /* sources per group */ #define IP_MAX_SOCK_SRC_FILTER 128 /* sources per socket/group */ #define IP_MAX_SOCK_MUTE_FILTER 128 /* XXX no longer used */ /* * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. */ struct ip_mreq { struct in_addr imr_multiaddr; /* IP multicast address of group */ struct in_addr imr_interface; /* local IP address of interface */ }; /* * Modified argument structure for IP_MULTICAST_IF, obtained from Linux. * This is used to specify an interface index for multicast sends, as * the IPv4 legacy APIs do not support this (unless IP_SENDIF is available). */ struct ip_mreqn { struct in_addr imr_multiaddr; /* IP multicast address of group */ struct in_addr imr_address; /* local IP address of interface */ int imr_ifindex; /* Interface index; cast to uint32_t */ }; /* * Argument structure for IPv4 Multicast Source Filter APIs. [RFC3678] */ struct ip_mreq_source { struct in_addr imr_multiaddr; /* IP multicast address of group */ struct in_addr imr_sourceaddr; /* IP address of source */ struct in_addr imr_interface; /* local IP address of interface */ }; /* * Argument structures for Protocol-Independent Multicast Source * Filter APIs. [RFC3678] */ struct group_req { uint32_t gr_interface; /* interface index */ struct sockaddr_storage gr_group; /* group address */ }; struct group_source_req { uint32_t gsr_interface; /* interface index */ struct sockaddr_storage gsr_group; /* group address */ struct sockaddr_storage gsr_source; /* source address */ }; #ifndef __MSFILTERREQ_DEFINED #define __MSFILTERREQ_DEFINED /* * The following structure is private; do not use it from user applications. * It is used to communicate IP_MSFILTER/IPV6_MSFILTER information between * the RFC 3678 libc functions and the kernel. */ struct __msfilterreq { uint32_t msfr_ifindex; /* interface index */ uint32_t msfr_fmode; /* filter mode for group */ uint32_t msfr_nsrcs; /* # of sources in msfr_srcs */ struct sockaddr_storage msfr_group; /* group address */ struct sockaddr_storage *msfr_srcs; /* pointer to the first member * of a contiguous array of * sources to filter in full. */ }; #endif struct sockaddr; /* * Advanced (Full-state) APIs [RFC3678] * The RFC specifies uint_t for the 6th argument to [sg]etsourcefilter(). * We use uint32_t here to be consistent. */ int setipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t, uint32_t, struct in_addr *); int getipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t *, uint32_t *, struct in_addr *); int setsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, uint32_t, uint32_t, struct sockaddr_storage *); int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, uint32_t *, uint32_t *, struct sockaddr_storage *); /* * Filter modes; also used to represent per-socket filter mode internally. */ #define MCAST_UNDEFINED 0 /* fmode: not yet defined */ #define MCAST_INCLUDE 1 /* fmode: include these source(s) */ #define MCAST_EXCLUDE 2 /* fmode: exclude these source(s) */ /* * Argument for IP_PORTRANGE: * - which range to search when port is unspecified at bind() or connect() */ #define IP_PORTRANGE_DEFAULT 0 /* default range */ #define IP_PORTRANGE_HIGH 1 /* "high" - request firewall bypass */ #define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ /* * Identifiers for IP sysctl nodes */ #define IPCTL_FORWARDING 1 /* act as router */ #define IPCTL_SENDREDIRECTS 2 /* may send redirects when forwarding */ #define IPCTL_DEFTTL 3 /* default TTL */ #ifdef notyet #define IPCTL_DEFMTU 4 /* default MTU */ #endif /* IPCTL_RTEXPIRE 5 deprecated */ /* IPCTL_RTMINEXPIRE 6 deprecated */ /* IPCTL_RTMAXCACHE 7 deprecated */ #define IPCTL_SOURCEROUTE 8 /* may perform source routes */ #define IPCTL_DIRECTEDBROADCAST 9 /* may re-broadcast received packets */ #define IPCTL_INTRQMAXLEN 10 /* max length of netisr queue */ #define IPCTL_INTRQDROPS 11 /* number of netisr q drops */ #define IPCTL_STATS 12 /* ipstat structure */ #define IPCTL_ACCEPTSOURCEROUTE 13 /* may accept source routed packets */ #define IPCTL_FASTFORWARDING 14 /* use fast IP forwarding code */ /* 15, unused, was: IPCTL_KEEPFAITH */ #define IPCTL_GIF_TTL 16 /* default TTL for gif encap packet */ #define IPCTL_INTRDQMAXLEN 17 /* max length of direct netisr queue */ #define IPCTL_INTRDQDROPS 18 /* number of direct netisr q drops */ #endif /* __BSD_VISIBLE */ #ifdef _KERNEL struct ifnet; struct mbuf; /* forward declarations for Standard C */ struct in_ifaddr; -bool in_broadcast(struct in_addr, struct ifnet *); +bool in_ifnet_broadcast(struct in_addr, struct ifnet *); bool in_ifaddr_broadcast(struct in_addr, struct in_ifaddr *); int in_canforward(struct in_addr); int in_localaddr(struct in_addr); bool in_localip(struct in_addr); bool in_localip_fib(struct in_addr, uint16_t); int in_ifhasaddr(struct ifnet *, struct in_addr); struct in_ifaddr *in_findlocal(uint32_t, bool); int inet_aton(const char *, struct in_addr *); /* in libkern */ char *inet_ntoa_r(struct in_addr ina, char *buf); /* in libkern */ char *inet_ntop(int, const void *, char *, socklen_t); /* in libkern */ int inet_pton(int af, const char *, void *); /* in libkern */ void in_ifdetach(struct ifnet *); #define in_hosteq(s, t) ((s).s_addr == (t).s_addr) #define in_nullhost(x) ((x).s_addr == INADDR_ANY) #define in_allhosts(x) ((x).s_addr == htonl(INADDR_ALLHOSTS_GROUP)) #define satosin(sa) ((struct sockaddr_in *)(sa)) #define sintosa(sin) ((struct sockaddr *)(sin)) #define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) #endif /* _KERNEL */ /* INET6 stuff */ #if __POSIX_VISIBLE >= 200112 #define __KAME_NETINET_IN_H_INCLUDED_ #include #undef __KAME_NETINET_IN_H_INCLUDED_ #endif #endif /* !_NETINET_IN_H_*/ diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c index aedfd0bc08c7..b8599143b991 100644 --- a/sys/netinet/in_rmx.c +++ b/sys/netinet/in_rmx.c @@ -1,182 +1,182 @@ /*- * Copyright 1994, 1995 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int rib4_set_nh_pfxflags(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask, struct nhop_object *nh) { const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr; const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask; bool is_broadcast = false; if (mask == NULL) { nhop_set_pxtype_flag(nh, NHF_HOST); /* * Backward compatibility: * if the destination is broadcast, * mark route as broadcast. * This behavior was useful when route cloning * was in place, so there was an explicit cloned * route for every broadcasted address. * Currently (2020-04) there is no kernel machinery * to do route cloning, though someone might explicitly * add these routes to support some cases with active-active * load balancing. Given that, retain this support. */ - if (in_broadcast(addr4->sin_addr, nh->nh_ifp)) + if (in_ifnet_broadcast(addr4->sin_addr, nh->nh_ifp)) is_broadcast = true; } else if (mask4->sin_addr.s_addr == 0) nhop_set_pxtype_flag(nh, NHF_DEFAULT); else nhop_set_pxtype_flag(nh, 0); nhop_set_broadcast(nh, is_broadcast); return (0); } static int rib4_augment_nh(u_int fibnum, struct nhop_object *nh) { /* * Check route MTU: * inherit interface MTU if not set or * check if MTU is too large. */ if (nh->nh_mtu == 0) { nh->nh_mtu = nh->nh_ifp->if_mtu; } else if (nh->nh_mtu > nh->nh_ifp->if_mtu) nh->nh_mtu = nh->nh_ifp->if_mtu; /* Set nhop type to basic per-AF nhop */ if (nhop_get_type(nh) == 0) { uint16_t nh_type; if (nh->nh_flags & NHF_GATEWAY) nh_type = NH_TYPE_IPV4_ETHER_NHOP; else nh_type = NH_TYPE_IPV4_ETHER_RSLV; nhop_set_type(nh, nh_type); } return (0); } /* * Initialize our routing tree. */ struct rib_head * in_inithead(uint32_t fibnum) { struct rib_head *rh; rh = rt_table_init(32, AF_INET, fibnum); if (rh == NULL) return (NULL); rh->rnh_set_nh_pfxflags = rib4_set_nh_pfxflags; rh->rnh_augment_nh = rib4_augment_nh; return (rh); } #ifdef VIMAGE void in_detachhead(struct rib_head *rh) { rt_table_destroy(rh); } #endif /* * This zaps old routes when the interface goes down or interface * address is deleted. In the latter case, it deletes static routes * that point to this address. If we don't do this, we may end up * using the old address in the future. The ones we always want to * get rid of are things like ARP entries, since the user might down * the interface, walk over to a completely different network, and * plug back in. */ struct in_ifadown_arg { struct ifaddr *ifa; int del; }; static int in_ifadownkill(const struct rtentry *rt, const struct nhop_object *nh, void *xap) { struct in_ifadown_arg *ap = xap; if (nh->nh_ifa != ap->ifa) return (0); if ((nhop_get_rtflags(nh) & RTF_STATIC) != 0 && ap->del == 0) return (0); return (1); } void in_ifadown(struct ifaddr *ifa, int delete) { struct in_ifadown_arg arg; KASSERT(ifa->ifa_addr->sa_family == AF_INET, ("%s: wrong family", __func__)); arg.ifa = ifa; arg.del = delete; rib_foreach_table_walk_del(AF_INET, in_ifadownkill, &arg); ifa->ifa_flags &= ~IFA_ROUTE; /* XXXlocking? */ } diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 6bc76e0be111..5a561814cdb5 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -1,781 +1,781 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #if defined(SCTP) || defined(SCTP_SUPPORT) #include #endif #include /* * Divert sockets */ /* * Allocate enough space to hold a full IP packet */ #define DIVSNDQ (65536 + 100) #define DIVRCVQ (65536 + 100) /* * Usually a system has very few divert ports. Previous implementation * used a linked list. */ #define DIVHASHSIZE (1 << 3) /* 8 entries, one cache line. */ #define DIVHASH(port) (port % DIVHASHSIZE) #define DCBHASH(dcb) ((dcb)->dcb_port % DIVHASHSIZE) /* * Divert sockets work in conjunction with ipfw or other packet filters, * see the divert(4) manpage for features. * Packets are selected by the packet filter and tagged with an * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by * the packet filter) and information on the matching filter rule for * subsequent reinjection. The divert_port is used to put the packet * on the corresponding divert socket, while the rule number is passed * up (at least partially) as the sin_port in the struct sockaddr. * * Packets written to the divert socket carry in sin_addr a * destination address, and in sin_port the number of the filter rule * after which to continue processing. * If the destination address is INADDR_ANY, the packet is treated as * as outgoing and sent to ip_output(); otherwise it is treated as * incoming and sent to ip_input(). * Further, sin_zero carries some information on the interface, * which can be used in the reinject -- see comments in the code. * * On reinjection, processing in ip_input() and ip_output() * will be exactly the same as for the original packet, except that * packet filter processing will start at the rule number after the one * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 * will apply the entire ruleset to the packet). */ static SYSCTL_NODE(_net_inet, OID_AUTO, divert, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "divert(4)"); VNET_PCPUSTAT_DEFINE_STATIC(struct divstat, divstat); VNET_PCPUSTAT_SYSINIT(divstat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(divstat); #endif SYSCTL_VNET_PCPUSTAT(_net_inet_divert, OID_AUTO, stats, struct divstat, divstat, "divert(4) socket statistics"); #define DIVSTAT_INC(name) \ VNET_PCPUSTAT_ADD(struct divstat, divstat, div_ ## name, 1) static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m, struct sockaddr_in *sin); static int div_output_outbound(int family, struct socket *so, struct mbuf *m); struct divcb { union { SLIST_ENTRY(divcb) dcb_next; intptr_t dcb_bound; #define DCB_UNBOUND ((intptr_t)-1) }; struct socket *dcb_socket; uint16_t dcb_port; uint64_t dcb_gencnt; struct epoch_context dcb_epochctx; }; SLIST_HEAD(divhashhead, divcb); VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]) = {}; #define V_divhash VNET(divhash) VNET_DEFINE_STATIC(uint64_t, dcb_count) = 0; #define V_dcb_count VNET(dcb_count) VNET_DEFINE_STATIC(uint64_t, dcb_gencnt) = 0; #define V_dcb_gencnt VNET(dcb_gencnt) static struct mtx divert_mtx; MTX_SYSINIT(divert, &divert_mtx, "divert(4) socket pcb lists", MTX_DEF); #define DIVERT_LOCK() mtx_lock(&divert_mtx) #define DIVERT_UNLOCK() mtx_unlock(&divert_mtx) /* * Divert a packet by passing it up to the divert socket at port 'port'. */ static void divert_packet(struct mbuf *m, bool incoming) { struct divcb *dcb; u_int16_t nport; struct sockaddr_in divsrc; struct m_tag *mtag; uint16_t cookie; NET_EPOCH_ASSERT(); mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag != NULL) { cookie = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; nport = htons((uint16_t) (((struct ipfw_rule_ref *)(mtag+1))->info)); } else if ((mtag = m_tag_locate(m, MTAG_PF_DIVERT, 0, NULL)) != NULL) { cookie = ((struct pf_divert_mtag *)(mtag+1))->idir; nport = htons(((struct pf_divert_mtag *)(mtag+1))->port); } else { m_freem(m); return; } /* Assure header */ if (m->m_len < sizeof(struct ip) && (m = m_pullup(m, sizeof(struct ip))) == NULL) return; #ifdef INET /* Delayed checksums are currently not compatible with divert. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP) { struct ip *ip; ip = mtod(m, struct ip *); sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif #endif #ifdef INET6 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, m->m_pkthdr.len - sizeof(struct ip6_hdr), sizeof(struct ip6_hdr)); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #if defined(SCTP) || defined(SCTP_SUPPORT) if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif #endif /* INET6 */ bzero(&divsrc, sizeof(divsrc)); divsrc.sin_len = sizeof(divsrc); divsrc.sin_family = AF_INET; /* record matching rule, in host format */ divsrc.sin_port = cookie; /* * Record receive interface address, if any. * But only for incoming packets. */ if (incoming) { struct ifaddr *ifa; struct ifnet *ifp; /* Sanity check */ M_ASSERTPKTHDR(m); /* Find IP address for receive interface */ ifp = m->m_pkthdr.rcvif; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; break; } } /* * Record the incoming interface name whenever we have one. */ if (m->m_pkthdr.rcvif) { /* * Hide the actual interface name in there in the * sin_zero array. XXX This needs to be moved to a * different sockaddr type for divert, e.g. * sockaddr_div with multiple fields like * sockaddr_dl. Presently we have only 7 bytes * but that will do for now as most interfaces * are 4 or less + 2 or less bytes for unit. * There is probably a faster way of doing this, * possibly taking it from the sockaddr_dl on the iface. * This solves the problem of a P2P link and a LAN interface * having the same address, which can result in the wrong * interface being assigned to the packet when fed back * into the divert socket. Theoretically if the daemon saves * and re-uses the sockaddr_in as suggested in the man pages, * this iface name will come along for the ride. * (see div_output for the other half of this.) */ strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, sizeof(divsrc.sin_zero)); } /* Put packet on socket queue, if any */ SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next) if (dcb->dcb_port == nport) break; if (dcb != NULL) { struct socket *sa = dcb->dcb_socket; SOCKBUF_LOCK(&sa->so_rcv); if (sbappendaddr_locked(&sa->so_rcv, (struct sockaddr *)&divsrc, m, NULL) == 0) { soroverflow_locked(sa); m_freem(m); } else { sorwakeup_locked(sa); DIVSTAT_INC(diverted); } } else { DIVSTAT_INC(noport); m_freem(m); } } /* * Deliver packet back into the IP processing machinery. * * If no address specified, or address is 0.0.0.0, send to ip_output(); * otherwise, send to ip_input() and mark as having been received on * the interface with that address. */ static int div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct epoch_tracker et; struct sockaddr_in *sin = (struct sockaddr_in *)nam; const struct ip *ip; struct m_tag *mtag; struct ipfw_rule_ref *dt; struct pf_divert_mtag *pfdt; int error, family; if (control) m_freem(control); /* Packet must have a header (but that's about it) */ if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { m_freem(m); return (EINVAL); } if (sin != NULL) { if (sin->sin_family != AF_INET) { m_freem(m); return (EAFNOSUPPORT); } if (sin->sin_len != sizeof(*sin)) { m_freem(m); return (EINVAL); } } /* * An mbuf may hasn't come from userland, but we pretend * that it has. */ m->m_pkthdr.rcvif = NULL; m->m_nextpkt = NULL; M_SETFIB(m, so->so_fibnum); mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { /* this should be normal */ mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); if (mtag == NULL) { m_freem(m); return (ENOBUFS); } m_tag_prepend(m, mtag); } dt = (struct ipfw_rule_ref *)(mtag+1); /* Loopback avoidance and state recovery */ if (sin) { int i; /* set the starting point. We provide a non-zero slot, * but a non_matching chain_id to skip that info and use * the rulenum/rule_id. */ dt->slot = 1; /* dummy, chain_id is invalid */ dt->chain_id = 0; dt->rulenum = sin->sin_port+1; /* host format ? */ dt->rule_id = 0; /* XXX: broken for IPv6 */ /* * Find receive interface with the given name, stuffed * (if it exists) in the sin_zero[] field. * The name is user supplied data so don't trust its size * or that it is zero terminated. */ for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) ; if ( i > 0 && i < sizeof(sin->sin_zero)) m->m_pkthdr.rcvif = ifunit(sin->sin_zero); } ip = mtod(m, struct ip *); switch (ip->ip_v) { #ifdef INET case IPVERSION: family = AF_INET; break; #endif #ifdef INET6 case IPV6_VERSION >> 4: family = AF_INET6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } mtag = m_tag_locate(m, MTAG_PF_DIVERT, 0, NULL); if (mtag == NULL) { /* this should be normal */ mtag = m_tag_alloc(MTAG_PF_DIVERT, 0, sizeof(struct pf_divert_mtag), M_NOWAIT | M_ZERO); if (mtag == NULL) { m_freem(m); return (ENOBUFS); } m_tag_prepend(m, mtag); } pfdt = (struct pf_divert_mtag *)(mtag+1); if (sin) pfdt->idir = sin->sin_port; /* Reinject packet into the system as incoming or outgoing */ NET_EPOCH_ENTER(et); if (!sin || sin->sin_addr.s_addr == 0) { dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; pfdt->ndir = PF_DIVERT_MTAG_DIR_OUT; error = div_output_outbound(family, so, m); } else { dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; pfdt->ndir = PF_DIVERT_MTAG_DIR_IN; error = div_output_inbound(family, so, m, sin); } NET_EPOCH_EXIT(et); return (error); } /* * Sends mbuf @m to the wire via ip[6]_output(). * * Returns 0 on success or an errno value on failure. @m is always consumed. */ static int div_output_outbound(int family, struct socket *so, struct mbuf *m) { int error; switch (family) { #ifdef INET case AF_INET: { struct ip *const ip = mtod(m, struct ip *); /* Don't allow packet length sizes that will crash. */ if (((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { m_freem(m); return (EINVAL); } break; } #endif #ifdef INET6 case AF_INET6: { struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *); /* Don't allow packet length sizes that will crash */ if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) { m_freem(m); return (EINVAL); } break; } #endif } #ifdef MAC mac_socket_create_mbuf(so, m); #endif error = 0; switch (family) { #ifdef INET case AF_INET: error = ip_output(m, NULL, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); break; #endif #ifdef INET6 case AF_INET6: error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); break; #endif } if (error == 0) DIVSTAT_INC(outbound); return (error); } /* * Schedules mbuf @m for local processing via IPv4/IPv6 netisr queue. * * Returns 0 on success or an errno value on failure. @m is always consumed. */ static int div_output_inbound(int family, struct socket *so, struct mbuf *m, struct sockaddr_in *sin) { struct ifaddr *ifa; if (m->m_pkthdr.rcvif == NULL) { /* * No luck with the name, check by IP address. * Clear the port and the ifname to make sure * there are no distractions for ifa_ifwithaddr. */ /* XXX: broken for IPv6 */ bzero(sin->sin_zero, sizeof(sin->sin_zero)); sin->sin_port = 0; ifa = ifa_ifwithaddr((struct sockaddr *) sin); if (ifa == NULL) { m_freem(m); return (EADDRNOTAVAIL); } m->m_pkthdr.rcvif = ifa->ifa_ifp; } #ifdef MAC mac_socket_create_mbuf(so, m); #endif /* Send packet to input processing via netisr */ switch (family) { #ifdef INET case AF_INET: { const struct ip *ip; ip = mtod(m, struct ip *); /* * Restore M_BCAST flag when destination address is * broadcast. It is expected by ip_tryforward(). */ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) m->m_flags |= M_MCAST; - else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + else if (in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) m->m_flags |= M_BCAST; netisr_queue_src(NETISR_IP, (uintptr_t)so, m); DIVSTAT_INC(inbound); break; } #endif #ifdef INET6 case AF_INET6: netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m); DIVSTAT_INC(inbound); break; #endif default: m_freem(m); return (EINVAL); } return (0); } static int div_attach(struct socket *so, int proto, struct thread *td) { struct divcb *dcb; int error; if (td != NULL) { error = priv_check(td, PRIV_NETINET_DIVERT); if (error) return (error); } error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; dcb = malloc(sizeof(*dcb), M_PCB, M_WAITOK); dcb->dcb_bound = DCB_UNBOUND; dcb->dcb_socket = so; DIVERT_LOCK(); V_dcb_count++; dcb->dcb_gencnt = ++V_dcb_gencnt; DIVERT_UNLOCK(); so->so_pcb = dcb; return (0); } static void div_free(epoch_context_t ctx) { struct divcb *dcb = __containerof(ctx, struct divcb, dcb_epochctx); free(dcb, M_PCB); } static void div_detach(struct socket *so) { struct divcb *dcb = so->so_pcb; so->so_pcb = NULL; DIVERT_LOCK(); if (dcb->dcb_bound != DCB_UNBOUND) SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next); V_dcb_count--; V_dcb_gencnt++; DIVERT_UNLOCK(); NET_EPOCH_CALL(div_free, &dcb->dcb_epochctx); } static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct divcb *dcb; uint16_t port; if (nam->sa_family != AF_INET) return EAFNOSUPPORT; if (nam->sa_len != sizeof(struct sockaddr_in)) return EINVAL; port = ((struct sockaddr_in *)nam)->sin_port; DIVERT_LOCK(); SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next) if (dcb->dcb_port == port) { DIVERT_UNLOCK(); return (EADDRINUSE); } dcb = so->so_pcb; if (dcb->dcb_bound != DCB_UNBOUND) SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next); dcb->dcb_port = port; SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next); DIVERT_UNLOCK(); return (0); } static int div_pcblist(SYSCTL_HANDLER_ARGS) { struct xinpgen xig; struct divcb *dcb; int error; if (req->newptr != 0) return EPERM; if (req->oldptr == 0) { u_int n; n = V_dcb_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return 0; } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_dcb_count; xig.xig_gen = V_dcb_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; DIVERT_LOCK(); for (int i = 0; i < DIVHASHSIZE; i++) SLIST_FOREACH(dcb, &V_divhash[i], dcb_next) { if (dcb->dcb_gencnt <= xig.xig_gen) { struct xinpcb xi; bzero(&xi, sizeof(xi)); xi.xi_len = sizeof(struct xinpcb); sotoxsocket(dcb->dcb_socket, &xi.xi_socket); xi.inp_gencnt = dcb->dcb_gencnt; xi.inp_vflag = INP_IPV4; /* XXX: netstat(1) */ xi.inp_inc.inc_ie.ie_lport = dcb->dcb_port; error = SYSCTL_OUT(req, &xi, sizeof xi); if (error) goto errout; } } /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_dcb_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_dcb_count; error = SYSCTL_OUT(req, &xig, sizeof xig); errout: DIVERT_UNLOCK(); return (error); } SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, div_pcblist, "S,xinpcb", "List of active divert sockets"); static struct protosw div_protosw = { .pr_type = SOCK_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_attach = div_attach, .pr_bind = div_bind, .pr_detach = div_detach, .pr_send = div_send, }; static struct domain divertdomain = { .dom_family = PF_DIVERT, .dom_name = "divert", .dom_nprotosw = 1, .dom_protosw = { &div_protosw }, }; static int div_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: domain_add(&divertdomain); ip_divert_ptr = divert_packet; break; case MOD_QUIESCE: /* * IPDIVERT may normally not be unloaded because of the * potential race conditions. Tell kldunload we can't be * unloaded unless the unload is forced. */ err = EPERM; break; case MOD_UNLOAD: /* * Forced unload. * * Module ipdivert can only be unloaded if no sockets are * connected. Maybe this can be changed later to forcefully * disconnect any open sockets. * * XXXRW: Note that there is a slight race here, as a new * socket open request could be spinning on the lock and then * we destroy the lock. * * XXXGL: One more reason this code is incorrect is that it * checks only the current vnet. */ DIVERT_LOCK(); if (V_dcb_count != 0) { DIVERT_UNLOCK(); err = EBUSY; break; } DIVERT_UNLOCK(); ip_divert_ptr = NULL; domain_remove(&divertdomain); break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipdivertmod = { "ipdivert", div_modevent, 0 }; DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); MODULE_VERSION(ipdivert, 1); diff --git a/sys/netinet/sctp_os_bsd.h b/sys/netinet/sctp_os_bsd.h index eb0caec942e9..9cec02aa6a07 100644 --- a/sys/netinet/sctp_os_bsd.h +++ b/sys/netinet/sctp_os_bsd.h @@ -1,484 +1,484 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _NETINET_SCTP_OS_BSD_H_ #define _NETINET_SCTP_OS_BSD_H_ #include #include "opt_inet6.h" #include "opt_inet.h" #include "opt_sctp.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #endif /* INET6 */ #include #include #include /* Declare all the malloc names for all the various mallocs */ MALLOC_DECLARE(SCTP_M_MAP); MALLOC_DECLARE(SCTP_M_STRMI); MALLOC_DECLARE(SCTP_M_STRMO); MALLOC_DECLARE(SCTP_M_ASC_ADDR); MALLOC_DECLARE(SCTP_M_ASC_IT); MALLOC_DECLARE(SCTP_M_AUTH_CL); MALLOC_DECLARE(SCTP_M_AUTH_KY); MALLOC_DECLARE(SCTP_M_AUTH_HL); MALLOC_DECLARE(SCTP_M_AUTH_IF); MALLOC_DECLARE(SCTP_M_STRESET); MALLOC_DECLARE(SCTP_M_CMSG); MALLOC_DECLARE(SCTP_M_COPYAL); MALLOC_DECLARE(SCTP_M_VRF); MALLOC_DECLARE(SCTP_M_IFA); MALLOC_DECLARE(SCTP_M_IFN); MALLOC_DECLARE(SCTP_M_TIMW); MALLOC_DECLARE(SCTP_M_MVRF); MALLOC_DECLARE(SCTP_M_ITER); MALLOC_DECLARE(SCTP_M_SOCKOPT); MALLOC_DECLARE(SCTP_M_MCORE); #if defined(SCTP_LOCAL_TRACE_BUF) #define SCTP_GET_CYCLECOUNT get_cyclecount() #define SCTP_CTR6 sctp_log_trace #else #define SCTP_CTR6 CTR6 #endif /* * Macros to expand out globals defined by various modules * to either a real global or a virtualized instance of one, * depending on whether VIMAGE is defined. */ /* then define the macro(s) that hook into the vimage macros */ #define MODULE_GLOBAL(__SYMBOL) V_##__SYMBOL #define V_system_base_info VNET(system_base_info) #define SCTP_BASE_INFO(__m) V_system_base_info.sctppcbinfo.__m #define SCTP_BASE_STATS V_system_base_info.sctpstat #define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m #define SCTP_BASE_SYSCTL(__m) V_system_base_info.sctpsysctl.__m #define SCTP_BASE_VAR(__m) V_system_base_info.__m #define SCTP_PRINTF(params...) printf(params) #if defined(SCTP_DEBUG) #define SCTPDBG(level, params...) \ { \ do { \ if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \ SCTP_PRINTF(params); \ } \ } while (0); \ } #define SCTPDBG_ADDR(level, addr) \ { \ do { \ if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) { \ sctp_print_address(addr); \ } \ } while (0); \ } #else #define SCTPDBG(level, params...) #define SCTPDBG_ADDR(level, addr) #endif #ifdef SCTP_LTRACE_CHUNKS #define SCTP_LTRACE_CHK(a, b, c, d) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_CHUNK_ENABLE) SCTP_CTR6(KTR_SUBSYS, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_CHUNK_PROC, 0, a, b, c, d) #else #define SCTP_LTRACE_CHK(a, b, c, d) #endif #ifdef SCTP_LTRACE_ERRORS #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) \ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \ SCTP_PRINTF("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \ m, inp, stcb, net, file, __LINE__, err); #define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) \ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \ SCTP_PRINTF("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \ inp, stcb, net, file, __LINE__, err); #else #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) #define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) #endif /* * Local address and interface list handling */ #define SCTP_MAX_VRF_ID 0 #define SCTP_SIZE_OF_VRF_HASH 3 #define SCTP_IFNAMSIZ IFNAMSIZ #define SCTP_DEFAULT_VRFID 0 #define SCTP_VRF_ADDR_HASH_SIZE 16 #define SCTP_VRF_IFN_HASH_SIZE 3 #define SCTP_INIT_VRF_TABLEID(vrf) #define SCTP_IFN_IS_IFT_LOOP(ifn) ((ifn)->ifn_type == IFT_LOOP) #define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_nh && (ro)->ro_nh->nh_ifa && (ro)->ro_nh->nh_ifa->ifa_ifp && (ro)->ro_nh->nh_ifa->ifa_ifp->if_type == IFT_LOOP) /* * Access to IFN's to help with src-addr-selection */ /* This could return VOID if the index works but for BSD we provide both. */ #define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_nh->nh_ifp #define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_nh->nh_ifp->if_index #define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_nh && (ro)->ro_nh->nh_ifp) /* * general memory allocation */ #define SCTP_MALLOC(var, type, size, name) \ do { \ var = (type)malloc(size, name, M_NOWAIT); \ } while (0) #define SCTP_FREE(var, type) free(var, type) #define SCTP_PROCESS_STRUCT struct proc * /* * zone allocation functions */ #include /* SCTP_ZONE_INIT: initialize the zone */ typedef struct uma_zone *sctp_zone_t; #define SCTP_ZONE_INIT(zone, name, size, number) { \ zone = uma_zcreate(name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,\ 0); \ uma_zone_set_max(zone, number); \ } #define SCTP_ZONE_DESTROY(zone) uma_zdestroy(zone) /* SCTP_ZONE_GET: allocate element from the zone */ #define SCTP_ZONE_GET(zone, type) \ (type *)uma_zalloc(zone, M_NOWAIT); /* SCTP_ZONE_FREE: free element from the zone */ #define SCTP_ZONE_FREE(zone, element) \ uma_zfree(zone, element); #define SCTP_HASH_INIT(size, hashmark) hashinit_flags(size, M_PCB, hashmark, HASH_NOWAIT) #define SCTP_HASH_FREE(table, hashmark) hashdestroy(table, M_PCB, hashmark) #define SCTP_M_COPYM m_copym /* * timers */ #include typedef struct callout sctp_os_timer_t; #define SCTP_OS_TIMER_INIT(tmr) callout_init(tmr, 1) /* * NOTE: The next two shouldn't be called directly outside of sctp_timer_start() * and sctp_timer_stop(), since they don't handle incrementing/decrementing * relevant reference counts. */ #define SCTP_OS_TIMER_START callout_reset #define SCTP_OS_TIMER_STOP callout_stop #define SCTP_OS_TIMER_STOP_DRAIN callout_drain #define SCTP_OS_TIMER_PENDING callout_pending #define SCTP_OS_TIMER_ACTIVE callout_active #define SCTP_OS_TIMER_DEACTIVATE callout_deactivate #define sctp_get_tick_count() (ticks) #define SCTP_UNUSED __attribute__((unused)) /* * Functions */ /* Mbuf manipulation and access macros */ #define SCTP_BUF_LEN(m) (m->m_len) #define SCTP_BUF_NEXT(m) (m->m_next) #define SCTP_BUF_NEXT_PKT(m) (m->m_nextpkt) #define SCTP_BUF_RESV_UF(m, size) m->m_data += size #define SCTP_BUF_AT(m, size) m->m_data + size #define SCTP_BUF_IS_EXTENDED(m) (m->m_flags & M_EXT) #define SCTP_BUF_SIZE M_SIZE #define SCTP_BUF_TYPE(m) (m->m_type) #define SCTP_BUF_RECVIF(m) (m->m_pkthdr.rcvif) #define SCTP_BUF_PREPEND M_PREPEND #define SCTP_ALIGN_TO_END(m, len) M_ALIGN(m, len) #define SCTP_SNPRINTF(...) snprintf(__VA_ARGS__) /* We make it so if you have up to 4 threads * writing based on the default size of * the packet log 65 k, that would be * 4 16k packets before we would hit * a problem. */ #define SCTP_PKTLOG_WRITERS_NEED_LOCK 3 /*************************/ /* MTU */ /*************************/ #define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index) ((ifn != NULL) ? ((struct ifnet *)ifn)->if_mtu : 0) #define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, nh) ((uint32_t)((nh != NULL) ? nh->nh_mtu : 0)) /*************************/ /* These are for logging */ /*************************/ /* return the base ext data pointer */ #define SCTP_BUF_EXTEND_BASE(m) (m->m_ext.ext_buf) /* return the refcnt of the data pointer */ #define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ext_cnt) /* return any buffer related flags, this is * used beyond logging for apple only. */ #define SCTP_BUF_GET_FLAGS(m) (m->m_flags) /* For BSD this just accesses the M_PKTHDR length * so it operates on an mbuf with hdr flag. Other * O/S's may have separate packet header and mbuf * chain pointers.. thus the macro. */ #define SCTP_HEADER_TO_CHAIN(m) (m) #define SCTP_DETACH_HEADER_FROM_CHAIN(m) #define SCTP_HEADER_LEN(m) ((m)->m_pkthdr.len) #define SCTP_GET_HEADER_FOR_OUTPUT(o_pak) 0 #define SCTP_RELEASE_HEADER(m) #define SCTP_RELEASE_PKT(m) sctp_m_freem(m) #define SCTP_ENABLE_UDP_CSUM(m) do { \ m->m_pkthdr.csum_flags = CSUM_UDP; \ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); \ } while (0) #define SCTP_GET_PKT_VRFID(m, vrf_id) ((vrf_id = SCTP_DEFAULT_VRFID) != SCTP_DEFAULT_VRFID) /* Attach the chain of data into the sendable packet. */ #define SCTP_ATTACH_CHAIN(pak, m, packet_length) do { \ pak = m; \ pak->m_pkthdr.len = packet_length; \ } while(0) /* Other m_pkthdr type things */ -#define SCTP_IS_IT_BROADCAST(dst, m) ((m->m_flags & M_PKTHDR) ? in_broadcast(dst, m->m_pkthdr.rcvif) : 0) +#define SCTP_IS_IT_BROADCAST(dst, m) ((m->m_flags & M_PKTHDR) ? in_ifnet_broadcast(dst, m->m_pkthdr.rcvif) : 0) #define SCTP_IS_IT_LOOPBACK(m) ((m->m_flags & M_PKTHDR) && ((m->m_pkthdr.rcvif == NULL) || (m->m_pkthdr.rcvif->if_type == IFT_LOOP))) /* This converts any input packet header * into the chain of data holders, for BSD * its a NOP. */ /* get the v6 hop limit */ #define SCTP_GET_HLIM(inp, ro) in6_selecthlim(&inp->ip_inp.inp, (ro ? (ro->ro_nh ? (ro->ro_nh->nh_ifp) : (NULL)) : (NULL))); /* is the endpoint v6only? */ #define SCTP_IPV6_V6ONLY(sctp_inpcb) ((sctp_inpcb)->ip_inp.inp.inp_flags & IN6P_IPV6_V6ONLY) /* is the socket non-blocking? */ #define SCTP_SO_IS_NBIO(so) ((so)->so_state & SS_NBIO) #define SCTP_SET_SO_NBIO(so) ((so)->so_state |= SS_NBIO) #define SCTP_CLEAR_SO_NBIO(so) ((so)->so_state &= ~SS_NBIO) /* get the socket type */ #define SCTP_SO_TYPE(so) ((so)->so_type) /* reserve sb space for a socket */ #define SCTP_SORESERVE(so, send, recv) soreserve(so, send, recv) /* wakeup a socket */ #define SCTP_SOWAKEUP(so) wakeup(&(so)->so_timeo) /* number of bytes ready to read */ #define SCTP_SBAVAIL(sb) sbavail(sb) /* clear the socket buffer state */ #define SCTP_SB_INCR(sb, incr) \ { \ atomic_add_int(&(sb)->sb_acc, incr); \ atomic_add_int(&(sb)->sb_ccc, incr); \ } #define SCTP_SB_DECR(sb, decr) \ { \ SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_acc, decr); \ SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_ccc, decr); \ } #define SCTP_SB_CLEAR(sb) \ (sb).sb_acc = 0; \ (sb).sb_ccc = 0; \ (sb).sb_mb = NULL; \ (sb).sb_mbcnt = 0; #define SCTP_SB_LIMIT_RCV(so) (SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat) #define SCTP_SB_LIMIT_SND(so) (SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat) /* * routes, output, etc. */ typedef struct route sctp_route_t; #define SCTP_RTALLOC(ro, vrf_id, fibnum) \ { \ if ((ro)->ro_nh == NULL) { \ (ro)->ro_nh = rib_lookup(fibnum, &(ro)->ro_dst, NHR_REF, 0); \ } \ } /* * SCTP protocol specific mbuf flags. */ #define M_NOTIFICATION M_PROTO1 /* SCTP notification */ /* * IP output routines */ #define SCTP_IP_OUTPUT(result, o_pak, ro, _inp, vrf_id) \ { \ struct sctp_inpcb *local_inp = _inp; \ int o_flgs = IP_RAWOUTPUT; \ \ m_clrprotoflags(o_pak); \ if ((local_inp != NULL) && (local_inp->sctp_socket != NULL)) { \ o_flgs |= local_inp->sctp_socket->so_options & SO_DONTROUTE; \ } \ result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL); \ } #define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, _inp, vrf_id) \ { \ struct sctp_inpcb *local_inp = _inp; \ \ m_clrprotoflags(o_pak); \ if (local_inp != NULL) { \ INP_RLOCK(&local_inp->ip_inp.inp); \ result = ip6_output(o_pak, \ local_inp->ip_inp.inp.in6p_outputopts, \ (ro), 0, 0, ifp, NULL); \ INP_RUNLOCK(&local_inp->ip_inp.inp); \ } else { \ result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL); \ } \ } struct mbuf * sctp_get_mbuf_for_msg(unsigned int space_needed, int want_header, int how, int allonebuf, int type); /* * SCTP AUTH */ #define SCTP_READ_RANDOM(buf, len) arc4rand(buf, len, 0) /* map standard crypto API names */ #define SCTP_SHA1_CTX SHA1_CTX #define SCTP_SHA1_INIT SHA1Init #define SCTP_SHA1_UPDATE SHA1Update #define SCTP_SHA1_FINAL(x,y) SHA1Final((caddr_t)x, y) #define SCTP_SHA256_CTX SHA256_CTX #define SCTP_SHA256_INIT SHA256_Init #define SCTP_SHA256_UPDATE SHA256_Update #define SCTP_SHA256_FINAL(x,y) SHA256_Final((caddr_t)x, y) #define SCTP_DECREMENT_AND_CHECK_REFCOUNT(addr) (atomic_fetchadd_int(addr, -1) == 1) #if defined(INVARIANTS) #define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \ { \ int32_t oldval; \ oldval = atomic_fetchadd_int(addr, -val); \ if (oldval < val) { \ panic("Counter goes negative"); \ } \ } #else #define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \ { \ int32_t oldval; \ oldval = atomic_fetchadd_int(addr, -val); \ if (oldval < val) { \ *addr = 0; \ } \ } #endif #define SCTP_IS_LISTENING(inp) ((inp->sctp_flags & SCTP_PCB_FLAGS_ACCEPTING) != 0) int sctp_syscalls_init(void); int sctp_syscalls_uninit(void); #endif diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 12dc4670f531..e21043fac0cf 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1,4244 +1,4244 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_rss.h" #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #include #ifdef TCP_OFFLOAD #include #endif #include #include #include #include #include const int tcprexmtthresh = 3; VNET_DEFINE(int, tcp_log_in_vain) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_log_in_vain), 0, "Log all incoming TCP segments to closed ports"); VNET_DEFINE(int, tcp_bind_all_fibs) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_bind_all_fibs), 0, "Bound sockets receive traffic from all FIBs"); VNET_DEFINE(int, blackhole) = 0; #define V_blackhole VNET(blackhole) SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole), 0, "Do not send RST on segments to closed ports"); VNET_DEFINE(bool, blackhole_local) = false; #define V_blackhole_local VNET(blackhole_local) SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, blackhole_local, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(blackhole_local), false, "Enforce net.inet.tcp.blackhole for locally originated packets"); VNET_DEFINE(int, tcp_delack_enabled) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_delack_enabled), 0, "Delay ACK to try and piggyback it onto a data packet"); VNET_DEFINE(int, drop_synfin) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); VNET_DEFINE(int, tcp_do_prr) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr), 1, "Enable Proportional Rate Reduction per RFC 6937"); VNET_DEFINE(int, tcp_do_newcwv) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_newcwv), 0, "Enable New Congestion Window Validation per RFC7661"); VNET_DEFINE(int, tcp_do_rfc3042) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3042), 0, "Enable RFC 3042 (Limited Transmit)"); VNET_DEFINE(int, tcp_do_rfc3390) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3390), 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); VNET_DEFINE(int, tcp_initcwnd_segments) = 10; SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, "Slow-start flight size (initial congestion window) in number of segments"); VNET_DEFINE(int, tcp_do_rfc3465) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc3465), 0, "Enable RFC 3465 (Appropriate Byte Counting)"); VNET_DEFINE(int, tcp_abc_l_var) = 2; SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_abc_l_var), 2, "Cap the max cwnd increment during slow-start to this number of segments"); VNET_DEFINE(int, tcp_insecure_syn) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_syn), 0, "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); VNET_DEFINE(int, tcp_insecure_rst) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_rst), 0, "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); VNET_DEFINE(int, tcp_insecure_ack) = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_ack, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_insecure_ack), 0, "Follow RFC793 criteria for validating SEG.ACK"); VNET_DEFINE(int, tcp_recvspace) = 1024*64; #define V_tcp_recvspace VNET(tcp_recvspace) SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_autorcvbuf), 0, "Enable automatic receive buffer sizing"); VNET_DEFINE(int, tcp_autorcvbuf_max) = 2*1024*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); VNET_DEFINE(struct inpcbinfo, tcbinfo); /* * TCP statistics are stored in an array of counter(9)s, which size matches * size of struct tcpstat. TCP running connection count is a regular array. */ VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, "TCP connection counts by TCP state"); /* * Kernel module interface for updating tcpstat. The first argument is an index * into tcpstat treated as an array. */ void kmod_tcpstat_add(int statnum, int val) { counter_u64_add(VNET(tcpstat)[statnum], val); } /* * Make sure that we only start a SACK loss recovery when * receiving a duplicate ACK with a SACK block, and also * complete SACK loss recovery in case the other end * reneges. */ static bool inline tcp_is_sack_recovery(struct tcpcb *tp, struct tcpopt *to) { return ((tp->t_flags & TF_SACK_PERMIT) && ((to->to_flags & TOF_SACK) || (!TAILQ_EMPTY(&tp->snd_holes)))); } #ifdef TCP_HHOOK /* * Wrapper for the TCP established input helper hook. */ void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { hhook_data.tp = tp; hhook_data.th = th; hhook_data.to = to; hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, &tp->t_osd); } } #endif /* * CC wrapper hook functions */ void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type) { #ifdef STATS int32_t gput; #endif INP_WLOCK_ASSERT(tptoinpcb(tp)); tp->t_ccv.nsegs = nsegs; tp->t_ccv.bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2)))) tp->t_ccv.flags |= CCF_CWND_LIMITED; else tp->t_ccv.flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { #ifdef STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t)tp->snd_cwnd) - tp->snd_wnd); if (!IN_RECOVERY(tp->t_flags)) stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, tp->t_ccv.bytes_this_ack / (tcp_maxseg(tp) * nsegs)); if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { /* * Compute goodput in bits per millisecond. */ gput = (((int64_t)SEQ_SUB(th->th_ack, tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an API * to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; } #endif /* STATS */ if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += tp->t_ccv.bytes_this_ack; if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->t_ccv.flags |= CCF_ABC_SENTAWND; } } else { tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->t_ccv.curack = th->th_ack; CC_ALGO(tp)->ack_received(&tp->t_ccv, type); } #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); #endif } void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; struct inpcb *inp = tptoinpcb(tp); u_int maxseg; int rtt; INP_WLOCK_ASSERT(inp); tcp_hc_get(&inp->inp_inc, &metrics); maxseg = tcp_maxseg(tp); if (tp->t_srtt == 0 && (rtt = metrics.hc_rtt)) { tp->t_srtt = rtt; TCPSTAT_INC(tcps_usedrtt); if (metrics.hc_rttvar) { tp->t_rttvar = metrics.hc_rttvar; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, tp->t_rttmin, TCPTV_REXMTMAX); } if (metrics.hc_ssthresh) { /* * There's some sort of gateway or interface * buffer limit on the path. Use this to set * the slow start threshold, but set the * threshold to no less than 2*mss. */ tp->snd_ssthresh = max(2 * maxseg, metrics.hc_ssthresh); TCPSTAT_INC(tcps_usedssthresh); } /* * Set the initial slow-start flight size. * * If a SYN or SYN/ACK was lost and retransmitted, we have to * reduce the initial CWND to one segment as congestion is likely * requiring us to be cautious. */ if (tp->snd_cwnd == 1) tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ else tp->snd_cwnd = tcp_compute_initwnd(maxseg); if (CC_ALGO(tp)->conn_init != NULL) CC_ALGO(tp)->conn_init(&tp->t_ccv); } void inline cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { INP_WLOCK_ASSERT(tptoinpcb(tp)); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); #endif switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags) || /* * Allow ECN reaction on ACK to CWR, if * that data segment was also CE marked. */ SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_CONGRECOVERY(tp->t_flags); TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max + 1; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) ENTER_FASTRECOVERY(tp->t_flags); if (tp->t_flags & TF_WASCRECOVERY) ENTER_CONGRECOVERY(tp->t_flags); tp->snd_nxt = tp->snd_max; tp->t_flags &= ~TF_PREVVALID; tp->t_badrxtwin = 0; break; } if (SEQ_LT(tp->snd_fack, tp->snd_una) || SEQ_GT(tp->snd_fack, tp->snd_max)) { tp->snd_fack = tp->snd_una; } if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->t_ccv.curack = th->th_ack; CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); } } void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tptoinpcb(tp)); if (CC_ALGO(tp)->post_recovery != NULL) { if (SEQ_LT(tp->snd_fack, th->th_ack) || SEQ_GT(tp->snd_fack, tp->snd_max)) { tp->snd_fack = th->th_ack; } tp->t_ccv.curack = th->th_ack; CC_ALGO(tp)->post_recovery(&tp->t_ccv); } EXIT_RECOVERY(tp->t_flags); tp->t_bytes_acked = 0; tp->sackhint.delivered_data = 0; tp->sackhint.prr_delivered = 0; tp->sackhint.prr_out = 0; } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) void inline cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos) { INP_WLOCK_ASSERT(tptoinpcb(tp)); if (CC_ALGO(tp)->ecnpkt_handler != NULL) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_ccv.flags |= CCF_IPHDR_CE; break; case IPTOS_ECN_ECT0: /* FALLTHROUGH */ case IPTOS_ECN_ECT1: /* FALLTHROUGH */ case IPTOS_ECN_NOTECT: tp->t_ccv.flags &= ~CCF_IPHDR_CE; break; } if (flags & TH_CWR) tp->t_ccv.flags |= CCF_TCPHDR_CWR; else tp->t_ccv.flags &= ~CCF_TCPHDR_CWR; CC_ALGO(tp)->ecnpkt_handler(&tp->t_ccv); if (tp->t_ccv.flags & CCF_ACKNOW) { tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); tp->t_flags |= TF_ACKNOW; } } } void inline cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) { cc_ecnpkt_handler_flags(tp, tcp_get_flags(th), iptos); } /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input * tcp_input handles primary segment validation, inpcb lookup and * SYN processing on listen sockets * tcp_do_segment processes the ACK and text of the segment for * establishing, established and closing connections */ #ifdef INET6 int tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m; struct in6_ifaddr *ia6; struct ip6_hdr *ip6; m = *mp; if (m->m_len < *offp + sizeof(struct tcphdr)) { m = m_pullup(m, *offp + sizeof(struct tcphdr)); if (m == NULL) { *mp = m; TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } /* * draft-itojun-ipv6-tcp-to-anycast * better place to put this in? */ ip6 = mtod(m, struct ip6_hdr *); ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); *mp = NULL; return (IPPROTO_DONE); } *mp = m; return (tcp_input_with_port(mp, offp, proto, port)); } int tcp6_input(struct mbuf **mp, int *offp, int proto) { return(tcp6_input_with_port(mp, offp, proto, 0)); } #endif /* INET6 */ int tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) { struct mbuf *m = *mp; struct tcphdr *th = NULL; struct ip *ip = NULL; struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; u_char *optp = NULL; int off0; int optlen = 0; #ifdef INET int len; uint8_t ipttl; #endif int tlen = 0, off; int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ int lookupflag; uint8_t iptos; struct m_tag *fwd_tag = NULL; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; #else const void *ip6 = NULL; #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif off0 = *offp; m = *mp; *mp = NULL; to.to_flags = 0; TCPSTAT_INC(tcps_rcvtotal); m->m_pkthdr.tcp_tun_port = port; #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; if (port) goto skip6_csum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } skip6_csum: /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ KASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst), ("%s: unspecified destination v6 address", __func__)); if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto drop; } iptos = IPV6_TRAFFIC_CLASS(ip6); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { /* * Get IP and TCP header together in first mbuf. * Note: IP leaves IP header in first mbuf. */ if (off0 > sizeof (struct ip)) { ip_stripoptions(m); off0 = sizeof(struct ip); } if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ntohs(ip->ip_len) - off0; iptos = ip->ip_tos; if (port) goto skip_csum; if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = off0 + tlen; ipttl = ip->ip_ttl; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_ttl = ipttl; ip->ip_v = IPVERSION; ip->ip_hl = off0 >> 2; } skip_csum: if (th->th_sum && (port == 0)) { TCPSTAT_INC(tcps_rcvbadsum); goto drop; } KASSERT(ip->ip_dst.s_addr != INADDR_ANY, ("%s: unspecified destination v4 address", __func__)); if (__predict_false(ip->ip_src.s_addr == INADDR_ANY)) { IPSTAT_INC(ips_badaddr); goto drop; } } #endif /* INET */ /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX */ off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); goto drop; } tlen -= off; /* tlen is used instead of ti->ti_len */ if (off > sizeof (struct tcphdr)) { #ifdef INET6 if (isipv6) { if (m->m_len < off0 + off) { m = m_pullup(m, off0 + off); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } } ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)((caddr_t)ip6 + off0); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (m->m_len < sizeof(struct ip) + off) { if ((m = m_pullup(m, sizeof (struct ip) + off)) == NULL) { TCPSTAT_INC(tcps_rcvshort); return (IPPROTO_DONE); } ip = mtod(m, struct ip *); th = (struct tcphdr *)((caddr_t)ip + off0); } } #endif optlen = off - sizeof (struct tcphdr); optp = (u_char *)(th + 1); } thflags = tcp_get_flags(th); /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); /* * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. */ drop_hdrlen = off0 + off; /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ( #ifdef INET6 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) #ifdef INET || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) #endif #endif #if defined(INET) && !defined(INET6) (m->m_flags & M_IP_NEXTHOP) #endif ) fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); /* * For initial SYN packets we don't need write lock on matching * PCB, be it a listening one or a synchronized one. The packet * shall not modify its state. */ lookupflag = INPLOOKUP_WILDCARD | ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ? INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB) | (V_tcp_bind_all_fibs ? 0 : INPLOOKUP_FIB); findpcb: tp = NULL; #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag & ~INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &next_hop6->sin6_addr, next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : th->th_dport, lookupflag, m->m_pkthdr.rcvif); } } else if (isipv6) { inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET if (fwd_tag != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag+1); /* * Transparently forwarded. Pretend to be the destination. * already got one like this? */ inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, lookupflag & ~INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(&V_tcbinfo, ip->ip_src, th->th_sport, next_hop->sin_addr, next_hop->sin_port ? ntohs(next_hop->sin_port) : th->th_dport, lookupflag, m->m_pkthdr.rcvif); } } else inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, ip->ip_dst, th->th_dport, lookupflag, m->m_pkthdr.rcvif, m); #endif /* INET */ /* * If the INPCB does not exist then all data in the incoming * segment is discarded and an appropriate RST is sent back. * XXX MRT Send RST using which routing table? */ if (inp == NULL) { if (rstreason != 0) { /* We came here after second (safety) lookup. */ MPASS((lookupflag & INPLOOKUP_WILDCARD) == 0); goto dropwithreset; } /* * Log communication attempts to ports that are not * in use. */ if ((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || V_tcp_log_in_vain == 2) { if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6))) log(LOG_INFO, "%s; %s: Connection attempt " "to closed port\n", s, __func__); } rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } INP_LOCK_ASSERT(inp); if ((inp->inp_flowtype == M_HASHTYPE_NONE) && !SOLISTENING(inp->inp_socket)) { if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); #ifdef RSS } else { /* assign flowid by software RSS hash */ #ifdef INET6 if (isipv6) { rss_proto_software_hash_v6(&inp->in6p_faddr, &inp->in6p_laddr, inp->inp_fport, inp->inp_lport, IPPROTO_TCP, &inp->inp_flowid, &inp->inp_flowtype); } else #endif /* INET6 */ { rss_proto_software_hash_v4(inp->inp_faddr, inp->inp_laddr, inp->inp_fport, inp->inp_lport, IPPROTO_TCP, &inp->inp_flowid, &inp->inp_flowtype); } #endif /* RSS */ } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6) && IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { goto dropunlock; } #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) { goto dropunlock; } #endif /* INET */ #endif /* IPSEC */ /* * Check the minimum TTL for socket. */ if (inp->inp_ip_minttl != 0) { #ifdef INET6 if (isipv6) { if (inp->inp_ip_minttl > ip6->ip6_hlim) goto dropunlock; } else #endif if (inp->inp_ip_minttl > ip->ip_ttl) goto dropunlock; } tp = intotcpcb(inp); switch (tp->t_state) { case TCPS_TIME_WAIT: /* * A previous connection in TIMEWAIT state is supposed to catch * stray or duplicate segments arriving late. If this segment * was a legitimate new connection attempt, the old INPCB gets * removed and we can try again to find a listening socket. */ tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); /* * tcp_twcheck unlocks the inp always, and frees the m if fails. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; return (IPPROTO_DONE); case TCPS_CLOSED: /* * The TCPCB may no longer exist if the connection is winding * down or it is in the CLOSED state. Either way we drop the * segment and send an appropriate response. */ rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_input(tp, m); m = NULL; /* consumed by the TOE driver */ goto dropunlock; } #endif #ifdef MAC if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif so = inp->inp_socket; KASSERT(so != NULL, ("%s: so == NULL", __func__)); /* * When the socket is accepting connections (the INPCB is in LISTEN * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ KASSERT(tp->t_state == TCPS_LISTEN || !SOLISTENING(so), ("%s: so accepting but tp %p not listening", __func__, tp)); if (tp->t_state == TCPS_LISTEN && SOLISTENING(so)) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); #ifdef INET6 if (isipv6) { inc.inc_flags |= INC_ISIPV6; if (inp->inp_inc.inc_flags & INC_IPV6MINMTU) inc.inc_flags |= INC_IPV6MINMTU; inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; } else #endif { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; inc.inc_fibnum = so->so_fibnum; /* * Check for an existing connection attempt in syncache if * the flag is only ACK. A successful lookup creates a new * socket appended to the listen queue in SYN_RECEIVED state. */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { /* * Parse the TCP options here because * syncookies need access to the reflected * timestamp. */ tcp_dooptions(&to, optp, optlen, 0); /* * NB: syncache_expand() doesn't unlock inp. */ rstreason = syncache_expand(&inc, &to, th, &so, m, port); if (rstreason < 0) { /* * A failing TCP MD5 signature comparison * must result in the segment being dropped * and must not produce any response back * to the sender. */ goto dropunlock; } else if (rstreason == 0) { /* * No syncache entry, or ACK was not for our * SYN/ACK. Do our protection against double * ACK. If peer sent us 2 ACKs, then for the * first one syncache_expand() successfully * converted syncache entry into a socket, * while we were waiting on the inpcb lock. We * don't want to sent RST for the second ACK, * so we perform second lookup without wildcard * match, hoping to find the new socket. If * the ACK is stray indeed, rstreason would * hint the above code that the lookup was a * second attempt. * * NB: syncache did its own logging * of the failure cause. */ INP_WUNLOCK(inp); rstreason = BANDLIM_RST_OPENPORT; lookupflag &= ~INPLOOKUP_WILDCARD; goto findpcb; } tfo_socket_result: if (so == NULL) { /* * We completed the 3-way handshake * but could not allocate a socket * either due to memory shortage, * listen queue length limits or * global socket limits. Send RST * or wait and have the remote end * retransmit the ACK for another * try. */ if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Socket allocation failed due to " "limits or memory shortage, %s\n", s, __func__, V_tcp_sc_rst_sock_fail ? "sending RST" : "try again"); if (V_tcp_sc_rst_sock_fail) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } else goto dropunlock; } /* * Socket is created in state SYN_RECEIVED. * Unlock the listen socket, lock the newly * created socket and update the tp variable. * If we came here via jump to tfo_socket_result, * then listening socket is read-locked. */ INP_UNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); /* * New connection inpcb is already locked by * syncache_expand(). */ INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); /* * Process the segment and the data it * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ TCP_PROBE5(receive, NULL, tp, m, tp, th); tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); } /* * Segment flag validation for new connection attempts: * * Our (SYN|ACK) response was rejected. * Check with syncache and remove entry to prevent * retransmits. * * NB: syncache_chkrst does its own logging of failure * causes. */ if (thflags & TH_RST) { syncache_chkrst(&inc, th, m, port); goto dropunlock; } /* * We can't do anything without SYN. */ if ((thflags & TH_SYN) == 0) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN is missing, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * (SYN|ACK) is bogus on a listen socket. */ if (thflags & TH_ACK) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|ACK invalid, segment rejected\n", s, __func__); syncache_badack(&inc, port); /* XXX: Not needed! */ TCPSTAT_INC(tcps_badsyn); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } /* * If the drop_synfin option is enabled, drop all * segments with both the SYN and FIN bits set. * This prevents e.g. nmap from identifying the * TCP/IP stack. * XXX: Poor reasoning. nmap has other methods * and is constantly refining its stack detection * strategies. * XXX: This is a violation of the TCP specification * and was used by RFC1644. */ if ((thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); TCPSTAT_INC(tcps_badsyn); goto dropunlock; } /* * Segment's flags are (SYN) or (SYN|FIN). * * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored * as they do not affect the state of the TCP FSM. * The data pointed to by TH_URG and th_urp is ignored. */ KASSERT((thflags & (TH_RST|TH_ACK)) == 0, ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); KASSERT(thflags & (TH_SYN), ("%s: Listen socket: TH_SYN not set", __func__)); INP_RLOCK_ASSERT(inp); #ifdef INET6 /* * If deprecated address is forbidden, * we do not accept SYN to deprecated interface * address to prevent any new inbound connection from * getting established. * When we do not accept SYN, we send a TCP RST, * with deprecated source address (instead of dropping * it). We compromise it as it is much better for peer * to send a RST, and RST will be the final packet * for the exchange. * * If we do not forbid deprecated addresses, we accept * the SYN packet. RFC2462 does not suggest dropping * SYN in this case. * If we decipher RFC2462 5.5.4, it says like this: * 1. use of deprecated addr with existing * communication is okay - "SHOULD continue to be * used" * 2. use of it with new communication: * (2a) "SHOULD NOT be used if alternate address * with sufficient scope is available" * (2b) nothing mentioned otherwise. * Here we fall into (2b) case as we have no choice in * our source address selection - we must obey the peer. * * The wording in RFC2462 is confusing, and there are * multiple description text for deprecated address * handling - worse, they are not exactly the same. * I believe 5.5.4 is the best one, so we follow 5.5.4. */ if (isipv6 && !V_ip6_use_deprecated) { struct in6_ifaddr *ia6; ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); if (ia6 != NULL && (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to deprecated " "IPv6 address rejected\n", s, __func__); rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } } #endif /* INET6 */ /* * Basic sanity checks on incoming SYN requests: * Don't respond if the destination is a link layer * broadcast according to RFC1122 4.2.3.10, p. 104. * If it is from this socket it must be forged. * Don't respond if the source or destination is a * global or subnet broad- or multicast address. * Note that it is quite possible to receive unicast * link-layer packets with a broadcast IP address. Use - * in_broadcast() to find them. + * in_ifnet_broadcast() to find them. */ if (m->m_flags & (M_BCAST|M_MCAST)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from broad- or multicast " "link layer address ignored\n", s, __func__); goto dropunlock; } #ifdef INET6 if (isipv6) { if (th->th_dport == th->th_sport && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt to/from self " "ignored\n", s, __func__); goto dropunlock; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to multicast " "address ignored\n", s, __func__); goto dropunlock; } } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (th->th_dport == th->th_sport && ip->ip_dst.s_addr == ip->ip_src.s_addr) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to self " "ignored\n", s, __func__); goto dropunlock; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || - in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Listen socket: " "Connection attempt from/to broad- " "or multicast address ignored\n", s, __func__); goto dropunlock; } } #endif /* * SYN appears to be valid. Create compressed TCP state * for syncache. */ TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL, iptos, port)) != NULL) goto tfo_socket_result; /* * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ return (IPPROTO_DONE); } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) { tcp_dooptions(&to, optp, optlen, thflags); if ((to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_nosigopt); goto dropunlock; } if (!TCPMD5_ENABLED() || TCPMD5_INPUT(m, th, to.to_signature) != 0) goto dropunlock; } #endif TCP_PROBE5(receive, NULL, tp, m, tp, th); /* * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. * * XXXGL: in case of a pure SYN arriving on existing connection * TCP stacks won't need to modify the PCB, they would either drop * the segment silently, or send a challenge ACK. However, we try * to upgrade the lock, because calling convention for stacks is * write-lock on PCB. If upgrade fails, drop the SYN. */ if ((lookupflag & INPLOOKUP_RLOCKPCB) && INP_TRY_UPGRADE(inp) == 0) goto dropunlock; tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, tlen, iptos); return (IPPROTO_DONE); dropwithreset: /* * When blackholing do not respond with a RST but * completely ignore the segment and drop it. */ if (((rstreason == BANDLIM_RST_OPENPORT && V_blackhole == 3) || (rstreason == BANDLIM_RST_CLOSEDPORT && ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && (V_blackhole_local || ( #ifdef INET6 isipv6 ? !in6_localaddr(&ip6->ip6_src) : #endif #ifdef INET !in_localip(ip->ip_src) #else true #endif ))) goto dropunlock; TCP_PROBE5(receive, NULL, tp, m, tp, th); tcp_dropwithreset(m, th, tp, tlen, rstreason); m = NULL; /* mbuf chain got consumed. */ dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, m, tp, th); if (inp != NULL) INP_UNLOCK(inp); drop: if (s != NULL) free(s, M_TCPLOG); if (m != NULL) m_freem(m); return (IPPROTO_DONE); } /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during 1/2 of an sRTT * is at least 3/8 of the current socket buffer size. * 3. receive buffer size has not hit maximal automatic size; * * If all of the criteria are met we increaset the socket buffer * by a 1/2 (bounded by the max). This allows us to keep ahead * of slow-start but also makes it so our peer never gets limited * by our rwnd which we then open up causing a burst. * * This algorithm does two steps per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ int tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int tlen) { int newsize = 0; if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && tp->t_srtt != 0 && tp->rfbuf_ts != 0 && TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) { if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max); } TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else { tp->rfbuf_cnt += tlen; /* add up */ } return (newsize); } int tcp_input(struct mbuf **mp, int *offp, int proto) { return(tcp_input_with_port(mp, offp, proto, 0)); } static void tcp_handle_wakeup(struct tcpcb *tp) { INP_WLOCK_ASSERT(tptoinpcb(tp)); if (tp->t_flags & TF_WAKESOR) { struct socket *so = tptosocket(tp); tp->t_flags &= ~TF_WAKESOR; SOCK_RECVBUF_LOCK_ASSERT(so); sorwakeup_locked(so); } } void tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, int drop_hdrlen, int tlen, uint8_t iptos) { uint16_t thflags; int acked, ourfinisacked, needoutput = 0; sackstatus_t sack_changed; int rstreason, todrop, win, incforsyn = 0; uint32_t tiwin; uint16_t nsegs; char *s; struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); struct in_conninfo *inc = &inp->inp_inc; struct mbuf *mfree; struct tcpopt to; int tfo_syn; u_int maxseg = 0; thflags = tcp_get_flags(th); tp->sackhint.last_sack_ack = 0; sack_changed = SACK_NOCHANGE; nsegs = max(1, m->m_pkthdr.lro_nsegs); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); #ifdef TCPPCAP /* Save segment, if requested. */ tcp_pcap_add(th, m, &(tp->t_inpkts)); #endif TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, NULL, true); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: " "SYN|FIN segment ignored (based on " "sysctl setting)\n", s, __func__); free(s, M_TCPLOG); } goto drop; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una) && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) cc_after_idle(tp); tp->t_rcvtime = ticks; if (thflags & TH_FIN) tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); /* * Scale up the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif /* * TCP ECN processing. */ if (tcp_ecn_input_segment(tp, thflags, tlen, tcp_packets_this_ack(tp, th->th_ack), iptos)) cc_cong_signal(tp, th, CC_ECN); /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { /* * We don't look at sack's from the * peer because the MSS is too small which * can subject us to an attack. */ to.to_flags &= ~TOF_SACK; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if ((tp->t_flags & TF_SIGNATURE) != 0 && (to.to_flags & TOF_SIGNATURE) == 0) { TCPSTAT_INC(tcps_sig_err_sigopt); /* XXX: should drop? */ } #endif /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) { to.to_tsecr = 0; } else if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) { cc_cong_signal(tp, th, CC_RTO_ERR); } } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { /* Handle parallel SYN for ECN */ tcp_ecn_input_parallel_syn(tp, thflags, iptos); if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE) && !(tp->t_flags & TF_NOOPT)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } else { tp->t_flags &= ~TF_REQ_SCALE; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if ((to.to_flags & TOF_TS) && (tp->t_flags & TF_REQ_TSTMP) && !(tp->t_flags & TF_NOOPT)) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } else { tp->t_flags &= ~TF_REQ_TSTMP; } if (to.to_flags & TOF_MSS) { tcp_mss(tp, to.to_mss); } if ((tp->t_flags & TF_SACK_PERMIT) && (!(to.to_flags & TOF_SACKPERM) || (tp->t_flags & TF_NOOPT))) { tp->t_flags &= ~TF_SACK_PERMIT; } if (tp->t_flags & TF_FASTOPEN) { if ((to.to_flags & TOF_FASTOPEN) && !(tp->t_flags & TF_NOOPT)) { uint16_t mss; if (to.to_flags & TOF_MSS) { mss = to.to_mss; } else { if ((inp->inp_vflag & INP_IPV6) != 0) { mss = TCP6_MSS; } else { mss = TCP_MSS; } } tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else { tcp_fastopen_disable_path(tp); } } } /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop * the segment, unless it is a RST segment or missing timestamps are * tolerated. * See section 3.2 of RFC 7323. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "segment silently dropped\n", s, __func__); free(s, M_TCPLOG); } goto drop; } } /* * If timestamps were not negotiated during SYN/ACK and a * segment with a timestamp is received, ignore the * timestamp and process the packet normally. * See section 3.2 of RFC 7323. */ if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment processed normally\n", s, __func__); free(s, M_TCPLOG); } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && SEGQ_EMPTY(tp) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)) { /* * This is a pure ack for outstanding data. */ TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery without timestamps. */ if ((to.to_flags & TOF_TS) == 0 && tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && TSTMP_LT(ticks, tp->t_badrxtwin)) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, nsegs, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ TCP_PROBE3(debug__input, tp, th, m); /* * Clear t_acktime if remote side has ACKd * all data in the socket buffer. * Otherwise, update t_acktime if we received * a sufficiently large ACK. */ if (sbavail(&so->so_snd) == 0) tp->t_acktime = 0; else if (acked > 1) tp->t_acktime = ticks; if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); sowwakeup(so); /* * Only call tcp_output when there * is new data available to be sent * or we need to send an ACK. */ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) >= SEQ_SUB(tp->snd_max, tp->snd_una))) { (void) tcp_output(tp); } goto check_delack; } } else if (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv)) { int newsize = 0; /* automatic sockbuf scaling */ /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvpack, nsegs); TCPSTAT_ADD(tcps_rcvbyte, tlen); TCP_PROBE3(debug__input, tp, th, m); newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } goto check_delack; } } /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if (thflags & TH_RST) { /* Handle RST segments later. */ break; } if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } if (tp->t_flags & TF_FASTOPEN) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((tcp_timer_active(tp, TT_DELACK) || tcp_timer_active(tp, TT_REXMT))) goto drop; } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { goto drop; } } break; /* * If the state is SYN_SENT: * if seg contains a RST with valid ACK (SEQ.ACK has already * been verified), then drop the connection. * if seg contains a RST without an ACK, drop the seg. * if seg does not contain SYN, then drop the seg. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, m, tp, th); tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial_ack = 0; TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if ((tp->t_flags & TF_FASTOPEN) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial_ack = 1; } /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; tcp_ecn_input_syn_sent(tp, thflags, iptos); /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_acktime = ticks; tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, m, tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: /* FALLTHROUGH */ default: tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST); tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); tcp_send_challenge_ack(tp, th, m); m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCPS_SYN_RECEIVED) { TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { tcp_ecn_input_syn_sent(tp, thflags, iptos); tcp_send_challenge_ack(tp, th, m); m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } /* * DSACK - add SACK block for dropped range */ if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ tp->t_flags |= TF_ACKNOW; } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end if * no FIN has been processed. */ if ((tp->t_flags & TF_CLOSED) && tlen > 0 && TCPS_HAVERCVDFIN(tp->t_state) == 0) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); /* tcp_close will kill the inp pre-log the Reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to.to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) { if (tp->t_state == TCPS_SYN_RECEIVED && (tp->t_flags & TF_FASTOPEN)) { tp->snd_wnd = tiwin; cc_conn_init(tp); } goto step6; } else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ if (SEQ_GEQ(tp->snd_una, tp->iss + (TCP_MAXWIN << tp->snd_scale))) { /* Checking SEG.ACK against ISS is definitely redundant. */ tp->t_flags2 |= TF2_NO_ISS_CHECK; } if (!V_tcp_insecure_ack) { tcp_seq seq_min; bool ghost_ack_check; if (tp->t_flags2 & TF2_NO_ISS_CHECK) { /* Check for too old ACKs (RFC 5961, Section 5.2). */ seq_min = tp->snd_una - tp->max_sndwnd; ghost_ack_check = false; } else { if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) { /* Checking for ghost ACKs is stricter. */ seq_min = tp->iss + 1; ghost_ack_check = true; } else { /* * Checking for too old ACKs (RFC 5961, * Section 5.2) is stricter. */ seq_min = tp->snd_una - tp->max_sndwnd; ghost_ack_check = false; } } if (SEQ_LT(th->th_ack, seq_min)) { if (ghost_ack_check) TCPSTAT_INC(tcps_rcvghostack); else TCPSTAT_INC(tcps_rcvacktooold); tcp_send_challenge_ack(tp, th, m); m = NULL; goto drop; } } switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); if (tp->t_flags & TF_SONOTCONN) { /* * Usually SYN_RECEIVED had been created from a LISTEN, * and solisten_enqueue() has already marked the socket * layer as connected. If it didn't, which can happen * only with an accept_filter(9), then the tp is marked * with TF_SONOTCONN. The other reason for this mark * to be set is a simultaneous open, a SYN_RECEIVED * that had been created from SYN_SENT. */ tp->t_flags &= ~TF_SONOTCONN; soisconnected(so); } /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->snd_wnd = tiwin; /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } if (tp->t_flags & TF_NEEDFIN) { tp->t_acktime = ticks; tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, m, tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such * connections is not harmless as it would undo the * snd_cwnd reduction that occurs when a TFO SYN|ACK * is retransmitted. */ if (!(tp->t_flags & TF_FASTOPEN)) cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * Account for the ACK of our SYN prior to * regular ACK processing below, except for * simultaneous SYN, which is handled later. */ if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) incforsyn = 1; /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) { (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tcp_handle_wakeup(tp); } tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if (tcp_is_sack_recovery(tp, &to)) { sack_changed = tcp_sack_doack(tp, &to, th->th_ack); if ((sack_changed != SACK_NOCHANGE) && (tp->t_flags & TF_LRD)) { tcp_sack_lost_retransmission(tp, th); } } else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif if (SEQ_LEQ(th->th_ack, tp->snd_una)) { maxseg = tcp_maxseg(tp); if (tlen == 0 && (tiwin == tp->snd_wnd || (tp->t_flags & TF_SACK_PERMIT))) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ /* * Following 2 kinds of acks should not affect * dupack counting: * 1) Old acks * 2) Acks with SACK but without any new SACK * information in them. These could result from * any anomaly in the network like a switch * duplicating packets or a possible DoS attack. */ if (th->th_ack != tp->snd_una || (tcp_is_sack_recovery(tp, &to) && (sack_changed == SACK_NOCHANGE))) { break; } else if (!tcp_timer_active(tp, TT_REXMT)) { tp->t_dupacks = 0; } else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags) && (tp->t_flags & TF_SACK_PERMIT)) { tcp_do_prr_ack(tp, th, &to, sack_changed, &maxseg); } else if (tcp_is_sack_recovery(tp, &to) && IN_FASTRECOVERY(tp->t_flags) && (tp->snd_nxt == tp->snd_max)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than ssthresh * worth of data in flight. */ if (V_tcp_do_newsack) { awnd = tcp_compute_pipe(tp); } else { awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; } if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += imax(maxseg, imin(2 * maxseg, tp->sackhint.delivered_data)); if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else if (tcp_is_sack_recovery(tp, &to) && IN_FASTRECOVERY(tp->t_flags) && SEQ_LT(tp->snd_nxt, tp->snd_max)) { tp->snd_cwnd += imax(maxseg, imin(2 * maxseg, tp->sackhint.delivered_data)); } else { tp->snd_cwnd += maxseg; } (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh || (tp->t_flags & TF_SACK_PERMIT && V_tcp_do_newsack && tp->sackhint.sacked_bytes > (tcprexmtthresh - 1) * maxseg)) { enter_recovery: /* * Above is the RFC6675 trigger condition of * more than (dupthresh-1)*maxseg sacked data. * If the count of holes in the * scoreboard is >= dupthresh, we could * also enter loss recovery, but don't * have that value readily available. */ tp->t_dupacks = tcprexmtthresh; tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tcp_is_sack_recovery(tp, &to)) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (V_tcp_do_prr) { /* * snd_ssthresh and snd_recover are * already updated by cc_cong_signal. */ if (tcp_is_sack_recovery(tp, &to)) { /* * Include Limited Transmit * segments here */ tp->sackhint.prr_delivered = imin(tp->snd_max - th->th_ack, (tp->snd_limited + 1) * maxseg); } else { tp->sackhint.prr_delivered = maxseg; } tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } tp->snd_limited = 0; if (tcp_is_sack_recovery(tp, &to)) { TCPSTAT_INC(tcps_sack_recovery_episode); /* * When entering LR after RTO due to * Duplicate ACKs, retransmit existing * holes from the scoreboard. */ tcp_resend_sackholes(tp); /* Avoid inflating cwnd in tcp_output */ tp->snd_nxt = tp->snd_max; tp->snd_cwnd = tcp_compute_pipe(tp) + maxseg; (void) tcp_output(tp); /* Set cwnd to the expected flightsize */ tp->snd_cwnd = tp->snd_ssthresh; if (SEQ_GT(th->th_ack, tp->snd_una)) { goto resume_partialack; } goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = maxseg; (void) tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, nsegs, CC_DUPACK); uint32_t oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; if ((tp->snd_nxt == tp->snd_max) && (tp->t_rxtshift == 0)) tp->snd_cwnd = SEQ_SUB(tp->snd_nxt, tp->snd_una) - tcp_sack_adjust(tp); tp->snd_cwnd += (tp->t_dupacks - tp->snd_limited) * maxseg - tcp_sack_adjust(tp); /* * Only call tcp_output when there * is new data available to be sent * or we need to send an ACK. */ SOCK_SENDBUF_LOCK(so); avail = sbavail(&so->so_snd); SOCK_SENDBUF_UNLOCK(so); if (tp->t_flags & TF_ACKNOW || (avail >= SEQ_SUB(tp->snd_nxt, tp->snd_una))) { (void) tcp_output(tp); } sent = SEQ_SUB(tp->snd_max, oldsndmax); if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) { ++tp->snd_limited; } tp->snd_cwnd = oldcwnd; goto drop; } } break; } else { /* * This ack is advancing the left edge, reset the * counter. */ tp->t_dupacks = 0; /* * If this ack also has new SACK info, increment the * counter as per rfc6675. The variable * sack_changed tracks all changes to the SACK * scoreboard, including when partial ACKs without * SACK options are received, and clear the scoreboard * from the left side. Such partial ACKs should not be * counted as dupacks here. */ if (tcp_is_sack_recovery(tp, &to) && (((tp->t_rxtshift == 0) && (sack_changed != SACK_NOCHANGE)) || ((tp->t_rxtshift > 0) && (sack_changed == SACK_NEWLOSS))) && (tp->snd_nxt == tp->snd_max)) { tp->t_dupacks++; /* limit overhead by setting maxseg last */ if (!IN_FASTRECOVERY(tp->t_flags) && (tp->sackhint.sacked_bytes > ((tcprexmtthresh - 1) * (maxseg = tcp_maxseg(tp))))) { goto enter_recovery; } } } resume_partialack: KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (IN_FASTRECOVERY(tp->t_flags)) { if (tp->t_flags & TF_SACK_PERMIT) { if (V_tcp_do_prr && (to.to_flags & TOF_SACK)) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tcp_do_prr_ack(tp, th, &to, sack_changed, &maxseg); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } else { tcp_sack_partialack(tp, th, &maxseg); } } else { tcp_newreno_partial_ack(tp, th); } } else if (IN_CONGRECOVERY(tp->t_flags) && (V_tcp_do_prr)) { tp->sackhint.delivered_data = BYTES_THIS_ACK(tp, th); tp->snd_fack = th->th_ack; /* * During ECN cwnd reduction * always use PRR-SSRB */ tcp_do_prr_ack(tp, th, &to, SACK_CHANGE, &maxseg); (void) tcp_output(tp); } } /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(inp); /* * Adjust for the SYN bit in sequence space, * but don't account for it in cwnd calculations. * This is for the SYN_RECEIVED, non-simultaneous * SYN case. SYN_SENT and simultaneous SYN are * treated elsewhere. */ if (incforsyn) tp->snd_una++; acked = BYTES_THIS_ACK(tp, th); KASSERT(acked >= 0, ("%s: acked unexepectedly negative " "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, tp->snd_una, th->th_ack, tp, m)); TCPSTAT_ADD(tcps_rcvackpack, nsegs); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && tp->t_badrxtwin != 0 && to.to_flags & TOF_TS && to.to_tsecr != 0 && TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to.to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } SOCK_SENDBUF_LOCK(so); /* * Clear t_acktime if remote side has ACKd all data in the * socket buffer and FIN (if applicable). * Otherwise, update t_acktime if we received a sufficiently * large ACK. */ if ((tp->t_state <= TCPS_CLOSE_WAIT && acked == sbavail(&so->so_snd)) || acked > sbavail(&so->so_snd)) tp->t_acktime = 0; else if (acked > 1) tp->t_acktime = ticks; /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) { SOCK_SENDBUF_UNLOCK(so); goto step6; } /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, nsegs, CC_ACK); if (acked > sbavail(&so->so_snd)) { if (tp->snd_wnd >= sbavail(&so->so_snd)) tp->snd_wnd -= sbavail(&so->so_snd); else tp->snd_wnd = 0; mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); if (tp->snd_wnd >= (uint32_t) acked) tp->snd_wnd -= acked; else tp->snd_wnd = 0; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; tp->snd_una = th->th_ack; if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { cc_post_recovery(tp, th); } if (SEQ_GT(tp->snd_una, tp->snd_recover)) { tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { tcp_free_sackholes(tp); soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { tcp_twstart(tp); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(inp); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCK_RECVBUF_LOCK(so); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCK_RECVBUF_UNLOCK(so); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCK_RECVBUF_UNLOCK(so); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (uint32_t)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(inp); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags & TF_FASTOPEN)); if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { if (DELAY_ACK(tp, tlen) || tfo_syn) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } thflags = tcp_get_flags(th) & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCK_RECVBUF_LOCK(so); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); tp->t_flags |= TF_WAKESOR; } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { tcp_update_dsack_list(tp, save_start, save_start + tlen); } } tcp_handle_wakeup(tp); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { if (tlen > 0) { if ((thflags & TH_FIN) != 0) { log(LOG_DEBUG, "%s; %s: %s: " "Received %d bytes of data and FIN " "after having received a FIN, " "just dropping both\n", s, __func__, tcpstates[tp->t_state], tlen); } else { log(LOG_DEBUG, "%s; %s: %s: " "Received %d bytes of data " "after having received a FIN, " "just dropping it\n", s, __func__, tcpstates[tp->t_state], tlen); } } else { if ((thflags & TH_FIN) != 0) { log(LOG_DEBUG, "%s; %s: %s: " "Received FIN " "after having received a FIN, " "just dropping it\n", s, __func__, tcpstates[tp->t_state]); } } free(s, M_TCPLOG); } m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* The socket upcall is handled by socantrcvmore. */ socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: tcp_twstart(tp); return; } } TCP_PROBE3(debug__input, tp, th, m); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) { (void) tcp_output(tp); } check_delack: INP_WLOCK_ASSERT(inp); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(inp); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); goto dropwithreset; } TCP_PROBE3(debug__input, tp, th, m); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); INP_WUNLOCK(inp); m_freem(m); return; dropwithreset: if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: /* * Drop space held by incoming segment and return. */ TCP_PROBE3(debug__input, tp, th, m); if (tp != NULL) { INP_WUNLOCK(inp); } m_freem(m); } /* * Issue RST and make ACK acceptable to originator of segment. * The mbuf must still include the original packet header. * tp may be NULL. */ void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { #ifdef INET struct ip *ip; #endif #ifdef INET6 struct ip6_hdr *ip6; #endif if (tp != NULL) { INP_LOCK_ASSERT(tptoinpcb(tp)); } /* Don't bother if destination was broadcast/multicast. */ if ((tcp_get_flags(th) & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) goto drop; #ifdef INET6 if (mtod(m, struct ip *)->ip_v == 6) { ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) goto drop; /* IPv6 anycast check is done at tcp6_input() */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip = mtod(m, struct ip *); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || - in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) + in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) goto drop; } #endif /* Perform bandwidth limiting. */ if (badport_bandlim(rstreason) < 0) goto drop; /* tcp_respond consumes the mbuf chain. */ if (tcp_get_flags(th) & TH_ACK) { tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (tcp_get_flags(th) & TH_SYN) tlen++; if (tcp_get_flags(th) & TH_FIN) tlen++; tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } return; drop: m_freem(m); } /* * Parse TCP options and place in tcpopt. */ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { if (cnt < 2) break; optlen = cp[1]; if (optlen < 2 || optlen > cnt) break; } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_MSS; bcopy((char *)cp + 2, (char *)&to->to_mss, sizeof(to->to_mss)); to->to_mss = ntohs(to->to_mss); break; case TCPOPT_WINDOW: if (optlen != TCPOLEN_WINDOW) continue; if (!(flags & TO_SYN)) continue; to->to_flags |= TOF_SCALE; to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); break; case TCPOPT_TIMESTAMP: if (optlen != TCPOLEN_TIMESTAMP) continue; to->to_flags |= TOF_TS; bcopy((char *)cp + 2, (char *)&to->to_tsval, sizeof(to->to_tsval)); to->to_tsval = ntohl(to->to_tsval); bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); to->to_tsecr = ntohl(to->to_tsecr); break; case TCPOPT_SIGNATURE: /* * In order to reply to a host which has set the * TCP_SIGNATURE option in its initial SYN, we have * to record the fact that the option was observed * here for the syncache code to perform the correct * response. */ if (optlen != TCPOLEN_SIGNATURE) continue; to->to_flags |= TOF_SIGNATURE; to->to_signature = cp + 2; break; case TCPOPT_SACK_PERMITTED: if (optlen != TCPOLEN_SACK_PERMITTED) continue; if (!(flags & TO_SYN)) continue; if (!V_tcp_do_sack) continue; to->to_flags |= TOF_SACKPERM; break; case TCPOPT_SACK: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) continue; if (flags & TO_SYN) continue; to->to_flags |= TOF_SACK; to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; case TCPOPT_FAST_OPEN: /* * Cookie length validation is performed by the * server side cookie checking code or the client * side cookie cache update code. */ if (!(flags & TO_SYN)) continue; if (!V_tcp_fastopen_client_enable && !V_tcp_fastopen_server_enable) continue; to->to_flags |= TOF_FASTOPEN; to->to_tfo_len = optlen - 2; to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; break; default: continue; } } } /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. * It is still reflected in the segment length for * sequencing purposes. */ void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { int cnt = off + th->th_urp - 1; while (cnt >= 0) { if (m->m_len > cnt) { char *cp = mtod(m, caddr_t) + cnt; struct tcpcb *tp = sototcpcb(so); INP_WLOCK_ASSERT(tptoinpcb(tp)); tp->t_iobc = *cp; tp->t_oobflags |= TCPOOB_HAVEDATA; bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); m->m_len--; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len--; return; } cnt -= m->m_len; m = m->m_next; if (m == NULL) break; } panic("tcp_pulloutofband"); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; INP_WLOCK_ASSERT(tptoinpcb(tp)); TCPSTAT_INC(tcps_rttupdated); if (tp->t_rttupdated < UCHAR_MAX) tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt * 1000 / hz)); #endif if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed * point). Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit * timer to smoothed rtt + 4 times the smoothed variance. * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); if ((tp->t_rttvar += delta) <= 0) tp->t_rttvar = 1; } else { /* * No rtt measurement yet - use the unsmoothed rtt. * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); } tp->t_rtttime = 0; tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); /* * We received an ack for a packet that wasn't retransmitted; * it is probably safe to discard any error indications we've * received recently. This isn't quite right, but close enough * for now (a route might have failed after we sent a segment, * and the return path might not be symmetrical). */ tp->t_softerror = 0; } /* * Determine a reasonable value for maxseg size. * If the route is known, check route for mtu. * If none, use an mss that can be handled on the outgoing interface * without forcing IP to fragment. If no route is found, route has no mtu, * or the destination isn't local, use a default, hopefully conservative * size (usually 512 or the default IP max size, but no more than the mtu * of the interface), as we can't discover anything about intervening * gateways or networks. We also initialize the congestion/slow start * window to be a single segment if the destination isn't local. * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * * NOTE that resulting t_maxseg doesn't include space for TCP options or * IP options, e.g. IPSEC data, since length of this data may vary, and * thus it is calculated for every segment separately in tcp_output(). * * NOTE that this routine is only called when we process an incoming * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS * settings are handled in tcp_mssopt(). */ void tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) { int mss = 0; uint32_t maxmtu = 0; struct inpcb *inp = tptoinpcb(tp); struct hc_metrics_lite metrics; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; size_t min_protoh = isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else size_t min_protoh = sizeof(struct tcpiphdr); #endif INP_WLOCK_ASSERT(inp); if (tp->t_port) min_protoh += V_tcp_udp_tunneling_overhead; if (mtuoffer != -1) { KASSERT(offer == -1, ("%s: conflict", __func__)); offer = mtuoffer - min_protoh; } /* Initialize. */ #ifdef INET6 if (isipv6) { maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_v6mssdflt; } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { maxmtu = tcp_maxmtu(&inp->inp_inc, cap); tp->t_maxseg = V_tcp_mssdflt; } #endif /* * No route to sender, stay with default mss and return. */ if (maxmtu == 0) { /* * In case we return early we need to initialize metrics * to a defined state as tcp_hc_get() would do for us * if there was no cache hit. */ if (metricptr != NULL) bzero(metricptr, sizeof(struct hc_metrics_lite)); return; } /* What have we got? */ switch (offer) { case 0: /* * Offer == 0 means that there was no MSS on the SYN * segment, in this case we use tcp_mssdflt as * already assigned to t_maxseg above. */ offer = tp->t_maxseg; break; case -1: /* * Offer == -1 means that we didn't receive SYN yet. */ /* FALLTHROUGH */ default: /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, V_tcp_minmss); } if (metricptr == NULL) metricptr = &metrics; tcp_hc_get(&inp->inp_inc, metricptr); /* * If there's a discovered mtu in tcp hostcache, use it. * Else, use the link mtu. */ if (metricptr->hc_mtu) mss = min(metricptr->hc_mtu, maxmtu) - min_protoh; else { #ifdef INET6 if (isipv6) { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, V_tcp_v6mssdflt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = maxmtu - min_protoh; if (!V_path_mtu_discovery && !in_localaddr(inp->inp_faddr)) mss = min(mss, V_tcp_mssdflt); } #endif /* * XXX - The above conditional (mss = maxmtu - min_protoh) * probably violates the TCP spec. * The problem is that, since we don't know the * other end's MSS, we are supposed to use a conservative * default. But, if we do that, then MTU discovery will * never actually take place, because the conservative * default is much less than the MTUs typically seen * on the Internet today. For the moment, we'll sweep * this under the carpet. * * The conservative default might not actually be a problem * if the only case this occurs is when sending an initial * SYN with options and data to a host we've never talked * to before. Then, they will reply with an MSS value which * will get recorded and the new parameters should get * recomputed. For Further Study. */ } mss = min(mss, offer); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ mss = max(mss, 64); tp->t_maxseg = mss; if (tp->t_maxseg < V_tcp_mssdflt) { /* * The MSS is so small we should not process incoming * SACK's since we are subject to attack in such a * case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; } else { tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; } } void tcp_mss(struct tcpcb *tp, int offer) { int mss; uint32_t bufsize; struct inpcb *inp = tptoinpcb(tp); struct socket *so; struct hc_metrics_lite metrics; struct tcp_ifcap cap; KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); bzero(&cap, sizeof(cap)); tcp_mss_update(tp, offer, -1, &metrics, &cap); mss = tp->t_maxseg; /* * If there's a pipesize, change the socket buffer to that size, * don't change if sb_hiwat is different than default (then it * has been changed on purpose with setsockopt). * Make the socket buffers an integral number of mss units; * if the mss is larger than the socket buffer, decrease the mss. */ so = inp->inp_socket; SOCK_SENDBUF_LOCK(so); if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.hc_sendpipe) bufsize = metrics.hc_sendpipe; else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; else { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_snd.sb_hiwat) (void)sbreserve_locked(so, SO_SND, bufsize, NULL); } SOCK_SENDBUF_UNLOCK(so); /* * Sanity check: make sure that maxseg will be large * enough to allow some data on segments even if the * all the option space is used (40bytes). Otherwise * funny things may happen in tcp_output. * * XXXGL: shouldn't we reserve space for IP/IPv6 options? */ tp->t_maxseg = max(mss, 64); if (tp->t_maxseg < V_tcp_mssdflt) { /* * The MSS is so small we should not process incoming * SACK's since we are subject to attack in such a * case. */ tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; } else { tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; } SOCK_RECVBUF_LOCK(so); if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.hc_recvpipe) bufsize = metrics.hc_recvpipe; else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve_locked(so, SO_RCV, bufsize, NULL); } SOCK_RECVBUF_UNLOCK(so); /* Check the interface for TSO capabilities. */ if (cap.ifcap & CSUM_TSO) { tp->t_flags |= TF_TSO; tp->t_tsomax = cap.tsomax; tp->t_tsomaxsegcount = cap.tsomaxsegcount; tp->t_tsomaxsegsize = cap.tsomaxsegsize; if (cap.ipsec_tso) tp->t_flags2 |= TF2_IPSEC_TSO; } } /* * Determine the MSS option to send on an outgoing SYN. */ int tcp_mssopt(struct in_conninfo *inc) { int mss = 0; uint32_t thcmtu = 0; uint32_t maxmtu = 0; size_t min_protoh; KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { mss = V_tcp_v6mssdflt; maxmtu = tcp_maxmtu6(inc, NULL); min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { mss = V_tcp_mssdflt; maxmtu = tcp_maxmtu(inc, NULL); min_protoh = sizeof(struct tcpiphdr); } #endif #if defined(INET6) || defined(INET) thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ #endif if (maxmtu && thcmtu) mss = min(maxmtu, thcmtu) - min_protoh; else if (maxmtu || thcmtu) mss = max(maxmtu, thcmtu) - min_protoh; return (mss); } void tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, sackstatus_t sack_changed, u_int *maxsegp) { int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0; u_int maxseg; INP_WLOCK_ASSERT(tptoinpcb(tp)); if (*maxsegp == 0) { *maxsegp = tcp_maxseg(tp); } maxseg = *maxsegp; /* * Compute the amount of data that this ACK is indicating * (del_data) and an estimate of how many bytes are in the * network. */ if (tcp_is_sack_recovery(tp, to) || (IN_CONGRECOVERY(tp->t_flags) && !IN_FASTRECOVERY(tp->t_flags))) { del_data = tp->sackhint.delivered_data; if (V_tcp_do_newsack) pipe = tcp_compute_pipe(tp); else pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; } else { if (tp->sackhint.prr_delivered < (tcprexmtthresh * maxseg + tp->snd_recover - tp->snd_una)) { del_data = maxseg; } pipe = imax(0, tp->snd_max - tp->snd_una - imin(INT_MAX / 65536, tp->t_dupacks) * maxseg); } tp->sackhint.prr_delivered += del_data; /* * Proportional Rate Reduction */ if (pipe >= tp->snd_ssthresh) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = imax(1, tp->snd_nxt - tp->snd_una); snd_cnt = howmany((long)tp->sackhint.prr_delivered * tp->snd_ssthresh, tp->sackhint.recover_fs) - tp->sackhint.prr_out + maxseg - 1; } else { /* * PRR 6937bis heuristic: * - A partial ack without SACK block beneath snd_recover * indicates further loss. * - An SACK scoreboard update adding a new hole indicates * further loss, so be conservative and send at most one * segment. * - Prevent ACK splitting attacks, by being conservative * when no new data is acked. */ if ((sack_changed == SACK_NEWLOSS) || (del_data == 0)) { limit = tp->sackhint.prr_delivered - tp->sackhint.prr_out; } else { limit = imax(tp->sackhint.prr_delivered - tp->sackhint.prr_out, del_data) + maxseg; } snd_cnt = imin((tp->snd_ssthresh - pipe), limit); } snd_cnt = imax(snd_cnt, 0) / maxseg; /* * Send snd_cnt new data into the network in response to this ack. * If there is going to be a SACK retransmission, adjust snd_cwnd * accordingly. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (tcp_is_sack_recovery(tp, to)) { tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); } else { tp->snd_cwnd = (tp->snd_max - tp->snd_una) + (snd_cnt * maxseg); } } else if (IN_CONGRECOVERY(tp->t_flags)) { tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); } tp->snd_cwnd = imax(maxseg, tp->snd_cwnd); } /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; uint32_t ocwnd = tp->snd_cwnd; u_int maxseg = tcp_maxseg(tp); INP_WLOCK_ASSERT(tptoinpcb(tp)); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (IN_FASTRECOVERY(tp->t_flags)) { tp->snd_nxt = th->th_ack; /* * Set snd_cwnd to one segment beyond acknowledged offset. * (tp->snd_una has not yet been updated when this function is called.) */ tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; } /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. */ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); else tp->snd_cwnd = 0; tp->snd_cwnd += maxseg; } int tcp_compute_pipe(struct tcpcb *tp) { if (tp->t_fb->tfb_compute_pipe == NULL) { return (tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes - tp->sackhint.lost_bytes); } else { return((*tp->t_fb->tfb_compute_pipe)(tp)); } } uint32_t tcp_compute_initwnd(uint32_t maxseg) { /* * Calculate the Initial Window, also used as Restart Window * * RFC5681 Section 3.1 specifies the default conservative values. * RFC3390 specifies slightly more aggressive values. * RFC6928 increases it to ten segments. * Support for user specified value for initial flight size. */ if (V_tcp_initcwnd_segments) return min(V_tcp_initcwnd_segments * maxseg, max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); else if (V_tcp_do_rfc3390) return min(4 * maxseg, max(2 * maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (maxseg > 2190) return (2 * maxseg); else if (maxseg > 1095) return (3 * maxseg); else return (4 * maxseg); } } diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 8278efcae60c..6c855e7a756b 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -1,1796 +1,1796 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2008 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2014 Kevin Lo * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include /* * UDP and UDP-Lite protocols implementation. * Per RFC 768, August, 1980. * Per RFC 3828, July, 2004. */ VNET_DEFINE(int, udp_bind_all_fibs) = 1; SYSCTL_INT(_net_inet_udp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(udp_bind_all_fibs), 0, "Bound sockets receive traffic from all FIBs"); /* * BSD 4.2 defaulted the udp checksum to be off. Turning off udp checksums * removes the only data integrity mechanism for packets and malformed * packets that would otherwise be discarded due to bad checksums, and may * cause problems (especially for NFS data blocks). */ VNET_DEFINE(int, udp_cksum) = 1; SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_cksum), 0, "compute udp checksum"); VNET_DEFINE(int, udp_log_in_vain) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets"); VNET_DEFINE(int, udp_blackhole) = 0; SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole), 0, "Do not send port unreachables for refused connects"); VNET_DEFINE(bool, udp_blackhole_local) = false; SYSCTL_BOOL(_net_inet_udp, OID_AUTO, blackhole_local, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(udp_blackhole_local), false, "Enforce net.inet.udp.blackhole for locally originated packets"); u_long udp_sendspace = 9216; /* really max datagram size */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW, &udp_sendspace, 0, "Maximum outgoing UDP datagram size"); u_long udp_recvspace = 40 * (1024 + #ifdef INET6 sizeof(struct sockaddr_in6) #else sizeof(struct sockaddr_in) #endif ); /* 40 1K datagrams */ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); VNET_DEFINE(struct inpcbinfo, udbinfo); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); #ifndef UDBHASHSIZE #define UDBHASHSIZE 128 #endif VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat); /* from udp_var.h */ VNET_PCPUSTAT_SYSINIT(udpstat); SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(udpstat); #endif /* VIMAGE */ #ifdef INET static void udp_detach(struct socket *so); #endif INPCBSTORAGE_DEFINE(udpcbstor, udpcb, "udpinp", "udp_inpcb", "udp", "udphash"); INPCBSTORAGE_DEFINE(udplitecbstor, udpcb, "udpliteinp", "udplite_inpcb", "udplite", "udplitehash"); static void udp_vnet_init(void *arg __unused) { /* * For now default to 2-tuple UDP hashing - until the fragment * reassembly code can also update the flowid. * * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE); /* Additional pcbinfo for UDP-Lite */ in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE, UDBHASHSIZE); } VNET_SYSINIT(udp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_vnet_init, NULL); /* * Kernel module interface for updating udpstat. The argument is an index * into udpstat treated as an array of u_long. While this encodes the * general layout of udpstat into the caller, it doesn't encode its location, * so that future changes to add, for example, per-CPU stats support won't * cause binary compatibility problems for kernel modules. */ void kmod_udpstat_inc(int statnum) { counter_u64_add(VNET(udpstat)[statnum], 1); } #ifdef VIMAGE static void udp_destroy(void *unused __unused) { in_pcbinfo_destroy(&V_udbinfo); in_pcbinfo_destroy(&V_ulitecbinfo); } VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL); #endif #ifdef INET /* * Subroutine of udp_input(), which appends the provided mbuf chain to the * passed pcb/socket. The caller must provide a sockaddr_in via udp_in that * contains the source address. If the socket ends up being an IPv6 socket, * udp_append() will convert to a sockaddr_in6 before passing the address * into the socket code. * * In the normal case udp_append() will return 0, indicating that you * must unlock the inp. However if a tunneling protocol is in place we increment * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we * then decrement the reference count. If the inp_rele returns 1, indicating the * inp is gone, we return that to the caller to tell them *not* to unlock * the inp. In the case of multi-cast this will cause the distribution * to stop (though most tunneling protocols known currently do *not* use * multicast). */ static int udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off, struct sockaddr_in *udp_in) { struct sockaddr *append_sa; struct socket *so; struct mbuf *tmpopts, *opts = NULL; #ifdef INET6 struct sockaddr_in6 udp_in6; #endif struct udpcb *up; bool filtered; INP_LOCK_ASSERT(inp); /* * Engage the tunneling protocol. */ up = intoudpcb(inp); if (up->u_tun_func != NULL) { in_pcbref(inp); INP_RUNLOCK(inp); filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0], up->u_tun_ctx); INP_RLOCK(inp); if (filtered) return (in_pcbrele_rlocked(inp)); } off += sizeof(struct udphdr); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* Check AH/ESP integrity. */ if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) { m_freem(n); return (0); } if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */ if (IPSEC_ENABLED(ipv4) && UDPENCAP_INPUT(ipv4, n, off, AF_INET) != 0) return (0); /* Consumed. */ } #endif /* IPSEC */ #ifdef MAC if (mac_inpcb_check_deliver(inp, n) != 0) { m_freem(n); return (0); } #endif /* MAC */ if (inp->inp_flags & INP_CONTROLOPTS || inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6) (void)ip6_savecontrol_v4(inp, n, &opts, NULL); else #endif /* INET6 */ ip_savecontrol(inp, &opts, ip, n); } if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) { tmpopts = sbcreatecontrol(&udp_in[1], sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP, M_NOWAIT); if (tmpopts) { if (opts) { tmpopts->m_next = opts; opts = tmpopts; } else opts = tmpopts; } } #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { bzero(&udp_in6, sizeof(udp_in6)); udp_in6.sin6_len = sizeof(udp_in6); udp_in6.sin6_family = AF_INET6; in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6); append_sa = (struct sockaddr *)&udp_in6; } else #endif /* INET6 */ append_sa = (struct sockaddr *)&udp_in[0]; m_adj(n, off); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_rcv); if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) { soroverflow_locked(so); m_freem(n); if (opts) m_freem(opts); UDPSTAT_INC(udps_fullsock); } else sorwakeup_locked(so); return (0); } static bool udp_multi_match(const struct inpcb *inp, void *v) { struct ip *ip = v; struct udphdr *uh = (struct udphdr *)(ip + 1); if (inp->inp_lport != uh->uh_dport) return (false); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) return (false); #endif if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) return (false); if (inp->inp_faddr.s_addr != INADDR_ANY && inp->inp_faddr.s_addr != ip->ip_src.s_addr) return (false); if (inp->inp_fport != 0 && inp->inp_fport != uh->uh_sport) return (false); return (true); } static int udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in) { struct ip *ip = mtod(m, struct ip *); struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto), INPLOOKUP_RLOCKPCB, udp_multi_match, ip); #ifdef KDTRACE_HOOKS struct udphdr *uh = (struct udphdr *)(ip + 1); #endif struct inpcb *inp; struct mbuf *n; int appends = 0, fib; MPASS(ip->ip_hl == sizeof(struct ip) >> 2); fib = M_GETFIB(m); while ((inp = inp_next(&inpi)) != NULL) { /* * XXXRW: Because we weren't holding either the inpcb * or the hash lock when we checked for a match * before, we should probably recheck now that the * inpcb lock is held. */ if (V_udp_bind_all_fibs == 0 && fib != inp->inp_inc.inc_fibnum) /* * Sockets bound to a specific FIB can only receive * packets from that FIB. */ continue; /* * Handle socket delivery policy for any-source * and source-specific multicast. [RFC3678] */ if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { struct ip_moptions *imo; struct sockaddr_in group; int blocked; imo = inp->inp_moptions; if (imo == NULL) continue; bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; group.sin_addr = ip->ip_dst; blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif, (struct sockaddr *)&group, (struct sockaddr *)&udp_in[0]); if (blocked != MCAST_PASS) { if (blocked == MCAST_NOTGMEMBER) IPSTAT_INC(ips_notmember); if (blocked == MCAST_NOTSMEMBER || blocked == MCAST_MUTED) UDPSTAT_INC(udps_filtermcast); continue; } } if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); else UDP_PROBE(receive, NULL, inp, ip, inp, uh); if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) { break; } else appends++; } /* * Don't look for additional matches if this one does * not have either the SO_REUSEPORT or SO_REUSEADDR * socket options set. This heuristic avoids * searching through all pcbs in the common case of a * non-shared port. It assumes that an application * will never clear these options after setting them. */ if ((inp->inp_socket->so_options & (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) { INP_RUNLOCK(inp); break; } } if (appends == 0) { /* * No matching pcb found; discard datagram. (No need * to send an ICMP Port Unreachable for a broadcast * or multicast datgram.) */ UDPSTAT_INC(udps_noport); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) UDPSTAT_INC(udps_noportmcast); else UDPSTAT_INC(udps_noportbcast); } m_freem(m); return (IPPROTO_DONE); } static int udp_input(struct mbuf **mp, int *offp, int proto) { struct ip *ip; struct udphdr *uh; struct ifnet *ifp; struct inpcb *inp; uint16_t len, ip_len; struct inpcbinfo *pcbinfo; struct sockaddr_in udp_in[2]; struct mbuf *m; struct m_tag *fwd_tag; int cscov_partial, iphlen, lookupflags; m = *mp; iphlen = *offp; ifp = m->m_pkthdr.rcvif; *mp = NULL; UDPSTAT_INC(udps_ipackets); /* * Strip IP options, if any; should skip this, make available to * user, and use on returned packets, but we don't yet have a way to * check the checksum with options still present. */ if (iphlen > sizeof (struct ip)) { ip_stripoptions(m); iphlen = sizeof(struct ip); } /* * Get IP and UDP header together in first mbuf. */ if (m->m_len < iphlen + sizeof(struct udphdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) { UDPSTAT_INC(udps_hdrops); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)ip + iphlen); cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0; /* * Destination port of 0 is illegal, based on RFC768. */ if (uh->uh_dport == 0) goto badunlocked; /* * Construct sockaddr format source address. Stuff source address * and datagram in user buffer. */ bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2); udp_in[0].sin_len = sizeof(struct sockaddr_in); udp_in[0].sin_family = AF_INET; udp_in[0].sin_port = uh->uh_sport; udp_in[0].sin_addr = ip->ip_src; udp_in[1].sin_len = sizeof(struct sockaddr_in); udp_in[1].sin_family = AF_INET; udp_in[1].sin_port = uh->uh_dport; udp_in[1].sin_addr = ip->ip_dst; /* * Make mbuf data length reflect UDP length. If not enough data to * reflect UDP length, drop. */ len = ntohs((u_short)uh->uh_ulen); ip_len = ntohs(ip->ip_len) - iphlen; if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) { /* Zero means checksum over the complete packet. */ if (len == 0) len = ip_len; cscov_partial = 0; } if (ip_len != len) { if (len > ip_len || len < sizeof(struct udphdr)) { UDPSTAT_INC(udps_badlen); goto badunlocked; } if (proto == IPPROTO_UDP) m_adj(m, len - ip_len); } /* * Checksum extended UDP header and data. */ if (uh->uh_sum) { u_short uh_sum; if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) && !cscov_partial) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) uh_sum = m->m_pkthdr.csum_data; else uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; } else { char b[offsetof(struct ipovly, ih_src)]; struct ipovly *ipov = (struct ipovly *)ip; memcpy(b, ipov, sizeof(b)); bzero(ipov, sizeof(ipov->ih_x1)); ipov->ih_len = (proto == IPPROTO_UDP) ? uh->uh_ulen : htons(ip_len); uh_sum = in_cksum(m, len + sizeof (struct ip)); memcpy(ipov, b, sizeof(b)); } if (uh_sum) { UDPSTAT_INC(udps_badsum); m_freem(m); return (IPPROTO_DONE); } } else { if (proto == IPPROTO_UDP) { UDPSTAT_INC(udps_nosum); } else { /* UDPLite requires a checksum */ /* XXX: What is the right UDPLite MIB counter here? */ m_freem(m); return (IPPROTO_DONE); } } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - in_broadcast(ip->ip_dst, ifp)) + in_ifnet_broadcast(ip->ip_dst, ifp)) return (udp_multi_input(m, proto, udp_in)); pcbinfo = udp_get_inpcbinfo(proto); /* * Locate pcb for datagram. */ lookupflags = INPLOOKUP_RLOCKPCB | (V_udp_bind_all_fibs ? 0 : INPLOOKUP_FIB); /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { struct sockaddr_in *next_hop; next_hop = (struct sockaddr_in *)(fwd_tag + 1); /* * Transparently forwarded. Pretend to be the destination. * Already got one like this? */ inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, lookupflags, ifp, m); if (!inp) { /* * It's new. Try to find the ambushing socket. * Because we've rewritten the destination address, * any hardware-generated hash is ignored. */ inp = in_pcblookup(pcbinfo, ip->ip_src, uh->uh_sport, next_hop->sin_addr, next_hop->sin_port ? htons(next_hop->sin_port) : uh->uh_dport, INPLOOKUP_WILDCARD | lookupflags, ifp); } /* Remove the tag from the packet. We don't need it anymore. */ m_tag_delete(m, fwd_tag); m->m_flags &= ~M_IP_NEXTHOP; } else inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport, ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD | lookupflags, ifp, m); if (inp == NULL) { if (V_udp_log_in_vain) { char src[INET_ADDRSTRLEN]; char dst[INET_ADDRSTRLEN]; log(LOG_INFO, "Connection attempt to UDP %s:%d from %s:%d\n", inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport), inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport)); } if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh); else UDP_PROBE(receive, NULL, NULL, ip, NULL, uh); UDPSTAT_INC(udps_noport); if (m->m_flags & (M_BCAST | M_MCAST)) { UDPSTAT_INC(udps_noportbcast); goto badunlocked; } if (V_udp_blackhole && (V_udp_blackhole_local || !in_localip(ip->ip_src))) goto badunlocked; if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0) goto badunlocked; icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0); return (IPPROTO_DONE); } /* * Check the minimum TTL for socket. */ INP_RLOCK_ASSERT(inp); if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) { if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); else UDP_PROBE(receive, NULL, inp, ip, inp, uh); INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } if (cscov_partial) { struct udpcb *up; up = intoudpcb(inp); if (up->u_rxcslen == 0 || up->u_rxcslen > len) { INP_RUNLOCK(inp); m_freem(m); return (IPPROTO_DONE); } } if (proto == IPPROTO_UDPLITE) UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); else UDP_PROBE(receive, NULL, inp, ip, inp, uh); if (udp_append(inp, ip, m, iphlen, udp_in) == 0) INP_RUNLOCK(inp); return (IPPROTO_DONE); badunlocked: m_freem(m); return (IPPROTO_DONE); } #endif /* INET */ /* * Notify a udp user of an asynchronous error; just wake up so that they can * collect error status. */ struct inpcb * udp_notify(struct inpcb *inp, int errno) { INP_WLOCK_ASSERT(inp); if ((errno == EHOSTUNREACH || errno == ENETUNREACH || errno == EHOSTDOWN) && inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } inp->inp_socket->so_error = errno; sorwakeup(inp->inp_socket); sowwakeup(inp->inp_socket); return (inp); } #ifdef INET static void udp_common_ctlinput(struct icmp *icmp, struct inpcbinfo *pcbinfo) { struct ip *ip = &icmp->icmp_ip; struct udphdr *uh; struct inpcb *inp; if (icmp_errmap(icmp) == 0) return; uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); inp = in_pcblookup(pcbinfo, ip->ip_dst, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { INP_WLOCK_ASSERT(inp); if (inp->inp_socket != NULL) udp_notify(inp, icmp_errmap(icmp)); INP_WUNLOCK(inp); } else { inp = in_pcblookup(pcbinfo, ip->ip_dst, uh->uh_dport, ip->ip_src, uh->uh_sport, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { struct udpcb *up; udp_tun_icmp_t *func; up = intoudpcb(inp); func = up->u_icmp_func; INP_RUNLOCK(inp); if (func != NULL) func(icmp); } } } static void udp_ctlinput(struct icmp *icmp) { return (udp_common_ctlinput(icmp, &V_udbinfo)); } static void udplite_ctlinput(struct icmp *icmp) { return (udp_common_ctlinput(icmp, &V_ulitecbinfo)); } #endif /* INET */ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != 0) return (EPERM); if (req->oldptr == 0) { int n; n = V_udbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_udbinfo.ipi_count; xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); error = SYSCTL_OUT(req, &xi, sizeof xi); if (error) { INP_RUNLOCK(inp); break; } } } if (!error) { /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #ifdef INET static int udp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) error = ENOENT; if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE, 0, 0, udp_getcred, "S,xucred", "Get the xucred of a UDP connection"); #endif /* INET */ int udp_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) { INP_WUNLOCK(inp); error = ip6_ctloutput(so, sopt); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { INP_WUNLOCK(inp); error = ip_ctloutput(so, sopt); } #endif return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) #if defined(INET) || defined(INET6) case UDP_ENCAP: #ifdef INET if (INP_SOCKAF(so) == AF_INET) { if (!IPSEC_ENABLED(ipv4)) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = UDPENCAP_PCBCTL(ipv4, inp, sopt); break; } #endif /* INET */ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { if (!IPSEC_ENABLED(ipv6)) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = UDPENCAP_PCBCTL(ipv6, inp, sopt); break; } #endif /* INET6 */ INP_WUNLOCK(inp); return (EINVAL); #endif /* INET || INET6 */ #endif /* IPSEC */ case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error != 0) break; inp = sotoinpcb(so); KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if ((optval != 0 && optval < 8) || (optval > 65535)) { INP_WUNLOCK(inp); error = EINVAL; break; } if (sopt->sopt_name == UDPLITE_SEND_CSCOV) up->u_txcslen = optval; else up->u_rxcslen = optval; INP_WUNLOCK(inp); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) #if defined(INET) || defined(INET6) case UDP_ENCAP: #ifdef INET if (INP_SOCKAF(so) == AF_INET) { if (!IPSEC_ENABLED(ipv4)) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = UDPENCAP_PCBCTL(ipv4, inp, sopt); break; } #endif /* INET */ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { if (!IPSEC_ENABLED(ipv6)) { INP_WUNLOCK(inp); return (ENOPROTOOPT); } error = UDPENCAP_PCBCTL(ipv6, inp, sopt); break; } #endif /* INET6 */ INP_WUNLOCK(inp); return (EINVAL); #endif /* INET || INET6 */ #endif /* IPSEC */ case UDPLITE_SEND_CSCOV: case UDPLITE_RECV_CSCOV: if (!isudplite) { INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); if (sopt->sopt_name == UDPLITE_SEND_CSCOV) optval = up->u_txcslen; else optval = up->u_rxcslen; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: INP_WUNLOCK(inp); error = ENOPROTOOPT; break; } break; } return (error); } #ifdef INET #ifdef INET6 /* The logic here is derived from ip6_setpktopt(). See comments there. */ static int udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src, struct inpcb *inp, int flags) { struct ifnet *ifp; struct in6_pktinfo *pktinfo; struct in_addr ia; if ((flags & PRUS_IPV6) == 0) return (0); if (cm->cmsg_level != IPPROTO_IPV6) return (0); if (cm->cmsg_type != IPV6_2292PKTINFO && cm->cmsg_type != IPV6_PKTINFO) return (0); if (cm->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) return (EINVAL); pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm); if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) return (EINVAL); /* Validate the interface index if specified. */ if (pktinfo->ipi6_ifindex) { struct epoch_tracker et; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(pktinfo->ipi6_ifindex); NET_EPOCH_EXIT(et); /* XXXGL: unsafe ifp */ if (ifp == NULL) return (ENXIO); } else ifp = NULL; if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3]; if (in_ifhasaddr(ifp, ia) == 0) return (EADDRNOTAVAIL); } bzero(src, sizeof(*src)); src->sin_family = AF_INET; src->sin_len = sizeof(*src); src->sin_port = inp->inp_lport; src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3]; return (0); } #endif /* INET6 */ int udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) { struct inpcb *inp; struct udpiphdr *ui; int len, error = 0; struct in_addr faddr, laddr; struct cmsghdr *cm; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin, src; struct epoch_tracker et; int cscov_partial = 0; int ipflags = 0; u_short fport, lport; u_char tos, vflagsav; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; uint8_t flowtype = M_HASHTYPE_NONE; bool use_cached_route; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_send: inp == NULL")); if (addr != NULL) { if (addr->sa_family != AF_INET) error = EAFNOSUPPORT; else if (addr->sa_len != sizeof(struct sockaddr_in)) error = EINVAL; if (__predict_false(error != 0)) { m_freem(control); m_freem(m); return (error); } } len = m->m_pkthdr.len; if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); m_freem(m); return (EMSGSIZE); } src.sin_family = 0; sin = (struct sockaddr_in *)addr; /* * udp_send() may need to temporarily bind or connect the current * inpcb. As such, we don't know up front whether we will need the * pcbinfo lock or not. Do any work to decide what is needed up * front before acquiring any locks. * * We will need network epoch in either case, to safely lookup into * pcb hash. */ use_cached_route = sin == NULL || (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0); if (use_cached_route || (flags & PRUS_IPV6) != 0) INP_WLOCK(inp); else INP_RLOCK(inp); NET_EPOCH_ENTER(et); tos = inp->inp_ip_tos; if (control != NULL) { /* * XXX: Currently, we assume all the optional information is * stored in a single mbuf. */ if (control->m_next) { m_freem(control); error = EINVAL; goto release; } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { cm = mtod(control, struct cmsghdr *); if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) { error = EINVAL; break; } #ifdef INET6 error = udp_v4mapped_pktinfo(cm, &src, inp, flags); if (error != 0) break; #endif if (cm->cmsg_level != IPPROTO_IP) continue; switch (cm->cmsg_type) { case IP_SENDSRCADDR: if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr))) { error = EINVAL; break; } bzero(&src, sizeof(src)); src.sin_family = AF_INET; src.sin_len = sizeof(src); src.sin_port = inp->inp_lport; src.sin_addr = *(struct in_addr *)CMSG_DATA(cm); break; case IP_TOS: if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) { error = EINVAL; break; } tos = *(u_char *)CMSG_DATA(cm); break; case IP_FLOWID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowid = *(uint32_t *) CMSG_DATA(cm); break; case IP_FLOWTYPE: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } flowtype = *(uint32_t *) CMSG_DATA(cm); break; #ifdef RSS case IP_RSSBUCKETID: if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { error = EINVAL; break; } /* This is just a placeholder for now */ break; #endif /* RSS */ default: error = ENOPROTOOPT; break; } if (error) break; } m_freem(control); control = NULL; } if (error) goto release; pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = udp_get_inpcbinfo(pr); /* * If the IP_SENDSRCADDR control message was specified, override the * source address for this datagram. Its use is invalidated if the * address thus specified is incomplete or clobbers other inpcbs. */ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } if ((flags & PRUS_IPV6) != 0) { vflagsav = inp->inp_vflag; inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; } INP_HASH_WLOCK(pcbinfo); error = in_pcbbind_setup(inp, &src, &laddr.s_addr, &lport, V_udp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); if ((flags & PRUS_IPV6) != 0) inp->inp_vflag = vflagsav; if (error) goto release; } /* * If a UDP socket has been connected, then a local address/port will * have been selected and bound. * * If a UDP socket has not been connected to, then an explicit * destination address must be used, in which case a local * address/port may not have been selected and bound. */ if (sin != NULL) { INP_LOCK_ASSERT(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { error = EISCONN; goto release; } /* * Jail may rewrite the destination address, so let it do * that before we use it. */ error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error) goto release; /* * If a local address or port hasn't yet been selected, or if * the destination address needs to be rewritten due to using * a special INADDR_ constant, invoke in_pcbconnect_setup() * to do the heavy lifting. Once a port is selected, we * commit the binding back to the socket; we also commit the * binding of the address if in jail. * * If we already have a valid binding and we're not * requesting a destination address rewrite, use a fast path. */ if (inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { if ((flags & PRUS_IPV6) != 0) { vflagsav = inp->inp_vflag; inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; } INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect_setup(inp, sin, &laddr.s_addr, &lport, &faddr.s_addr, &fport, td->td_ucred); if ((flags & PRUS_IPV6) != 0) inp->inp_vflag = vflagsav; if (error) { INP_HASH_WUNLOCK(pcbinfo); goto release; } /* * XXXRW: Why not commit the port if the address is * !INADDR_ANY? */ /* Commit the local port if newly assigned. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); /* * Remember addr if jailed, to prevent * rebinding. */ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; error = in_pcbinshash(inp); INP_HASH_WUNLOCK(pcbinfo); if (error != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; } inp->inp_flags |= INP_ANONPORT; } else INP_HASH_WUNLOCK(pcbinfo); } else { faddr = sin->sin_addr; fport = sin->sin_port; } } else { INP_LOCK_ASSERT(inp); faddr = inp->inp_faddr; fport = inp->inp_fport; if (faddr.s_addr == INADDR_ANY) { error = ENOTCONN; goto release; } } /* * Calculate data length and get a mbuf for UDP, IP, and possible * link-layer headers. Immediate slide the data pointer back forward * since we won't use that space at this layer. */ M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto release; } m->m_data += max_linkhdr; m->m_len -= max_linkhdr; m->m_pkthdr.len -= max_linkhdr; /* * Fill in mbuf with extended UDP header and addresses and length put * into network format. */ ui = mtod(m, struct udpiphdr *); /* * Filling only those fields of udpiphdr that participate in the * checksum calculation. The rest must be zeroed and will be filled * later. */ bzero(ui->ui_x1, sizeof(ui->ui_x1)); ui->ui_pr = pr; ui->ui_src = laddr; ui->ui_dst = faddr; ui->ui_sport = lport; ui->ui_dport = fport; ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr)); if (pr == IPPROTO_UDPLITE) { struct udpcb *up; uint16_t plen; up = intoudpcb(inp); cscov = up->u_txcslen; plen = (u_short)len + sizeof(struct udphdr); if (cscov >= plen) cscov = 0; ui->ui_len = htons(plen); ui->ui_ulen = htons(cscov); /* * For UDP-Lite, checksum coverage length of zero means * the entire UDPLite packet is covered by the checksum. */ cscov_partial = (cscov == 0) ? 0 : 1; } if (inp->inp_socket->so_options & SO_DONTROUTE) ipflags |= IP_ROUTETOIF; if (inp->inp_socket->so_options & SO_BROADCAST) ipflags |= IP_ALLOWBROADCAST; if (inp->inp_flags & INP_ONESBCAST) ipflags |= IP_SENDONES; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif /* * Set up checksum and output datagram. */ ui->ui_sum = 0; if (pr == IPPROTO_UDPLITE) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; if (cscov_partial) { if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0) ui->ui_sum = 0xffff; } else { if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0) ui->ui_sum = 0xffff; } } else if (V_udp_cksum) { if (inp->inp_flags & INP_ONESBCAST) faddr.s_addr = INADDR_BROADCAST; ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr, htons((u_short)len + sizeof(struct udphdr) + pr)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } /* * After finishing the checksum computation, fill the remaining fields * of udpiphdr. */ ((struct ip *)ui)->ip_v = IPVERSION; ((struct ip *)ui)->ip_tos = tos; ((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len); if (inp->inp_flags & INP_DONTFRAG) ((struct ip *)ui)->ip_off |= htons(IP_DF); ((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl; UDPSTAT_INC(udps_opackets); /* * Setup flowid / RSS information for outbound socket. * * Once the UDP code decides to set a flowid some other way, * this allows the flowid to be overridden by userland. */ if (flowtype != M_HASHTYPE_NONE) { m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, flowtype); } #if defined(ROUTE_MPATH) || defined(RSS) else if (CALC_FLOWID_OUTBOUND_SENDTO) { uint32_t hash_val, hash_type; hash_val = fib4_calc_packet_hash(laddr, faddr, lport, fport, pr, &hash_type); m->m_pkthdr.flowid = hash_val; M_HASHTYPE_SET(m, hash_type); } /* * Don't override with the inp cached flowid value. * * Depending upon the kind of send being done, the inp * flowid/flowtype values may actually not be appropriate * for this particular socket send. * * We should either leave the flowid at zero (which is what is * currently done) or set it to some software generated * hash value based on the packet contents. */ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ if (pr == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); else UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, use_cached_route ? &inp->inp_route : NULL, ipflags, inp->inp_moptions, inp); INP_UNLOCK(inp); NET_EPOCH_EXIT(et); return (error); release: INP_UNLOCK(inp); NET_EPOCH_EXIT(et); m_freem(m); return (error); } void udp_abort(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_abort: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_attach(struct socket *so, int proto, struct thread *td) { static uint32_t udp_flowid; struct inpcbinfo *pcbinfo; struct inpcb *inp; struct udpcb *up; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp == NULL, ("udp_attach: inp != NULL")); error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); error = in_pcballoc(so, pcbinfo); if (error) return (error); inp = sotoinpcb(so); inp->inp_ip_ttl = V_ip_defttl; inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1); inp->inp_flowtype = M_HASHTYPE_OPAQUE; up = intoudpcb(inp); bzero(&up->u_start_zero, u_zero_size); INP_WUNLOCK(inp); return (0); } #endif /* INET */ int udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx) { struct inpcb *inp; struct udpcb *up; KASSERT(so->so_type == SOCK_DGRAM, ("udp_set_kernel_tunneling: !dgram")); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL")); INP_WLOCK(inp); up = intoudpcb(inp); if ((f != NULL || i != NULL) && ((up->u_tun_func != NULL) || (up->u_icmp_func != NULL))) { INP_WUNLOCK(inp); return (EBUSY); } up->u_tun_func = f; up->u_icmp_func = i; up->u_tun_ctx = ctx; INP_WUNLOCK(inp); return (0); } #ifdef INET static int udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sinp; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_bind: inp == NULL")); sinp = (struct sockaddr_in *)nam; if (nam->sa_family != AF_INET) { /* * Preserve compatibility with old programs. */ if (nam->sa_family != AF_UNSPEC || nam->sa_len < offsetof(struct sockaddr_in, sin_zero) || sinp->sin_addr.s_addr != INADDR_ANY) return (EAFNOSUPPORT); nam->sa_family = AF_INET; } if (nam->sa_len != sizeof(struct sockaddr_in)) return (EINVAL); INP_WLOCK(inp); INP_HASH_WLOCK(pcbinfo); error = in_pcbbind(inp, sinp, V_udp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); INP_WUNLOCK(inp); return (error); } static void udp_close(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_close: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); INP_HASH_WUNLOCK(pcbinfo); soisdisconnected(so); } INP_WUNLOCK(inp); } static int udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { struct epoch_tracker et; struct inpcb *inp; struct inpcbinfo *pcbinfo; struct sockaddr_in *sin; int error; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_connect: inp == NULL")); sin = (struct sockaddr_in *)nam; if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_len != sizeof(*sin)) return (EINVAL); INP_WLOCK(inp); if (inp->inp_faddr.s_addr != INADDR_ANY) { INP_WUNLOCK(inp); return (EISCONN); } error = prison_remote_ip4(td->td_ucred, &sin->sin_addr); if (error != 0) { INP_WUNLOCK(inp); return (error); } NET_EPOCH_ENTER(et); INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect(inp, sin, td->td_ucred); INP_HASH_WUNLOCK(pcbinfo); NET_EPOCH_EXIT(et); if (error == 0) soisconnected(so); INP_WUNLOCK(inp); return (error); } static void udp_detach(struct socket *so) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); INP_WLOCK(inp); in_pcbfree(inp); } int udp_disconnect(struct socket *so) { struct inpcb *inp; struct inpcbinfo *pcbinfo; pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp_disconnect: inp == NULL")); INP_WLOCK(inp); if (inp->inp_faddr.s_addr == INADDR_ANY) { INP_WUNLOCK(inp); return (ENOTCONN); } INP_HASH_WLOCK(pcbinfo); in_pcbdisconnect(inp); INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; /* XXX */ SOCK_UNLOCK(so); INP_WUNLOCK(inp); return (0); } #endif /* INET */ int udp_shutdown(struct socket *so, enum shutdown_how how) { int error; SOCK_LOCK(so); if (!(so->so_state & SS_ISCONNECTED)) /* * POSIX mandates us to just return ENOTCONN when shutdown(2) is * invoked on a datagram sockets, however historically we would * actually tear socket down. This is known to be leveraged by * some applications to unblock process waiting in recv(2) by * other process that it shares that socket with. Try to meet * both backward-compatibility and POSIX requirements by forcing * ENOTCONN but still flushing buffers and performing wakeup(9). * * XXXGL: it remains unknown what applications expect this * behavior and is this isolated to unix/dgram or inet/dgram or * both. See: D10351, D3039. */ error = ENOTCONN; else error = 0; SOCK_UNLOCK(so); switch (how) { case SHUT_RD: sorflush(so); break; case SHUT_RDWR: sorflush(so); /* FALLTHROUGH */ case SHUT_WR: socantsendmore(so); } return (error); } #ifdef INET #define UDP_PROTOSW \ .pr_type = SOCK_DGRAM, \ .pr_flags = PR_ATOMIC | PR_ADDR | PR_CAPATTACH, \ .pr_ctloutput = udp_ctloutput, \ .pr_abort = udp_abort, \ .pr_attach = udp_attach, \ .pr_bind = udp_bind, \ .pr_connect = udp_connect, \ .pr_control = in_control, \ .pr_detach = udp_detach, \ .pr_disconnect = udp_disconnect, \ .pr_peeraddr = in_getpeeraddr, \ .pr_send = udp_send, \ .pr_soreceive = soreceive_dgram, \ .pr_sosend = sosend_dgram, \ .pr_shutdown = udp_shutdown, \ .pr_sockaddr = in_getsockaddr, \ .pr_sosetlabel = in_pcbsosetlabel, \ .pr_close = udp_close struct protosw udp_protosw = { .pr_protocol = IPPROTO_UDP, UDP_PROTOSW }; struct protosw udplite_protosw = { .pr_protocol = IPPROTO_UDPLITE, UDP_PROTOSW }; static void udp_init(void *arg __unused) { IPPROTO_REGISTER(IPPROTO_UDP, udp_input, udp_ctlinput); IPPROTO_REGISTER(IPPROTO_UDPLITE, udp_input, udplite_ctlinput); } SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL); #endif /* INET */