diff --git a/share/man/man4/inet.4 b/share/man/man4/inet.4 index edd14fbc48b9..45da9301f86f 100644 --- a/share/man/man4/inet.4 +++ b/share/man/man4/inet.4 @@ -1,367 +1,369 @@ .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" From: @(#)inet.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" .Dd September 8, 2022 .Dt INET 4 .Os .Sh NAME .Nm inet .Nd Internet protocol family .Sh SYNOPSIS .In sys/types.h .In netinet/in.h .Sh DESCRIPTION The Internet protocol family is a collection of protocols layered atop the .Em Internet Protocol .Pq Tn IP transport layer, and utilizing the Internet address format. The Internet family provides protocol support for the .Dv SOCK_STREAM , SOCK_DGRAM , and .Dv SOCK_RAW socket types; the .Dv SOCK_RAW interface provides access to the .Tn IP protocol. .Sh ADDRESSING Internet addresses are four byte quantities, stored in network standard format (on little endian machines, such as the .Tn alpha , .Tn amd64 and .Tn i386 these are word and byte reversed). The include file .In netinet/in.h defines this address as a discriminated union. .Pp Sockets bound to the Internet protocol family utilize the following addressing structure, .Bd -literal -offset indent struct sockaddr_in { uint8_t sin_len; sa_family_t sin_family; in_port_t sin_port; struct in_addr sin_addr; char sin_zero[8]; }; .Ed .Pp Sockets may be created with the local address .Dv INADDR_ANY to affect .Dq wildcard matching on incoming messages. The address in a .Xr connect 2 or .Xr sendto 2 call may be given as .Dv INADDR_ANY to mean .Dq this host . The distinguished address .Dv INADDR_BROADCAST is allowed as a shorthand for the broadcast address on the primary network if the first network configured supports broadcast. .Sh PROTOCOLS The Internet protocol family is comprised of the .Tn IP network protocol, Internet Control Message Protocol .Pq Tn ICMP , Internet Group Management Protocol .Pq Tn IGMP , Transmission Control Protocol .Pq Tn TCP , and User Datagram Protocol .Pq Tn UDP . .Tn TCP is used to support the .Dv SOCK_STREAM abstraction while .Tn UDP is used to support the .Dv SOCK_DGRAM abstraction. A raw interface to .Tn IP is available by creating an Internet socket of type .Dv SOCK_RAW . The .Tn ICMP message protocol is accessible from a raw socket. .Pp The .Nm address on an interface consist of the address itself, the netmask, either broadcast address in case of a broadcast interface or peers address in case of point-to-point interface. The following .Xr ioctl 2 commands are provided for a datagram socket in the Internet domain: .Pp .Bl -tag -width ".Dv SIOCGIFBRDADDR" -offset indent -compact .It Dv SIOCAIFADDR Add address to an interface. The command requires .Ft struct in_aliasreq as argument. .It Dv SIOCDIFADDR Delete address from an interface. The command requires .Ft struct ifreq as argument. .It Dv SIOCGIFADDR .It Dv SIOCGIFBRDADDR .It Dv SIOCGIFDSTADDR .It Dv SIOCGIFNETMASK Return address information from interface. The returned value is in .Ft struct ifreq . This way of address information retrieval is obsoleted, a preferred way is to use .Xr getifaddrs 3 API. .El .Ss MIB (sysctl) Variables In addition to the variables supported by the transport protocols in .Va net.inet (for which the respective manual pages may be consulted), there are a number of general variables implemented in the .Va net.inet.ip branch of the .Xr sysctl 3 MIB, which can be also read or modified with .Xr sysctl 8 . The following general variables are defined: .Bl -tag -width ".Va accept_sourceroute" .It Va accept_sourceroute Boolean: enable/disable accepting of source-routed IP packets (default false). .It Va allow_net0 Boolean: allow experimental use of addresses in 0.0.0.0/8 as endpoints, and allow forwarding of packets with these addresses. .It Va allow_net240 Boolean: allow experimental use of addresses in 240.0.0.0/4 as endpoints, and allow forwarding of packets with these addresses. .It Va curfrags Integer: Current number of IPv4 fragments across all reassembly queues in all VNETs (read-only). .It Va forwarding Boolean: enable/disable forwarding of IP packets. Defaults to off. .It Va fragpackets Integer: Current number of IPv4 fragment reassembly queue entries for the VNET (read-only). .It Va fragttl Integer: time to live for IPv4 packet fragments in the per-VNET reassemby queue. .It Va loopback_prefixlen Integer: prefix length of the address space reserved for loopback purposes. The default is 8, meaning that 127.0.0.0/8 is reserved for loopback, and cannot be sent, received, or forwarded on a non-loopback interface. Use of other values is experimental. .It Va maxfragbucketsize Integer: maximum number of reassembly queues per bucket. Fragmented packets are hashed to buckets. Each bucket has a list of reassembly queues. The system must compare the incoming packets to the existing reassembly queues in the bucket to find a matching reassembly queue. To preserve system resources, the system limits the number of reassembly queues allowed in each bucket. This limit is recalculated when the number of mbuf clusters is changed or when the value of .Va maxfragpackets changes. This is a per-VNET limit. .It Va maxfragpackets Integer: maximum number of fragmented packets the host will accept and simultaneously hold in the reassembly queue for a particular VNET. 0 means that the host will not accept any fragmented packets for that VNET. \-1 means that the host will not apply this limit for that VNET. This limit is recalculated when the number of mbuf clusters is changed. This is a per-VNET limit. .It Va maxfrags Integer: maximum number of fragments the host will accept and simultaneously hold across all reassembly queues in all VNETs. If set to 0, reassembly is disabled. If set to -1, this limit is not applied. This limit is recalculated when the number of mbuf clusters is changed. This is a global limit. .It Va maxfragsperpacket Integer: maximum number of fragments the host will accept and hold in the reassembly queue for a packet. 0 means that the host will not accept any fragmented packets for the VNET. This is a per-VNET limit. .It Va mcast Variables under the .Va net.inet.ip.mcast node are documented in .Xr ip 4 . .It Va no_same_prefix Boolean: Refuse to create same prefixes on different interfaces. This is a per-VNET value. .It Va portrange Variables under the .Va net.inet.ip.portrange node control port ranges used by transport protocols; see .Xr ip 4 for details. .It Va process_options Integer: control IP options processing. By setting this variable to 0, all IP options in the incoming packets will be ignored, and the packets will be passed unmodified. By setting to 1, IP options in the incoming packets will be processed accordingly. By setting to 2, an .Tn ICMP .Dq "prohibited by filter" message will be sent back in response to incoming packets with IP options. Default is 1. This .Xr sysctl 8 variable affects packets destined for a local host as well as packets forwarded to some other host. .It Va random_id Boolean: control IP IDs generation behavior. Setting this .Xr sysctl 8 to 1 causes the ID field in .Em non-atomic IP datagrams (or all IP datagrams, if .Va rfc6864 is disabled) to be randomized instead of incremented by 1 with each packet generated. This closes a minor information leak which allows remote observers to determine the rate of packet generation on the machine by watching the counter. At the same time, on high-speed links, it can decrease the ID reuse cycle greatly. Default is 0 (sequential IP IDs). IPv6 flow IDs and fragment IDs are always random. .It Va random_id_collisions Integer: count of IP ID collisions (read-only, per-VNET). .It Va random_id_period Integer: size of the IP ID array, which is the number of previous packets for which the IDs are recorded. The number must be between 512 and 32768 inclusive. This is a per-VNET value. .It Va random_id_total Integer: count of IP IDs created (read-only, per-VNET). +.It Va reass_hashsize +Number of hash slots in the IPv4 reassembly queue (loader tunable). .It Va redirect Boolean: enable/disable sending of ICMP redirects in response to .Tn IP packets for which a better, and for the sender directly reachable, route and next hop is known. Defaults to on. .It Va rfc1122_strong_es Boolean: in non-forwarding mode .Pq forwarding is disabled partially implement the Strong End System model per RFC1122. If a packet with destination address that is local arrives on a different interface than the interface the address belongs to, the packet would be silently dropped. Enabling this option may break certain setups, e.g. having an alias address(es) on loopback that are expected to be reachable by outside traffic. Enabling some other network features, e.g. .Xr carp 4 or destination address rewriting .Xr pfil 4 filters may override and bypass this check. Disabled by default. .It Va rfc6864 Boolean: control IP IDs generation behaviour. True value enables RFC6864 support, which specifies that IP ID field of .Em atomic datagrams can be set to any value. The .Fx implementation sets it to zero. Enabled by default. .It Va source_address_validation Boolean: perform source address validation for packets destined for the local host. Consider this as following Section 3.2 of RFC3704/BCP84, where we treat local host as our own infrastructure. Forwarded packets are unaffected by this and it should not be considered an anti-spoof feature for a router. Enabled by default. .It Va sourceroute Boolean: enable/disable forwarding of source-routed IP packets (default false). .It Va ttl Integer: default time-to-live .Pq Dq TTL to use for outgoing .Tn IP packets. .El .Sh SEE ALSO .Xr ioctl 2 , .Xr socket 2 , .Xr getifaddrs 3 , .Xr sysctl 3 , .Xr icmp 4 , .Xr intro 4 , .Xr ip 4 , .Xr ipfirewall 4 , .Xr route 4 , .Xr tcp 4 , .Xr udp 4 , .Xr sysctl 8 , .Xr pfil 9 .Rs .%T "An Introductory 4.3 BSD Interprocess Communication Tutorial" .%B PS1 .%N 7 .Re .Rs .%T "An Advanced 4.3 BSD Interprocess Communication Tutorial" .%B PS1 .%N 8 .Re .Sh HISTORY The .Nm protocol interface appeared in .Bx 4.2 . The .Dq protocol cloning code appeared in .Fx 2.1 . .Sh CAVEATS The Internet protocol support is subject to change as the Internet protocols develop. Users should not depend on details of the current implementation, but rather the services exported. diff --git a/sys/netinet/ip_reass.c b/sys/netinet/ip_reass.c index 257211c0a1a1..88ba74ace4d6 100644 --- a/sys/netinet/ip_reass.c +++ b/sys/netinet/ip_reass.c @@ -1,954 +1,967 @@ /*- * Copyright (c) 2015 Gleb Smirnoff * Copyright (c) 2015 Adrian Chadd * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MAC #include #endif SYSCTL_DECL(_net_inet_ip); /* * Reassembly headers are stored in hash buckets. */ #define IPREASS_NHASH_LOG2 10 #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #define IPREASS_HMASK (IPREASS_NHASH - 1) struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; struct callout timer; #ifdef VIMAGE struct vnet *vnet; #endif int count; }; -VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]); +VNET_DEFINE_STATIC(struct ipqbucket *, ipq); #define V_ipq VNET(ipq) VNET_DEFINE_STATIC(uint32_t, ipq_hashseed); -#define V_ipq_hashseed VNET(ipq_hashseed) +#define V_ipq_hashseed VNET(ipq_hashseed) +VNET_DEFINE_STATIC(uint32_t, ipq_hashsize); +#define V_ipq_hashsize VNET(ipq_hashsize) #define IPQ_LOCK(i) mtx_lock(&V_ipq[i].lock) #define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) #define IPQ_BUCKET_LOCK_ASSERT(b) mtx_assert(&(b)->lock, MA_OWNED) VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) void ipreass_init(void); void ipreass_vnet_init(void); #ifdef VIMAGE void ipreass_destroy(void); #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); static int sysctl_fragttl(SYSCTL_HANDLER_ARGS); static void ipreass_zone_change(void *); static void ipreass_drain_tomax(void); static void ipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); static void ipreass_callout(void *); static void ipreass_reschedule(struct ipqbucket *); static inline void ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); ipq_free(bucket, fp); } static inline void ipq_drop(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ipq_free(bucket, fp); ipreass_reschedule(bucket); } /* * By default, limit the number of IP fragments across all reassembly * queues to 1/32 of the total number of mbuf clusters. * * Limit the total number of reassembly queues per VNET to the * IP fragment limit, but ensure the limit will not allow any bucket * to grow above 100 items. (The bucket limit is - * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct + * IP_MAXFRAGPACKETS / (V_ipq_hashsize / 2), so the 50 is the correct * multiplier to reach a 100-item limit.) * The 100-item limit was chosen as brief testing seems to show that * this produces "reasonable" performance on some subset of systems * under DoS attack. */ #define IP_MAXFRAGS (nmbclusters / 32) -#define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, IPREASS_NHASH * 50)) +#define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, V_ipq_hashsize * 50)) static int maxfrags; static u_int __exclusive_cache_line nfrags; SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, &maxfrags, 0, "Maximum number of IPv4 fragments allowed across all reassembly queues"); SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD, &nfrags, 0, "Current number of IPv4 fragments across all reassembly queues"); VNET_DEFINE_STATIC(uma_zone_t, ipq_zone); #define V_ipq_zone VNET(ipq_zone) + +SYSCTL_UINT(_net_inet_ip, OID_AUTO, reass_hashsize, + CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(ipq_hashsize), 0, + "Size of IP fragment reassembly hashtable"); + SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_maxfragpackets, "I", "Maximum number of IPv4 fragment reassembly queue entries"); SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET, &VNET_NAME(ipq_zone), "Current number of IPv4 fragment reassembly queue entries"); VNET_DEFINE_STATIC(int, noreass); #define V_noreass VNET(noreass) VNET_DEFINE_STATIC(int, maxfragsperpacket); #define V_maxfragsperpacket VNET(maxfragsperpacket) SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxfragbucketsize, "I", "Maximum number of IPv4 fragment reassembly queue entries per bucket"); VNET_DEFINE_STATIC(u_int, ipfragttl) = 30; #define V_ipfragttl VNET(ipfragttl) SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU", "IP fragment life time on reassembly queue (seconds)"); /* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If the argument is the first fragment or one * in between the function will return NULL and store the mbuf * in the fragment chain. If the argument is the last fragment * the packet will be reassembled and the pointer to the new * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. */ #define M_IP_FRAG M_PROTO9 struct mbuf * ip_reass(struct mbuf *m) { struct ip *ip; struct mbuf *p, *q, *nq, *t; struct ipq *fp; struct ifnet *srcifp; struct ipqhead *head; int i, hlen, next, tmpmax; u_int8_t ecn, ecn0; uint32_t hash, hashkey[3]; #ifdef RSS uint32_t rss_hash, rss_type; #endif /* * If no reassembling or maxfragsperpacket are 0, * never accept fragments. * Also, drop packet if it would exceed the maximum * number of fragments. */ tmpmax = maxfrags; if (V_noreass == 1 || V_maxfragsperpacket == 0 || (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) { IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; /* * Adjust ip_len to not reflect header, * convert offset of this to bytes. */ ip->ip_len = htons(ntohs(ip->ip_len) - hlen); /* * Make sure that fragments have a data length * that's a non-zero multiple of 8 bytes, unless * this is the last fragment. */ if (ip->ip_len == htons(0) || ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) { IPSTAT_INC(ips_toosmall); /* XXX */ IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } if (ip->ip_off & htons(IP_MF)) m->m_flags |= M_IP_FRAG; else m->m_flags &= ~M_IP_FRAG; ip->ip_off = htons(ntohs(ip->ip_off) << 3); /* * Make sure the fragment lies within a packet of valid size. */ if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); IPSTAT_INC(ips_fragdropped); m_freem(m); return (NULL); } /* * Store receive network interface pointer for later. */ srcifp = m->m_pkthdr.rcvif; /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ IPSTAT_INC(ips_fragments); m->m_pkthdr.PH_loc.ptr = ip; /* * Presence of header sizes in mbufs * would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; hashkey[0] = ip->ip_src.s_addr; hashkey[1] = ip->ip_dst.s_addr; hashkey[2] = (uint32_t)ip->ip_p << 16; hashkey[2] += ip->ip_id; hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed); hash &= IPREASS_HMASK; head = &V_ipq[hash].head; IPQ_LOCK(hash); /* * Look for queue of fragments * of this datagram. */ TAILQ_FOREACH(fp, head, ipq_list) if (ip->ip_id == fp->ipq_id && ip->ip_src.s_addr == fp->ipq_src.s_addr && ip->ip_dst.s_addr == fp->ipq_dst.s_addr && #ifdef MAC mac_ipq_match(m, fp) && #endif ip->ip_p == fp->ipq_p) break; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { if (V_ipq[hash].count < V_ipreass_maxbucketsize) fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) fp = ipq_reuse(hash); if (fp == NULL) goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); fp = NULL; goto dropfrag; } mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); V_ipq[hash].count++; fp->ipq_nfrags = 1; atomic_add_int(&nfrags, 1); fp->ipq_expire = time_uptime + V_ipfragttl; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; fp->ipq_frags = m; if (m->m_flags & M_IP_FRAG) fp->ipq_maxoff = -1; else fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); m->m_nextpkt = NULL; if (fp == TAILQ_LAST(head, ipqhead)) callout_reset_sbt(&V_ipq[hash].timer, SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout, &V_ipq[hash], 0); else MPASS(callout_active(&V_ipq[hash].timer)); goto done; } else { /* * If we already saw the last fragment, make sure * this fragment's offset looks sane. Otherwise, if * this is the last fragment, record its endpoint. */ if (fp->ipq_maxoff > 0) { i = ntohs(ip->ip_off) + ntohs(ip->ip_len); if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) || ((m->m_flags & M_IP_FRAG) == 0 && i != fp->ipq_maxoff)) { fp = NULL; goto dropfrag; } } else if ((m->m_flags & M_IP_FRAG) == 0) fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); fp->ipq_nfrags++; atomic_add_int(&nfrags, 1); #ifdef MAC mac_ipq_update(m, fp); #endif } #define GETIP(m) ((struct ip*)((m)->m_pkthdr.PH_loc.ptr)) /* * Handle ECN by comparing this segment with the first one; * if CE is set, do not lose CE. * drop if CE and not-ECT are mixed for the same packet. */ ecn = ip->ip_tos & IPTOS_ECN_MASK; ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK; if (ecn == IPTOS_ECN_CE) { if (ecn0 == IPTOS_ECN_NOTECT) goto dropfrag; if (ecn0 != IPTOS_ECN_CE) GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE; } if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) goto dropfrag; /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off)) break; /* * If there is a preceding segment, it may provide some of * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us, otherwise * stick new segment in the proper place. * * If some of the data is dropped from the preceding * segment, then it's checksum is invalidated. */ if (p) { i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) - ntohs(ip->ip_off); if (i > 0) { if (i >= ntohs(ip->ip_len)) goto dropfrag; m_adj(m, i); m->m_pkthdr.csum_flags = 0; ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); } m->m_nextpkt = p->m_nextpkt; p->m_nextpkt = m; } else { m->m_nextpkt = fp->ipq_frags; fp->ipq_frags = m; } /* * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) > ntohs(GETIP(q)->ip_off); q = nq) { i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) - ntohs(GETIP(q)->ip_off); if (i < ntohs(GETIP(q)->ip_len)) { GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i); GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i); m_adj(q, i); q->m_pkthdr.csum_flags = 0; break; } nq = q->m_nextpkt; m->m_nextpkt = nq; IPSTAT_INC(ips_fragdropped); fp->ipq_nfrags--; atomic_subtract_int(&nfrags, 1); m_freem(q); } /* * Check for complete reassembly and perform frag per packet * limiting. * * Frag limiting is performed here so that the nth frag has * a chance to complete the packet before we drop the packet. * As a result, n+1 frags are actually allowed per packet, but * only n will ever be stored. (n = maxfragsperpacket.) * */ next = 0; for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) ipq_drop(&V_ipq[hash], fp); goto done; } next += ntohs(GETIP(q)->ip_len); } /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) ipq_drop(&V_ipq[hash], fp); goto done; } /* * Reassembly is complete. Make sure the packet is a sane size. */ q = fp->ipq_frags; ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); ipq_drop(&V_ipq[hash], fp); goto done; } /* * Concatenate fragments. */ m = q; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = q->m_nextpkt; q->m_nextpkt = NULL; for (q = nq; q != NULL; q = nq) { nq = q->m_nextpkt; q->m_nextpkt = NULL; m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags; m->m_pkthdr.csum_data += q->m_pkthdr.csum_data; m_demote_pkthdr(q); m_cat(m, q); } /* * In order to do checksumming faster we do 'end-around carry' here * (and not in for{} loop), though it implies we are not going to * reassemble more than 64k fragments. */ while (m->m_pkthdr.csum_data & 0xffff0000) m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + (m->m_pkthdr.csum_data >> 16); atomic_subtract_int(&nfrags, fp->ipq_nfrags); #ifdef MAC mac_ipq_reassemble(fp, m); mac_ipq_destroy(fp); #endif /* * Create header for new ip packet by modifying header of first * packet; dequeue and discard fragment reassembly header. * Make header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); V_ipq[hash].count--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ m_fixhdr(m); /* set valid receive interface pointer */ m->m_pkthdr.rcvif = srcifp; } IPSTAT_INC(ips_reassembled); ipreass_reschedule(&V_ipq[hash]); IPQ_UNLOCK(hash); #ifdef RSS /* * Query the RSS layer for the flowid / flowtype for the * mbuf payload. * * For now, just assume we have to calculate a new one. * Later on we should check to see if the assigned flowid matches * what RSS wants for the given IP protocol and if so, just keep it. * * We then queue into the relevant netisr so it can be dispatched * to the correct CPU. * * Note - this may return 1, which means the flowid in the mbuf * is correct for the configured RSS hash types and can be used. */ if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { m->m_pkthdr.flowid = rss_hash; M_HASHTYPE_SET(m, rss_type); } /* * Queue/dispatch for reprocessing. * * Note: this is much slower than just handling the frame in the * current receive context. It's likely worth investigating * why this is. */ netisr_dispatch(NETISR_IP_DIRECT, m); return (NULL); #endif /* Handle in-line */ return (m); dropfrag: IPSTAT_INC(ips_fragdropped); if (fp != NULL) { fp->ipq_nfrags--; atomic_subtract_int(&nfrags, 1); } m_freem(m); done: IPQ_UNLOCK(hash); return (NULL); #undef GETIP } /* * Timer expired on a bucket. * There should be at least one ipq to be timed out. */ static void ipreass_callout(void *arg) { struct ipqbucket *bucket = arg; struct ipq *fp; IPQ_BUCKET_LOCK_ASSERT(bucket); MPASS(atomic_load_int(&nfrags) > 0); CURVNET_SET(bucket->vnet); fp = TAILQ_LAST(&bucket->head, ipqhead); KASSERT(fp != NULL && fp->ipq_expire >= time_uptime, ("%s: stray callout on bucket %p", __func__, bucket)); while (fp != NULL && fp->ipq_expire >= time_uptime) { ipq_timeout(bucket, fp); fp = TAILQ_LAST(&bucket->head, ipqhead); } ipreass_reschedule(bucket); CURVNET_RESTORE(); } static void ipreass_reschedule(struct ipqbucket *bucket) { struct ipq *fp; IPQ_BUCKET_LOCK_ASSERT(bucket); if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) { time_t t; /* Protect against time_uptime tick. */ t = fp->ipq_expire - time_uptime; t = (t > 0) ? t : 1; callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S, ipreass_callout, bucket, 0); } else callout_stop(&bucket->timer); } static void ipreass_drain_vnet(void) { - for (int i = 0; i < IPREASS_NHASH; i++) { + for (int i = 0; i < V_ipq_hashsize; i++) { IPQ_LOCK(i); while(!TAILQ_EMPTY(&V_ipq[i].head)) ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head)); KASSERT(V_ipq[i].count == 0, ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, V_ipq[i].count, V_ipq)); IPQ_UNLOCK(i); } } /* * Drain off all datagram fragments. */ static void ipreass_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain_vnet(); CURVNET_RESTORE(); } } /* * Initialize IP reassembly structures. */ +MALLOC_DEFINE(M_IPREASS_HASH, "IP reass", "IP packet reassembly hash headers"); void ipreass_vnet_init(void) { int max; - for (int i = 0; i < IPREASS_NHASH; i++) { + V_ipq_hashsize = IPREASS_NHASH; + TUNABLE_INT_FETCH("net.inet.ip.reass_hashsize", &V_ipq_hashsize); + V_ipq = malloc(sizeof(struct ipqbucket) * V_ipq_hashsize, + M_IPREASS_HASH, M_WAITOK); + + for (int i = 0; i < V_ipq_hashsize; i++) { TAILQ_INIT(&V_ipq[i].head); mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, - MTX_DEF | MTX_DUPOK); + MTX_DEF | MTX_DUPOK | MTX_NEW); callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0); V_ipq[i].count = 0; #ifdef VIMAGE V_ipq[i].vnet = curvnet; #endif } V_ipq_hashseed = arc4random(); V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); max = IP_MAXFRAGPACKETS; max = uma_zone_set_max(V_ipq_zone, max); - V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); + V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); } void ipreass_init(void) { maxfrags = IP_MAXFRAGS; EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, NULL, EVENTHANDLER_PRI_ANY); EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL, LOWMEM_PRI_DEFAULT); EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL, LOWMEM_PRI_DEFAULT); } /* * Drain off all datagram fragments belonging to * the given network interface. */ static void ipreass_cleanup(void *arg __unused, struct ifnet *ifp) { struct ipq *fp, *temp; struct mbuf *m; int i; KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__)); CURVNET_SET_QUIET(ifp->if_vnet); /* * Skip processing if IPv4 reassembly is not initialised or * torn down by ipreass_destroy(). */ if (V_ipq_zone == NULL) { CURVNET_RESTORE(); return; } - for (i = 0; i < IPREASS_NHASH; i++) { + for (i = 0; i < V_ipq_hashsize; i++) { IPQ_LOCK(i); /* Scan fragment list. */ TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) { for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) { /* clear no longer valid rcvif pointer */ if (m->m_pkthdr.rcvif == ifp) m->m_pkthdr.rcvif = NULL; } } IPQ_UNLOCK(i); } CURVNET_RESTORE(); } EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0); #ifdef VIMAGE /* * Destroy IP reassembly structures. */ void ipreass_destroy(void) { ipreass_drain_vnet(); uma_zdestroy(V_ipq_zone); V_ipq_zone = NULL; - for (int i = 0; i < IPREASS_NHASH; i++) + for (int i = 0; i < V_ipq_hashsize; i++) mtx_destroy(&V_ipq[i].lock); } #endif /* * After maxnipq has been updated, propagate the change to UMA. The UMA zone * max has slightly different semantics than the sysctl, for historical * reasons. */ static void ipreass_drain_tomax(void) { struct ipq *fp; int target; /* * Make sure each bucket is under the new limit. If * necessary, drop enough of the oldest elements from * each bucket to get under the new limit. */ - for (int i = 0; i < IPREASS_NHASH; i++) { + for (int i = 0; i < V_ipq_hashsize; i++) { IPQ_LOCK(i); while (V_ipq[i].count > V_ipreass_maxbucketsize && (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) ipq_timeout(&V_ipq[i], fp); ipreass_reschedule(&V_ipq[i]); IPQ_UNLOCK(i); } /* * If we are over the maximum number of fragments, * drain off enough to get down to the new limit, * stripping off last elements on queues. Every * run we strip the oldest element from each bucket. */ target = uma_zone_get_max(V_ipq_zone); while (uma_zone_get_cur(V_ipq_zone) > target) { - for (int i = 0; i < IPREASS_NHASH; i++) { + for (int i = 0; i < V_ipq_hashsize; i++) { IPQ_LOCK(i); fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); if (fp != NULL) { ipq_timeout(&V_ipq[i], fp); ipreass_reschedule(&V_ipq[i]); } IPQ_UNLOCK(i); } } } static void ipreass_zone_change(void *tag) { VNET_ITERATOR_DECL(vnet_iter); int max; maxfrags = IP_MAXFRAGS; max = IP_MAXFRAGPACKETS; VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); max = uma_zone_set_max(V_ipq_zone, max); - V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); + V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); ipreass_drain_tomax(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Change the limit on the UMA zone, or disable the fragment allocation * at all. Since 0 and -1 is a special values here, we need our own handler, * instead of sysctl_handle_uma_zone_max(). */ static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) { int error, max; if (V_noreass == 0) { max = uma_zone_get_max(V_ipq_zone); if (max == 0) max = -1; } else max = 0; error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); if (max > 0) { /* * XXXRW: Might be a good idea to sanity check the argument * and place an extreme upper bound. */ max = uma_zone_set_max(V_ipq_zone, max); - V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); + V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1); ipreass_drain_tomax(); V_noreass = 0; } else if (max == 0) { V_noreass = 1; ipreass_drain(); } else if (max == -1) { V_noreass = 0; uma_zone_set_max(V_ipq_zone, 0); V_ipreass_maxbucketsize = INT_MAX; } else return (EINVAL); return (0); } /* * Seek for old fragment queue header that can be reused. Try to * reuse a header from currently locked hash bucket. */ static struct ipq * ipq_reuse(int start) { struct ipq *fp; int bucket, i; IPQ_LOCK_ASSERT(start); - for (i = 0; i < IPREASS_NHASH; i++) { - bucket = (start + i) % IPREASS_NHASH; + for (i = 0; i < V_ipq_hashsize; i++) { + bucket = (start + i) % V_ipq_hashsize; if (bucket != start && IPQ_TRYLOCK(bucket) == 0) continue; fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); if (fp) { struct mbuf *m; IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { m = fp->ipq_frags; fp->ipq_frags = m->m_nextpkt; m_freem(m); } TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); V_ipq[bucket].count--; ipreass_reschedule(&V_ipq[bucket]); if (bucket != start) IPQ_UNLOCK(bucket); break; } if (bucket != start) IPQ_UNLOCK(bucket); } IPQ_LOCK_ASSERT(start); return (fp); } /* * Free a fragment reassembly header and all associated datagrams. */ static void ipq_free(struct ipqbucket *bucket, struct ipq *fp) { struct mbuf *q; atomic_subtract_int(&nfrags, fp->ipq_nfrags); while (fp->ipq_frags) { q = fp->ipq_frags; fp->ipq_frags = q->m_nextpkt; m_freem(q); } TAILQ_REMOVE(&bucket->head, fp, ipq_list); bucket->count--; uma_zfree(V_ipq_zone, fp); } /* * Get or set the maximum number of reassembly queues per bucket. */ static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) { int error, max; max = V_ipreass_maxbucketsize; error = sysctl_handle_int(oidp, &max, 0, req); if (error || !req->newptr) return (error); if (max <= 0) return (EINVAL); V_ipreass_maxbucketsize = max; ipreass_drain_tomax(); return (0); } /* * Get or set the IP fragment time to live. */ static int sysctl_fragttl(SYSCTL_HANDLER_ARGS) { u_int ttl; int error; ttl = V_ipfragttl; error = sysctl_handle_int(oidp, &ttl, 0, req); if (error || !req->newptr) return (error); if (ttl < 1 || ttl > MAXTTL) return (EINVAL); atomic_store_int(&V_ipfragttl, ttl); return (0); }