diff --git a/share/man/man4/inet.4 b/share/man/man4/inet.4 --- a/share/man/man4/inet.4 +++ b/share/man/man4/inet.4 @@ -28,7 +28,7 @@ .\" From: @(#)inet.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd August 1, 2022 +.Dd September 8, 2022 .Dt INET 4 .Os .Sh NAME @@ -186,6 +186,8 @@ .It Va fragpackets Integer: Current number of IPv4 fragment reassembly queue entries for the VNET (read-only). +.It Va fragttl +Integer: time to live for IPv4 packet fragments in the per-VNET reassemby queue. .It Va loopback_prefixlen Integer: prefix length of the address space reserved for loopback purposes. The default is 8, meaning that 127.0.0.0/8 is reserved for loopback, diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h --- a/sys/netinet/ip.h +++ b/sys/netinet/ip.h @@ -210,7 +210,6 @@ */ #define MAXTTL 255 /* maximum time to live (seconds) */ #define IPDEFTTL 64 /* default ttl, from RFC 1340 */ -#define IPFRAGTTL 60 /* time to live for frags, slowhz */ #define IPTTLDEC 1 /* subtracted when forwarding */ #define IP_MSS 576 /* default maximum segment size */ diff --git a/sys/netinet/ip_reass.c b/sys/netinet/ip_reass.c --- a/sys/netinet/ip_reass.c +++ b/sys/netinet/ip_reass.c @@ -75,6 +75,10 @@ struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; + struct callout timer; +#ifdef VIMAGE + struct vnet *vnet; +#endif int count; }; @@ -87,6 +91,7 @@ #define IPQ_TRYLOCK(i) mtx_trylock(&V_ipq[i].lock) #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) +#define IPQ_BUCKET_LOCK_ASSERT(b) mtx_assert(&(b)->lock, MA_OWNED) VNET_DEFINE_STATIC(int, ipreass_maxbucketsize); #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) @@ -98,10 +103,13 @@ #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); +static int sysctl_fragttl(SYSCTL_HANDLER_ARGS); static void ipreass_zone_change(void *); static void ipreass_drain_tomax(void); static void ipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); +static void ipreass_callout(void *); +static void ipreass_reschedule(struct ipqbucket *); static inline void ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) @@ -117,6 +125,7 @@ IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); ipq_free(bucket, fp); + ipreass_reschedule(bucket); } /* @@ -167,9 +176,11 @@ sysctl_maxfragbucketsize, "I", "Maximum number of IPv4 fragment reassembly queue entries per bucket"); -static u_int ipfragttl = IPFRAGTTL / 2; -SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragttl, CTLFLAG_RD, &ipfragttl, - IPFRAGTTL / 2, "IP fragment life time on reassembly queue"); +VNET_DEFINE_STATIC(u_int, ipfragttl) = 30; +#define V_ipfragttl VNET(ipfragttl) +SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW | + CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU", + "IP fragment life time on reassembly queue (seconds)"); /* * Take incoming datagram fragment and try to reassemble it into @@ -311,7 +322,7 @@ V_ipq[hash].count++; fp->ipq_nfrags = 1; atomic_add_int(&nfrags, 1); - fp->ipq_ttl = IPFRAGTTL; + fp->ipq_expire = time_uptime + V_ipfragttl; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_src = ip->ip_src; @@ -322,6 +333,12 @@ else fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len); m->m_nextpkt = NULL; + if (fp == TAILQ_LAST(head, ipqhead)) + callout_reset_sbt(&V_ipq[hash].timer, + SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout, + &V_ipq[hash], 0); + else + MPASS(callout_active(&V_ipq[hash].timer)); goto done; } else { /* @@ -509,6 +526,7 @@ m->m_pkthdr.rcvif = srcifp; } IPSTAT_INC(ips_reassembled); + ipreass_reschedule(&V_ipq[hash]); IPQ_UNLOCK(hash); #ifdef RSS @@ -560,44 +578,48 @@ } /* - * If a timer expires on a reassembly queue, discard it. + * Timer expired on a bucket. + * There should be at least one ipq to be timed out. */ -static struct callout ipreass_callout; static void -ipreass_slowtimo(void *arg __unused) +ipreass_callout(void *arg) { - VNET_ITERATOR_DECL(vnet_iter); - struct ipq *fp, *tmp; + struct ipqbucket *bucket = arg; + struct ipq *fp; - if (atomic_load_int(&nfrags) == 0) - return; + IPQ_BUCKET_LOCK_ASSERT(bucket); + MPASS(atomic_load_int(&nfrags) > 0); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - for (int i = 0; i < IPREASS_NHASH; i++) { - if (TAILQ_EMPTY(&V_ipq[i].head)) - continue; - IPQ_LOCK(i); - TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp) - if (--fp->ipq_ttl == 0) - ipq_timeout(&V_ipq[i], fp); - IPQ_UNLOCK(i); - } - CURVNET_RESTORE(); - } - VNET_LIST_RUNLOCK_NOSLEEP(); + CURVNET_SET(bucket->vnet); + fp = TAILQ_LAST(&bucket->head, ipqhead); + KASSERT(fp != NULL && fp->ipq_expire >= time_uptime, + ("%s: stray callout on bucket %p", __func__, bucket)); - callout_reset_sbt(&ipreass_callout, SBT_1MS * 500, SBT_1MS * 10, - ipreass_slowtimo, NULL, 0); + while (fp != NULL && fp->ipq_expire >= time_uptime) { + ipq_timeout(bucket, fp); + fp = TAILQ_LAST(&bucket->head, ipqhead); + } + ipreass_reschedule(bucket); + CURVNET_RESTORE(); } static void -ipreass_timer_init(void *arg __unused) +ipreass_reschedule(struct ipqbucket *bucket) { + struct ipq *fp; - callout_init(&ipreass_callout, 1); - callout_reset_sbt(&ipreass_callout, SBT_1MS * 500, SBT_1MS * 10, - ipreass_slowtimo, NULL, 0); + IPQ_BUCKET_LOCK_ASSERT(bucket); + + if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) { + time_t t; + + /* Protect against time_uptime tick. */ + t = fp->ipq_expire - time_uptime; + t = (t > 0) ? t : 1; + callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S, + ipreass_callout, bucket, 0); + } else + callout_stop(&bucket->timer); } static void @@ -614,7 +636,6 @@ IPQ_UNLOCK(i); } } -SYSINIT(ipreass, SI_SUB_VNET_DONE, SI_ORDER_ANY, ipreass_timer_init, NULL); /* * Drain off all datagram fragments. @@ -644,7 +665,11 @@ TAILQ_INIT(&V_ipq[i].head); mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, MTX_DEF | MTX_DUPOK); + callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0); V_ipq[i].count = 0; +#ifdef VIMAGE + V_ipq[i].vnet = curvnet; +#endif } V_ipq_hashseed = arc4random(); V_maxfragsperpacket = 16; @@ -745,6 +770,7 @@ while (V_ipq[i].count > V_ipreass_maxbucketsize && (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) ipq_timeout(&V_ipq[i], fp); + ipreass_reschedule(&V_ipq[i]); IPQ_UNLOCK(i); } @@ -759,8 +785,10 @@ for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); - if (fp != NULL) + if (fp != NULL) { ipq_timeout(&V_ipq[i], fp); + ipreass_reschedule(&V_ipq[i]); + } IPQ_UNLOCK(i); } } @@ -854,6 +882,7 @@ } TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); V_ipq[bucket].count--; + ipreass_reschedule(&V_ipq[bucket]); if (bucket != start) IPQ_UNLOCK(bucket); break; @@ -902,3 +931,24 @@ ipreass_drain_tomax(); return (0); } + +/* + * Get or set the IP fragment time to live. + */ +static int +sysctl_fragttl(SYSCTL_HANDLER_ARGS) +{ + u_int ttl; + int error; + + ttl = V_ipfragttl; + error = sysctl_handle_int(oidp, &ttl, 0, req); + if (error || !req->newptr) + return (error); + + if (ttl < 1 || ttl > MAXTTL) + return (EINVAL); + + atomic_store_int(&V_ipfragttl, ttl); + return (0); +} diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -56,18 +56,18 @@ /* * Ip reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. - * They are timed out after ipq_ttl drops to 0, and may also - * be reclaimed if memory becomes tight. + * They are timed out after net.inet.ip.fragttl seconds, and may also be + * reclaimed if memory becomes tight. */ struct ipq { TAILQ_ENTRY(ipq) ipq_list; /* to other reass headers */ - u_char ipq_ttl; /* time for reass q to live */ + time_t ipq_expire; /* time_uptime when ipq expires */ + u_char ipq_nfrags; /* # frags in this packet */ u_char ipq_p; /* protocol of this fragment */ u_short ipq_id; /* sequence id for reassembly */ int ipq_maxoff; /* total length of packet */ struct mbuf *ipq_frags; /* to ip headers of fragments */ struct in_addr ipq_src,ipq_dst; - u_char ipq_nfrags; /* # frags in this packet */ struct label *ipq_label; /* MAC label */ }; #endif /* _KERNEL */