Index: head/sys/netinet/ip_input.c =================================================================== --- head/sys/netinet/ip_input.c +++ head/sys/netinet/ip_input.c @@ -166,15 +166,18 @@ static VNET_DEFINE(uma_zone_t, ipq_zone); static VNET_DEFINE(TAILQ_HEAD(ipqhead, ipq), ipq[IPREASS_NHASH]); -static struct mtx ipqlock; +static struct mtx_padalign ipqlock[IPREASS_NHASH]; #define V_ipq_zone VNET(ipq_zone) #define V_ipq VNET(ipq) -#define IPQ_LOCK() mtx_lock(&ipqlock) -#define IPQ_UNLOCK() mtx_unlock(&ipqlock) -#define IPQ_LOCK_INIT() mtx_init(&ipqlock, "ipqlock", NULL, MTX_DEF) -#define IPQ_LOCK_ASSERT() mtx_assert(&ipqlock, MA_OWNED) +/* + * The ipqlock array is global, /not/ per-VNET. + */ +#define IPQ_LOCK(i) mtx_lock(&ipqlock[(i)]) +#define IPQ_UNLOCK(i) mtx_unlock(&ipqlock[(i)]) +#define IPQ_LOCK_INIT(i) mtx_init(&ipqlock[(i)], "ipqlock", NULL, MTX_DEF) +#define IPQ_LOCK_ASSERT(i) mtx_assert(&ipqlock[(i)], MA_OWNED) static void maxnipq_update(void); static void ipq_zone_change(void *); @@ -206,7 +209,7 @@ "IP stealth mode, no TTL decrementation on forwarding"); #endif -static void ip_freef(struct ipqhead *, struct ipq *); +static void ip_freef(struct ipqhead *, int, struct ipq *); /* * IP statistics are stored in the "array" of counter(9)s. @@ -373,7 +376,8 @@ NULL, EVENTHANDLER_PRI_ANY); /* Initialize various other remaining things. */ - IPQ_LOCK_INIT(); + for (i = 0; i < IPREASS_NHASH; i++) + IPQ_LOCK_INIT(i); netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); @@ -393,9 +397,7 @@ /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); - IPQ_LOCK(); ip_drain_locked(); - IPQ_UNLOCK(); uma_zdestroy(V_ipq_zone); } @@ -856,6 +858,41 @@ #define M_IP_FRAG M_PROTO9 /* + * Attempt to purge something from the reassembly queue to make + * room. + * + * Must be called without any IPQ locks held, as it will attempt + * to lock each in turn. + * + * 'skip_bucket' is the bucket with which to skip over, or -1 to + * not skip over anything. + * + * Returns the bucket being freed, or -1 for no action. + */ +static int +ip_reass_purge_element(int skip_bucket) +{ + int i; + struct ipq *r; + + for (i = 0; i < IPREASS_NHASH; i++) { + if (skip_bucket > -1 && i == skip_bucket) + continue; + IPQ_LOCK(i); + r = TAILQ_LAST(&V_ipq[i], ipqhead); + if (r) { + IPSTAT_ADD(ips_fragtimeout, + r->ipq_nfrags); + ip_freef(&V_ipq[i], i, r); + IPQ_UNLOCK(i); + return (i); + } + IPQ_UNLOCK(i); + } + return (-1); +} + +/* * Take incoming datagram fragment and try to reassemble it into * whole datagram. If the argument is the first fragment or one * in between the function will return NULL and store the mbuf @@ -878,6 +915,7 @@ #ifdef RSS uint32_t rss_hash, rss_type; #endif + int do_purge = 0; /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { @@ -892,7 +930,7 @@ hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); head = &V_ipq[hash]; - IPQ_LOCK(); + IPQ_LOCK(hash); /* * Look for queue of fragments @@ -921,18 +959,14 @@ */ struct ipq *q = TAILQ_LAST(head, ipqhead); if (q == NULL) { /* gak */ - for (i = 0; i < IPREASS_NHASH; i++) { - struct ipq *r = TAILQ_LAST(&V_ipq[i], ipqhead); - if (r) { - IPSTAT_ADD(ips_fragtimeout, - r->ipq_nfrags); - ip_freef(&V_ipq[i], r); - break; - } - } + /* + * Defer doing this until later; when the + * lock is no longer held. + */ + do_purge = 1; } else { IPSTAT_ADD(ips_fragtimeout, q->ipq_nfrags); - ip_freef(head, q); + ip_freef(head, hash, q); } } @@ -1093,7 +1127,7 @@ if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ip_freef(head, fp); + ip_freef(head, hash, fp); } goto done; } @@ -1103,7 +1137,7 @@ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ip_freef(head, fp); + ip_freef(head, hash, fp); } goto done; } @@ -1116,7 +1150,7 @@ if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ip_freef(head, fp); + ip_freef(head, hash, fp); goto done; } @@ -1166,7 +1200,20 @@ if (m->m_flags & M_PKTHDR) /* XXX this should be done elsewhere */ m_fixhdr(m); IPSTAT_INC(ips_reassembled); - IPQ_UNLOCK(); + IPQ_UNLOCK(hash); + + /* + * Do the delayed purge to keep fragment counts under + * the configured maximum. + * + * This is delayed so that it's not done with another IPQ bucket + * lock held. + * + * Note that we pass in the bucket to /skip/ over, not + * the bucket to /purge/. + */ + if (do_purge) + ip_reass_purge_element(hash); #ifdef RSS /* @@ -1208,7 +1255,7 @@ fp->ipq_nfrags--; m_freem(m); done: - IPQ_UNLOCK(); + IPQ_UNLOCK(hash); return (NULL); #undef GETIP @@ -1219,11 +1266,11 @@ * associated datagrams. */ static void -ip_freef(struct ipqhead *fhp, struct ipq *fp) +ip_freef(struct ipqhead *fhp, int i, struct ipq *fp) { struct mbuf *q; - IPQ_LOCK_ASSERT(); + IPQ_LOCK_ASSERT(i); while (fp->ipq_frags) { q = fp->ipq_frags; @@ -1248,10 +1295,10 @@ int i; VNET_LIST_RLOCK_NOSLEEP(); - IPQ_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); for (i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); for(fp = TAILQ_FIRST(&V_ipq[i]); fp;) { struct ipq *fpp; @@ -1260,9 +1307,10 @@ if(--fpp->ipq_ttl == 0) { IPSTAT_ADD(ips_fragtimeout, fpp->ipq_nfrags); - ip_freef(&V_ipq[i], fpp); + ip_freef(&V_ipq[i], i, fpp); } } + IPQ_UNLOCK(i); } /* * If we are over the maximum number of fragments @@ -1271,37 +1319,41 @@ */ if (V_maxnipq >= 0 && V_nipq > V_maxnipq) { for (i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); while (V_nipq > V_maxnipq && !TAILQ_EMPTY(&V_ipq[i])) { IPSTAT_ADD(ips_fragdropped, TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); ip_freef(&V_ipq[i], + i, TAILQ_FIRST(&V_ipq[i])); } + IPQ_UNLOCK(i); } } CURVNET_RESTORE(); } - IPQ_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); } /* * Drain off all datagram fragments. + * + * Call without any IPQ locks held. */ static void ip_drain_locked(void) { int i; - IPQ_LOCK_ASSERT(); - for (i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); while(!TAILQ_EMPTY(&V_ipq[i])) { IPSTAT_ADD(ips_fragdropped, TAILQ_FIRST(&V_ipq[i])->ipq_nfrags); - ip_freef(&V_ipq[i], TAILQ_FIRST(&V_ipq[i])); + ip_freef(&V_ipq[i], i, TAILQ_FIRST(&V_ipq[i])); } + IPQ_UNLOCK(i); } } @@ -1311,13 +1363,11 @@ VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); - IPQ_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ip_drain_locked(); CURVNET_RESTORE(); } - IPQ_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); }