diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h --- a/sys/netinet/ip_mroute.h +++ b/sys/netinet/ip_mroute.h @@ -199,7 +199,7 @@ }; /* max. number of upcalls to deliver together */ -#define BW_UPCALLS_MAX 128 +#define BW_UPCALLS_MAX 1024 /* min. threshold time interval for bandwidth measurement */ #define BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC 3 #define BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC 0 @@ -264,6 +264,10 @@ u_long v_pkt_out; /* # pkts out on interface */ u_long v_bytes_in; /* # bytes in on interface */ u_long v_bytes_out; /* # bytes out on interface */ +#ifdef _KERNEL + struct mtx v_spin; /* Spin mutex for pkt stats */ + char v_spin_name[32]; +#endif }; #ifdef _KERNEL @@ -287,8 +291,7 @@ for Lower-or-EQual case */ struct bw_meter *mfc_bw_meter_geq; /* list of bandwidth meters for Greater-or-EQual case */ - u_long mfc_nstall; /* # of packets awaiting mfc */ - TAILQ_HEAD(, rtdetq) mfc_stall; /* q of packets awaiting mfc */ + struct buf_ring *mfc_stall_ring; /* ring of awaiting mfc */ }; #endif /* _KERNEL */ @@ -349,6 +352,8 @@ #ifdef _KERNEL struct callout bm_meter_callout; /* Periodic callout */ void* arg; /* custom argument */ + struct mtx bm_spin; /* meter spin lock */ + char bm_spin_name[32]; #endif }; diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -82,6 +82,7 @@ #include #include #include +#include #include #include #include @@ -139,13 +140,19 @@ * structures. */ -static struct mtx mrouter_mtx; -#define MROUTER_LOCK() mtx_lock(&mrouter_mtx) -#define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) -#define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) -#define MROUTER_LOCK_INIT() \ - mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) -#define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) +static struct rwlock mrouter_mtx; +#define MRW_RLOCK() rw_rlock(&mrouter_mtx) +#define MRW_WLOCK() rw_wlock(&mrouter_mtx) +#define MRW_RUNLOCK() rw_runlock(&mrouter_mtx) +#define MRW_WUNLOCK() rw_wunlock(&mrouter_mtx) +#define MRW_UNLOCK() rw_unlock(&mrouter_mtx) +#define MRW_LOCK_ASSERT() rw_assert(&mrouter_mtx, RA_LOCKED) +#define MRW_WLOCK_ASSERT() rw_assert(&mrouter_mtx, RA_WLOCKED) +#define MRW_LOCK_TRY_UPGRADE() rw_try_upgrade(&mrouter_mtx) +#define MRW_WOWNED() rw_wowned(&mrouter_mtx) +#define MRW_LOCK_INIT() \ + rw_init(&mrouter_mtx, "IPv4 multicast forwarding") +#define MRW_LOCK_DESTROY() rw_destroy(&mrouter_mtx) static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ @@ -170,32 +177,25 @@ VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) -static struct mtx mfc_mtx; -#define MFC_LOCK() mtx_lock(&mfc_mtx) -#define MFC_UNLOCK() mtx_unlock(&mfc_mtx) -#define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) -#define MFC_LOCK_INIT() \ - mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) -#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) - VNET_DEFINE_STATIC(vifi_t, numvifs); #define V_numvifs VNET(numvifs) VNET_DEFINE_STATIC(struct vif *, viftable); #define V_viftable VNET(viftable) -static struct mtx vif_mtx; -#define VIF_LOCK() mtx_lock(&vif_mtx) -#define VIF_UNLOCK() mtx_unlock(&vif_mtx) -#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) -#define VIF_LOCK_INIT() \ - mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) -#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) - static eventhandler_tag if_detach_event_tag = NULL; VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) +VNET_DEFINE_STATIC(struct mtx, upcall_thread_mtx); +#define V_upcall_thread_mtx VNET(upcall_thread_mtx) + +VNET_DEFINE_STATIC(struct cv, upcall_thread_cv); +#define V_upcall_thread_cv VNET(upcall_thread_cv) + +VNET_DEFINE_STATIC(struct mtx, buf_ring_mtx); +#define V_buf_ring_mtx VNET(buf_ring_mtx) + #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ @@ -205,15 +205,15 @@ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* - * Pending upcalls are stored in a vector which is flushed when + * Pending upcalls are stored in a rung which is flushed when * full, or periodically */ -VNET_DEFINE_STATIC(struct bw_upcall *, bw_upcalls); -#define V_bw_upcalls VNET(bw_upcalls) -VNET_DEFINE_STATIC(u_int, bw_upcalls_n); /* # of pending upcalls */ -#define V_bw_upcalls_n VNET(bw_upcalls_n) VNET_DEFINE_STATIC(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) +VNET_DEFINE_STATIC(struct buf_ring *, bw_upcalls_ring); +#define V_bw_upcalls_ring VNET(bw_upcalls_ring) +VNET_DEFINE_STATIC(struct mtx, bw_upcalls_ring_mtx); /* # of pending upcalls */ +#define V_bw_upcalls_ring_mtx VNET(bw_upcalls_ring_mtx) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ @@ -231,6 +231,8 @@ &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); +static volatile int upcall_thread_shutdown = 0; + static const struct encaptab *pim_encap_cookie; static int pim_encapcheck(const struct mbuf *, int, int, void *); static int pim_input(struct mbuf *, int, int, void *); @@ -373,18 +375,41 @@ { struct mfc *rt; - MFC_LOCK_ASSERT(); + /* + * Might be called both RLOCK and WLOCK. + * Check if any, it's caller responsibility + * to choose correct option. + */ + MRW_LOCK_ASSERT(); LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { if (in_hosteq(rt->mfc_origin, *o) && in_hosteq(rt->mfc_mcastgrp, *g) && - TAILQ_EMPTY(&rt->mfc_stall)) + buf_ring_empty(rt->mfc_stall_ring)) break; } return (rt); } +static __inline struct mfc * +mfc_alloc(void) +{ + struct mfc *rt; + rt = (struct mfc*) malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT | M_ZERO); + if (rt == NULL) + return rt; + + rt->mfc_stall_ring = buf_ring_alloc(MAX_UPQ, M_DEVBUF, + M_NOWAIT, &V_buf_ring_mtx); + if (rt->mfc_stall_ring == NULL) { + free(rt, M_DEVBUF); + return NULL; + } + + return rt; +} + /* * Handle MRT setsockopt commands to modify the multicast forwarding tables. */ @@ -558,17 +583,17 @@ { struct mfc *rt; - MFC_LOCK(); + MRW_RLOCK(); rt = mfc_find(&req->src, &req->grp); if (rt == NULL) { - MFC_UNLOCK(); + MRW_RUNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; - MFC_UNLOCK(); + MRW_RUNLOCK(); return 0; } @@ -580,17 +605,19 @@ { vifi_t vifi = req->vifi; - VIF_LOCK(); + MRW_RLOCK(); if (vifi >= V_numvifs) { - VIF_UNLOCK(); + MRW_RUNLOCK(); return EINVAL; } + mtx_lock_spin(&V_viftable[vifi].v_spin); req->icount = V_viftable[vifi].v_pkt_in; req->ocount = V_viftable[vifi].v_pkt_out; req->ibytes = V_viftable[vifi].v_bytes_in; req->obytes = V_viftable[vifi].v_bytes_out; - VIF_UNLOCK(); + mtx_unlock_spin(&V_viftable[vifi].v_spin); + MRW_RUNLOCK(); return 0; } @@ -601,16 +628,13 @@ vifi_t vifi; u_long i; - MROUTER_LOCK(); + MRW_WLOCK(); if (V_ip_mrouter == NULL) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return; } - VIF_LOCK(); - MFC_LOCK(); - /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. @@ -634,10 +658,26 @@ del_vif_locked(vifi); } - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); +} + +static void +ip_mrouter_upcall_thread(void *arg) +{ + CURVNET_SET((struct vnet *) arg); + + while (upcall_thread_shutdown == 0) { + /* START: Event loop */ - MROUTER_UNLOCK(); + /* END: Event loop */ + mtx_lock(&V_upcall_thread_mtx); + cv_timedwait(&V_upcall_thread_cv, &V_upcall_thread_mtx, hz); + mtx_unlock(&V_upcall_thread_mtx); + } + + upcall_thread_shutdown = 0; + CURVNET_RESTORE(); + kthread_exit(); } /* @@ -656,21 +696,35 @@ if (version != 1) return ENOPROTOOPT; - MROUTER_LOCK(); + MRW_WLOCK(); if (ip_mrouter_unloading) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return EADDRINUSE; } V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, HASH_NOWAIT); + /* Create upcall ring */ + mtx_init(&V_bw_upcalls_ring_mtx, "mroute upcall buf_ring mtx", NULL, MTX_DEF); + V_bw_upcalls_ring = buf_ring_alloc(BW_UPCALLS_MAX, M_MRTABLE, + M_NOWAIT, &V_bw_upcalls_ring_mtx); + if (!V_bw_upcalls_ring) + return (ENOMEM); + + /* Create upcall thread */ + upcall_thread_shutdown = 0; + mtx_init(&V_upcall_thread_mtx, "ip_mroute upcall thread mtx", NULL, MTX_DEF); + cv_init(&V_upcall_thread_cv, "ip_mroute upcall cv"); + kthread_add(ip_mrouter_upcall_thread, curvnet, + NULL, NULL, 0, 0, "ip_mroute upcall thread"); + callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, @@ -679,7 +733,10 @@ V_ip_mrouter = so; ip_mrouter_cnt++; - MROUTER_UNLOCK(); + /* This is a mutex required by buf_ring init, but not used internally */ + mtx_init(&V_buf_ring_mtx, "mroute buf_ring mtx", NULL, MTX_DEF); + + MRW_WUNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); @@ -695,11 +752,12 @@ struct ifnet *ifp; u_long i; vifi_t vifi; + struct bw_upcall *bu; - MROUTER_LOCK(); + MRW_WLOCK(); if (V_ip_mrouter == NULL) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return EINVAL; } @@ -712,7 +770,22 @@ MROUTER_WAIT(); - VIF_LOCK(); + upcall_thread_shutdown = 1; + mtx_lock(&V_upcall_thread_mtx); + cv_signal(&V_upcall_thread_cv); + mtx_unlock(&V_upcall_thread_mtx); + + /* Wait for thread shutdown */ + while (upcall_thread_shutdown == 1) {}; + + mtx_destroy(&V_upcall_thread_mtx); + + /* Destroy upcall ring */ + while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) { + free(bu, M_MRTABLE); + } + buf_ring_free(V_bw_upcalls_ring, M_MRTABLE); + mtx_destroy(&V_bw_upcalls_ring_mtx); /* * For each phyint in use, disable promiscuous reception of all IP @@ -729,13 +802,9 @@ V_numvifs = 0; V_pim_assert_enabled = 0; - VIF_UNLOCK(); - callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); - MFC_LOCK(); - /* * Free all multicast forwarding cache entries. * Do not use hashdestroy(), as we must perform other cleanup. @@ -752,13 +821,11 @@ bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); - V_bw_upcalls_n = 0; - - MFC_UNLOCK(); - V_reg_vif_num = VIFI_INVALID; - MROUTER_UNLOCK(); + mtx_destroy(&V_buf_ring_mtx); + + MRW_WUNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); @@ -803,17 +870,17 @@ return EPERM; } - MFC_LOCK(); + MRW_RLOCK(); for (i = 0; i < mfchashsize; i++) { if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { - MFC_UNLOCK(); + MRW_RUNLOCK(); *apival = 0; return EPERM; } } - MFC_UNLOCK(); + MRW_RUNLOCK(); V_mrt_api_config = *apival & mrt_api_support; *apival = V_mrt_api_config; @@ -833,23 +900,23 @@ struct ifnet *ifp; int error; - VIF_LOCK(); + MRW_WLOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); - VIF_UNLOCK(); + MRW_WUNLOCK(); return EINVAL; } if (!in_nullhost(vifp->v_lcl_addr)) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return EADDRINUSE; } if (in_nullhost(vifcp->vifc_lcl_addr)) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } @@ -869,7 +936,7 @@ ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { NET_EPOCH_EXIT(et); - VIF_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; @@ -879,7 +946,7 @@ if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); - VIF_UNLOCK(); + MRW_WUNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &V_multicast_register_if; @@ -891,14 +958,14 @@ } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return error; } } @@ -913,12 +980,14 @@ vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; + sprintf(vifp->v_spin_name, "BM[%d] spin", vifcp->vifc_vifi); + mtx_init(&vifp->v_spin, vifp->v_spin_name, NULL, MTX_SPIN); /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) V_numvifs = vifcp->vifc_vifi + 1; - VIF_UNLOCK(); + MRW_WUNLOCK(); CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__, (int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr), @@ -935,7 +1004,7 @@ { struct vif *vifp; - VIF_LOCK_ASSERT(); + MRW_WLOCK_ASSERT(); if (vifi >= V_numvifs) { return EINVAL; @@ -951,6 +1020,8 @@ if (vifp->v_flags & VIFF_REGISTER) V_reg_vif_num = VIFI_INVALID; + mtx_destroy(&vifp->v_spin); + bzero((caddr_t)vifp, sizeof (*vifp)); CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); @@ -969,9 +1040,9 @@ { int cc; - VIF_LOCK(); + MRW_WLOCK(); cc = del_vif_locked(vifi); - VIF_UNLOCK(); + MRW_WUNLOCK(); return cc; } @@ -1018,18 +1089,21 @@ static void expire_mfc(struct mfc *rt) { - struct rtdetq *rte, *nrte; + struct rtdetq *rte; - MFC_LOCK_ASSERT(); + MRW_WLOCK_ASSERT(); free_bw_list(rt->mfc_bw_meter_leq); free_bw_list(rt->mfc_bw_meter_geq); - TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { + while (!buf_ring_empty(rt->mfc_stall_ring)) { + rte = buf_ring_dequeue_mc(rt->mfc_stall_ring); + if (rte) { m_freem(rte->m); - TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); free(rte, M_MRTABLE); } + } + buf_ring_free(rt->mfc_stall_ring, M_DEVBUF); LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); @@ -1042,13 +1116,11 @@ add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; - struct rtdetq *rte, *nrte; + struct rtdetq *rte; u_long hash = 0; u_short nstl; - VIF_LOCK(); - MFC_LOCK(); - + MRW_WLOCK(); rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); /* If an entry already exists, just update the fields */ @@ -1058,8 +1130,7 @@ (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return (0); } @@ -1071,7 +1142,7 @@ LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && - !TAILQ_EMPTY(&rt->mfc_stall)) { + !buf_ring_empty(rt->mfc_stall_ring)) { CTR5(KTR_IPMF, "%s: add mfc orig 0x%08x group %lx parent %x qh %p", __func__, ntohl(mfccp->mfcc_origin.s_addr), @@ -1086,12 +1157,11 @@ V_nexpire[hash]--; /* Free queued packets, but attempt to forward them first. */ - TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { + while (!buf_ring_empty(rt->mfc_stall_ring)) { + rte = buf_ring_dequeue_mc(rt->mfc_stall_ring); if (rte->ifp != NULL) ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); - TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); - rt->mfc_nstall--; free(rte, M_MRTABLE); } } @@ -1114,16 +1184,13 @@ } if (rt == NULL) { /* no upcall, so make a new entry */ - rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + rt = mfc_alloc(); if (rt == NULL) { - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return (ENOBUFS); } init_mfc_params(rt, mfccp); - TAILQ_INIT(&rt->mfc_stall); - rt->mfc_nstall = 0; rt->mfc_expire = 0; rt->mfc_bw_meter_leq = NULL; @@ -1134,8 +1201,7 @@ } } - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return (0); } @@ -1156,11 +1222,11 @@ CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__, ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); - MFC_LOCK(); + MRW_WLOCK(); rt = mfc_find(&origin, &mcastgrp); if (rt == NULL) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } @@ -1175,7 +1241,7 @@ LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); - MFC_UNLOCK(); + MRW_WUNLOCK(); return (0); } @@ -1219,6 +1285,10 @@ struct mfc *rt; int error; vifi_t vifi; + struct mbuf *mb0; + struct rtdetq *rte; + u_long hash; + int hlen; CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p", ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp); @@ -1237,14 +1307,15 @@ return (1); } - VIF_LOCK(); - MFC_LOCK(); + /* + * BEGIN: MCAST ROUTING HOT PATH + */ + MRW_RLOCK(); if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { if (ip->ip_ttl < MAXTTL) ip->ip_ttl++; /* compensate for -1 in *_send routines */ error = ip_mdq(m, ifp, NULL, vifi); - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_RUNLOCK(); return error; } @@ -1253,11 +1324,11 @@ * or a packet destined to a local-only group. */ if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_RUNLOCK(); return 0; } + mfc_find_retry: /* * Determine forwarding vifs from the forwarding cache table */ @@ -1267,19 +1338,27 @@ /* Entry exists, so forward if necessary */ if (rt != NULL) { error = ip_mdq(m, ifp, rt, -1); - MFC_UNLOCK(); - VIF_UNLOCK(); + /* Generic unlock here as we might release R or W lock */ + MRW_UNLOCK(); return error; - } else { + } + + /* + * END: MCAST ROUTING HOT PATH + */ + + /* Further processing must be done with WLOCK taken */ + if ((MRW_WOWNED() == 0) && (MRW_LOCK_TRY_UPGRADE() == 0)) { + MRW_RUNLOCK(); + MRW_WLOCK(); + goto mfc_find_retry; + } + /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ - - struct mbuf *mb0; - struct rtdetq *rte; - u_long hash; - int hlen = ip->ip_hl << 2; + hlen = ip->ip_hl << 2; MRTSTAT_INC(mrts_mfc_misses); MRTSTAT_INC(mrts_no_route); @@ -1294,8 +1373,7 @@ rte = (struct rtdetq*) malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO); if (rte == NULL) { - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return ENOBUFS; } @@ -1304,17 +1382,17 @@ mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { free(rte, M_MRTABLE); - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src, ip->ip_dst); - LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { + LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) + { if (in_hosteq(ip->ip_src, rt->mfc_origin) && in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && - !TAILQ_EMPTY(&rt->mfc_stall)) + !buf_ring_empty(rt->mfc_stall_ring)) break; } @@ -1335,7 +1413,7 @@ goto non_fatal; /* no upcall, so make a new entry */ - rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + rt = mfc_alloc(); if (rt == NULL) goto fail; @@ -1360,13 +1438,10 @@ if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { CTR0(KTR_IPMF, "ip_mforward: socket queue full"); MRTSTAT_INC(mrts_upq_sockfull); -fail1: - free(rt, M_MRTABLE); -fail: - free(rte, M_MRTABLE); + fail1: free(rt, M_MRTABLE); + fail: free(rte, M_MRTABLE); m_freem(mb0); - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return ENOBUFS; } @@ -1392,38 +1467,27 @@ rt->mfc_wrong_if = 0; timevalclear(&rt->mfc_last_assert); - TAILQ_INIT(&rt->mfc_stall); - rt->mfc_nstall = 0; - - /* link into table */ - LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); - TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); - rt->mfc_nstall++; - + buf_ring_enqueue(rt->mfc_stall_ring, rte); } else { /* determine if queue has overflowed */ - if (rt->mfc_nstall > MAX_UPQ) { + if (buf_ring_full(rt->mfc_stall_ring)) { MRTSTAT_INC(mrts_upq_ovflw); -non_fatal: - free(rte, M_MRTABLE); + non_fatal: free(rte, M_MRTABLE); m_freem(mb0); - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return (0); } - TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); - rt->mfc_nstall++; + + buf_ring_enqueue(rt->mfc_stall_ring, rte); } rte->m = mb0; rte->ifp = ifp; - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return 0; } -} /* * Clean up the cache entry if upcall is not serviced @@ -1435,7 +1499,7 @@ CURVNET_SET((struct vnet *) arg); - MFC_LOCK(); + /*This callout is always run with MRW_WLOCK taken. */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; @@ -1444,7 +1508,7 @@ continue; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { - if (TAILQ_EMPTY(&rt->mfc_stall)) + if (buf_ring_empty(rt->mfc_stall_ring)) continue; if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) @@ -1459,8 +1523,6 @@ } } - MFC_UNLOCK(); - callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); @@ -1477,7 +1539,7 @@ vifi_t vifi; int plen = ntohs(ip->ip_len); - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. @@ -1554,6 +1616,7 @@ } /* If I sourced this packet, it counts as output, else it was input. */ + mtx_lock_spin(&V_viftable[vifi].v_spin); if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; @@ -1561,6 +1624,8 @@ V_viftable[vifi].v_pkt_in++; V_viftable[vifi].v_bytes_in += plen; } + mtx_unlock_spin(&V_viftable[vifi].v_spin); + rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; @@ -1588,16 +1653,22 @@ struct timeval now; microtime(&now); - MFC_LOCK_ASSERT(); /* Process meters for Greater-or-EQual case */ for (x = rt->mfc_bw_meter_geq; x != NULL; x = x->bm_mfc_next) bw_meter_geq_receive_packet(x, plen, &now); /* Process meters for Lower-or-EQual case */ for (x = rt->mfc_bw_meter_leq; x != NULL; x = x->bm_mfc_next) { - /* Record that a packet is received */ + /* + * Record that a packet is received. + * Spin lock has to be taken as callout context + * (expire_bw_meter_leq) might modify these fields + * as well + */ + mtx_lock_spin(&x->bm_spin); x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; + mtx_unlock_spin(&x->bm_spin); } } @@ -1616,10 +1687,10 @@ if (vif < 0) return (ret); - VIF_LOCK(); + MRW_RLOCK(); if (vif < V_numvifs) ret = 1; - VIF_UNLOCK(); + MRW_RUNLOCK(); return (ret); } @@ -1636,10 +1707,10 @@ if (vifi < 0) return (addr); - VIF_LOCK(); + MRW_RLOCK(); if (vifi < V_numvifs) addr = V_viftable[vifi].v_lcl_addr.s_addr; - VIF_UNLOCK(); + MRW_RUNLOCK(); return (addr); } @@ -1650,7 +1721,7 @@ struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that @@ -1672,7 +1743,7 @@ struct ip_moptions imo; int error __unused; - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; @@ -1755,7 +1826,7 @@ struct timeval now; /* * INFO: - * callout is always executed with MFC_LOCK taken + * callout is always executed with MRW_WLOCK taken */ CURVNET_SET((struct vnet *)x->arg); @@ -1774,12 +1845,19 @@ } /* Send all upcalls that are pending delivery */ - bw_upcalls_send(); + mtx_lock(&V_upcall_thread_mtx); + cv_signal(&V_upcall_thread_cv); + mtx_unlock(&V_upcall_thread_mtx); /* Reset counters */ x->bm_start_time = now; + /* Spin lock has to be taken as ip_forward context + * might modify these fields as well + */ + mtx_lock_spin(&x->bm_spin); x->bm_measured.b_bytes = 0; x->bm_measured.b_packets = 0; + mtx_unlock_spin(&x->bm_spin); callout_schedule(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time)); @@ -1820,10 +1898,10 @@ /* * Find if we have already same bw_meter entry */ - MFC_LOCK(); + MRW_WLOCK(); mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } @@ -1842,15 +1920,16 @@ == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ - x = (struct bw_meter*) malloc(sizeof(*x), M_BWMETER, M_NOWAIT); + x = (struct bw_meter*) malloc(sizeof(*x), M_BWMETER, + M_ZERO | M_NOWAIT); if (x == NULL) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return ENOBUFS; } @@ -1866,10 +1945,12 @@ x->bm_time_next = NULL; x->bm_mfc = mfc; x->arg = curvnet; + sprintf(x->bm_spin_name, "BM spin %p", x); + mtx_init(&x->bm_spin, x->bm_spin_name, NULL, MTX_SPIN); /* For LEQ case create periodic callout */ if (req->bu_flags & BW_UPCALL_LEQ) { - callout_init_mtx(&x->bm_meter_callout, &mfc_mtx,0); + callout_init_rw(&x->bm_meter_callout, &mrouter_mtx, CALLOUT_SHAREDLOCK); callout_reset(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time), expire_bw_meter_leq, x); } @@ -1878,7 +1959,7 @@ x->bm_mfc_next = *bwm_ptr; *bwm_ptr = x; - MFC_UNLOCK(); + MRW_WUNLOCK(); return 0; } @@ -1889,9 +1970,11 @@ while (list != NULL) { struct bw_meter *x = list; - /* MFC_LOCK must be held here */ - if (x->bm_flags & BW_METER_LEQ) + /* MRW_WLOCK must be held here */ + if (x->bm_flags & BW_METER_LEQ) { callout_drain(&x->bm_meter_callout); + mtx_destroy(&x->bm_spin); + } list = list->bm_mfc_next; free(x, M_BWMETER); @@ -1910,12 +1993,12 @@ if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; - MFC_LOCK(); + MRW_WLOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* @@ -1932,7 +2015,7 @@ list = mfc->mfc_bw_meter_geq; mfc->mfc_bw_meter_geq = NULL; free_bw_list(list); - MFC_UNLOCK(); + MRW_WUNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; @@ -1965,12 +2048,12 @@ if (req->bu_flags & BW_UPCALL_LEQ) callout_stop(&x->bm_meter_callout); - MFC_UNLOCK(); + MRW_WUNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { - MFC_UNLOCK(); + MRW_WUNLOCK(); return EINVAL; } } @@ -1985,13 +2068,15 @@ { struct timeval delta; - MFC_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* - * Processing for ">=" type of bw_meter entry + * Processing for ">=" type of bw_meter entry. + * bm_spin does not have to be hold here as in GEQ + * case this is the only context accessing bm_measured. */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ @@ -2029,7 +2114,7 @@ struct timeval delta; struct bw_upcall *u; - MFC_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); /* * Compute the measured time interval @@ -2037,16 +2122,14 @@ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); - /* - * If there are too many pending upcalls, deliver them now - */ - if (V_bw_upcalls_n >= BW_UPCALLS_MAX) - bw_upcalls_send(); - /* * Set the bw_upcall entry */ - u = &V_bw_upcalls[V_bw_upcalls_n++]; + u = malloc(sizeof(struct bw_upcall), M_MRTABLE, M_NOWAIT | M_ZERO); + if (!u) { + log(LOG_WARNING, "bw_meter_prepare_upcall: cannot allocate entry\n"); + return; + } u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; @@ -2064,8 +2147,15 @@ u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; -} + if (buf_ring_enqueue(V_bw_upcalls_ring, u)) + log(LOG_WARNING, "bw_meter_prepare_upcall: cannot enqueue upcall\n"); + if (buf_ring_count(V_bw_upcalls_ring) > (BW_UPCALLS_MAX / 2)) { + mtx_lock(&V_upcall_thread_mtx); + cv_signal(&V_upcall_thread_cv); + mtx_unlock(&V_upcall_thread_mtx); + } +} /* * Send the pending bandwidth-related upcalls */ @@ -2073,7 +2163,8 @@ bw_upcalls_send(void) { struct mbuf *m; - int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); + int len = 0; + struct bw_upcall *bu; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ @@ -2084,12 +2175,10 @@ { 0 }, /* im_src */ { 0 } }; /* im_dst */ - MFC_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); - if (V_bw_upcalls_n == 0) - return; /* No pending upcalls */ - - V_bw_upcalls_n = 0; + if (buf_ring_empty(V_bw_upcalls_ring)) + return; /* * Allocate a new mbuf, initialize it with the header and @@ -2102,7 +2191,11 @@ } m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); - m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); + len += sizeof(struct igmpmsg); + while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) { + m_copyback(m, len, sizeof(struct bw_upcall), (caddr_t)bu); + len += sizeof(struct bw_upcall); + } /* * Send the upcalls @@ -2123,9 +2216,9 @@ { CURVNET_SET((struct vnet *) arg); - MFC_LOCK(); + /* This callout is run with MRW_RLOCK taken */ + bw_upcalls_send(); - MFC_UNLOCK(); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); @@ -2241,7 +2334,7 @@ struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header @@ -2294,7 +2387,7 @@ int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { m_freem(mb_copy); @@ -2467,9 +2560,9 @@ u_int32_t *reghdr; struct ifnet *vifp; - VIF_LOCK(); + MRW_RLOCK(); if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { - VIF_UNLOCK(); + MRW_RUNLOCK(); CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); @@ -2477,7 +2570,7 @@ } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; - VIF_UNLOCK(); + MRW_RUNLOCK(); /* * Validate length @@ -2603,7 +2696,7 @@ if (error) return (error); - MFC_LOCK(); + MRW_RLOCK(); for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); @@ -2612,7 +2705,7 @@ } } out_locked: - MFC_UNLOCK(); + MRW_RUNLOCK(); return (error); } @@ -2634,9 +2727,9 @@ if (error) return (error); - VIF_LOCK(); + MRW_RLOCK(); error = SYSCTL_OUT(req, V_viftable, sizeof(*V_viftable) * MAXVIFS); - VIF_UNLOCK(); + MRW_RUNLOCK(); return (error); } @@ -2653,11 +2746,9 @@ V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable), M_MRTABLE, M_WAITOK|M_ZERO); - V_bw_upcalls = mallocarray(BW_UPCALLS_MAX, sizeof(*V_bw_upcalls), - M_MRTABLE, M_WAITOK|M_ZERO); - callout_init(&V_expire_upcalls_ch, 1); - callout_init(&V_bw_upcalls_ch, 1); + callout_init_rw(&V_expire_upcalls_ch, &mrouter_mtx, 0); + callout_init_rw(&V_bw_upcalls_ch, &mrouter_mtx, 0); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, @@ -2667,7 +2758,6 @@ vnet_mroute_uninit(const void *unused __unused) { - free(V_bw_upcalls, M_MRTABLE); free(V_viftable, M_MRTABLE); free(V_nexpire, M_MRTABLE); V_nexpire = NULL; @@ -2682,20 +2772,17 @@ switch (type) { case MOD_LOAD: - MROUTER_LOCK_INIT(); + MRW_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { printf("ip_mroute: unable to register " "ifnet_departure_event handler\n"); - MROUTER_LOCK_DESTROY(); + MRW_LOCK_DESTROY(); return (EINVAL); } - MFC_LOCK_INIT(); - VIF_LOCK_INIT(); - mfchashsize = MFCHASHSIZE; if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && !powerof2(mfchashsize)) { @@ -2711,9 +2798,7 @@ pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); - VIF_LOCK_DESTROY(); - MFC_LOCK_DESTROY(); - MROUTER_LOCK_DESTROY(); + MRW_LOCK_DESTROY(); return (EINVAL); } @@ -2740,13 +2825,13 @@ * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ - MROUTER_LOCK(); + MRW_WLOCK(); if (ip_mrouter_cnt != 0) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return (EINVAL); } ip_mrouter_unloading = 1; - MROUTER_UNLOCK(); + MRW_WUNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); @@ -2768,9 +2853,7 @@ mrt_ioctl = NULL; rsvp_input_p = NULL; - VIF_LOCK_DESTROY(); - MFC_LOCK_DESTROY(); - MROUTER_LOCK_DESTROY(); + MRW_LOCK_DESTROY(); break; default: