diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h --- a/sys/netinet/ip_mroute.h +++ b/sys/netinet/ip_mroute.h @@ -199,7 +199,7 @@ }; /* max. number of upcalls to deliver together */ -#define BW_UPCALLS_MAX 128 +#define BW_UPCALLS_MAX 1024 /* min. threshold time interval for bandwidth measurement */ #define BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC 3 #define BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC 0 @@ -264,6 +264,10 @@ u_long v_pkt_out; /* # pkts out on interface */ u_long v_bytes_in; /* # bytes in on interface */ u_long v_bytes_out; /* # bytes out on interface */ +#ifdef _KERNEL + struct mtx v_spin; /* Spin mutex for pkt stats */ + char v_spin_name[32]; +#endif }; #if defined(_KERNEL) || defined (_NETSTAT) @@ -287,8 +291,7 @@ for Lower-or-EQual case */ struct bw_meter *mfc_bw_meter_geq; /* list of bandwidth meters for Greater-or-EQual case */ - u_long mfc_nstall; /* # of packets awaiting mfc */ - TAILQ_HEAD(, rtdetq) mfc_stall; /* q of packets awaiting mfc */ + struct buf_ring *mfc_stall_ring; /* ring of awaiting mfc */ }; #endif /* _KERNEL */ @@ -349,6 +352,8 @@ #ifdef _KERNEL struct callout bm_meter_callout; /* Periodic callout */ void* arg; /* custom argument */ + struct mtx bm_spin; /* meter spin lock */ + char bm_spin_name[32]; #endif }; diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c --- a/sys/netinet/ip_mroute.c +++ b/sys/netinet/ip_mroute.c @@ -80,8 +80,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -136,13 +138,19 @@ * structures. */ -static struct mtx mrouter_mtx; -#define MROUTER_LOCK() mtx_lock(&mrouter_mtx) -#define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) -#define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) -#define MROUTER_LOCK_INIT() \ - mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) -#define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) +static struct rwlock mrouter_mtx; +#define MRW_RLOCK() rw_rlock(&mrouter_mtx) +#define MRW_WLOCK() rw_wlock(&mrouter_mtx) +#define MRW_RUNLOCK() rw_runlock(&mrouter_mtx) +#define MRW_WUNLOCK() rw_wunlock(&mrouter_mtx) +#define MRW_UNLOCK() rw_unlock(&mrouter_mtx) +#define MRW_LOCK_ASSERT() rw_assert(&mrouter_mtx, RA_LOCKED) +#define MRW_WLOCK_ASSERT() rw_assert(&mrouter_mtx, RA_WLOCKED) +#define MRW_LOCK_TRY_UPGRADE() rw_try_upgrade(&mrouter_mtx) +#define MRW_WOWNED() rw_wowned(&mrouter_mtx) +#define MRW_LOCK_INIT() \ + rw_init(&mrouter_mtx, "IPv4 multicast forwarding") +#define MRW_LOCK_DESTROY() rw_destroy(&mrouter_mtx) static int ip_mrouter_cnt; /* # of vnets with active mrouters */ static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */ @@ -167,32 +175,25 @@ VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl); #define V_mfchashtbl VNET(mfchashtbl) -static struct mtx mfc_mtx; -#define MFC_LOCK() mtx_lock(&mfc_mtx) -#define MFC_UNLOCK() mtx_unlock(&mfc_mtx) -#define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) -#define MFC_LOCK_INIT() \ - mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) -#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) - VNET_DEFINE_STATIC(vifi_t, numvifs); #define V_numvifs VNET(numvifs) VNET_DEFINE_STATIC(struct vif *, viftable); #define V_viftable VNET(viftable) -static struct mtx vif_mtx; -#define VIF_LOCK() mtx_lock(&vif_mtx) -#define VIF_UNLOCK() mtx_unlock(&vif_mtx) -#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) -#define VIF_LOCK_INIT() \ - mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) -#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) - static eventhandler_tag if_detach_event_tag = NULL; VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch); #define V_expire_upcalls_ch VNET(expire_upcalls_ch) +VNET_DEFINE_STATIC(struct mtx, upcall_thread_mtx); +#define V_upcall_thread_mtx VNET(upcall_thread_mtx) + +VNET_DEFINE_STATIC(struct cv, upcall_thread_cv); +#define V_upcall_thread_cv VNET(upcall_thread_cv) + +VNET_DEFINE_STATIC(struct mtx, buf_ring_mtx); +#define V_buf_ring_mtx VNET(buf_ring_mtx) + #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ #define UPCALL_EXPIRE 6 /* number of timeouts */ @@ -202,15 +203,15 @@ static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); /* - * Pending upcalls are stored in a vector which is flushed when + * Pending upcalls are stored in a ring which is flushed when * full, or periodically */ -VNET_DEFINE_STATIC(struct bw_upcall *, bw_upcalls); -#define V_bw_upcalls VNET(bw_upcalls) -VNET_DEFINE_STATIC(u_int, bw_upcalls_n); /* # of pending upcalls */ -#define V_bw_upcalls_n VNET(bw_upcalls_n) VNET_DEFINE_STATIC(struct callout, bw_upcalls_ch); #define V_bw_upcalls_ch VNET(bw_upcalls_ch) +VNET_DEFINE_STATIC(struct buf_ring *, bw_upcalls_ring); +#define V_bw_upcalls_ring VNET(bw_upcalls_ring) +VNET_DEFINE_STATIC(struct mtx, bw_upcalls_ring_mtx); +#define V_bw_upcalls_ring_mtx VNET(bw_upcalls_ring_mtx) #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ @@ -228,6 +229,8 @@ &pim_squelch_wholepkt, 0, "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); +static volatile int upcall_thread_shutdown = 0; + static const struct encaptab *pim_encap_cookie; static int pim_encapcheck(const struct mbuf *, int, int, void *); static int pim_input(struct mbuf *, int, int, void *); @@ -367,18 +370,41 @@ { struct mfc *rt; - MFC_LOCK_ASSERT(); + /* + * Might be called both RLOCK and WLOCK. + * Check if any, it's caller responsibility + * to choose correct option. + */ + MRW_LOCK_ASSERT(); LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { if (in_hosteq(rt->mfc_origin, *o) && in_hosteq(rt->mfc_mcastgrp, *g) && - TAILQ_EMPTY(&rt->mfc_stall)) + buf_ring_empty(rt->mfc_stall_ring)) break; } return (rt); } +static __inline struct mfc * +mfc_alloc(void) +{ + struct mfc *rt; + rt = (struct mfc*) malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT | M_ZERO); + if (rt == NULL) + return rt; + + rt->mfc_stall_ring = buf_ring_alloc(MAX_UPQ, M_MRTABLE, + M_NOWAIT, &V_buf_ring_mtx); + if (rt->mfc_stall_ring == NULL) { + free(rt, M_MRTABLE); + return NULL; + } + + return rt; +} + /* * Handle MRT setsockopt commands to modify the multicast forwarding tables. */ @@ -552,17 +578,17 @@ { struct mfc *rt; - MFC_LOCK(); + MRW_RLOCK(); rt = mfc_find(&req->src, &req->grp); if (rt == NULL) { - MFC_UNLOCK(); + MRW_RUNLOCK(); req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; return EADDRNOTAVAIL; } req->pktcnt = rt->mfc_pkt_cnt; req->bytecnt = rt->mfc_byte_cnt; req->wrong_if = rt->mfc_wrong_if; - MFC_UNLOCK(); + MRW_RUNLOCK(); return 0; } @@ -574,17 +600,19 @@ { vifi_t vifi = req->vifi; - VIF_LOCK(); + MRW_RLOCK(); if (vifi >= V_numvifs) { - VIF_UNLOCK(); + MRW_RUNLOCK(); return EINVAL; } + mtx_lock_spin(&V_viftable[vifi].v_spin); req->icount = V_viftable[vifi].v_pkt_in; req->ocount = V_viftable[vifi].v_pkt_out; req->ibytes = V_viftable[vifi].v_bytes_in; req->obytes = V_viftable[vifi].v_bytes_out; - VIF_UNLOCK(); + mtx_unlock_spin(&V_viftable[vifi].v_spin); + MRW_RUNLOCK(); return 0; } @@ -595,16 +623,13 @@ vifi_t vifi; u_long i; - MROUTER_LOCK(); + MRW_WLOCK(); if (V_ip_mrouter == NULL) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return; } - VIF_LOCK(); - MFC_LOCK(); - /* * Tear down multicast forwarder state associated with this ifnet. * 1. Walk the vif list, matching vifs against this ifnet. @@ -628,10 +653,26 @@ del_vif_locked(vifi); } - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); +} + +static void +ip_mrouter_upcall_thread(void *arg) +{ + CURVNET_SET((struct vnet *) arg); + + while (upcall_thread_shutdown == 0) { + /* START: Event loop */ + + /* END: Event loop */ + mtx_lock(&V_upcall_thread_mtx); + cv_timedwait(&V_upcall_thread_cv, &V_upcall_thread_mtx, hz); + mtx_unlock(&V_upcall_thread_mtx); + } - MROUTER_UNLOCK(); + upcall_thread_shutdown = 0; + CURVNET_RESTORE(); + kthread_exit(); } /* @@ -650,21 +691,35 @@ if (version != 1) return ENOPROTOOPT; - MROUTER_LOCK(); + MRW_WLOCK(); if (ip_mrouter_unloading) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return ENOPROTOOPT; } if (V_ip_mrouter != NULL) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return EADDRINUSE; } V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash, HASH_NOWAIT); + /* Create upcall ring */ + mtx_init(&V_bw_upcalls_ring_mtx, "mroute upcall buf_ring mtx", NULL, MTX_DEF); + V_bw_upcalls_ring = buf_ring_alloc(BW_UPCALLS_MAX, M_MRTABLE, + M_NOWAIT, &V_bw_upcalls_ring_mtx); + if (!V_bw_upcalls_ring) + return (ENOMEM); + + /* Create upcall thread */ + upcall_thread_shutdown = 0; + mtx_init(&V_upcall_thread_mtx, "ip_mroute upcall thread mtx", NULL, MTX_DEF); + cv_init(&V_upcall_thread_cv, "ip_mroute upcall cv"); + kthread_add(ip_mrouter_upcall_thread, curvnet, + NULL, NULL, 0, 0, "ip_mroute upcall thread"); + callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, @@ -673,7 +728,10 @@ V_ip_mrouter = so; ip_mrouter_cnt++; - MROUTER_UNLOCK(); + /* This is a mutex required by buf_ring init, but not used internally */ + mtx_init(&V_buf_ring_mtx, "mroute buf_ring mtx", NULL, MTX_DEF); + + MRW_WUNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); @@ -689,11 +747,12 @@ struct ifnet *ifp; u_long i; vifi_t vifi; + struct bw_upcall *bu; - MROUTER_LOCK(); + MRW_WLOCK(); if (V_ip_mrouter == NULL) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return EINVAL; } @@ -706,7 +765,22 @@ MROUTER_WAIT(); - VIF_LOCK(); + upcall_thread_shutdown = 1; + mtx_lock(&V_upcall_thread_mtx); + cv_signal(&V_upcall_thread_cv); + mtx_unlock(&V_upcall_thread_mtx); + + /* Wait for thread shutdown */ + while (upcall_thread_shutdown == 1) {}; + + mtx_destroy(&V_upcall_thread_mtx); + + /* Destroy upcall ring */ + while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) { + free(bu, M_MRTABLE); + } + buf_ring_free(V_bw_upcalls_ring, M_MRTABLE); + mtx_destroy(&V_bw_upcalls_ring_mtx); /* * For each phyint in use, disable promiscuous reception of all IP @@ -723,13 +797,9 @@ V_numvifs = 0; V_pim_assert_enabled = 0; - VIF_UNLOCK(); - callout_stop(&V_expire_upcalls_ch); callout_stop(&V_bw_upcalls_ch); - MFC_LOCK(); - /* * Free all multicast forwarding cache entries. * Do not use hashdestroy(), as we must perform other cleanup. @@ -746,13 +816,11 @@ bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize); - V_bw_upcalls_n = 0; - - MFC_UNLOCK(); - V_reg_vif_num = VIFI_INVALID; - MROUTER_UNLOCK(); + mtx_destroy(&V_buf_ring_mtx); + + MRW_WUNLOCK(); CTR1(KTR_IPMF, "%s: done", __func__); @@ -797,17 +865,17 @@ return EPERM; } - MFC_LOCK(); + MRW_RLOCK(); for (i = 0; i < mfchashsize; i++) { if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) { - MFC_UNLOCK(); + MRW_RUNLOCK(); *apival = 0; return EPERM; } } - MFC_UNLOCK(); + MRW_RUNLOCK(); V_mrt_api_config = *apival & mrt_api_support; *apival = V_mrt_api_config; @@ -827,23 +895,23 @@ struct ifnet *ifp; int error; - VIF_LOCK(); + MRW_WLOCK(); if (vifcp->vifc_vifi >= MAXVIFS) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return EINVAL; } /* rate limiting is no longer supported by this code */ if (vifcp->vifc_rate_limit != 0) { log(LOG_ERR, "rate limiting is no longer supported\n"); - VIF_UNLOCK(); + MRW_WUNLOCK(); return EINVAL; } if (!in_nullhost(vifp->v_lcl_addr)) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return EADDRINUSE; } if (in_nullhost(vifcp->vifc_lcl_addr)) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } @@ -863,7 +931,7 @@ ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == NULL) { NET_EPOCH_EXIT(et); - VIF_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } ifp = ifa->ifa_ifp; @@ -873,7 +941,7 @@ if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); - VIF_UNLOCK(); + MRW_WUNLOCK(); return EOPNOTSUPP; } else if (vifcp->vifc_flags & VIFF_REGISTER) { ifp = &V_multicast_register_if; @@ -885,14 +953,14 @@ } } else { /* Make sure the interface supports multicast */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return EOPNOTSUPP; } /* Enable promiscuous reception of all IP multicasts from the if */ error = if_allmulti(ifp, 1); if (error) { - VIF_UNLOCK(); + MRW_WUNLOCK(); return error; } } @@ -907,12 +975,14 @@ vifp->v_pkt_out = 0; vifp->v_bytes_in = 0; vifp->v_bytes_out = 0; + sprintf(vifp->v_spin_name, "BM[%d] spin", vifcp->vifc_vifi); + mtx_init(&vifp->v_spin, vifp->v_spin_name, NULL, MTX_SPIN); /* Adjust numvifs up if the vifi is higher than numvifs */ if (V_numvifs <= vifcp->vifc_vifi) V_numvifs = vifcp->vifc_vifi + 1; - VIF_UNLOCK(); + MRW_WUNLOCK(); CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__, (int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr), @@ -929,7 +999,7 @@ { struct vif *vifp; - VIF_LOCK_ASSERT(); + MRW_WLOCK_ASSERT(); if (vifi >= V_numvifs) { return EINVAL; @@ -945,6 +1015,8 @@ if (vifp->v_flags & VIFF_REGISTER) V_reg_vif_num = VIFI_INVALID; + mtx_destroy(&vifp->v_spin); + bzero((caddr_t)vifp, sizeof (*vifp)); CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); @@ -963,9 +1035,9 @@ { int cc; - VIF_LOCK(); + MRW_WLOCK(); cc = del_vif_locked(vifi); - VIF_UNLOCK(); + MRW_WUNLOCK(); return cc; } @@ -1012,18 +1084,21 @@ static void expire_mfc(struct mfc *rt) { - struct rtdetq *rte, *nrte; + struct rtdetq *rte; - MFC_LOCK_ASSERT(); + MRW_WLOCK_ASSERT(); free_bw_list(rt->mfc_bw_meter_leq); free_bw_list(rt->mfc_bw_meter_geq); - TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { - m_freem(rte->m); - TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); - free(rte, M_MRTABLE); + while (!buf_ring_empty(rt->mfc_stall_ring)) { + rte = buf_ring_dequeue_mc(rt->mfc_stall_ring); + if (rte) { + m_freem(rte->m); + free(rte, M_MRTABLE); + } } + buf_ring_free(rt->mfc_stall_ring, M_MRTABLE); LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); @@ -1036,13 +1111,11 @@ add_mfc(struct mfcctl2 *mfccp) { struct mfc *rt; - struct rtdetq *rte, *nrte; + struct rtdetq *rte; u_long hash = 0; u_short nstl; - VIF_LOCK(); - MFC_LOCK(); - + MRW_WLOCK(); rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); /* If an entry already exists, just update the fields */ @@ -1052,8 +1125,7 @@ (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent); update_mfc_params(rt, mfccp); - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return (0); } @@ -1065,13 +1137,13 @@ LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && - !TAILQ_EMPTY(&rt->mfc_stall)) { + !buf_ring_empty(rt->mfc_stall_ring)) { CTR5(KTR_IPMF, "%s: add mfc orig 0x%08x group %lx parent %x qh %p", __func__, ntohl(mfccp->mfcc_origin.s_addr), (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), mfccp->mfcc_parent, - TAILQ_FIRST(&rt->mfc_stall)); + rt->mfc_stall_ring); if (nstl++) CTR1(KTR_IPMF, "%s: multiple matches", __func__); @@ -1080,12 +1152,11 @@ V_nexpire[hash]--; /* Free queued packets, but attempt to forward them first. */ - TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { + while (!buf_ring_empty(rt->mfc_stall_ring)) { + rte = buf_ring_dequeue_mc(rt->mfc_stall_ring); if (rte->ifp != NULL) ip_mdq(rte->m, rte->ifp, rt, -1); m_freem(rte->m); - TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); - rt->mfc_nstall--; free(rte, M_MRTABLE); } } @@ -1108,16 +1179,13 @@ } if (rt == NULL) { /* no upcall, so make a new entry */ - rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); + rt = mfc_alloc(); if (rt == NULL) { - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return (ENOBUFS); } init_mfc_params(rt, mfccp); - TAILQ_INIT(&rt->mfc_stall); - rt->mfc_nstall = 0; rt->mfc_expire = 0; rt->mfc_bw_meter_leq = NULL; @@ -1128,8 +1196,7 @@ } } - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return (0); } @@ -1150,11 +1217,11 @@ CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__, ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); - MFC_LOCK(); + MRW_WLOCK(); rt = mfc_find(&origin, &mcastgrp); if (rt == NULL) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } @@ -1169,7 +1236,7 @@ LIST_REMOVE(rt, mfc_hash); free(rt, M_MRTABLE); - MFC_UNLOCK(); + MRW_WUNLOCK(); return (0); } @@ -1210,70 +1277,83 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, struct ip_moptions *imo) { - struct mfc *rt; - int error; - vifi_t vifi; + struct mfc *rt; + int error; + vifi_t vifi; + struct mbuf *mb0; + struct rtdetq *rte; + u_long hash; + int hlen; - CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p", - ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp); + CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p", + ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp); + + if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || + ((u_char *)(ip + 1))[1] != IPOPT_LSRR) { + /* + * Packet arrived via a physical interface or + * an encapsulated tunnel or a register_vif. + */ + } else { + /* + * Packet arrived through a source-route tunnel. + * Source-route tunnels are no longer supported. + */ + return (1); + } - if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || - ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { /* - * Packet arrived via a physical interface or - * an encapsulated tunnel or a register_vif. + * BEGIN: MCAST ROUTING HOT PATH */ - } else { + MRW_RLOCK(); + if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { + if (ip->ip_ttl < MAXTTL) + ip->ip_ttl++; /* compensate for -1 in *_send routines */ + error = ip_mdq(m, ifp, NULL, vifi); + MRW_RUNLOCK(); + return error; + } + /* - * Packet arrived through a source-route tunnel. - * Source-route tunnels are no longer supported. + * Don't forward a packet with time-to-live of zero or one, + * or a packet destined to a local-only group. */ - return (1); - } + if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { + MRW_RUNLOCK(); + return 0; + } - VIF_LOCK(); - MFC_LOCK(); - if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) { - if (ip->ip_ttl < MAXTTL) - ip->ip_ttl++; /* compensate for -1 in *_send routines */ - error = ip_mdq(m, ifp, NULL, vifi); - MFC_UNLOCK(); - VIF_UNLOCK(); - return error; - } + mfc_find_retry: + /* + * Determine forwarding vifs from the forwarding cache table + */ + MRTSTAT_INC(mrts_mfc_lookups); + rt = mfc_find(&ip->ip_src, &ip->ip_dst); + + /* Entry exists, so forward if necessary */ + if (rt != NULL) { + error = ip_mdq(m, ifp, rt, -1); + /* Generic unlock here as we might release R or W lock */ + MRW_UNLOCK(); + return error; + } - /* - * Don't forward a packet with time-to-live of zero or one, - * or a packet destined to a local-only group. - */ - if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { - MFC_UNLOCK(); - VIF_UNLOCK(); - return 0; - } + /* + * END: MCAST ROUTING HOT PATH + */ + + /* Further processing must be done with WLOCK taken */ + if ((MRW_WOWNED() == 0) && (MRW_LOCK_TRY_UPGRADE() == 0)) { + MRW_RUNLOCK(); + MRW_WLOCK(); + goto mfc_find_retry; + } - /* - * Determine forwarding vifs from the forwarding cache table - */ - MRTSTAT_INC(mrts_mfc_lookups); - rt = mfc_find(&ip->ip_src, &ip->ip_dst); - - /* Entry exists, so forward if necessary */ - if (rt != NULL) { - error = ip_mdq(m, ifp, rt, -1); - MFC_UNLOCK(); - VIF_UNLOCK(); - return error; - } else { /* * If we don't have a route for packet's origin, * Make a copy of the packet & send message to routing daemon */ - - struct mbuf *mb0; - struct rtdetq *rte; - u_long hash; - int hlen = ip->ip_hl << 2; + hlen = ip->ip_hl << 2; MRTSTAT_INC(mrts_mfc_misses); MRTSTAT_INC(mrts_no_route); @@ -1285,138 +1365,123 @@ * just going to fail anyway. Make sure to pullup the header so * that other people can't step on it. */ - rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, + rte = (struct rtdetq*) malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO); if (rte == NULL) { - MFC_UNLOCK(); - VIF_UNLOCK(); - return ENOBUFS; + MRW_WUNLOCK(); + return ENOBUFS; } mb0 = m_copypacket(m, M_NOWAIT); if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen)) - mb0 = m_pullup(mb0, hlen); + mb0 = m_pullup(mb0, hlen); if (mb0 == NULL) { - free(rte, M_MRTABLE); - MFC_UNLOCK(); - VIF_UNLOCK(); - return ENOBUFS; + free(rte, M_MRTABLE); + MRW_WUNLOCK(); + return ENOBUFS; } /* is there an upcall waiting for this flow ? */ hash = MFCHASH(ip->ip_src, ip->ip_dst); - LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) { + LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) + { if (in_hosteq(ip->ip_src, rt->mfc_origin) && in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && - !TAILQ_EMPTY(&rt->mfc_stall)) + !buf_ring_empty(rt->mfc_stall_ring)) break; } if (rt == NULL) { - int i; - struct igmpmsg *im; - struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; - struct mbuf *mm; - - /* - * Locate the vifi for the incoming interface for this packet. - * If none found, drop packet. - */ - for (vifi = 0; vifi < V_numvifs && + int i; + struct igmpmsg *im; + struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; + struct mbuf *mm; + + /* + * Locate the vifi for the incoming interface for this packet. + * If none found, drop packet. + */ + for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++) - ; - if (vifi >= V_numvifs) /* vif not found, drop packet */ - goto non_fatal; - - /* no upcall, so make a new entry */ - rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); - if (rt == NULL) - goto fail; - - /* Make a copy of the header to send to the user level process */ - mm = m_copym(mb0, 0, hlen, M_NOWAIT); - if (mm == NULL) - goto fail1; - - /* - * Send message to routing daemon to install - * a route into the kernel table - */ - - im = mtod(mm, struct igmpmsg *); - im->im_msgtype = IGMPMSG_NOCACHE; - im->im_mbz = 0; - im->im_vif = vifi; - - MRTSTAT_INC(mrts_upcalls); - - k_igmpsrc.sin_addr = ip->ip_src; - if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { - CTR0(KTR_IPMF, "ip_mforward: socket queue full"); - MRTSTAT_INC(mrts_upq_sockfull); -fail1: - free(rt, M_MRTABLE); -fail: - free(rte, M_MRTABLE); - m_freem(mb0); - MFC_UNLOCK(); - VIF_UNLOCK(); - return ENOBUFS; - } + ; + if (vifi >= V_numvifs) /* vif not found, drop packet */ + goto non_fatal; - /* insert new entry at head of hash chain */ - rt->mfc_origin.s_addr = ip->ip_src.s_addr; - rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; - rt->mfc_expire = UPCALL_EXPIRE; - V_nexpire[hash]++; - for (i = 0; i < V_numvifs; i++) { - rt->mfc_ttls[i] = 0; - rt->mfc_flags[i] = 0; - } - rt->mfc_parent = -1; + /* no upcall, so make a new entry */ + rt = mfc_alloc(); + if (rt == NULL) + goto fail; - /* clear the RP address */ - rt->mfc_rp.s_addr = INADDR_ANY; - rt->mfc_bw_meter_leq = NULL; - rt->mfc_bw_meter_geq = NULL; + /* Make a copy of the header to send to the user level process */ + mm = m_copym(mb0, 0, hlen, M_NOWAIT); + if (mm == NULL) + goto fail1; - /* initialize pkt counters per src-grp */ - rt->mfc_pkt_cnt = 0; - rt->mfc_byte_cnt = 0; - rt->mfc_wrong_if = 0; - timevalclear(&rt->mfc_last_assert); + /* + * Send message to routing daemon to install + * a route into the kernel table + */ - TAILQ_INIT(&rt->mfc_stall); - rt->mfc_nstall = 0; + im = mtod(mm, struct igmpmsg*); + im->im_msgtype = IGMPMSG_NOCACHE; + im->im_mbz = 0; + im->im_vif = vifi; - /* link into table */ - LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash); - TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); - rt->mfc_nstall++; + MRTSTAT_INC(mrts_upcalls); + + k_igmpsrc.sin_addr = ip->ip_src; + if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { + CTR0(KTR_IPMF, "ip_mforward: socket queue full"); + MRTSTAT_INC(mrts_upq_sockfull); + fail1: free(rt, M_MRTABLE); + fail: free(rte, M_MRTABLE); + m_freem(mb0); + MRW_WUNLOCK(); + return ENOBUFS; + } + + /* insert new entry at head of hash chain */ + rt->mfc_origin.s_addr = ip->ip_src.s_addr; + rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; + rt->mfc_expire = UPCALL_EXPIRE; + V_nexpire[hash]++; + for (i = 0; i < V_numvifs; i++) { + rt->mfc_ttls[i] = 0; + rt->mfc_flags[i] = 0; + } + rt->mfc_parent = -1; + + /* clear the RP address */ + rt->mfc_rp.s_addr = INADDR_ANY; + rt->mfc_bw_meter_leq = NULL; + rt->mfc_bw_meter_geq = NULL; + + /* initialize pkt counters per src-grp */ + rt->mfc_pkt_cnt = 0; + rt->mfc_byte_cnt = 0; + rt->mfc_wrong_if = 0; + timevalclear(&rt->mfc_last_assert); + buf_ring_enqueue(rt->mfc_stall_ring, rte); } else { - /* determine if queue has overflowed */ - if (rt->mfc_nstall > MAX_UPQ) { - MRTSTAT_INC(mrts_upq_ovflw); -non_fatal: - free(rte, M_MRTABLE); - m_freem(mb0); - MFC_UNLOCK(); - VIF_UNLOCK(); - return (0); - } - TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); - rt->mfc_nstall++; + /* determine if queue has overflowed */ + if (buf_ring_full(rt->mfc_stall_ring)) { + MRTSTAT_INC(mrts_upq_ovflw); + non_fatal: free(rte, M_MRTABLE); + m_freem(mb0); + MRW_WUNLOCK(); + return (0); + } + + buf_ring_enqueue(rt->mfc_stall_ring, rte); } - rte->m = mb0; - rte->ifp = ifp; + rte->m = mb0; + rte->ifp = ifp; - MFC_UNLOCK(); - VIF_UNLOCK(); + MRW_WUNLOCK(); return 0; - } } /* @@ -1429,7 +1494,7 @@ CURVNET_SET((struct vnet *) arg); - MFC_LOCK(); + /*This callout is always run with MRW_WLOCK taken. */ for (i = 0; i < mfchashsize; i++) { struct mfc *rt, *nrt; @@ -1438,7 +1503,7 @@ continue; LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) { - if (TAILQ_EMPTY(&rt->mfc_stall)) + if (buf_ring_empty(rt->mfc_stall_ring)) continue; if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) @@ -1453,8 +1518,6 @@ } } - MFC_UNLOCK(); - callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, curvnet); @@ -1471,7 +1534,7 @@ vifi_t vifi; int plen = ntohs(ip->ip_len); - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); /* * If xmt_vif is not -1, send on only the requested vif. @@ -1548,6 +1611,7 @@ } /* If I sourced this packet, it counts as output, else it was input. */ + mtx_lock_spin(&V_viftable[vifi].v_spin); if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) { V_viftable[vifi].v_pkt_out++; V_viftable[vifi].v_bytes_out += plen; @@ -1555,6 +1619,8 @@ V_viftable[vifi].v_pkt_in++; V_viftable[vifi].v_bytes_in += plen; } + mtx_unlock_spin(&V_viftable[vifi].v_spin); + rt->mfc_pkt_cnt++; rt->mfc_byte_cnt += plen; @@ -1582,16 +1648,22 @@ struct timeval now; microtime(&now); - MFC_LOCK_ASSERT(); /* Process meters for Greater-or-EQual case */ for (x = rt->mfc_bw_meter_geq; x != NULL; x = x->bm_mfc_next) bw_meter_geq_receive_packet(x, plen, &now); /* Process meters for Lower-or-EQual case */ for (x = rt->mfc_bw_meter_leq; x != NULL; x = x->bm_mfc_next) { - /* Record that a packet is received */ + /* + * Record that a packet is received. + * Spin lock has to be taken as callout context + * (expire_bw_meter_leq) might modify these fields + * as well + */ + mtx_lock_spin(&x->bm_spin); x->bm_measured.b_packets++; x->bm_measured.b_bytes += plen; + mtx_unlock_spin(&x->bm_spin); } } @@ -1610,10 +1682,10 @@ if (vif < 0) return (ret); - VIF_LOCK(); + MRW_RLOCK(); if (vif < V_numvifs) ret = 1; - VIF_UNLOCK(); + MRW_RUNLOCK(); return (ret); } @@ -1630,10 +1702,10 @@ if (vifi < 0) return (addr); - VIF_LOCK(); + MRW_RLOCK(); if (vifi < V_numvifs) addr = V_viftable[vifi].v_lcl_addr.s_addr; - VIF_UNLOCK(); + MRW_RUNLOCK(); return (addr); } @@ -1644,7 +1716,7 @@ struct mbuf *mb_copy; int hlen = ip->ip_hl << 2; - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); /* * Make a new reference to the packet; make sure that @@ -1666,7 +1738,7 @@ struct ip_moptions imo; int error __unused; - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); imo.imo_multicast_ifp = vifp->v_ifp; imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; @@ -1749,7 +1821,7 @@ struct timeval now; /* * INFO: - * callout is always executed with MFC_LOCK taken + * callout is always executed with MRW_WLOCK taken */ CURVNET_SET((struct vnet *)x->arg); @@ -1768,12 +1840,19 @@ } /* Send all upcalls that are pending delivery */ - bw_upcalls_send(); + mtx_lock(&V_upcall_thread_mtx); + cv_signal(&V_upcall_thread_cv); + mtx_unlock(&V_upcall_thread_mtx); /* Reset counters */ x->bm_start_time = now; + /* Spin lock has to be taken as ip_forward context + * might modify these fields as well + */ + mtx_lock_spin(&x->bm_spin); x->bm_measured.b_bytes = 0; x->bm_measured.b_packets = 0; + mtx_unlock_spin(&x->bm_spin); callout_schedule(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time)); @@ -1814,10 +1893,10 @@ /* * Find if we have already same bw_meter entry */ - MFC_LOCK(); + MRW_WLOCK(); mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } @@ -1836,15 +1915,16 @@ == req->bu_threshold.b_bytes) && (x->bm_flags & BW_METER_USER_FLAGS) == flags) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return 0; /* XXX Already installed */ } } /* Allocate the new bw_meter entry */ - x = (struct bw_meter*) malloc(sizeof(*x), M_BWMETER, M_NOWAIT); + x = (struct bw_meter*) malloc(sizeof(*x), M_BWMETER, + M_ZERO | M_NOWAIT); if (x == NULL) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return ENOBUFS; } @@ -1860,10 +1940,12 @@ x->bm_time_next = NULL; x->bm_mfc = mfc; x->arg = curvnet; + sprintf(x->bm_spin_name, "BM spin %p", x); + mtx_init(&x->bm_spin, x->bm_spin_name, NULL, MTX_SPIN); /* For LEQ case create periodic callout */ if (req->bu_flags & BW_UPCALL_LEQ) { - callout_init_mtx(&x->bm_meter_callout, &mfc_mtx,0); + callout_init_rw(&x->bm_meter_callout, &mrouter_mtx, CALLOUT_SHAREDLOCK); callout_reset(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time), expire_bw_meter_leq, x); } @@ -1872,7 +1954,7 @@ x->bm_mfc_next = *bwm_ptr; *bwm_ptr = x; - MFC_UNLOCK(); + MRW_WUNLOCK(); return 0; } @@ -1883,9 +1965,11 @@ while (list != NULL) { struct bw_meter *x = list; - /* MFC_LOCK must be held here */ - if (x->bm_flags & BW_METER_LEQ) + /* MRW_WLOCK must be held here */ + if (x->bm_flags & BW_METER_LEQ) { callout_drain(&x->bm_meter_callout); + mtx_destroy(&x->bm_spin); + } list = list->bm_mfc_next; free(x, M_BWMETER); @@ -1904,12 +1988,12 @@ if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL)) return EOPNOTSUPP; - MFC_LOCK(); + MRW_WLOCK(); /* Find the corresponding MFC entry */ mfc = mfc_find(&req->bu_src, &req->bu_dst); if (mfc == NULL) { - MFC_UNLOCK(); + MRW_WUNLOCK(); return EADDRNOTAVAIL; } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { /* @@ -1926,7 +2010,7 @@ list = mfc->mfc_bw_meter_geq; mfc->mfc_bw_meter_geq = NULL; free_bw_list(list); - MFC_UNLOCK(); + MRW_WUNLOCK(); return 0; } else { /* Delete a single bw_meter entry */ struct bw_meter *prev; @@ -1959,12 +2043,12 @@ if (req->bu_flags & BW_UPCALL_LEQ) callout_stop(&x->bm_meter_callout); - MFC_UNLOCK(); + MRW_WUNLOCK(); /* Free the bw_meter entry */ free(x, M_BWMETER); return 0; } else { - MFC_UNLOCK(); + MRW_WUNLOCK(); return EINVAL; } } @@ -1979,13 +2063,15 @@ { struct timeval delta; - MFC_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); /* - * Processing for ">=" type of bw_meter entry + * Processing for ">=" type of bw_meter entry. + * bm_spin does not have to be hold here as in GEQ + * case this is the only context accessing bm_measured. */ if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { /* Reset the bw_meter entry */ @@ -2023,7 +2109,7 @@ struct timeval delta; struct bw_upcall *u; - MFC_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); /* * Compute the measured time interval @@ -2031,16 +2117,14 @@ delta = *nowp; BW_TIMEVALDECR(&delta, &x->bm_start_time); - /* - * If there are too many pending upcalls, deliver them now - */ - if (V_bw_upcalls_n >= BW_UPCALLS_MAX) - bw_upcalls_send(); - /* * Set the bw_upcall entry */ - u = &V_bw_upcalls[V_bw_upcalls_n++]; + u = malloc(sizeof(struct bw_upcall), M_MRTABLE, M_NOWAIT | M_ZERO); + if (!u) { + log(LOG_WARNING, "bw_meter_prepare_upcall: cannot allocate entry\n"); + return; + } u->bu_src = x->bm_mfc->mfc_origin; u->bu_dst = x->bm_mfc->mfc_mcastgrp; u->bu_threshold.b_time = x->bm_threshold.b_time; @@ -2058,8 +2142,15 @@ u->bu_flags |= BW_UPCALL_GEQ; if (x->bm_flags & BW_METER_LEQ) u->bu_flags |= BW_UPCALL_LEQ; -} + if (buf_ring_enqueue(V_bw_upcalls_ring, u)) + log(LOG_WARNING, "bw_meter_prepare_upcall: cannot enqueue upcall\n"); + if (buf_ring_count(V_bw_upcalls_ring) > (BW_UPCALLS_MAX / 2)) { + mtx_lock(&V_upcall_thread_mtx); + cv_signal(&V_upcall_thread_cv); + mtx_unlock(&V_upcall_thread_mtx); + } +} /* * Send the pending bandwidth-related upcalls */ @@ -2067,7 +2158,8 @@ bw_upcalls_send(void) { struct mbuf *m; - int len = V_bw_upcalls_n * sizeof(V_bw_upcalls[0]); + int len = 0; + struct bw_upcall *bu; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; static struct igmpmsg igmpmsg = { 0, /* unused1 */ 0, /* unused2 */ @@ -2078,12 +2170,10 @@ { 0 }, /* im_src */ { 0 } }; /* im_dst */ - MFC_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); - if (V_bw_upcalls_n == 0) - return; /* No pending upcalls */ - - V_bw_upcalls_n = 0; + if (buf_ring_empty(V_bw_upcalls_ring)) + return; /* * Allocate a new mbuf, initialize it with the header and @@ -2096,7 +2186,12 @@ } m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); - m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&V_bw_upcalls[0]); + len += sizeof(struct igmpmsg); + while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) { + m_copyback(m, len, sizeof(struct bw_upcall), (caddr_t)bu); + len += sizeof(struct bw_upcall); + free(bu, M_MRTABLE); + } /* * Send the upcalls @@ -2117,9 +2212,9 @@ { CURVNET_SET((struct vnet *) arg); - MFC_LOCK(); + /* This callout is run with MRW_RLOCK taken */ + bw_upcalls_send(); - MFC_UNLOCK(); callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send, curvnet); @@ -2235,7 +2330,7 @@ struct igmpmsg *im; struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); /* * Add a new mbuf with an upcall header @@ -2288,7 +2383,7 @@ int len = ntohs(ip->ip_len); vifi_t vifi = rt->mfc_parent; - VIF_LOCK_ASSERT(); + MRW_LOCK_ASSERT(); if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) { m_freem(mb_copy); @@ -2461,9 +2556,9 @@ u_int32_t *reghdr; struct ifnet *vifp; - VIF_LOCK(); + MRW_RLOCK(); if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) { - VIF_UNLOCK(); + MRW_RUNLOCK(); CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, (int)V_reg_vif_num); m_freem(m); @@ -2471,7 +2566,7 @@ } /* XXX need refcnt? */ vifp = V_viftable[V_reg_vif_num].v_ifp; - VIF_UNLOCK(); + MRW_RUNLOCK(); /* * Validate length @@ -2597,7 +2692,7 @@ if (error) return (error); - MFC_LOCK(); + MRW_RLOCK(); for (i = 0; i < mfchashsize; i++) { LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) { error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); @@ -2606,7 +2701,7 @@ } } out_locked: - MFC_UNLOCK(); + MRW_RUNLOCK(); return (error); } @@ -2628,9 +2723,9 @@ if (error) return (error); - VIF_LOCK(); + MRW_RLOCK(); error = SYSCTL_OUT(req, V_viftable, sizeof(*V_viftable) * MAXVIFS); - VIF_UNLOCK(); + MRW_RUNLOCK(); return (error); } @@ -2647,11 +2742,9 @@ V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable), M_MRTABLE, M_WAITOK|M_ZERO); - V_bw_upcalls = mallocarray(BW_UPCALLS_MAX, sizeof(*V_bw_upcalls), - M_MRTABLE, M_WAITOK|M_ZERO); - callout_init(&V_expire_upcalls_ch, 1); - callout_init(&V_bw_upcalls_ch, 1); + callout_init_rw(&V_expire_upcalls_ch, &mrouter_mtx, 0); + callout_init_rw(&V_bw_upcalls_ch, &mrouter_mtx, 0); } VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init, @@ -2661,7 +2754,6 @@ vnet_mroute_uninit(const void *unused __unused) { - free(V_bw_upcalls, M_MRTABLE); free(V_viftable, M_MRTABLE); free(V_nexpire, M_MRTABLE); V_nexpire = NULL; @@ -2676,20 +2768,17 @@ switch (type) { case MOD_LOAD: - MROUTER_LOCK_INIT(); + MRW_LOCK_INIT(); if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, if_detached_event, NULL, EVENTHANDLER_PRI_ANY); if (if_detach_event_tag == NULL) { printf("ip_mroute: unable to register " "ifnet_departure_event handler\n"); - MROUTER_LOCK_DESTROY(); + MRW_LOCK_DESTROY(); return (EINVAL); } - MFC_LOCK_INIT(); - VIF_LOCK_INIT(); - mfchashsize = MFCHASHSIZE; if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && !powerof2(mfchashsize)) { @@ -2705,9 +2794,7 @@ pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK); if (pim_encap_cookie == NULL) { printf("ip_mroute: unable to attach pim encap\n"); - VIF_LOCK_DESTROY(); - MFC_LOCK_DESTROY(); - MROUTER_LOCK_DESTROY(); + MRW_LOCK_DESTROY(); return (EINVAL); } @@ -2734,13 +2821,13 @@ * just loaded and then unloaded w/o starting up a user * process we still need to cleanup. */ - MROUTER_LOCK(); + MRW_WLOCK(); if (ip_mrouter_cnt != 0) { - MROUTER_UNLOCK(); + MRW_WUNLOCK(); return (EINVAL); } ip_mrouter_unloading = 1; - MROUTER_UNLOCK(); + MRW_WUNLOCK(); EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); @@ -2762,9 +2849,7 @@ mrt_ioctl = NULL; rsvp_input_p = NULL; - VIF_LOCK_DESTROY(); - MFC_LOCK_DESTROY(); - MROUTER_LOCK_DESTROY(); + MRW_LOCK_DESTROY(); break; default: