Index: sys/net/if.h =================================================================== --- sys/net/if.h +++ sys/net/if.h @@ -160,9 +160,11 @@ #define IFF_PPROMISC 0x20000 /* (n) user-requested promisc mode */ #define IFF_MONITOR 0x40000 /* (n) user-requested monitor mode */ #define IFF_STATICARP 0x80000 /* (n) static ARP */ +#define IFF_DETACHING 0x100000 /* (n) interface is detaching */ #define IFF_DYING 0x200000 /* (n) interface is winding down */ #define IFF_RENAMING 0x400000 /* (n) interface is being renamed */ #define IFF_NOGROUP 0x800000 /* (n) interface is not part of any groups */ +#define IFF_VMOVING 0x1000000 /* (n) interface is changing vnets */ /* Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -262,6 +262,9 @@ static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); +static bool if_busy(struct ifnet *ifp, int setflags); +static void if_unbusy(struct ifnet *ifp); +static void if_busy_wait(struct ifnet *ifp); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); @@ -274,6 +277,7 @@ static int if_detach_internal(struct ifnet *, int, struct if_clone **); static void if_siocaddmulti(void *, int); #ifdef VIMAGE +static int if_vmove_busied(struct ifnet *, struct vnet *); static int if_vmove(struct ifnet *, struct vnet *); #endif @@ -471,10 +475,60 @@ { struct ifnet *ifp, *nifp; + /* + * We do a two pass approach here: first, we go through and busy up all + * of the interfaces with the ifnet writelock held. This ensures the + * list is stable while we determine what still needs to be detached and + * busy up the rest. Any that are already attempting to if_detach() + * will have been properly marked IFF_DETACHING, and we'll skip over + * them here. + */ + IFNET_WLOCK(); + CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { + if (ifp->if_home_vnet == ifp->if_vnet) + continue; + IF_BUSY_LOCK(ifp); + if ((ifp->if_flags & IFF_DETACHING) == 0) { + /* + * If the interface needs to be moved back to the parent + * vnet and it's not already in the process of + * detaching, go ahead and busy it and mark it as such. + * + * This guarantees that the interface can't be destroyed + * or moved while we're in the process of tearing down + * the vnet. If it's already detaching, then it will be + * a non-issue as it has to be leaving the vnet and we + * can assume it will be gone in due time. + */ + refcount_acquire(&ifp->if_busy_count); + ifp->if_flags |= IFF_DETACHING | IFF_VMOVING; + } + IF_BUSY_UNLOCK(ifp); + } + IFNET_WUNLOCK(); /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { - if (ifp->if_home_vnet != ifp->if_vnet) - if_vmove(ifp, ifp->if_home_vnet); + if (ifp->if_home_vnet == ifp->if_vnet) + continue; + /* + * At this point, we have two possible scenarios for every ifnet + * still here that needs re-homed: either we're re-homing it + * because we marked it as such above, or someone else is + * removing it. If we're re-homing it, IFF_VMOVING should + * still be set from up above. If someone else is trying to + * remove it, then they've unset IFF_VMOVING and are waiting for + * us to drop the refcount. In that scenario, we don't need to + * bother with an if_vmove() because the waiter is simply going + * to drop it as soon as we let go of the reference. + */ + IF_BUSY_LOCK(ifp); + if ((ifp->if_flags & IFF_VMOVING) != 0) { + IF_BUSY_UNLOCK(ifp); + if_vmove_busied(ifp, ifp->if_home_vnet); + } else { + IF_BUSY_UNLOCK(ifp); + } + if_unbusy(ifp); } } /* @@ -561,6 +615,7 @@ } IF_ADDR_LOCK_INIT(ifp); + IF_BUSY_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp); ifp->if_afdata_initialized = 0; @@ -607,7 +662,7 @@ if_free_internal(struct ifnet *ifp) { - KASSERT((ifp->if_flags & IFF_DYING), + KASSERT((ifp->if_flags & IFF_DYING) && (ifp->if_flags & IFF_DETACHING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) @@ -619,6 +674,7 @@ #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); + IF_BUSY_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) @@ -1058,6 +1114,22 @@ if_detach(struct ifnet *ifp) { + /* + * Grab the ifnet writelock as well as the interface busy lock here to + * serialize parts of vnet shutdown with interface detach. If we win + * here over vnet_sysuninit_shutdown, then we'll proceed to detach and + * vnet_shutdown will be properly set before we do something we regret. + * If we lose here, the interface will be busy'd as it gets removed from + * the vnet and then we'll swoop in and finish the process. + */ + IFNET_WLOCK(); + IF_BUSY_LOCK(ifp); + ifp->if_flags |= IFF_DETACHING; + ifp->if_flags &= ~IFF_VMOVING; + IF_BUSY_UNLOCK(ifp); + IFNET_WUNLOCK(); + + if_busy_wait(ifp); CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); @@ -1082,6 +1154,16 @@ struct ifnet *iter; int found = 0; + KASSERT((ifp->if_flags & IFF_DETACHING) != 0, + ("%s: detach flag not set", __func__)); + if (!vmove) { + KASSERT(ifp->if_busy_count == 0, + ("%s: busy threads still present", __func__)); + KASSERT((ifp->if_flags & IFF_VMOVING) == 0, + ("%s: detach while ifnet changing vnets", __func__)); + KASSERT((ifp->if_flags & IFF_DYING) == 0, + ("%s: out of order detach/free", __func__)); + } IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { @@ -1250,15 +1332,8 @@ } #ifdef VIMAGE -/* - * if_vmove() performs a limited version of if_detach() in current - * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. - * An attempt is made to shrink if_index in current vnet, find an - * unused if_index in target vnet and calls if_grow() if necessary, - * and finally find an unused if_xname for the target vnet. - */ static int -if_vmove(struct ifnet *ifp, struct vnet *new_vnet) +if_vmove_busied(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; #ifdef DEV_BPF @@ -1268,7 +1343,7 @@ int rc; #ifdef DEV_BPF - /* + /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. @@ -1283,7 +1358,19 @@ */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) - return (rc); + goto out; + + /* + * We could defer clearing IFF_DETACHING until the end, but + * IFF_VMOVING is sufficient for keeping the interface 'busy' until + * death or ioctls are invoked, and this keeps if_flags looking somewhat + * correct for what's happening right now- we've exited the detach phase + * of if_vmove, and we're now attaching to the new vnet. + */ + IF_BUSY_LOCK(ifp); + if ((ifp->if_flags & IFF_VMOVING) != 0) + ifp->if_flags &= ~IFF_DETACHING; + IF_BUSY_UNLOCK(ifp); /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink @@ -1327,7 +1414,53 @@ #endif CURVNET_RESTORE(); - return (0); +out: + /* + * Either we failed or the interface is no longer visible in the current + * vnet, so we can go ahead and drop IFF_DETACHED and IFF_VMOVING. If + * IFF_VMOVING is no longer set, then if_detach() came along and + * solidifed the detaching. We may have already cleared IFF_DETACHING + * up above, but we're blocked from doing further vnet moves and + * if_detach would have cleared IFF_VMOVING, so we can go ahead and + * unconditionally clear it -- we'll need to clear it anyways if we're + * in the error path. + */ + IF_BUSY_LOCK(ifp); + if ((ifp->if_flags & IFF_VMOVING) != 0) + ifp->if_flags &= ~(IFF_DETACHING | IFF_VMOVING); + IF_BUSY_UNLOCK(ifp); + return (rc); +} + +/* + * if_vmove() performs a limited version of if_detach() in current + * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. + * An attempt is made to shrink if_index in current vnet, find an + * unused if_index in target vnet and calls if_grow() if necessary, + * and finally find an unused if_xname for the target vnet. + * + * The caller may indicate that it's already busied thet interface prior + * to entry. If it has done so, assume it's ok to proceed with the move. + * The caller will handle unbusy. + */ +static int +if_vmove(struct ifnet *ifp, struct vnet *new_vnet) +{ + int error; + + /* + * Busy the ifnet, taking into account whether this is a vnet shutdown + * or not. We don't let detach block us if we're in the middle of + * shutdown so that we don't end up with a vnet that still has + * interfaces in it; if_detach will specifically handle the case that + * the ifnet in question will be getting re-homed as it's shutting down. + */ + if (!if_busy(ifp, IFF_DETACHING | IFF_VMOVING)) + return (EBUSY); + + error = if_vmove_busied(ifp, new_vnet); + if_unbusy(ifp); + return (error); } /* @@ -2414,6 +2547,43 @@ return (ifp); } +#define IFF_BUSY_FLAGS (IFF_DETACHING | IFF_DYING | IFF_VMOVING) + +/* + * Busy the interface, if it's not already detaching/dying. The caller may + * request, via setflags, flags to set on the ifp while we're still holding the + * busy lock if we've succeeded. The caller is responsible for clearing any + * temporary flags that were set. + */ +static bool +if_busy(struct ifnet *ifp, int setflags) +{ + + IF_BUSY_LOCK(ifp); + if ((ifp->if_flags & IFF_BUSY_FLAGS) != 0) { + IF_BUSY_UNLOCK(ifp); + return (false); + } + refcount_acquire(&ifp->if_busy_count); + ifp->if_flags |= setflags; + IF_BUSY_UNLOCK(ifp); + return (true); +} + +static void +if_unbusy(struct ifnet *ifp) +{ + + refcount_release(&ifp->if_busy_count); +} + +static void +if_busy_wait(struct ifnet *ifp) +{ + + refcount_wait(&ifp->if_busy_count, "ifbusy", 0); +} + static void * ifr_buffer_get_buffer(void *data) { @@ -2486,8 +2656,8 @@ /* * Hardware specific interface ioctls. */ -int -ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) +static int +ifhwioctl_internal(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; @@ -2499,6 +2669,14 @@ struct ifaddr *ifa; struct sockaddr_dl *sdl; + /* + * We can't assert that we're not detaching because the procedure is to + * mark us as detaching then wait for the different "management" paths + * to exit before proceeding with the detach. We should have properly + * increased the refcount, at least. + */ + KASSERT(ifp->if_busy_count > 0, + ("%s: mgmt refcount not increased", __func__)); ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: @@ -2902,6 +3080,23 @@ return (error); } +/* + * Hardware specific interface ioctls. + */ +int +ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) +{ + int error; + + if (!if_busy(ifp, 0)) + return (ENXIO); + + error = ifhwioctl_internal(cmd, ifp, data, td); + + if_unbusy(ifp); + return (error); +} + #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; @@ -2972,7 +3167,7 @@ switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); - goto out_noref; + goto out_notbusy; #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: @@ -2987,7 +3182,7 @@ error = ifconf(SIOCGIFCONF, (void *)&ifc); if (error == 0) ifc32->ifc_len = ifc.ifc_len; - goto out_noref; + goto out_notbusy; } #endif } @@ -3012,7 +3207,7 @@ if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); - goto out_noref; + goto out_notbusy; #endif case SIOCIFCREATE: case SIOCIFCREATE2: @@ -3021,20 +3216,20 @@ error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); - goto out_noref; + goto out_notbusy; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); - goto out_noref; + goto out_notbusy; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); - goto out_noref; + goto out_notbusy; case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); - goto out_noref; + goto out_notbusy; #if defined(INET) || defined(INET6) case SIOCSVH: @@ -3043,24 +3238,34 @@ error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); - goto out_noref; + goto out_notbusy; #endif } - ifp = ifunit_ref(ifr->ifr_name); + /* + * Don't increment the refcount; we'll do administrative refcounting as + * we want to ensure that we're no longer busy before + * if_detach/if_free may proceed. + */ + ifp = ifunit(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; - goto out_noref; + goto out_notbusy; + } + + if (!if_busy(ifp, 0)) { + error = ENXIO; + goto out_notbusy; } - error = ifhwioctl(cmd, ifp, data, td); + error = ifhwioctl_internal(cmd, ifp, data, td); if (error != ENOIOCTL) - goto out_ref; + goto out_busy; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; - goto out_ref; + goto out_busy; } /* @@ -3086,9 +3291,9 @@ #endif } -out_ref: - if_rele(ifp); -out_noref: +out_busy: + if_unbusy(ifp); +out_notbusy: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), Index: sys/net/if_tuntap.c =================================================================== --- sys/net/if_tuntap.c +++ sys/net/if_tuntap.c @@ -190,9 +190,6 @@ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); -static struct sx tun_ioctl_sx; -SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); - SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, @@ -641,9 +638,6 @@ bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } - sx_xlock(&tun_ioctl_sx); - TUN2IFP(tp)->if_softc = NULL; - sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); @@ -1011,14 +1005,8 @@ * from dying until we've created the alias (that will then be * subsequently destroyed). */ - sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; - if (tp == NULL) { - sx_xunlock(&tun_ioctl_sx); - return; - } error = tun_busy(tp); - sx_xunlock(&tun_ioctl_sx); if (error != 0) return; if (tp->tun_alias != NULL) { @@ -1324,12 +1312,7 @@ bool l2tun; ifmr = NULL; - sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; - if (tp == NULL) { - error = ENXIO; - goto bad; - } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: @@ -1391,8 +1374,6 @@ error = EINVAL; } } -bad: - sx_xunlock(&tun_ioctl_sx); return (error); } Index: sys/net/if_var.h =================================================================== --- sys/net/if_var.h +++ sys/net/if_var.h @@ -423,6 +423,13 @@ struct debugnet_methods *if_debugnet_methods; struct epoch_context if_epoch_ctx; + /* + * Bits for serializing management operations (ioctl/destroy/vmove) + * and ensuring order. + */ + struct mtx if_busy_lock; + u_int if_busy_count; + /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel @@ -450,6 +457,12 @@ #define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) #define NET_EPOCH_ASSERT() MPASS(in_epoch(net_epoch_preempt)) +#define IF_BUSY_LOCK_INIT(if) mtx_init(&(if)->if_busy_lock, "if_busy_lock", NULL, MTX_DEF) +#define IF_BUSY_LOCK_DESTROY(if) mtx_destroy(&(if)->if_busy_lock) +#define IF_BUSY_LOCK(if) mtx_lock(&(if)->if_busy_lock) +#define IF_BUSY_UNLOCK(if) mtx_unlock(&(if)->if_busy_lock) +#define IF_BUSY_LOCK_ASSERT(if) mtx_assert(&(if)->if_busy_lock, MA_OWNED) + #ifdef _KERNEL /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *);