Index: sys/net/if.h =================================================================== --- sys/net/if.h +++ sys/net/if.h @@ -160,6 +160,7 @@ #define IFF_PPROMISC 0x20000 /* (n) user-requested promisc mode */ #define IFF_MONITOR 0x40000 /* (n) user-requested monitor mode */ #define IFF_STATICARP 0x80000 /* (n) static ARP */ +#define IFF_DETACHING 0x100000 /* (n) interface is detaching */ #define IFF_DYING 0x200000 /* (n) interface is winding down */ #define IFF_RENAMING 0x400000 /* (n) interface is being renamed */ #define IFF_NOGROUP 0x800000 /* (n) interface is not part of any groups */ Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -274,7 +274,7 @@ static int if_detach_internal(struct ifnet *, int, struct if_clone **); static void if_siocaddmulti(void *, int); #ifdef VIMAGE -static void if_vmove(struct ifnet *, struct vnet *); +static void if_vmove(struct ifnet *, struct vnet *, bool shutdown); #endif #ifdef INET6 @@ -474,7 +474,7 @@ /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) - if_vmove(ifp, ifp->if_home_vnet); + if_vmove(ifp, ifp->if_home_vnet, true); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, @@ -557,6 +557,8 @@ } IF_ADDR_LOCK_INIT(ifp); + IF_MGMT_LOCK_INIT(ifp); + IF_MGMT_CV_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp); ifp->if_afdata_initialized = 0; @@ -603,6 +605,10 @@ if_free_internal(struct ifnet *ifp) { + /* + * XXX Should we also assert that it's been properly detached, a la + * IFF_DETACHED? It seems likely. + */ KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); @@ -615,6 +621,8 @@ #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); + IF_MGMT_LOCK_DESTROY(ifp); + IF_MGMT_CV_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) @@ -1054,9 +1062,32 @@ if_detach(struct ifnet *ifp) { +#ifdef VIMAGE +#define ifp_waits_for_shutdown(ifp) \ + (((ifp)->if_vnet != (ifp)->if_home_vnet) && (ifp)->if_vnet->vnet_shutdown) +#else +#define ifp_waits_for_shutdown(ifp) false +#endif + + /* + * We've marked ourselves as detaching, now we wait for the various + * management functions (e.g. ioctl handling, vmove) to complete before + * we proceed. If the vnet the interface is currently in is in the + * shutdown process and it's not already in its home vnet, we wait for + * the interface's vnet to stabilize before we proceed with the final + * detach. If the ifnet's home vnet is the one that is getting + * shutdown, it's not going to change vnets and the expectation is + * likely that we're about to detach+free it. + */ + IF_MGMT_LOCK(ifp); + ifp->if_flags |= IFF_DETACHING; + while (ifp->if_mgmt_refcount > 0 || ifp_waits_for_shutdown(ifp)) + IF_MGMT_CV_WAIT(ifp); + IF_MGMT_UNLOCK(ifp); CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); +#undef ifp_waits_for_shutdown } /* @@ -1077,11 +1108,14 @@ struct domain *dp; struct ifnet *iter; int found = 0; -#ifdef VIMAGE - bool shutdown; - shutdown = ifp->if_vnet->vnet_shutdown; -#endif + if (!vmove) { + KASSERT(ifp->if_mgmt_refcount == 0 && + (ifp->if_flags & IFF_DETACHING) != 0, + ("%s: improper detach, dangling mgmt refs", __func__)); + KASSERT((ifp->if_flags & IFF_DYING) == 0, + ("%s: out of order detach/free", __func__)); + } IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { @@ -1149,9 +1183,11 @@ #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all - * the work top-down for us. + * the work top-down for us. XXX This is still racy; nothing is + * stopping a vnet from being shutdown while we're in the middle of + * detaching but somewhere after this point. */ - if (shutdown) { + if (ifp->if_vnet->vnet_shutdown) { /* Give interface users the chance to clean up. */ EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); @@ -1258,7 +1294,7 @@ * and finally find an unused if_xname for the target vnet. */ static void -if_vmove(struct ifnet *ifp, struct vnet *new_vnet) +if_vmove(struct ifnet *ifp, struct vnet *new_vnet, bool shutdown) { struct if_clone *ifc; #ifdef DEV_BPF @@ -1267,6 +1303,19 @@ void *old; int rc; + /* + * We can't hold the lock across attach/detach; bump the refcount and + * drop it. if_detach will stall until we broadcast at the end. We + * don't let detach block us if we're in the middle of shutdown so that + * we don't end up with a vnet that still has interfaces in it. + */ + IF_MGMT_LOCK(ifp); + if (!shutdown && (ifp->if_flags & IFF_DETACHING) != 0) { + IF_MGMT_UNLOCK(ifp); + return; + } + refcount_acquire(&ifp->if_mgmt_refcount); + IF_MGMT_UNLOCK(ifp); #ifdef DEV_BPF /* * if_detach_internal() will call the eventhandler to notify @@ -1283,7 +1332,7 @@ */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) - return; + goto out; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink @@ -1327,6 +1376,11 @@ #endif CURVNET_RESTORE(); +out: + IF_MGMT_LOCK(ifp); + if (refcount_release(&ifp->if_mgmt_refcount)) + IF_MGMT_CV_BROADCAST(ifp); + IF_MGMT_UNLOCK(ifp); } /* @@ -1372,7 +1426,7 @@ CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ - if_vmove(ifp, pr->pr_vnet); + if_vmove(ifp, pr->pr_vnet, false); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); @@ -1422,7 +1476,7 @@ } /* Get interface back from child jail/vnet. */ - if_vmove(ifp, vnet_dst); + if_vmove(ifp, vnet_dst, false); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ @@ -2481,8 +2535,8 @@ /* * Hardware specific interface ioctls. */ -int -ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) +static int +ifhwioctl_internal(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; @@ -2494,6 +2548,14 @@ struct ifaddr *ifa; struct sockaddr_dl *sdl; + /* + * We can't assert that we're not detaching because the procedure is to + * mark us as detaching then wait for the different "management" paths + * to exit before proceeding with the detach. We should have properly + * increased the refcount, at least. + */ + KASSERT(ifp->if_mgmt_refcount > 0, + ("%s: mgmt refcount not increased", __func__)); ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: @@ -2897,6 +2959,35 @@ return (error); } +/* + * Hardware specific interface ioctls. + */ +int +ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) +{ + int error; + + /* + * Don't attempt to do anything on a detaching interface. + */ + IF_MGMT_LOCK(ifp); + if ((ifp->if_flags & IFF_DETACHING) != 0) { + IF_MGMT_UNLOCK(ifp); + return (ENXIO); + } + refcount_acquire(&ifp->if_mgmt_refcount); + IF_MGMT_UNLOCK(ifp); + + error = ifhwioctl_internal(cmd, ifp, data, td); + + IF_MGMT_LOCK(ifp); + if (refcount_release(&ifp->if_mgmt_refcount)) + IF_MGMT_CV_BROADCAST(ifp); + IF_MGMT_UNLOCK(ifp); + return (error); +} + + #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; @@ -3042,13 +3133,30 @@ #endif } - ifp = ifunit_ref(ifr->ifr_name); + /* + * Don't increment the refcount; we'll do administrative refcounting as + * we want to ensure that we're out of these management paths before + * if_detach/if_free may proceed. + */ + ifp = ifunit(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } - error = ifhwioctl(cmd, ifp, data, td); + /* + * Don't attempt to do anything on a detaching or dying interface. + */ + IF_MGMT_LOCK(ifp); + if ((ifp->if_flags & (IFF_DETACHING | IFF_DYING)) != 0) { + IF_MGMT_UNLOCK(ifp); + error = ENXIO; + goto out_noref; + } + refcount_acquire(&ifp->if_mgmt_refcount); + IF_MGMT_UNLOCK(ifp); + + error = ifhwioctl_internal(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; @@ -3082,7 +3190,10 @@ } out_ref: - if_rele(ifp); + IF_MGMT_LOCK(ifp); + if (refcount_release(&ifp->if_mgmt_refcount)) + IF_MGMT_CV_BROADCAST(ifp); + IF_MGMT_UNLOCK(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { Index: sys/net/if_tuntap.c =================================================================== --- sys/net/if_tuntap.c +++ sys/net/if_tuntap.c @@ -190,9 +190,6 @@ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); -static struct sx tun_ioctl_sx; -SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); - SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, @@ -641,9 +638,6 @@ bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } - sx_xlock(&tun_ioctl_sx); - TUN2IFP(tp)->if_softc = NULL; - sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); @@ -1011,14 +1005,8 @@ * from dying until we've created the alias (that will then be * subsequently destroyed). */ - sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; - if (tp == NULL) { - sx_xunlock(&tun_ioctl_sx); - return; - } error = tun_busy(tp); - sx_xunlock(&tun_ioctl_sx); if (error != 0) return; if (tp->tun_alias != NULL) { @@ -1324,12 +1312,7 @@ bool l2tun; ifmr = NULL; - sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; - if (tp == NULL) { - error = ENXIO; - goto bad; - } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: @@ -1391,8 +1374,6 @@ error = EINVAL; } } -bad: - sx_xunlock(&tun_ioctl_sx); return (error); } Index: sys/net/if_var.h =================================================================== --- sys/net/if_var.h +++ sys/net/if_var.h @@ -423,6 +423,14 @@ struct debugnet_methods *if_debugnet_methods; struct epoch_context if_epoch_ctx; + /* + * Bits for serializing management operations (ioctl/destroy/vmove) + * and ensuring order. + */ + struct mtx if_mgmt_lock; + struct cv if_mgmt_cv; + u_int if_mgmt_refcount; /* count of threads in mgmt functions */ + /* * Spare fields to be added before branching a stable branch, so * that structure can be enhanced without changing the kernel @@ -450,6 +458,17 @@ #define NET_EPOCH_WAIT() epoch_wait_preempt(net_epoch_preempt) #define NET_EPOCH_ASSERT() MPASS(in_epoch(net_epoch_preempt)) +#define IF_MGMT_CV_INIT(if) cv_init(&(if)->if_mgmt_cv, "if_mgmt_cv") +#define IF_MGMT_CV_DESTROY(if) cv_destroy(&(if)->if_mgmt_cv) +#define IF_MGMT_CV_BROADCAST(if) cv_broadcast(&(if)->if_mgmt_cv) +#define IF_MGMT_CV_WAIT(if) cv_wait(&(if)->if_mgmt_cv, &(if)->if_mgmt_lock) + +#define IF_MGMT_LOCK_INIT(if) mtx_init(&(if)->if_mgmt_lock, "if_mgmt_lock", NULL, MTX_DEF) +#define IF_MGMT_LOCK_DESTROY(if) mtx_destroy(&(if)->if_mgmt_lock) +#define IF_MGMT_LOCK(if) mtx_lock(&(if)->if_mgmt_lock) +#define IF_MGMT_UNLOCK(if) mtx_unlock(&(if)->if_mgmt_lock) +#define IF_MGMT_LOCK_ASSERT(if) mtx_assert(&(if)->if_mgmt_lock, MA_OWNED) + #ifdef _KERNEL /* interface link layer address change event */ typedef void (*iflladdr_event_handler_t)(void *, struct ifnet *);