Index: sys/fs/nullfs/null.h =================================================================== --- sys/fs/nullfs/null.h +++ sys/fs/nullfs/null.h @@ -45,6 +45,8 @@ struct mount *nullm_vfs; struct vnode *nullm_lowerrootvp; /* Ref to lower root vnode */ uint64_t nullm_flags; + struct mount_upper_node upper_node; + struct mount_upper_node notify_node; }; #ifdef _KERNEL Index: sys/fs/nullfs/null_vfsops.c =================================================================== --- sys/fs/nullfs/null_vfsops.c +++ sys/fs/nullfs/null_vfsops.c @@ -163,7 +163,8 @@ * Save pointer to underlying FS and the reference to the * lower root vnode. */ - xmp->nullm_vfs = vfs_pin_from_vp(lowerrootvp); + xmp->nullm_vfs = vfs_register_upper_from_vp(lowerrootvp, mp, + &xmp->upper_node); if (xmp->nullm_vfs == NULL) { vput(lowerrootvp); free(xmp, M_NULLFSMNT); @@ -178,7 +179,7 @@ */ error = null_nodeget(mp, lowerrootvp, &nullm_rootvp); if (error != 0) { - vfs_unpin(xmp->nullm_vfs); + vfs_unregister_upper(xmp->nullm_vfs, &xmp->upper_node); vrele(lowerrootvp); free(xmp, M_NULLFSMNT); return (error); @@ -195,6 +196,11 @@ (xmp->nullm_vfs->mnt_kern_flag & MNTK_NULL_NOCACHE) != 0) xmp->nullm_flags &= ~NULLM_CACHE; + if ((xmp->nullm_flags & NULLM_CACHE) != 0) { + vfs_register_for_notification(xmp->nullm_vfs, mp, + &xmp->notify_node); + } + MNT_ILOCK(mp); if ((xmp->nullm_flags & NULLM_CACHE) != 0) { mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & @@ -206,13 +212,6 @@ (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS); MNT_IUNLOCK(mp); vfs_getnewfsid(mp); - if ((xmp->nullm_flags & NULLM_CACHE) != 0) { - MNT_ILOCK(xmp->nullm_vfs); - TAILQ_INSERT_TAIL(&xmp->nullm_vfs->mnt_uppers, mp, - mnt_upper_link); - MNT_IUNLOCK(xmp->nullm_vfs); - } - vfs_mountedfrom(mp, target); vput(nullm_rootvp); @@ -230,7 +229,6 @@ int mntflags; { struct null_mount *mntdata; - struct mount *ump; int error, flags; NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp); @@ -259,17 +257,11 @@ * Finally, throw away the null_mount structure */ mntdata = mp->mnt_data; - ump = mntdata->nullm_vfs; if ((mntdata->nullm_flags & NULLM_CACHE) != 0) { - MNT_ILOCK(ump); - while ((ump->mnt_kern_flag & MNTK_VGONE_UPPER) != 0) { - ump->mnt_kern_flag |= MNTK_VGONE_WAITER; - msleep(&ump->mnt_uppers, &ump->mnt_mtx, 0, "vgnupw", 0); - } - TAILQ_REMOVE(&ump->mnt_uppers, mp, mnt_upper_link); - MNT_IUNLOCK(ump); + vfs_unregister_for_notification(mntdata->nullm_vfs, + &mntdata->notify_node); } - vfs_unpin(ump); + vfs_unregister_upper(mntdata->nullm_vfs, &mntdata->upper_node); vrele(mntdata->nullm_lowerrootvp); mp->mnt_data = NULL; free(mntdata, M_NULLFSMNT); Index: sys/fs/unionfs/union.h =================================================================== --- sys/fs/unionfs/union.h +++ sys/fs/unionfs/union.h @@ -57,6 +57,8 @@ struct vnode *um_lowervp; /* VREFed once */ struct vnode *um_uppervp; /* VREFed once */ struct vnode *um_rootvp; /* ROOT vnode */ + struct mount_upper_node um_lower_link; /* node in lower FS list of uppers */ + struct mount_upper_node um_upper_link; /* node in upper FS list of uppers */ unionfs_copymode um_copymode; unionfs_whitemode um_whitemode; uid_t um_uid; Index: sys/fs/unionfs/union_vfsops.c =================================================================== --- sys/fs/unionfs/union_vfsops.c +++ sys/fs/unionfs/union_vfsops.c @@ -292,14 +292,16 @@ return (error); } - lowermp = vfs_pin_from_vp(ump->um_lowervp); - uppermp = vfs_pin_from_vp(ump->um_uppervp); + lowermp = vfs_register_upper_from_vp(ump->um_lowervp, mp, + &ump->um_lower_link); + uppermp = vfs_register_upper_from_vp(ump->um_uppervp, mp, + &ump->um_upper_link); if (lowermp == NULL || uppermp == NULL) { if (lowermp != NULL) - vfs_unpin(lowermp); + vfs_unregister_upper(lowermp, &ump->um_lower_link); if (uppermp != NULL) - vfs_unpin(uppermp); + vfs_unregister_upper(uppermp, &ump->um_upper_link); free(ump, M_UNIONFSMNT); mp->mnt_data = NULL; return (ENOENT); @@ -357,8 +359,8 @@ if (error) return (error); - vfs_unpin(ump->um_lowervp->v_mount); - vfs_unpin(ump->um_uppervp->v_mount); + vfs_unregister_upper(ump->um_lowervp->v_mount, &ump->um_lower_link); + vfs_unregister_upper(ump->um_uppervp->v_mount, &ump->um_upper_link); free(ump, M_UNIONFSMNT); mp->mnt_data = NULL; Index: sys/kern/vfs_mount.c =================================================================== --- sys/kern/vfs_mount.c +++ sys/kern/vfs_mount.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -89,6 +90,11 @@ SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); +static bool recursive_forced_unmount = false; +SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW, + &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts" + " when a file system is forcibly unmounted"); + MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); static uma_zone_t mount_zone; @@ -103,6 +109,16 @@ EVENTHANDLER_LIST_DEFINE(vfs_mounted); EVENTHANDLER_LIST_DEFINE(vfs_unmounted); +static void vfs_deferred_unmount(void *arg, int pending); +static struct task deferred_unmount_task = + TASK_INITIALIZER(0, vfs_deferred_unmount, NULL);; +static struct mtx deferred_unmount_lock; +MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount", + MTX_DEF); +static STAILQ_HEAD(, mount) deferred_unmount_list = + STAILQ_HEAD_INITIALIZER(deferred_unmount_list); +TASKQUEUE_DEFINE_THREAD(deferred_unmount); + static void mount_devctl_event(const char *type, struct mount *mp, bool donew); /* @@ -505,8 +521,21 @@ MNT_IUNLOCK(mp); } +/* + * Register ump as an upper mount of the mount associated with + * vnode vp. This registration will be tracked through + * mount_upper_node upper, which should be allocated by the + * caller and stored in per-mount data associated with mp. + * + * If successful, this function will return the mount associated + * with vp, and will ensure that it cannot be unmounted until + * ump has been unregistered as one of its upper mounts. + * + * Upon failure this function will return NULL. + */ struct mount * -vfs_pin_from_vp(struct vnode *vp) +vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump, + struct mount_upper_node *upper) { struct mount *mp; @@ -514,26 +543,81 @@ if (mp == NULL) return (NULL); MNT_ILOCK(mp); - if (mp != vp->v_mount || (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { + if (mp != vp->v_mount || + ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) { MNT_IUNLOCK(mp); return (NULL); } + KASSERT(ump != mp, ("upper and lower mounts are identical")); + upper->mp = ump; MNT_REF(mp); - KASSERT(mp->mnt_pinned_count < INT_MAX, - ("mount pinned count overflow")); - ++mp->mnt_pinned_count; + TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link); MNT_IUNLOCK(mp); return (mp); } +/* + * Register upper mount ump to receive vnode unlink/reclaim + * notifications from lower mount mp. This registration will + * be tracked through mount_upper_node upper, which should be + * allocated by the caller and stored in per-mount data + * associated with mp. + * + * ump must already be registered as an upper mount of mp + * through a call to vfs_register_upper_from_vp(). + */ void -vfs_unpin(struct mount *mp) +vfs_register_for_notification(struct mount *mp, struct mount *ump, + struct mount_upper_node *upper) +{ + upper->mp = ump; + MNT_ILOCK(mp); + TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link); + MNT_IUNLOCK(mp); +} + +static void +vfs_drain_upper_locked(struct mount *mp) +{ + mtx_assert(MNT_MTX(mp), MA_OWNED); + while (mp->mnt_upper_pending != 0) { + mp->mnt_kern_flag |= MNTK_UPPER_WAITER; + msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0); + } +} + +/* + * Undo a previous call to vfs_register_for_notification(). + * The mount represented by upper must be currently registered + * as an upper mount for mp. + */ +void +vfs_unregister_for_notification(struct mount *mp, + struct mount_upper_node *upper) +{ + MNT_ILOCK(mp); + vfs_drain_upper_locked(mp); + TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link); + MNT_IUNLOCK(mp); +} + +/* + * Undo a previous call to vfs_register_upper_from_vp(). + * This must be done before mp can be unmounted. + */ +void +vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper) { MNT_ILOCK(mp); - KASSERT(mp->mnt_pinned_count > 0, ("mount pinned count underflow")); KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0, - ("mount pinned with pending unmount")); - --mp->mnt_pinned_count; + ("registered upper with pending unmount")); + vfs_drain_upper_locked(mp); + TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link); + if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 && + TAILQ_EMPTY(&mp->mnt_uppers)) { + mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER; + wakeup(&mp->taskqueue_link); + } MNT_REL(mp); MNT_IUNLOCK(mp); } @@ -600,8 +684,10 @@ mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); + mp->mnt_upper_pending = 0; TAILQ_INIT(&mp->mnt_uppers); - mp->mnt_pinned_count = 0; + TAILQ_INIT(&mp->mnt_notify); + mp->taskqueue_flags = 0; return (mp); } @@ -640,9 +726,9 @@ vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } - KASSERT(mp->mnt_pinned_count == 0, - ("mnt_pinned_count = %d", mp->mnt_pinned_count)); + KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending")); KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); + KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_lazyvnodelistsize != 0) @@ -1799,17 +1885,165 @@ return (sum); } +static bool +deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue) +{ + bool enqueued; + + enqueued = false; + mtx_lock(&deferred_unmount_lock); + if ((mp->taskqueue_flags & MNT_TASKQUEUE) == 0 || requeue) { + mp->taskqueue_flags = flags | MNT_TASKQUEUE; + STAILQ_INSERT_TAIL(&deferred_unmount_list, mp, taskqueue_link); + enqueued = true; + } + mtx_unlock(&deferred_unmount_lock); + + if (enqueued) { + taskqueue_enqueue(taskqueue_deferred_unmount, + &deferred_unmount_task); + } + + return (enqueued); +} + +/* + * Taskqueue handler for processing async/recursive unmounts + */ +static void +vfs_deferred_unmount(void *argi __unused, int pending __unused) +{ + STAILQ_HEAD(, mount) local_unmounts; + uint64_t flags; + struct mount *mp, *tmp; + bool unmounted; + + STAILQ_INIT(&local_unmounts); + mtx_lock(&deferred_unmount_lock); + STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); + mtx_unlock(&deferred_unmount_lock); + + STAILQ_FOREACH_SAFE(mp, &local_unmounts, taskqueue_link, tmp) { + flags = mp->taskqueue_flags; + KASSERT((flags & MNT_TASKQUEUE) != 0, + ("taskqueue unmount without MNT_TASKQUEUE")); + if (dounmount(mp, flags, curthread) != 0) { + MNT_ILOCK(mp); + unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0); + MNT_IUNLOCK(mp); + if (!unmounted) + deferred_unmount_enqueue(mp, flags, true); + else + vfs_rel(mp); + } + } +} + /* * Do the actual filesystem unmount. */ int -dounmount(struct mount *mp, int flags, struct thread *td) +dounmount(struct mount *mp, uint64_t flags, struct thread *td) { + struct mount_upper_node *upper; struct vnode *coveredvp, *rootvp; int error; uint64_t async_flag; int mnt_gen_r; + KASSERT((flags & MNT_TASKQUEUE) == 0 || + (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE), + ("MNT_TASKQUEUE requires MNT_RECURSE | MNT_FORCE")); + + /* + * If the caller has explicitly requested the unmount to be handled by + * the taskqueue and we're not already in taskqueue context, queue + * up the unmount request and exit. This is done prior to any + * credential checks; MNT_TASKQUEUE should be used only for kernel- + * initiated unmounts and will therefore be processed with the + * (kernel) credentials of the taskqueue thread. Still, callers + * should be sure this is the behavior they want. + */ + if ((flags & MNT_TASKQUEUE) != 0 && + taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) { + if (!deferred_unmount_enqueue(mp, flags, false)) + vfs_rel(mp); + return (EINPROGRESS); + } + + /* + * Only privileged root, or (if MNT_USER is set) the user that did the + * original mount is permitted to unmount this filesystem. + * This check should be made prior to queueing up any recursive + * unmounts of upper filesystems. Those unmounts will be executed + * with kernel thread credentials and are expected to succeed, so + * we must at least ensure the originating context has sufficient + * privilege to unmount the base filesystem before proceeding with + * the uppers. + */ + error = vfs_suser(mp, td); + if (error != 0) { + KASSERT((flags & MNT_TASKQUEUE) == 0, + ("taskqueue unmount with insufficient privilege")); + vfs_rel(mp); + return (error); + } + + if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0)) + flags |= MNT_RECURSE; + + if ((flags & MNT_RECURSE) != 0) { + KASSERT((flags & MNT_FORCE) != 0, + ("MNT_RECURSE requires MNT_FORCE")); + + MNT_ILOCK(mp); + /* + * Set MNTK_RECURSE to prevent new upper mounts from being + * added, and note that an operation on the uppers list is in + * progress. This will ensure that unregistration from the + * uppers list, and therefore any pending unmount of the upper + * FS, can't complete until after we finish walking the list. + */ + mp->mnt_kern_flag |= MNTK_RECURSE; + mp->mnt_upper_pending++; + TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) { + MNT_IUNLOCK(mp); + vfs_ref(upper->mp); + if (!deferred_unmount_enqueue(upper->mp, flags, false)) + vfs_rel(upper->mp); + MNT_ILOCK(mp); + } + mp->mnt_upper_pending--; + if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && + mp->mnt_upper_pending == 0) { + mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; + wakeup(&mp->mnt_uppers); + } + /* + * If we're not on the taskqueue, wait until the uppers list + * is drained before proceeding with unmount. Otherwise, if + * we are on the taskqueue and there are still pending uppers, + * just re-enqueue on the end of the taskqueue. + */ + if ((flags & MNT_TASKQUEUE) == 0) { + while (!TAILQ_EMPTY(&mp->mnt_uppers)) { + mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER; + msleep(&mp->taskqueue_link, MNT_MTX(mp), 0, + "umntqw", 0); + } + } else if (!TAILQ_EMPTY(&mp->mnt_uppers)) { + MNT_IUNLOCK(mp); + deferred_unmount_enqueue(mp, flags, true); + return (0); + } + MNT_IUNLOCK(mp); + KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty")); + } + + /* Allow the taskqueue to safely re-enqueue on failure */ + if ((flags & MNT_TASKQUEUE) != 0) + vfs_ref(mp); + if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); @@ -1828,27 +2062,13 @@ } } - /* - * Only privileged root, or (if MNT_USER is set) the user that did the - * original mount is permitted to unmount this filesystem. - */ - error = vfs_suser(mp, td); - if (error != 0) { - if (coveredvp != NULL) { - VOP_UNLOCK(coveredvp); - vdrop(coveredvp); - } - vfs_rel(mp); - return (error); - } - vfs_op_enter(mp); vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || (mp->mnt_flag & MNT_UPDATE) != 0 || - mp->mnt_pinned_count != 0) { + !TAILQ_EMPTY(&mp->mnt_uppers)) { dounmount_cleanup(mp, coveredvp, 0); return (EBUSY); } @@ -1952,6 +2172,7 @@ } return (error); } + mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); @@ -1977,6 +2198,8 @@ } if (mp == rootdevmp) rootdevmp = NULL; + if ((flags & MNT_TASKQUEUE) != 0) + vfs_rel(mp); vfs_mount_destroy(mp); return (0); } Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -831,9 +831,9 @@ * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { - KASSERT(mp->mnt_pinned_count == 0, - ("%s: non-zero pinned count %d with pending unmount", - __func__, mp->mnt_pinned_count)); + KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), + ("%s: non-empty upper mount list with pending unmount", + __func__)); if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); @@ -3891,61 +3891,44 @@ VI_UNLOCK(vp); } -static void -notify_lowervp_vfs_dummy(struct mount *mp __unused, - struct vnode *lowervp __unused) -{ -} - /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, int event) { - static struct vfsops vgonel_vfsops = { - .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, - .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, - }; - struct mount *mp, *ump, *mmp; + struct mount *mp; + struct mount_upper_node *ump; - mp = vp->v_mount; + mp = atomic_load_ptr(&vp->v_mount); if (mp == NULL) return; - if (TAILQ_EMPTY(&mp->mnt_uppers)) + if (TAILQ_EMPTY(&mp->mnt_notify)) return; - mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); - mmp->mnt_op = &vgonel_vfsops; - mmp->mnt_kern_flag |= MNTK_MARKER; MNT_ILOCK(mp); - mp->mnt_kern_flag |= MNTK_VGONE_UPPER; - for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { - if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { - ump = TAILQ_NEXT(ump, mnt_upper_link); - continue; - } - TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); + mp->mnt_upper_pending++; + KASSERT(mp->mnt_upper_pending > 0, + ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); + TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) { MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: - VFS_RECLAIM_LOWERVP(ump, vp); + VFS_RECLAIM_LOWERVP(ump->mp, vp); break; case VFS_NOTIFY_UPPER_UNLINK: - VFS_UNLINK_LOWERVP(ump, vp); + VFS_UNLINK_LOWERVP(ump->mp, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); - ump = TAILQ_NEXT(mmp, mnt_upper_link); - TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); } - free(mmp, M_TEMP); - mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; - if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { - mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; + mp->mnt_upper_pending--; + if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && + mp->mnt_upper_pending == 0) { + mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); @@ -4376,12 +4359,12 @@ MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); - MNT_KERN_FLAG(MNTK_VGONE_UPPER); - MNT_KERN_FLAG(MNTK_VGONE_WAITER); + MNT_KERN_FLAG(MNTK_RECURSE); + MNT_KERN_FLAG(MNTK_UPPER_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); - MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_FPLOOKUP); + MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h +++ sys/sys/mount.h @@ -190,6 +190,19 @@ _Static_assert(sizeof(struct mount_pcpu) == 16, "the struct is allocated from pcpu 16 zone"); +/* + * Structure for tracking a stacked filesystem mounted above another + * filesystem. This is expected to be stored in the upper FS' per-mount data. + * + * Lock reference: + * i - lower mount interlock + * c - constant from node initialization + */ +struct mount_upper_node { + struct mount *mp; /* (c) mount object for upper FS */ + TAILQ_ENTRY(mount_upper_node) mnt_upper_link; /* (i) position in uppers list */ +}; + /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are @@ -199,8 +212,8 @@ * l - mnt_listmtx * m - mountlist_mtx * i - interlock - * i* - interlock of uppers' list head * v - vnode freelist mutex + * d - deferred unmount list mutex * * Unmarked fields are considered stable as long as a ref is held. * @@ -242,10 +255,12 @@ struct mtx mnt_listmtx; struct vnodelst mnt_lazyvnodelist; /* (l) list of lazy vnodes */ int mnt_lazyvnodelistsize; /* (l) # of lazy vnodes */ - int mnt_pinned_count; /* (i) unmount prevented */ + int mnt_upper_pending; /* (i) # of pending ops on mnt_uppers */ struct lock mnt_explock; /* vfs_export walkers lock */ - TAILQ_ENTRY(mount) mnt_upper_link; /* (i*) we in the all uppers */ - TAILQ_HEAD(, mount) mnt_uppers; /* (i) upper mounts over us */ + TAILQ_HEAD(, mount_upper_node) mnt_uppers; /* (i) upper mounts over us */ + TAILQ_HEAD(, mount_upper_node) mnt_notify; /* (i) upper mounts for notification */ + STAILQ_ENTRY(mount) taskqueue_link; /* (d) our place in deferred unmount list */ + uint64_t taskqueue_flags; /* (d) unmount flags passed from taskqueue */ }; #endif /* _WANT_MOUNT || _KERNEL */ @@ -438,9 +453,13 @@ #define MNT_BYFSID 0x0000000008000000ULL /* specify filesystem by ID. */ #define MNT_NOCOVER 0x0000001000000000ULL /* Do not cover a mount point */ #define MNT_EMPTYDIR 0x0000002000000000ULL /* Only mount on empty dir */ -#define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ +#define MNT_RECURSE 0x0000100000000000ULL /* recursively unmount uppers */ +#define MNT_TASKQUEUE 0x0000200000000000ULL /* unmount in taskqueue context */ +#define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_NONBUSY | \ - MNT_BYFSID | MNT_NOCOVER | MNT_EMPTYDIR) + MNT_BYFSID | MNT_NOCOVER | MNT_EMPTYDIR | \ + MNT_RECURSE | MNT_TASKQUEUE) + /* * Internal filesystem control flags stored in mnt_kern_flag. * @@ -466,10 +485,9 @@ #define MNTK_NO_IOPF 0x00000100 /* Disallow page faults during reads and writes. Filesystem shall properly handle i/o state on EFAULT. */ -#define MNTK_VGONE_UPPER 0x00000200 -#define MNTK_VGONE_WAITER 0x00000400 +#define MNTK_RECURSE 0x00000200 /* pending recursive unmount */ +#define MNTK_UPPER_WAITER 0x00000400 /* waiting to drain MNTK_UPPER_PENDING */ #define MNTK_LOOKUP_EXCL_DOTDOT 0x00000800 -#define MNTK_MARKER 0x00001000 #define MNTK_UNMAPPED_BUFS 0x00002000 #define MNTK_USES_BCACHE 0x00004000 /* FS uses the buffer cache. */ #define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */ @@ -477,8 +495,9 @@ #define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */ #define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */ #define MNTK_SUSPEND_ALL 0x00080000 /* Suspended by all-fs suspension */ -#define MNTK_NOASYNC 0x00800000 /* disable async */ -#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ +#define MNTK_TASKQUEUE_WAITER 0x00100000 /* Waiting on unmount taskqueue */ +#define MNTK_NOASYNC 0x00800000 /* disable async */ +#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPEND2 0x04000000 /* block secondary writes */ @@ -952,7 +971,7 @@ * exported vnode operations */ -int dounmount(struct mount *, int, struct thread *); +int dounmount(struct mount *, uint64_t, struct thread *); int kernel_mount(struct mntarg *ma, uint64_t flags); int kernel_vmount(int flags, ...); @@ -1012,8 +1031,13 @@ int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *); void vfs_unmountall(void); -struct mount *vfs_pin_from_vp(struct vnode *); -void vfs_unpin(struct mount *); +struct mount *vfs_register_upper_from_vp(struct vnode *, + struct mount *ump, struct mount_upper_node *); +void vfs_register_for_notification(struct mount *, struct mount *, + struct mount_upper_node *); +void vfs_unregister_for_notification(struct mount *, + struct mount_upper_node *); +void vfs_unregister_upper(struct mount *, struct mount_upper_node *); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx_padalign mountlist_mtx; extern struct nfs_public nfs_pub; Index: sys/ufs/ffs/ffs_vfsops.c =================================================================== --- sys/ufs/ffs/ffs_vfsops.c +++ sys/ufs/ffs/ffs_vfsops.c @@ -281,28 +281,8 @@ } /* - * Initiate a forcible unmount. + * On first ENXIO error, initiate an asynchronous forcible unmount. * Used to unmount filesystems whose underlying media has gone away. - */ -static void -ffs_fsfail_unmount(void *v, int pending) -{ - struct fsfail_task *etp; - struct mount *mp; - - etp = v; - - /* - * Find our mount and get a ref on it, then try to unmount. - */ - mp = vfs_getvfs(&etp->fsid); - if (mp != NULL) - dounmount(mp, MNT_FORCE, curthread); - free(etp, M_UFSMNT); -} - -/* - * On first ENXIO error, start a task that forcibly unmounts the filesystem. * * Return true if a cleanup is in progress. */ @@ -320,25 +300,18 @@ int ffs_fsfail_cleanup_locked(struct ufsmount *ump, int error) { - struct fsfail_task *etp; - struct task *tp; - mtx_assert(UFS_MTX(ump), MA_OWNED); if (error == ENXIO && (ump->um_flags & UM_FSFAIL_CLEANUP) == 0) { ump->um_flags |= UM_FSFAIL_CLEANUP; /* * Queue an async forced unmount. */ - etp = ump->um_fsfail_task; - ump->um_fsfail_task = NULL; - if (etp != NULL) { - tp = &etp->task; - TASK_INIT(tp, 0, ffs_fsfail_unmount, etp); - taskqueue_enqueue(taskqueue_thread, tp); - printf("UFS: forcibly unmounting %s from %s\n", - ump->um_mountp->mnt_stat.f_mntfromname, - ump->um_mountp->mnt_stat.f_mntonname); - } + vfs_ref(ump->um_mountp); + dounmount(ump->um_mountp, + MNT_FORCE | MNT_RECURSE | MNT_TASKQUEUE, curthread); + printf("UFS: forcibly unmounting %s from %s\n", + ump->um_mountp->mnt_stat.f_mntfromname, + ump->um_mountp->mnt_stat.f_mntonname); } return ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0); } @@ -1046,7 +1019,6 @@ struct g_consumer *cp; struct mount *nmp; struct vnode *devvp; - struct fsfail_task *etp; int candelete, canspeedup; off_t loc; @@ -1334,9 +1306,6 @@ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ - etp = malloc(sizeof *ump->um_fsfail_task, M_UFSMNT, M_WAITOK | M_ZERO); - etp->fsid = mp->mnt_stat.f_fsid; - ump->um_fsfail_task = etp; return (0); out: if (fs != NULL) { @@ -1583,8 +1552,6 @@ free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); - if (ump->um_fsfail_task != NULL) - free(ump->um_fsfail_task, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); Index: sys/ufs/ufs/ufsmount.h =================================================================== --- sys/ufs/ufs/ufsmount.h +++ sys/ufs/ufs/ufsmount.h @@ -67,10 +67,6 @@ TAILQ_HEAD(inodedeplst, inodedep); LIST_HEAD(bmsafemaphd, bmsafemap); LIST_HEAD(trimlist_hashhead, ffs_blkfree_trim_params); -struct fsfail_task { - struct task task; - fsid_t fsid; -}; #include #include @@ -123,7 +119,6 @@ struct taskqueue *um_trim_tq; /* (c) trim request queue */ struct trimlist_hashhead *um_trimhash; /* (i) trimlist hash table */ u_long um_trimlisthashsize; /* (i) trim hash table size-1 */ - struct fsfail_task *um_fsfail_task; /* (i) task for fsfail cleanup*/ /* (c) - below function ptrs */ int (*um_balloc)(struct vnode *, off_t, int, struct ucred *, int, struct buf **); Index: tools/test/stress2/misc/gnop11.sh =================================================================== --- /dev/null +++ tools/test/stress2/misc/gnop11.sh @@ -0,0 +1,85 @@ +#!/bin/sh + +# +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright (c) 2020 Kirk McKusick +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +# 'panic: Lock (lockmgr) ufs not locked @ kern/kern_lock.c:1271' seen: +# https://people.freebsd.org/~pho/stress/log/gnop8.txt + +[ `id -u ` -ne 0 ] && echo "Must be root!" && exit 1 +. ../default.cfg + +fsck=/sbin/fsck_ffs +exp=/sbin/fsck_ffs.exp # Experimental version +[ -f $exp ] && { echo "Using $exp"; fsck=$exp; } +mdconfig -a -t swap -s 5g -u $mdstart || exit 1 +md=md$mdstart +newfs -j /dev/$md || exit 1 +start=`date +%s` + +nullfs_mounts=15 +: ${nullfs_dstdir:=$mntpoint} + +while [ $((`date +%s` - start)) -lt 120 ]; do + gnop create /dev/$md || exit 1 + mount /dev/$md.nop /mnt || exit 1 + + for i in `jot $nullfs_mounts`; do + [ ! -d ${nullfs_dstdir}$i ] && mkdir ${nullfs_dstdir}$i + [ ! -d ${nullfs_dstdir}$(($i + $nullfs_mounts)) ] && + mkdir ${nullfs_dstdir}$(($i + $nullfs_mounts)) + mount | grep -q " ${nullfs_dstdir}$i " && + umount ${nullfs_dstdir}$i + mount | grep -q " ${nullfs_dstdir}$(($i + $nullfs_mounts)) " && + umount ${nullfs_dstdir}$(($i + $nullfs_mounts)) + mount_nullfs /mnt ${nullfs_dstdir}$i > \ + /dev/null 2>&1 + mount_nullfs ${nullfs_dstdir}$i ${nullfs_dstdir}$(($i + $nullfs_mounts)) > \ + /dev/null 2>&1 + done + # start your favorite I/O test here + cp -rp /[a-l]* /[n-z]* /mnt & + + # after some number of seconds + sleep 1 + gnop destroy -f /dev/$md.nop + kill $! + + # wait until forcible unmount, may be up to about 30 seconds, + # but typically very quick if I/O is in progress + while (a=`mount | egrep /mnt`) do sleep 1; done + + # first fsck will attempt journal recovery + $fsck -d -y /dev/$md + + # second fsck will do traditional fsck to check for any errors + # from journal recovery + $fsck -d -y /dev/$md + wait +done +mdconfig -d -u ${md#md} +exit 0