Index: head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c =================================================================== --- head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c +++ head/sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c @@ -242,6 +242,7 @@ if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) panic("mount: lost mount"); VOP_UNLOCK(vp, 0); + vfs_op_exit(mp); vfs_unbusy(mp); *vpp = mvp; return (0); Index: head/sys/kern/subr_pcpu.c =================================================================== --- head/sys/kern/subr_pcpu.c +++ head/sys/kern/subr_pcpu.c @@ -131,15 +131,19 @@ /* * UMA_PCPU_ZONE zones, that are available for all kernel - * consumers. Right now 64 bit zone is used for counter(9). + * consumers. Right now 64 bit zone is used for counter(9) + * and int zone is used for mount point counters. */ +uma_zone_t pcpu_zone_int; uma_zone_t pcpu_zone_64; static void pcpu_zones_startup(void) { + pcpu_zone_int = uma_zcreate("int pcpu", sizeof(int), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); pcpu_zone_64 = uma_zcreate("64 pcpu", sizeof(uint64_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_PCPU); } Index: head/sys/kern/vfs_default.c =================================================================== --- head/sys/kern/vfs_default.c +++ head/sys/kern/vfs_default.c @@ -601,17 +601,24 @@ */ vp = ap->a_vp; mp = vp->v_mount; - if (mp == NULL) - goto out; - MNT_ILOCK(mp); - if (mp != vp->v_mount) { + if (mp == NULL) { + *(ap->a_mpp) = NULL; + return (0); + } + if (vfs_op_thread_enter(mp)) { + if (mp == vp->v_mount) + MNT_REF_UNLOCKED(mp); + else + mp = NULL; + vfs_op_thread_exit(mp); + } else { + MNT_ILOCK(mp); + if (mp == vp->v_mount) + MNT_REF(mp); + else + mp = NULL; MNT_IUNLOCK(mp); - mp = NULL; - goto out; } - MNT_REF(mp); - MNT_IUNLOCK(mp); -out: *(ap->a_mpp) = mp; return (0); } Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c +++ head/sys/kern/vfs_mount.c @@ -41,6 +41,7 @@ #include #include +#include #include #include #include @@ -123,6 +124,10 @@ mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF); mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF); lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); + mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); + mp->mnt_ref = 0; + mp->mnt_vfs_ops = 1; return (0); } @@ -132,6 +137,7 @@ struct mount *mp; mp = (struct mount *)mem; + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); mtx_destroy(&mp->mnt_mtx); @@ -445,6 +451,12 @@ { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + if (vfs_op_thread_enter(mp)) { + MNT_REF_UNLOCKED(mp); + vfs_op_thread_exit(mp); + return; + } + MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); @@ -455,6 +467,12 @@ { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + if (vfs_op_thread_enter(mp)) { + MNT_REL_UNLOCKED(mp); + vfs_op_thread_exit(mp); + return; + } + MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); @@ -478,7 +496,12 @@ mp->mnt_activevnodelistsize = 0; TAILQ_INIT(&mp->mnt_tmpfreevnodelist); mp->mnt_tmpfreevnodelistsize = 0; - mp->mnt_ref = 0; + if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || + mp->mnt_writeopcount != 0) + panic("%s: non-zero counters on new mp %p\n", __func__, mp); + if (mp->mnt_vfs_ops != 1) + panic("%s: vfs_ops should be 1 but %d found\n", __func__, + mp->mnt_vfs_ops); (void) vfs_busy(mp, MBF_NOWAIT); atomic_add_acq_int(&vfsp->vfc_refcount, 1); mp->mnt_op = vfsp->vfc_vfsops; @@ -507,6 +530,9 @@ vfs_mount_destroy(struct mount *mp) { + if (mp->mnt_vfs_ops == 0) + panic("%s: entered with zero vfs_ops\n", __func__); + MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { @@ -540,6 +566,11 @@ if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); + + if (mp->mnt_vfs_ops != 1) + panic("%s: vfs_ops should be 1 but %d found\n", __func__, + mp->mnt_vfs_ops); + if (mp->mnt_vnodecovered != NULL) vrele(mp->mnt_vnodecovered); #ifdef MAC @@ -951,6 +982,7 @@ vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); + vfs_op_exit(mp); vfs_unbusy(mp); return (0); } @@ -1019,6 +1051,8 @@ VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); + vfs_op_enter(mp); + MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); @@ -1100,6 +1134,7 @@ else vfs_deallocate_syncvnode(mp); end: + vfs_op_exit(mp); vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; @@ -1328,6 +1363,7 @@ mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } + vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp != NULL) { VOP_UNLOCK(coveredvp, 0); @@ -1337,6 +1373,69 @@ } /* + * There are various reference counters associated with the mount point. + * Normally it is permitted to modify them without taking the mnt ilock, + * but this behavior can be temporarily disabled if stable value is needed + * or callers are expected to block (e.g. to not allow new users during + * forced unmount). + */ +void +vfs_op_enter(struct mount *mp) +{ + + MNT_ILOCK(mp); + mp->mnt_vfs_ops++; + if (mp->mnt_vfs_ops > 1) { + MNT_IUNLOCK(mp); + return; + } + /* + * Paired with a fence in vfs_op_thread_enter(). See the comment + * above it for details. + */ + atomic_thread_fence_seq_cst(); + vfs_op_barrier_wait(mp); + MNT_IUNLOCK(mp); +} + +void +vfs_op_exit_locked(struct mount *mp) +{ + + mtx_assert(MNT_MTX(mp), MA_OWNED); + + if (mp->mnt_vfs_ops <= 0) + panic("%s: invalid vfs_ops count %d for mp %p\n", + __func__, mp->mnt_vfs_ops, mp); + mp->mnt_vfs_ops--; +} + +void +vfs_op_exit(struct mount *mp) +{ + + MNT_ILOCK(mp); + vfs_op_exit_locked(mp); + MNT_IUNLOCK(mp); +} + +/* + * It is assumed the caller already posted at least an acquire barrier. + */ +void +vfs_op_barrier_wait(struct mount *mp) +{ + int *in_op; + int cpu; + + CPU_FOREACH(cpu) { + in_op = zpcpu_get_cpu(mp->mnt_thread_in_ops_pcpu, cpu); + while (atomic_load_int(in_op)) + cpu_spinwait(); + } +} + +/* * Do the actual filesystem unmount. */ int @@ -1379,6 +1478,8 @@ return (error); } + vfs_op_enter(mp); + vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || @@ -1469,6 +1570,7 @@ mp->mnt_kern_flag &= ~MNTK_MWAIT; wakeup(mp); } + vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); if (coveredvp) VOP_UNLOCK(coveredvp, 0); Index: head/sys/kern/vfs_mountroot.c =================================================================== --- head/sys/kern/vfs_mountroot.c +++ head/sys/kern/vfs_mountroot.c @@ -273,6 +273,7 @@ *mpp = mp; rootdevmp = mp; + vfs_op_exit(mp); } set_rootvnode(); Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c +++ head/sys/kern/vfs_subr.c @@ -4032,6 +4032,7 @@ mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); + db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { Index: head/sys/sys/mount.h =================================================================== --- head/sys/sys/mount.h +++ head/sys/sys/mount.h @@ -226,6 +226,8 @@ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ + int mnt_vfs_ops; /* (i) pending vfs ops */ + int *mnt_thread_in_ops_pcpu; }; /* @@ -265,15 +267,26 @@ #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) + +#define MNT_REF_UNLOCKED(mp) do { \ + atomic_add_int(&(mp)->mnt_ref, 1); \ +} while (0) +#define MNT_REL_UNLOCKED(mp) do { \ + int _c; \ + _c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \ + KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \ +} while (0) + #define MNT_REF(mp) do { \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ - (mp)->mnt_ref++; \ + atomic_add_int(&(mp)->mnt_ref, 1); \ } while (0) #define MNT_REL(mp) do { \ + int _c; \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ - KASSERT((mp)->mnt_ref > 0, ("negative mnt_ref")); \ - (mp)->mnt_ref--; \ - if ((mp)->mnt_ref == 0) \ + _c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \ + KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \ + if (_c == 0) \ wakeup((mp)); \ } while (0) @@ -940,6 +953,48 @@ void syncer_suspend(void); void syncer_resume(void); + +void vfs_op_barrier_wait(struct mount *); +void vfs_op_enter(struct mount *); +void vfs_op_exit_locked(struct mount *); +void vfs_op_exit(struct mount *); + +/* + * We mark ourselves as entering the section and post a sequentially consistent + * fence, meaning the store is completed before we get into the section and + * mnt_vfs_ops is only read afterwards. + * + * Any thread transitioning the ops counter 0->1 does things in the opposite + * order - first bumps the count, posts a sequentially consistent fence and + * observes all CPUs not executing within the section. + * + * This provides an invariant that by the time the last CPU is observed not + * executing, everyone else entering will see the counter > 0 and exit. + * + * Note there is no barrier between vfs_ops and the rest of the code in the + * section. It is not necessary as the writer has to wait for everyone to drain + * before making any changes or only make changes safe while the section is + * executed. + */ + +#define vfs_op_thread_enter(mp) ({ \ + struct mount *_mp = (mp); \ + bool _retval = true; \ + critical_enter(); \ + *(int *)zpcpu_get(_mp->mnt_thread_in_ops_pcpu) = 1; \ + atomic_thread_fence_seq_cst(); \ + if (__predict_false(_mp->mnt_vfs_ops > 0)) { \ + vfs_op_thread_exit(_mp); \ + _retval = false; \ + } \ + _retval; \ +}) + +#define vfs_op_thread_exit(mp) do { \ + atomic_thread_fence_rel(); \ + *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 0; \ + critical_exit(); \ +} while (0) #else /* !_KERNEL */ Index: head/sys/vm/uma.h =================================================================== --- head/sys/vm/uma.h +++ head/sys/vm/uma.h @@ -650,6 +650,7 @@ /* * Common UMA_ZONE_PCPU zones. */ +extern uma_zone_t pcpu_zone_int; extern uma_zone_t pcpu_zone_64; /*