Index: head/sys/fs/tmpfs/tmpfs_subr.c =================================================================== --- head/sys/fs/tmpfs/tmpfs_subr.c +++ head/sys/fs/tmpfs/tmpfs_subr.c @@ -190,8 +190,6 @@ /* If the root directory of the 'tmp' file system is not yet * allocated, this must be the request to do it. */ MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); - KASSERT(tmp->tm_root == NULL || mp->mnt_writeopcount > 0, - ("creating node not under vn_start_write")); MPASS(IFF(type == VLNK, target != NULL)); MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL)); Index: head/sys/kern/vfs_default.c =================================================================== --- head/sys/kern/vfs_default.c +++ head/sys/kern/vfs_default.c @@ -607,7 +607,7 @@ } if (vfs_op_thread_enter(mp)) { if (mp == vp->v_mount) - MNT_REF_UNLOCKED(mp); + vfs_mp_count_add_pcpu(mp, ref, 1); else mp = NULL; vfs_op_thread_exit(mp); Index: head/sys/kern/vfs_mount.c =================================================================== --- head/sys/kern/vfs_mount.c +++ head/sys/kern/vfs_mount.c @@ -126,6 +126,12 @@ lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); + mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); + mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); + mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); mp->mnt_ref = 0; mp->mnt_vfs_ops = 1; return (0); @@ -137,6 +143,9 @@ struct mount *mp; mp = (struct mount *)mem; + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_writeopcount_pcpu); + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_lockref_pcpu); + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_ref_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); @@ -452,7 +461,7 @@ CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { - MNT_REF_UNLOCKED(mp); + vfs_mp_count_add_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } @@ -468,7 +477,7 @@ CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { - MNT_REL_UNLOCKED(mp); + vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } @@ -533,6 +542,8 @@ if (mp->mnt_vfs_ops == 0) panic("%s: entered with zero vfs_ops\n", __func__); + vfs_assert_mount_counters(mp); + MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { @@ -1382,6 +1393,7 @@ void vfs_op_enter(struct mount *mp) { + int cpu; MNT_ILOCK(mp); mp->mnt_vfs_ops++; @@ -1395,7 +1407,20 @@ */ atomic_thread_fence_seq_cst(); vfs_op_barrier_wait(mp); + /* + * Paired with a fence in vfs_op_thread_exit(). + */ + atomic_thread_fence_acq(); + CPU_FOREACH(cpu) { + mp->mnt_ref += + zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu); + mp->mnt_lockref += + zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu); + mp->mnt_writeopcount += + zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu); + } MNT_IUNLOCK(mp); + vfs_assert_mount_counters(mp); } void @@ -1433,6 +1458,93 @@ while (atomic_load_int(in_op)) cpu_spinwait(); } +} + +#ifdef DIAGNOSTIC +void +vfs_assert_mount_counters(struct mount *mp) +{ + int cpu; + + if (mp->mnt_vfs_ops == 0) + return; + + CPU_FOREACH(cpu) { + if (*(int *)zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 || + *(int *)zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 || + *(int *)zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0) + vfs_dump_mount_counters(mp); + } +} + +void +vfs_dump_mount_counters(struct mount *mp) +{ + int cpu, *count; + int ref, lockref, writeopcount; + + printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); + + printf(" ref : "); + ref = mp->mnt_ref; + CPU_FOREACH(cpu) { + count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu); + printf("%d ", *count); + ref += *count; + } + printf("\n"); + printf(" lockref : "); + lockref = mp->mnt_lockref; + CPU_FOREACH(cpu) { + count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu); + printf("%d ", *count); + lockref += *count; + } + printf("\n"); + printf("writeopcount: "); + writeopcount = mp->mnt_writeopcount; + CPU_FOREACH(cpu) { + count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu); + printf("%d ", *count); + writeopcount += *count; + } + printf("\n"); + + printf("counter struct total\n"); + printf("ref %-5d %-5d\n", mp->mnt_ref, ref); + printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); + printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); + + panic("invalid counts on struct mount"); +} +#endif + +int +vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) +{ + int *base, *pcpu; + int cpu, sum; + + switch (which) { + case MNT_COUNT_REF: + base = &mp->mnt_ref; + pcpu = mp->mnt_ref_pcpu; + break; + case MNT_COUNT_LOCKREF: + base = &mp->mnt_lockref; + pcpu = mp->mnt_lockref_pcpu; + break; + case MNT_COUNT_WRITEOPCOUNT: + base = &mp->mnt_writeopcount; + pcpu = mp->mnt_writeopcount_pcpu; + break; + } + + sum = *base; + CPU_FOREACH(cpu) { + sum += *(int *)zpcpu_get_cpu(pcpu, cpu); + } + return (sum); } /* Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c +++ head/sys/kern/vfs_subr.c @@ -645,8 +645,8 @@ MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); - MNT_REF_UNLOCKED(mp); - atomic_add_int(&mp->mnt_lockref, 1); + vfs_mp_count_add_pcpu(mp, ref, 1); + vfs_mp_count_add_pcpu(mp, lockref, 1); vfs_op_thread_exit(mp); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); @@ -654,6 +654,7 @@ } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the @@ -685,7 +686,7 @@ } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); - atomic_add_int(&mp->mnt_lockref, 1); + mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } @@ -702,17 +703,23 @@ if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); - c = atomic_fetchadd_int(&mp->mnt_lockref, -1) - 1; - KASSERT(c >= 0, ("%s: negative mnt_lockref %d\n", __func__, c)); - MNT_REL_UNLOCKED(mp); + vfs_mp_count_sub_pcpu(mp, lockref, 1); + vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REL(mp); - c = atomic_fetchadd_int(&mp->mnt_lockref, -1) - 1; - KASSERT(c >= 0, ("%s: negative mnt_lockref %d\n", __func__, c)); + c = --mp->mnt_lockref; + if (mp->mnt_vfs_ops == 0) { + MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); + MNT_IUNLOCK(mp); + return; + } + if (c < 0) + vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); @@ -4040,16 +4047,19 @@ if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); - db_printf(" mnt_ref = %d\n", mp->mnt_ref); + db_printf(" mnt_ref = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); - db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); + db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); - db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); + db_printf(" mnt_lockref = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c +++ head/sys/kern/vfs_vnops.c @@ -1628,7 +1628,7 @@ if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 && vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); - atomic_add_int(&mp->mnt_writeopcount, 1); + vfs_mp_count_add_pcpu(mp, writeopcount, 1); vfs_op_thread_exit(mp); return (0); } @@ -1660,7 +1660,7 @@ } if (flags & V_XSLEEP) goto unlock; - atomic_add_int(&mp->mnt_writeopcount, 1); + mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); @@ -1797,19 +1797,23 @@ return; if (vfs_op_thread_enter(mp)) { - c = atomic_fetchadd_int(&mp->mnt_writeopcount, -1) - 1; - if (c < 0) - panic("vn_finished_write: invalid writeopcount %d", c); - MNT_REL_UNLOCKED(mp); + vfs_mp_count_sub_pcpu(mp, writeopcount, 1); + vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REL(mp); - c = atomic_fetchadd_int(&mp->mnt_writeopcount, -1) - 1; + c = --mp->mnt_writeopcount; + if (mp->mnt_vfs_ops == 0) { + MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); + MNT_IUNLOCK(mp); + return; + } if (c < 0) - panic("vn_finished_write: invalid writeopcount %d", c); + vfs_dump_mount_counters(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); @@ -1852,6 +1856,7 @@ vfs_op_enter(mp); MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); if (mp->mnt_susp_owner == curthread) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); @@ -1909,7 +1914,7 @@ curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); - atomic_add_int(&mp->mnt_writeopcount, 1); + mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) Index: head/sys/sys/mount.h =================================================================== --- head/sys/sys/mount.h +++ head/sys/sys/mount.h @@ -228,6 +228,9 @@ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ int mnt_vfs_ops; /* (i) pending vfs ops */ int *mnt_thread_in_ops_pcpu; + int *mnt_ref_pcpu; + int *mnt_lockref_pcpu; + int *mnt_writeopcount_pcpu; }; /* @@ -268,25 +271,16 @@ #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) #define MNT_MTX(mp) (&(mp)->mnt_mtx) -#define MNT_REF_UNLOCKED(mp) do { \ - atomic_add_int(&(mp)->mnt_ref, 1); \ -} while (0) -#define MNT_REL_UNLOCKED(mp) do { \ - int _c; \ - _c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \ - KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \ -} while (0) - #define MNT_REF(mp) do { \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ - atomic_add_int(&(mp)->mnt_ref, 1); \ + mp->mnt_ref++; \ } while (0) #define MNT_REL(mp) do { \ - int _c; \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ - _c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \ - KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \ - if (_c == 0) \ + (mp)->mnt_ref--; \ + if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0) \ + vfs_dump_mount_counters(mp); \ + if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops) \ wakeup((mp)); \ } while (0) @@ -959,6 +953,17 @@ void vfs_op_exit_locked(struct mount *); void vfs_op_exit(struct mount *); +#ifdef DIAGNOSTIC +void vfs_assert_mount_counters(struct mount *); +void vfs_dump_mount_counters(struct mount *); +#else +#define vfs_assert_mount_counters(mp) do { } while (0) +#define vfs_dump_mount_counters(mp) do { } while (0) +#endif + +enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT }; +int vfs_mount_fetch_counter(struct mount *, enum mount_counter); + /* * We mark ourselves as entering the section and post a sequentially consistent * fence, meaning the store is completed before we get into the section and @@ -976,24 +981,39 @@ * before making any changes or only make changes safe while the section is * executed. */ +#define vfs_op_thread_entered(mp) ({ \ + MPASS(curthread->td_critnest > 0); \ + *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) == 1; \ +}) #define vfs_op_thread_enter(mp) ({ \ - struct mount *_mp = (mp); \ bool _retval = true; \ critical_enter(); \ - *(int *)zpcpu_get(_mp->mnt_thread_in_ops_pcpu) = 1; \ + MPASS(!vfs_op_thread_entered(mp)); \ + *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 1; \ atomic_thread_fence_seq_cst(); \ - if (__predict_false(_mp->mnt_vfs_ops > 0)) { \ - vfs_op_thread_exit(_mp); \ + if (__predict_false(mp->mnt_vfs_ops > 0)) { \ + vfs_op_thread_exit(mp); \ _retval = false; \ } \ _retval; \ }) #define vfs_op_thread_exit(mp) do { \ + MPASS(vfs_op_thread_entered(mp)); \ atomic_thread_fence_rel(); \ *(int *)zpcpu_get(mp->mnt_thread_in_ops_pcpu) = 0; \ critical_exit(); \ +} while (0) + +#define vfs_mp_count_add_pcpu(mp, count, val) do { \ + MPASS(vfs_op_thread_entered(mp)); \ + (*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) += val; \ +} while (0) + +#define vfs_mp_count_sub_pcpu(mp, count, val) do { \ + MPASS(vfs_op_thread_entered(mp)); \ + (*(int *)zpcpu_get(mp->mnt_##count##_pcpu)) -= val; \ } while (0) #else /* !_KERNEL */ Index: head/sys/sys/pcpu.h =================================================================== --- head/sys/sys/pcpu.h +++ head/sys/sys/pcpu.h @@ -243,6 +243,18 @@ } /* + * This operation is NOT atomic and does not post any barriers. + * If you use this the assumption is that the target CPU will not + * be modifying this variable. + * If you need atomicity use xchg. + * */ +#define zpcpu_replace_cpu(base, val, cpu) ({ \ + __typeof(val) _old = *(__typeof(val) *)zpcpu_get_cpu(base, cpu);\ + *(__typeof(val) *)zpcpu_get_cpu(base, cpu) = val; \ + _old; \ +}) + +/* * Machine dependent callouts. cpu_pcpu_init() is responsible for * initializing machine dependent fields of struct pcpu, and * db_show_mdpcpu() is responsible for handling machine dependent Index: head/sys/ufs/ffs/ffs_softdep.c =================================================================== --- head/sys/ufs/ffs/ffs_softdep.c +++ head/sys/ufs/ffs/ffs_softdep.c @@ -13403,10 +13403,11 @@ * (fs_minfree). */ if (resource == FLUSH_INODES_WAIT) { - needed = vp->v_mount->mnt_writeopcount + 2; + needed = vfs_mount_fetch_counter(vp->v_mount, + MNT_COUNT_WRITEOPCOUNT) + 2; } else if (resource == FLUSH_BLOCKS_WAIT) { - needed = (vp->v_mount->mnt_writeopcount + 2) * - fs->fs_contigsumsize; + needed = (vfs_mount_fetch_counter(vp->v_mount, + MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE)) needed += fragstoblks(fs, roundup((fs->fs_dsize * fs->fs_minfree / 100) -