Index: sys/fs/tmpfs/tmpfs_subr.c =================================================================== --- sys/fs/tmpfs/tmpfs_subr.c +++ sys/fs/tmpfs/tmpfs_subr.c @@ -190,8 +190,6 @@ /* If the root directory of the 'tmp' file system is not yet * allocated, this must be the request to do it. */ MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); - KASSERT(tmp->tm_root == NULL || mp->mnt_writeopcount > 0, - ("creating node not under vn_start_write")); MPASS(IFF(type == VLNK, target != NULL)); MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL)); Index: sys/kern/vfs_mount.c =================================================================== --- sys/kern/vfs_mount.c +++ sys/kern/vfs_mount.c @@ -126,6 +126,12 @@ lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0); mp->mnt_thread_in_ops_pcpu = uma_zalloc_pcpu(pcpu_zone_int, M_WAITOK | M_ZERO); + mp->mnt_ref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); + mp->mnt_lockref_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); + mp->mnt_writeopcount_pcpu = uma_zalloc_pcpu(pcpu_zone_int, + M_WAITOK | M_ZERO); mp->mnt_ref = 0; mp->mnt_vfs_ops = 1; return (0); @@ -137,6 +143,9 @@ struct mount *mp; mp = (struct mount *)mem; + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_writeopcount_pcpu); + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_lockref_pcpu); + uma_zfree_pcpu(pcpu_zone_int, mp->mnt_ref_pcpu); uma_zfree_pcpu(pcpu_zone_int, mp->mnt_thread_in_ops_pcpu); lockdestroy(&mp->mnt_explock); mtx_destroy(&mp->mnt_listmtx); @@ -533,6 +542,8 @@ if (mp->mnt_vfs_ops == 0) panic("%s: entered with zero vfs_ops\n", __func__); + vfs_assert_mount_counters(mp); + MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_REFEXPIRE; if (mp->mnt_kern_flag & MNTK_MWAIT) { @@ -1382,6 +1393,7 @@ void vfs_op_enter(struct mount *mp) { + int cpu; MNT_ILOCK(mp); mp->mnt_vfs_ops++; @@ -1395,7 +1407,17 @@ */ atomic_thread_fence_seq_cst(); vfs_op_barrier_wait(mp); + atomic_thread_fence_acq(); + CPU_FOREACH(cpu) { + mp->mnt_ref += + zpcpu_replace_cpu(mp->mnt_ref_pcpu, 0, cpu); + mp->mnt_lockref += + zpcpu_replace_cpu(mp->mnt_lockref_pcpu, 0, cpu); + mp->mnt_writeopcount += + zpcpu_replace_cpu(mp->mnt_writeopcount_pcpu, 0, cpu); + } MNT_IUNLOCK(mp); + vfs_assert_mount_counters(mp); } void @@ -1407,6 +1429,7 @@ if (mp->mnt_vfs_ops <= 0) panic("%s: invalid vfs_ops count %d for mp %p\n", __func__, mp->mnt_vfs_ops, mp); + vfs_assert_mount_counters(mp); mp->mnt_vfs_ops--; } @@ -1435,6 +1458,93 @@ } } +#ifdef INVARIANTS +void +vfs_assert_mount_counters(struct mount *mp) +{ + int cpu; + + if (mp->mnt_vfs_ops == 0) + return; + + for (cpu = 0; cpu <= mp_maxid; cpu++) { + if (*(int *)zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu) != 0 || + *(int *)zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu) != 0 || + *(int *)zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu) != 0) + vfs_dump_mount_counters(mp); + } +} + +void +vfs_dump_mount_counters(struct mount *mp) +{ + int cpu, *count; + int ref, lockref, writeopcount; + + printf("%s: mp %p vfs_ops %d\n", __func__, mp, mp->mnt_vfs_ops); + + printf(" ref : "); + ref = mp->mnt_ref; + for (cpu = 0; cpu <= mp_maxid; cpu++) { + count = zpcpu_get_cpu(mp->mnt_ref_pcpu, cpu); + printf("%d ", *count); + ref += *count; + } + printf("\n"); + printf(" lockref : "); + lockref = mp->mnt_lockref; + for (cpu = 0; cpu <= mp_maxid; cpu++) { + count = zpcpu_get_cpu(mp->mnt_lockref_pcpu, cpu); + printf("%d ", *count); + lockref += *count; + } + printf("\n"); + printf("writeopcount: "); + writeopcount = mp->mnt_writeopcount; + for (cpu = 0; cpu <= mp_maxid; cpu++) { + count = zpcpu_get_cpu(mp->mnt_writeopcount_pcpu, cpu); + printf("%d ", *count); + writeopcount += *count; + } + printf("\n"); + + printf("counter struct total\n"); + printf("ref %-5d %-5d\n", mp->mnt_ref, ref); + printf("lockref %-5d %-5d\n", mp->mnt_lockref, lockref); + printf("writeopcount %-5d %-5d\n", mp->mnt_writeopcount, writeopcount); + + panic("invalid counts on struct mount"); +} +#endif + +int +vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) +{ + int cpu, sum; + int *base, *pcpu; + + switch (which) { + case MNT_COUNT_REF: + base = &mp->mnt_ref; + pcpu = mp->mnt_ref_pcpu; + break; + case MNT_COUNT_LOCKREF: + base = &mp->mnt_lockref; + pcpu = mp->mnt_lockref_pcpu; + break; + case MNT_COUNT_WRITEOPCOUNT: + base = &mp->mnt_writeopcount; + pcpu = mp->mnt_writeopcount_pcpu; + break; + } + + sum = *base; + for (cpu = 0; cpu <= mp_maxid; cpu++) { + sum += *(int *)zpcpu_get_cpu(pcpu, cpu); + } + return (sum); +} + /* * Do the actual filesystem unmount. */ Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -646,7 +646,7 @@ MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); MNT_REF_UNLOCKED(mp); - atomic_add_int(&mp->mnt_lockref, 1); + (*(int *)zpcpu_get(mp->mnt_lockref_pcpu))++; vfs_op_thread_exit(mp); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); @@ -654,6 +654,7 @@ } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the @@ -685,7 +686,7 @@ } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); - atomic_add_int(&mp->mnt_lockref, 1); + mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } @@ -702,17 +703,23 @@ if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); - c = atomic_fetchadd_int(&mp->mnt_lockref, -1) - 1; - KASSERT(c >= 0, ("%s: negative mnt_lockref %d\n", __func__, c)); + (*(int *)zpcpu_get(mp->mnt_lockref_pcpu))--; MNT_REL_UNLOCKED(mp); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REL(mp); - c = atomic_fetchadd_int(&mp->mnt_lockref, -1) - 1; - KASSERT(c >= 0, ("%s: negative mnt_lockref %d\n", __func__, c)); + c = --mp->mnt_lockref; + if (mp->mnt_vfs_ops == 0) { + MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); + MNT_IUNLOCK(mp); + return; + } + if (c < 0) + vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); @@ -4040,16 +4047,19 @@ if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); - db_printf(" mnt_ref = %d\n", mp->mnt_ref); + db_printf(" mnt_ref = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); - db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); + db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); - db_printf(" mnt_lockref = %d\n", mp->mnt_lockref); + db_printf(" mnt_lockref = %d (with %d in the struct)\n", + vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -1637,7 +1637,7 @@ if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); - atomic_add_int(&mp->mnt_writeopcount, 1); + (*(int *)zpcpu_get(mp->mnt_writeopcount_pcpu))++; vfs_op_thread_exit(mp); return (0); } @@ -1667,7 +1667,7 @@ } if (flags & V_XSLEEP) goto unlock; - atomic_add_int(&mp->mnt_writeopcount, 1); + mp->mnt_writeopcount++; unlock: if (error != 0 || (flags & V_XSLEEP) != 0) MNT_REL(mp); @@ -1804,19 +1804,25 @@ return; if (vfs_op_thread_enter(mp)) { - c = atomic_fetchadd_int(&mp->mnt_writeopcount, -1) - 1; - if (c < 0) - panic("vn_finished_write: invalid writeopcount %d", c); + (*(int *)zpcpu_get(mp->mnt_writeopcount_pcpu))--; MNT_REL_UNLOCKED(mp); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); MNT_REL(mp); - c = atomic_fetchadd_int(&mp->mnt_writeopcount, -1) - 1; - if (c < 0) - panic("vn_finished_write: invalid writeopcount %d", c); + c = --mp->mnt_writeopcount; + if (mp->mnt_vfs_ops == 0) { + MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0); + MNT_IUNLOCK(mp); + return; + } + if (c < 0) { + printf("vn_finished_write: invalid writeopcount %d", c); + vfs_dump_mount_counters(mp); + } if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0) wakeup(&mp->mnt_writeopcount); MNT_IUNLOCK(mp); @@ -1859,6 +1865,7 @@ vfs_op_enter(mp); MNT_ILOCK(mp); + vfs_assert_mount_counters(mp); if (mp->mnt_susp_owner == curthread) { vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); @@ -1916,7 +1923,7 @@ curthread->td_pflags &= ~TDP_IGNSUSP; if ((flags & VR_START_WRITE) != 0) { MNT_REF(mp); - atomic_add_int(&mp->mnt_writeopcount, 1); + mp->mnt_writeopcount++; } MNT_IUNLOCK(mp); if ((flags & VR_NO_SUSPCLR) == 0) Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h +++ sys/sys/mount.h @@ -228,6 +228,9 @@ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ int mnt_vfs_ops; /* (i) pending vfs ops */ int *mnt_thread_in_ops_pcpu; + int *mnt_ref_pcpu; + int *mnt_lockref_pcpu; + int *mnt_writeopcount_pcpu; }; /* @@ -269,24 +272,22 @@ #define MNT_MTX(mp) (&(mp)->mnt_mtx) #define MNT_REF_UNLOCKED(mp) do { \ - atomic_add_int(&(mp)->mnt_ref, 1); \ + (*(int *)zpcpu_get(mp->mnt_ref_pcpu))++; \ } while (0) #define MNT_REL_UNLOCKED(mp) do { \ - int _c; \ - _c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \ - KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \ + (*(int *)zpcpu_get(mp->mnt_ref_pcpu))--; \ } while (0) #define MNT_REF(mp) do { \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ - atomic_add_int(&(mp)->mnt_ref, 1); \ + mp->mnt_ref++; \ } while (0) #define MNT_REL(mp) do { \ - int _c; \ mtx_assert(MNT_MTX(mp), MA_OWNED); \ - _c = atomic_fetchadd_int(&(mp)->mnt_ref, -1) - 1; \ - KASSERT(_c >= 0, ("negative mnt_ref %d", _c)); \ - if (_c == 0) \ + (mp)->mnt_ref--; \ + if ((mp)->mnt_vfs_ops && (mp)->mnt_ref < 0) \ + vfs_dump_mount_counters(mp); \ + if ((mp)->mnt_ref == 0 && (mp)->mnt_vfs_ops) \ wakeup((mp)); \ } while (0) @@ -959,6 +960,17 @@ void vfs_op_exit_locked(struct mount *); void vfs_op_exit(struct mount *); +#ifdef INVARIANTS +void vfs_assert_mount_counters(struct mount *); +void vfs_dump_mount_counters(struct mount *); +#else +#define vfs_assert_mount_counters(mp) do { } while (0) +#define vfs_dump_mount_counters(mp) do { } while (0) +#endif + +enum mount_counter { MNT_COUNT_REF, MNT_COUNT_LOCKREF, MNT_COUNT_WRITEOPCOUNT }; +int vfs_mount_fetch_counter(struct mount *, enum mount_counter); + /* * We mark ourselves as entering the section and post a sequentially consistent * fence, meaning the store is completed before we get into the section and Index: sys/sys/pcpu.h =================================================================== --- sys/sys/pcpu.h +++ sys/sys/pcpu.h @@ -242,6 +242,18 @@ return ((char *)(base) + UMA_PCPU_ALLOC_SIZE * cpu); } +/* + * This operation is NOT atomic and does not post any barriers. + * If you use this the assumption is that the target CPU will not + * be modifying this variable. + * If you need atomicity use xchg. + * */ +#define zpcpu_replace_cpu(base, val, cpu) ({ \ + __typeof(val) _old = *(__typeof(val) *)zpcpu_get_cpu(base, cpu);\ + *(__typeof(val) *)zpcpu_get_cpu(base, cpu) = val; \ + _old; \ +}) + /* * Machine dependent callouts. cpu_pcpu_init() is responsible for * initializing machine dependent fields of struct pcpu, and Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c +++ sys/ufs/ffs/ffs_softdep.c @@ -13403,10 +13403,11 @@ * (fs_minfree). */ if (resource == FLUSH_INODES_WAIT) { - needed = vp->v_mount->mnt_writeopcount + 2; + needed = vfs_mount_fetch_counter(vp->v_mount, + MNT_COUNT_WRITEOPCOUNT) + 2; } else if (resource == FLUSH_BLOCKS_WAIT) { - needed = (vp->v_mount->mnt_writeopcount + 2) * - fs->fs_contigsumsize; + needed = (vfs_mount_fetch_counter(vp->v_mount, + MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE)) needed += fragstoblks(fs, roundup((fs->fs_dsize * fs->fs_minfree / 100) -