Index: sys/fs/nullfs/null_vnops.c =================================================================== --- sys/fs/nullfs/null_vnops.c +++ sys/fs/nullfs/null_vnops.c @@ -227,6 +227,7 @@ struct vnode *old_vps[VDESC_MAX_VPS]; struct vnode **vps_p[VDESC_MAX_VPS]; struct vnode ***vppp; + struct vnode *lvp; struct vnodeop_desc *descp = ap->a_desc; int reles, i; @@ -295,6 +296,23 @@ if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ if (old_vps[i]) { + lvp = *(vps_p[i]); + + /* + * If lowervp was unlocked during VOP + * operation, nullfs upper vnode could have + * been reclaimed, which changes its v_vnlock + * back to private v_lock. In this case we + * must move lock ownership from lower to + * upper (reclaimed) vnode. + */ + if (lvp != NULLVP && + VOP_ISLOCKED(lvp) == LK_EXCLUSIVE && + old_vps[i]->v_vnlock != lvp->v_vnlock) { + VOP_UNLOCK(lvp); + VOP_LOCK(old_vps[i], LK_EXCLUSIVE | LK_RETRY); + } + *(vps_p[i]) = old_vps[i]; #if 0 if (reles & VDESC_VP0_WILLUNLOCK) Index: sys/kern/uipc_usrreq.c =================================================================== --- sys/kern/uipc_usrreq.c +++ sys/kern/uipc_usrreq.c @@ -671,6 +671,8 @@ vput(nd.ni_dvp); if (error) { vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; goto error; } vp = nd.ni_vp; Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -1794,6 +1794,8 @@ VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); + VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, + ("Leaked inactivation")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); @@ -3803,7 +3805,7 @@ struct thread *td; struct mount *mp; vm_object_t object; - bool active, oweinact; + bool active, doinginact, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); @@ -3825,11 +3827,17 @@ vp->v_irflag |= VIRF_DOOMED; /* - * Check to see if the vnode is in use. If so, we have to call - * VOP_CLOSE() and VOP_INACTIVE(). + * Check to see if the vnode is in use. If so, we have to + * call VOP_CLOSE() and VOP_INACTIVE(). + * + * It could be that VOP_INACTIVE() requested reclamation, in + * which case we should avoid recursion, so check + * VI_DOINGINACT. This is not precise but good enough. */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; + doinginact = (vp->v_iflag & VI_DOINGINACT) != 0; + /* * If we need to do inactive VI_OWEINACT will be set. */ @@ -3850,7 +3858,7 @@ */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); - if (oweinact || active) { + if ((oweinact || active) && !doinginact) { VI_LOCK(vp); vinactivef(vp); VI_UNLOCK(vp); Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -1384,6 +1384,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1470,6 +1472,8 @@ vput(nd.ni_dvp); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1568,7 +1572,7 @@ return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); - } while (error == EAGAIN); + } while (error == EAGAIN || error == ERELOOKUP); return (error); } @@ -1741,6 +1745,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, tmppath); @@ -1791,6 +1797,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1937,6 +1945,8 @@ vrele(vp); else vput(vp); + if (error == ERELOOKUP) + goto restart; fdout: if (fp != NULL) fdrop(fp, td); @@ -3710,6 +3720,8 @@ vrele(fromnd.ni_startdir); if (error == -1) return (0); + if (error == ERELOOKUP) + goto again; return (error); } @@ -3803,6 +3815,8 @@ if (error == 0) vput(nd.ni_vp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -3903,6 +3917,8 @@ vrele(nd.ni_dvp); else vput(nd.ni_dvp); + if (error == ERELOOKUP) + goto restart; fdout: if (fp != NULL) fdrop(fp, td); @@ -4416,7 +4432,8 @@ if (error != 0) return (error); VOP_UNLOCK(vp); - } while ((error = kern_linkat_vp(td, vp, fd, path, pathseg)) == EAGAIN); + error = kern_linkat_vp(td, vp, fd, path, pathseg); + } while (error == EAGAIN || error == ERELOOKUP); return (error); } Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -274,6 +275,8 @@ vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); + if (error == ERELOOKUP) + goto restart; return (error); } fmode &= ~O_TRUNC; @@ -3315,3 +3318,66 @@ return (error); } + +void +vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2, + bool vp2_locked) +{ + int error; + + if (vp1 != NULL) { + if (vp1_locked) + ASSERT_VOP_ELOCKED(vp1, "vp1"); + else + ASSERT_VOP_UNLOCKED(vp1, "vp1"); + } else { + vp1_locked = true; + } + if (vp2 != NULL) { + if (vp2_locked) + ASSERT_VOP_ELOCKED(vp2, "vp2"); + else + ASSERT_VOP_UNLOCKED(vp2, "vp2"); + } else { + vp2_locked = true; + } + if (!vp1_locked && !vp2_locked) { + vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY); + vp1_locked = true; + } + + for (;;) { + if (vp1_locked && vp2_locked) + break; + if (vp1_locked && vp2 != NULL) { + if (vp1 != NULL) { + error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT, + __FILE__, __LINE__); + if (error == 0) + break; + VOP_UNLOCK(vp1); + vp1_locked = false; + pause("vlp1", prng32_bounded(100)); + } + vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); + vp2_locked = true; + } + if (vp2_locked && vp1 != NULL) { + if (vp2 != NULL) { + error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT, + __FILE__, __LINE__); + if (error == 0) + break; + VOP_UNLOCK(vp2); + vp2_locked = false; + pause("vlp2", prng32_bounded(100)); + } + vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY); + vp1_locked = true; + } + } + if (vp1 != NULL) + ASSERT_VOP_ELOCKED(vp1, "vp1 ret"); + if (vp2 != NULL) + ASSERT_VOP_ELOCKED(vp2, "vp2 ret"); +} Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -761,6 +761,9 @@ int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); +void vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2, + bool vp2_locked); + void vn_seqc_write_begin_unheld_locked(struct vnode *vp); void vn_seqc_write_begin_unheld(struct vnode *vp); void vn_seqc_write_begin_locked(struct vnode *vp); Index: sys/ufs/ffs/ffs_alloc.c =================================================================== --- sys/ufs/ffs/ffs_alloc.c +++ sys/ufs/ffs/ffs_alloc.c @@ -3468,7 +3468,7 @@ break; } dp = VTOI(dvp); - dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ + SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, DT_DIR, 0); cache_purge(fdvp); Index: sys/ufs/ffs/ffs_extern.h =================================================================== --- sys/ufs/ffs/ffs_extern.h +++ sys/ufs/ffs/ffs_extern.h @@ -173,6 +173,9 @@ void softdep_freefile(struct vnode *, ino_t, int); int softdep_request_cleanup(struct fs *, struct vnode *, struct ucred *, int); +int softdep_prerename(struct vnode *, struct vnode *, struct vnode *, + struct vnode *); +int softdep_prelink(struct vnode *, struct vnode *, int); void softdep_setup_freeblocks(struct inode *, off_t, int); void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t, int); void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t, Index: sys/ufs/ffs/ffs_inode.c =================================================================== --- sys/ufs/ffs/ffs_inode.c +++ sys/ufs/ffs/ffs_inode.c @@ -67,6 +67,17 @@ static int ffs_indirtrunc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, ufs2_daddr_t *); +static void +ffs_inode_bwrite(struct vnode *vp, struct buf *bp, int flags) +{ + if ((flags & IO_SYNC) != 0) + bwrite(bp); + else if (DOINGASYNC(vp)) + bdwrite(bp); + else + bawrite(bp); +} + /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode @@ -357,12 +368,7 @@ DIP_SET(ip, i_size, length); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; - if (flags & IO_SYNC) - bwrite(bp); - else if (DOINGASYNC(vp)) - bdwrite(bp); - else - bawrite(bp); + ffs_inode_bwrite(vp, bp, flags); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); return (ffs_update(vp, waitforupdate)); } @@ -456,6 +462,8 @@ error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) return (error); + ffs_inode_bwrite(vp, bp, flags); + /* * When we are doing soft updates and the UFS_BALLOC * above fills in a direct block hole with a full sized @@ -468,6 +476,10 @@ fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); + + error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); + if (error) + return (error); ip->i_size = length; DIP_SET(ip, i_size, length); size = blksize(fs, ip, lbn); @@ -478,12 +490,7 @@ allocbuf(bp, size); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; - if (flags & IO_SYNC) - bwrite(bp); - else if (DOINGASYNC(vp)) - bdwrite(bp); - else - bawrite(bp); + ffs_inode_bwrite(vp, bp, flags); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); } /* Index: sys/ufs/ffs/ffs_snapshot.c =================================================================== --- sys/ufs/ffs/ffs_snapshot.c +++ sys/ufs/ffs/ffs_snapshot.c @@ -301,6 +301,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); vrele(nd.ni_dvp); + if (error == ERELOOKUP) + goto restart; return (error); } vp = nd.ni_vp; @@ -368,8 +370,12 @@ if (error) goto out; bawrite(nbp); - if (cg % 10 == 0) - ffs_syncvnode(vp, MNT_WAIT, 0); + if (cg % 10 == 0) { + error = ffs_syncvnode(vp, MNT_WAIT, 0); + /* vp possibly reclaimed if unlocked */ + if (error != 0) + goto out; + } } /* * Copy all the cylinder group maps. Although the @@ -391,8 +397,8 @@ goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); - if (cg % 10 == 0) - ffs_syncvnode(vp, MNT_WAIT, 0); + if (cg % 10 == 0 && error == 0) + error = ffs_syncvnode(vp, MNT_WAIT, 0); if (error) goto out; } Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c +++ sys/ufs/ffs/ffs_softdep.c @@ -609,6 +609,26 @@ panic("softdep_freework called"); } +int +softdep_prerename(fdvp, fvp, tdvp, tvp) + struct vnode *fdvp; + struct vnode *fvp; + struct vnode *tdvp; + struct vnode *tvp; +{ + + panic("softdep_prerename called"); +} + +void +softdep_prelink(dvp, vp) + struct vnode *dvp; + struct vnode *vp; +{ + + panic("softdep_prelink called"); +} + #else FEATURE(softupdates, "FFS soft-updates support"); @@ -748,7 +768,7 @@ static void clear_unlinked_inodedep(struct inodedep *); static struct inodedep *first_unlinked_inodedep(struct ufsmount *); static int flush_pagedep_deps(struct vnode *, struct mount *, - struct diraddhd *); + struct diraddhd *, struct buf *); static int free_pagedep(struct pagedep *); static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); @@ -925,7 +945,6 @@ static int journal_space(struct ufsmount *, int); static void journal_suspend(struct ufsmount *); static int journal_unsuspend(struct ufsmount *ump); -static void softdep_prelink(struct vnode *, struct vnode *); static void add_to_journal(struct worklist *); static void remove_from_journal(struct worklist *); static bool softdep_excess_items(struct ufsmount *, int); @@ -1389,6 +1408,112 @@ /* List of all filesystems mounted with soft updates */ static TAILQ_HEAD(, mount_softdeps) softdepmounts; +static int +get_parent_vp(struct vnode *vp, struct mount *mp, ino_t inum, struct buf *bp, + struct diraddhd *diraddhdp, struct diraddhd *unfinishedp, + struct vnode **rvp) +{ + struct vnode *pvp; + struct diradd *dap; + int error; + bool bplocked; + + ASSERT_VOP_ELOCKED(vp, "child vnode must be locked"); + for (bplocked = true, pvp = NULL;;) { + error = ffs_vgetf(mp, inum, LK_EXCLUSIVE | LK_NOWAIT, &pvp, + FFSV_FORCEINSMQ); + if (error == 0) { + /* + * Since we could have unlocked vp, the inode + * number could no longer indicate a + * constructed node. In this case, we must + * restart the syscall. + */ + if (VTOI(pvp)->i_mode == 0 || !bplocked) { + if (VTOI(pvp)->i_mode == 0) + vgone(pvp); + vput(pvp); + error = ERELOOKUP; + goto out; + } + + error = 0; + goto out1; + } + if (bp != NULL && bplocked) { + /* + * Requeue unfinished dependencies before + * unlocking buffer, which could make + * diraddhdp invalid. + */ + ACQUIRE_LOCK(VFSTOUFS(mp)); + while ((dap = LIST_FIRST(unfinishedp)) != NULL) { + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); + } + FREE_LOCK(VFSTOUFS(mp)); + bp->b_vflags &= ~BV_SCANNED; + BUF_NOREC(bp); + BUF_UNLOCK(bp); + bplocked = false; + } + + /* + * Do not drop vnode lock while inactivating. This + * would result in leaks of the VI flags and + * reclaiming of non-truncated vnode. Instead, + * re-schedule inactivation hoping that we would be + * able to sync inode later. + */ + if ((vp->v_iflag & VI_DOINGINACT) != 0) { + VI_LOCK(vp); + vp->v_iflag |= VI_OWEINACT; + VI_UNLOCK(vp); + return (ERELOOKUP); + } + + VOP_UNLOCK(vp); + error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &pvp, + FFSV_FORCEINSMQ); + if (error != 0) { + MPASS(error != ERELOOKUP); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + break; + } + if (VTOI(pvp)->i_mode == 0) { + vgone(pvp); + vput(pvp); + pvp = NULL; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = ERELOOKUP; + break; + } + error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); + if (error == 0) + break; + vput(pvp); + pvp = NULL; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_data == NULL) { + error = ENOENT; + break; + } + } + if (bp != NULL) { + MPASS(!bplocked); + error = ERELOOKUP; + } + if (error != 0 && pvp != NULL) { + vput(pvp); + pvp = NULL; + } +out1: + *rvp = pvp; +out: + ASSERT_VOP_ELOCKED(vp, "child vnode must be locked on return"); + return (error); +} + /* * This function cleans the worklist for a filesystem. * Each filesystem running with soft dependencies gets its own @@ -3095,48 +3220,191 @@ return (0); } +static int +softdep_prehandle_vnode(ump, vp) + struct ufsmount *ump; + struct vnode *vp; +{ + int error; + + ASSERT_VOP_ELOCKED(vp, "prehandle"); + if (vp->v_data == NULL) + return (0); + error = VOP_FSYNC(vp, MNT_WAIT, curthread); + if (error != 0) + return (error); + ACQUIRE_LOCK(ump); + process_removes(vp); + process_truncates(vp); + FREE_LOCK(ump); + return (0); +} + +int +softdep_prerename(fdvp, fvp, tdvp, tvp) + struct vnode *fdvp; + struct vnode *fvp; + struct vnode *tdvp; + struct vnode *tvp; +{ + struct ufsmount *ump; + int error; + + ump = VFSTOUFS(fdvp->v_mount); + + if (journal_space(ump, 0)) + return (0); + + VOP_UNLOCK(tdvp); + VOP_UNLOCK(fvp); + if (tvp != NULL && tvp != tdvp) + VOP_UNLOCK(tvp); + + error = softdep_prehandle_vnode(ump, fdvp); + VOP_UNLOCK(fdvp); + if (error != 0) + return (error); + + VOP_LOCK(fvp, LK_EXCLUSIVE | LK_RETRY); + error = softdep_prehandle_vnode(ump, fvp); + VOP_UNLOCK(fvp); + if (error != 0) + return (error); + + if (tdvp != fdvp) { + VOP_LOCK(tdvp, LK_EXCLUSIVE | LK_RETRY); + error = softdep_prehandle_vnode(ump, tdvp); + VOP_UNLOCK(tdvp); + if (error != 0) + return (error); + } + + if (tvp != fvp && tvp != NULL) { + VOP_LOCK(tvp, LK_EXCLUSIVE | LK_RETRY); + error = softdep_prehandle_vnode(ump, tvp); + VOP_UNLOCK(tvp); + if (error != 0) + return (error); + } + + ACQUIRE_LOCK(ump); + softdep_speedup(ump); + process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); + if (journal_space(ump, 0) == 0) { + softdep_speedup(ump); + if (journal_space(ump, 1) == 0) + journal_suspend(ump); + } + FREE_LOCK(ump); + return (ERELOOKUP); +} + /* * Before adjusting a link count on a vnode verify that we have sufficient * journal space. If not, process operations that depend on the currently * locked pair of vnodes to try to flush space as the syncer, buf daemon, * and softdep flush threads can not acquire these locks to reclaim space. */ -static void -softdep_prelink(dvp, vp) +int +softdep_prelink(dvp, vp, will_direnter) struct vnode *dvp; struct vnode *vp; + int will_direnter; { struct ufsmount *ump; + int error, error1; + ASSERT_VOP_ELOCKED(dvp, "prelink dvp"); + if (vp != NULL) + ASSERT_VOP_ELOCKED(vp, "prelink vp"); ump = VFSTOUFS(dvp->v_mount); - LOCK_OWNED(ump); + /* * Nothing to do if we have sufficient journal space. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. + * + * In case allocated a directory block in an + * indirect block, we must prevent holes in the + * directory created if directory entries are + * written out of order. To accomplish this we + * fsync when we extend a directory into indirects. + * During rename it's not safe to drop the tvp lock + * so sync must be delayed until it is. + * + * This synchronous step could be removed if fsck and + * the kernel were taught to fill in sparse + * directories rather than panic. */ - if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) - return; + if (journal_space(ump, 0) || (vp != NULL && IS_SNAPSHOT(VTOI(vp)))) { + error = 0; + if (will_direnter && (vp == NULL || !IS_SNAPSHOT(VTOI(vp)))) { + if (vp != NULL) + VOP_UNLOCK(vp); + error = ffs_syncvnode(dvp, MNT_WAIT, 0); + if (vp != NULL) { + error1 = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); + if (error1 != 0) { + vn_lock_pair(dvp, true, vp, false); + if (error == 0) + error = ERELOOKUP; + } else if (vp->v_data == NULL) { + error = ERELOOKUP; + } + } + } + return (error); + } + stat_journal_low++; - FREE_LOCK(ump); - if (vp) + if (vp != NULL) { + VOP_UNLOCK(dvp); ffs_syncvnode(vp, MNT_NOWAIT, 0); + vn_lock_pair(dvp, false, vp, true); + if (dvp->v_data == NULL) + return (ERELOOKUP); + } + if (vp != NULL) + VOP_UNLOCK(vp); ffs_syncvnode(dvp, MNT_WAIT, 0); - ACQUIRE_LOCK(ump); + VOP_UNLOCK(dvp); + /* Process vp before dvp as it may create .. removes. */ - if (vp) { + if (vp != NULL) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_data == NULL) { + vn_lock_pair(dvp, false, vp, true); + return (ERELOOKUP); + } + ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); + FREE_LOCK(ump); + VOP_UNLOCK(vp); + } + + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); + if (dvp->v_data == NULL) { + vn_lock_pair(dvp, true, vp, false); + return (ERELOOKUP); } + + ACQUIRE_LOCK(ump); process_removes(dvp); process_truncates(dvp); + VOP_UNLOCK(dvp); softdep_speedup(ump); + process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } + FREE_LOCK(ump); + + vn_lock_pair(dvp, false, vp, false); + return (ERELOOKUP); } static void @@ -4742,7 +5010,6 @@ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_create: No addref structure present.")); } - softdep_prelink(dvp, NULL); FREE_LOCK(ITOUMP(dp)); } @@ -4777,7 +5044,6 @@ if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); - softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } @@ -4808,7 +5074,6 @@ if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); - softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } @@ -4858,7 +5123,6 @@ if (DOINGSUJ(dvp)) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &dotdotaddref->ja_ref, if_deps); - softdep_prelink(ITOV(dp), NULL); FREE_LOCK(ITOUMP(dp)); } @@ -4879,7 +5143,6 @@ ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); - softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } @@ -4900,7 +5163,6 @@ ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); - softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } @@ -8764,11 +9026,11 @@ if (MOUNTEDSUJ(mp)) { flags = DEPALLOC; jmvref = newjmvref(dp, de->d_ino, - dp->i_offset + (oldloc - base), - dp->i_offset + (newloc - base)); + I_OFFSET(dp) + (oldloc - base), + I_OFFSET(dp) + (newloc - base)); } - lbn = lblkno(ump->um_fs, dp->i_offset); - offset = blkoff(ump->um_fs, dp->i_offset); + lbn = lblkno(ump->um_fs, I_OFFSET(dp)); + offset = blkoff(ump->um_fs, I_OFFSET(dp)); oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); ACQUIRE_LOCK(ump); @@ -9280,7 +9542,7 @@ jremref = dotremref = dotdotremref = NULL; if (DOINGSUJ(dvp)) { if (isrmdir) { - jremref = newjremref(dirrem, dp, ip, dp->i_offset, + jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp), ip->i_effnlink + 2); dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, ip->i_effnlink + 1); @@ -9288,12 +9550,12 @@ dp->i_effnlink + 1); dotdotremref->jr_state |= MKDIR_PARENT; } else - jremref = newjremref(dirrem, dp, ip, dp->i_offset, + jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp), ip->i_effnlink + 1); } ACQUIRE_LOCK(ump); - lbn = lblkno(ump->um_fs, dp->i_offset); - offset = blkoff(ump->um_fs, dp->i_offset); + lbn = lblkno(ump->um_fs, I_OFFSET(dp)); + offset = blkoff(ump->um_fs, I_OFFSET(dp)); pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC, &pagedep); dirrem->dm_pagedep = pagedep; @@ -9304,7 +9566,7 @@ * the jremref is preserved for any potential diradd in this * location. This can not coincide with a rmdir. */ - if (dp->i_offset == DOTDOT_OFFSET) { + if (I_OFFSET(dp) == DOTDOT_OFFSET) { if (isrmdir) panic("newdirrem: .. directory change during remove?"); jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); @@ -9405,7 +9667,7 @@ mp = ITOVFS(dp); ump = VFSTOUFS(mp); - offset = blkoff(ump->um_fs, dp->i_offset); + offset = blkoff(ump->um_fs, I_OFFSET(dp)); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_change called on non-softdep filesystem")); @@ -9508,7 +9770,7 @@ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_change: bad jaddref %p", jaddref)); - jaddref->ja_diroff = dp->i_offset; + jaddref->ja_diroff = I_OFFSET(dp); jaddref->ja_diradd = dap; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); @@ -9527,7 +9789,7 @@ * committed when need to move the dot and dotdot references to * this new name. */ - if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) + if (inodedep->id_mkdiradd && I_OFFSET(dp) != DOTDOT_OFFSET) merge_diradd(inodedep, dap); FREE_LOCK(ump); } @@ -12622,25 +12884,12 @@ * for details on possible races. */ FREE_LOCK(ump); - if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, - FFSV_FORCEINSMQ)) { - /* - * Unmount cannot proceed after unlock because - * caller must have called vn_start_write(). - */ - VOP_UNLOCK(vp); - error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, - &pvp, FFSV_FORCEINSMQ); - MPASS(VTOI(pvp)->i_mode != 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if (VN_IS_DOOMED(vp)) { - if (error == 0) - vput(pvp); - error = ENOENT; - } - if (error != 0) - return (error); - } + error = get_parent_vp(vp, mp, parentino, NULL, NULL, NULL, + &pvp); + if (error == ERELOOKUP) + error = 0; + if (error != 0) + return (error); /* * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps * that are contained in direct blocks will be resolved by @@ -12964,9 +13213,11 @@ for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; - if ((error = flush_pagedep_deps(vp, wk->wk_mp, - &pagedep->pd_diraddhd[i]))) { - BUF_NOREC(bp); + error = flush_pagedep_deps(vp, wk->wk_mp, + &pagedep->pd_diraddhd[i], bp); + if (error != 0) { + if (error != ERELOOKUP) + BUF_NOREC(bp); goto out_unlock; } } @@ -13200,10 +13451,11 @@ * Eliminate a pagedep dependency by flushing out all its diradd dependencies. */ static int -flush_pagedep_deps(pvp, mp, diraddhdp) +flush_pagedep_deps(pvp, mp, diraddhdp, locked_bp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; + struct buf *locked_bp; { struct inodedep *inodedep; struct inoref *inoref; @@ -13270,10 +13522,10 @@ } if (dap->da_state & MKDIR_BODY) { FREE_LOCK(ump); - if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, - FFSV_FORCEINSMQ))) + error = get_parent_vp(pvp, mp, inum, locked_bp, + diraddhdp, &unfinished, &vp); + if (error != 0) break; - MPASS(VTOI(vp)->i_mode != 0); error = flush_newblk_dep(vp, mp, 0); /* * If we still have the dependency we might need to @@ -13335,10 +13587,10 @@ */ if (dap == LIST_FIRST(diraddhdp)) { FREE_LOCK(ump); - if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, - FFSV_FORCEINSMQ))) + error = get_parent_vp(pvp, mp, inum, locked_bp, + diraddhdp, &unfinished, &vp); + if (error != 0) break; - MPASS(VTOI(vp)->i_mode != 0); error = ffs_update(vp, 1); vput(vp); if (error) Index: sys/ufs/ffs/ffs_vfsops.c =================================================================== --- sys/ufs/ffs/ffs_vfsops.c +++ sys/ufs/ffs/ffs_vfsops.c @@ -2005,6 +2005,9 @@ ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; ip->i_mode = 0; /* ensure error cases below throw away vnode */ +#ifdef DIAGNOSTIC + ufs_init_trackers(ip); +#endif #ifdef QUOTA { int i; Index: sys/ufs/ffs/ffs_vnops.c =================================================================== --- sys/ufs/ffs/ffs_vnops.c +++ sys/ufs/ffs/ffs_vnops.c @@ -253,7 +253,7 @@ struct buf *bp, *nbp; ufs_lbn_t lbn; int error, passes; - bool still_dirty, wait; + bool still_dirty, unlocked, wait; ip = VTOI(vp); ip->i_flag &= ~IN_NEEDSYNC; @@ -277,6 +277,7 @@ error = 0; passes = 0; wait = false; /* Always do an async pass first. */ + unlocked = false; lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); BO_LOCK(bo); loop: @@ -325,6 +326,26 @@ if (!LIST_EMPTY(&bp->b_dep) && (error = softdep_sync_buf(vp, bp, wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { + /* + * Lock order conflict, buffer was already unlocked, + * and vnode possibly unlocked. + */ + if (error == ERELOOKUP) { + if (vp->v_data == NULL) + return (EBADF); + unlocked = true; + if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && + (error = softdep_sync_metadata(vp)) != 0) { + if (ffs_fsfail_cleanup(ump, error)) + error = 0; + return (unlocked && error == 0 ? + ERELOOKUP : error); + } + /* Re-evaluate inode size */ + lbn = lblkno(ITOFS(ip), (ip->i_size + + ITOFS(ip)->fs_bsize - 1)); + goto next; + } /* I/O error. */ if (error != EBUSY) { BUF_UNLOCK(bp); @@ -361,9 +382,11 @@ if (waitfor != MNT_WAIT) { BO_UNLOCK(bo); if ((flags & NO_INO_UPDT) != 0) - return (0); - else - return (ffs_update(vp, 0)); + return (unlocked ? ERELOOKUP : 0); + error = ffs_update(vp, 0); + if (error == 0 && unlocked) + error = ERELOOKUP; + return (error); } /* Drain IO to see if we're done. */ bufobj_wwait(bo, 0, 0); @@ -419,6 +442,8 @@ } else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) { error = ffs_update(vp, 1); } + if (error == 0 && unlocked) + error = ERELOOKUP; return (error); } @@ -434,16 +459,18 @@ struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; - struct thread *a_td; char *file; int line; } */ *ap; { + struct vnode *vp = ap->a_vp; +#ifdef DIAGNOSTIC + struct inode *ip; +#endif + int result; #ifndef NO_FFS_SNAPSHOT - struct vnode *vp; int flags; struct lock *lkp; - int result; /* * Adaptive spinning mixed with SU leads to trouble. use a giant hammer @@ -456,7 +483,6 @@ case LK_SHARED: case LK_UPGRADE: case LK_EXCLUSIVE: - vp = ap->a_vp; flags = ap->a_flags; for (;;) { #ifdef DEBUG_VFS_LOCKS @@ -483,28 +509,65 @@ flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; } + switch (ap->a_flags & LK_TYPE_MASK) { + case LK_UPGRADE: + case LK_EXCLUSIVE: + if (result == 0 && vp->v_vnlock->lk_recurse == 0) { + ip = VTOI(vp); + if (ip != NULL) + ip->i_lock_gen++; + } + } break; default: +#ifdef DIAGNOSTIC + if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) { + ip = VTOI(vp); + if (ip != NULL) + ufs_unlock_tracker(ip); + } +#endif result = VOP_LOCK1_APV(&ufs_vnodeops, ap); + break; } - return (result); #else /* * See above for an explanation. */ if ((ap->a_flags & LK_NODDLKTREAT) != 0) ap->a_flags |= LK_ADAPTIVE; - return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); +#ifdef DIAGNOSTIC + if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) { + ip = VTOI(vp); + if (ip != NULL) + ufs_unlock_tracker(ip); + } #endif + result = VOP_LOCK1_APV(&ufs_vnodeops, ap); +#endif +#ifdef DIAGNOSTIC + switch (ap->a_flags & LK_TYPE_MASK) { + case LK_UPGRADE: + case LK_EXCLUSIVE: + if (result == 0 && vp->v_vnlock->lk_recurse == 0) { + ip = VTOI(vp); + if (ip != NULL) + ip->i_lock_gen++; + } + } +#endif + return (result); } #ifdef INVARIANTS static int ffs_unlock_debug(struct vop_unlock_args *ap) { - struct vnode *vp = ap->a_vp; - struct inode *ip = VTOI(vp); + struct vnode *vp; + struct inode *ip; + vp = ap->a_vp; + ip = VTOI(vp); if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) { if ((vp->v_mflag & VMP_LAZYLIST) == 0) { VI_LOCK(vp); @@ -514,6 +577,11 @@ VI_UNLOCK(vp); } } +#ifdef DIAGNOSTIC + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && ip != NULL && + vp->v_vnlock->lk_recurse == 0) + ufs_unlock_tracker(ip); +#endif return (VOP_UNLOCK_APV(&ufs_vnodeops, ap)); } #endif Index: sys/ufs/ufs/inode.h =================================================================== --- sys/ufs/ufs/inode.h +++ sys/ufs/ufs/inode.h @@ -44,12 +44,24 @@ #include #include #include +#ifdef DIAGNOSTIC +#include +#endif /* * This must agree with the definition in . */ #define doff_t int32_t +#ifdef DIAGNOSTIC +struct iown_tracker { + struct thread *tr_owner; + struct stack tr_st; + struct stack tr_unlock; + int tr_gen; +}; +#endif + /* * The inode is used to describe each active (or recently active) file in the * UFS filesystem. It is composed of two types of information. The first part @@ -94,6 +106,12 @@ doff_t i_endoff; /* End of useful stuff in directory. */ doff_t i_diroff; /* Offset in dir, where we found last entry. */ doff_t i_offset; /* Offset of free space in directory. */ +#ifdef DIAGNOSTIC + int i_lock_gen; + struct iown_tracker i_count_tracker; + struct iown_tracker i_endoff_tracker; + struct iown_tracker i_offset_tracker; +#endif int i_nextclustercg; /* last cg searched for cluster */ @@ -254,6 +272,35 @@ uint32_t ufid_ino; /* File number (ino). */ uint32_t ufid_gen; /* Generation number. */ }; + +#ifdef DIAGNOSTIC +void ufs_init_trackers(struct inode *ip); +void ufs_unlock_tracker(struct inode *ip); + +doff_t ufs_get_i_offset(struct inode *ip, const char *file, int line); +void ufs_set_i_offset(struct inode *ip, doff_t off, const char *file, int line); +#define I_OFFSET(ip) ufs_get_i_offset(ip, __FILE__, __LINE__) +#define SET_I_OFFSET(ip, off) ufs_set_i_offset(ip, off, __FILE__, __LINE__) + +int32_t ufs_get_i_count(struct inode *ip, const char *file, int line); +void ufs_set_i_count(struct inode *ip, int32_t cnt, const char *file, int line); +#define I_COUNT(ip) ufs_get_i_count(ip, __FILE__, __LINE__) +#define SET_I_COUNT(ip, cnt) ufs_set_i_count(ip, cnt, __FILE__, __LINE__) + +doff_t ufs_get_i_endoff(struct inode *ip, const char *file, int line); +void ufs_set_i_endoff(struct inode *ip, doff_t off, const char *file, int line); +#define I_ENDOFF(ip) ufs_get_i_endoff(ip, __FILE__, __LINE__) +#define SET_I_ENDOFF(ip, off) ufs_set_i_endoff(ip, off, __FILE__, __LINE__) + +#else +#define I_OFFSET(ip) ((ip)->i_offset) +#define SET_I_OFFSET(ip, off) ((ip)->i_offset = (off)) +#define I_COUNT(ip) ((ip)->i_count) +#define SET_I_COUNT(ip, cnt) ((ip)->i_count = cnt) +#define I_ENDOFF(ip) ((ip)->i_endoff) +#define SET_I_ENDOFF(ip, off) ((ip)->i_endoff = off) +#endif + #endif /* _KERNEL */ #endif /* !_UFS_UFS_INODE_H_ */ Index: sys/ufs/ufs/ufs_inode.c =================================================================== --- sys/ufs/ufs/ufs_inode.c +++ sys/ufs/ufs/ufs_inode.c @@ -167,7 +167,8 @@ isize += ip->i_din2->di_extsize; if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip)) error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, NOCRED); - if (ip->i_nlink <= 0 && ip->i_mode && !UFS_RDONLY(ip)) { + if (ip->i_nlink <= 0 && ip->i_mode != 0 && !UFS_RDONLY(ip) && + (vp->v_iflag & VI_OWEINACT) == 0) { #ifdef QUOTA if (!getinoquota(ip)) (void)chkiq(ip, -1, NOCRED, FORCE); @@ -208,10 +209,12 @@ * If we are done with the inode, reclaim it * so that it can be reused immediately. */ - if (ip->i_mode == 0) + if (ip->i_mode == 0 && (vp->v_iflag & VI_OWEINACT) == 0) vrecycle(vp); if (mp != NULL) vn_finished_secondary_write(mp); + if (error == ERELOOKUP) + error = 0; return (error); } Index: sys/ufs/ufs/ufs_lookup.c =================================================================== --- sys/ufs/ufs/ufs_lookup.c +++ sys/ufs/ufs/ufs_lookup.c @@ -66,6 +66,7 @@ #endif #include #include +#include #ifdef DIAGNOSTIC static int dirchk = 1; @@ -504,22 +505,22 @@ * dp->i_offset + dp->i_count. */ if (slotstatus == NONE) { - dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); - dp->i_count = 0; - enduseful = dp->i_offset; + SET_I_OFFSET(dp, roundup2(dp->i_size, DIRBLKSIZ)); + SET_I_COUNT(dp, 0); + enduseful = I_OFFSET(dp); } else if (nameiop == DELETE) { - dp->i_offset = slotoffset; - if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) - dp->i_count = 0; + SET_I_OFFSET(dp, slotoffset); + if ((I_OFFSET(dp) & (DIRBLKSIZ - 1)) == 0) + SET_I_COUNT(dp, 0); else - dp->i_count = dp->i_offset - prevoff; + SET_I_COUNT(dp, I_OFFSET(dp) - prevoff); } else { - dp->i_offset = slotoffset; - dp->i_count = slotsize; + SET_I_OFFSET(dp, slotoffset); + SET_I_COUNT(dp, slotsize); if (enduseful < slotoffset + slotsize) enduseful = slotoffset + slotsize; } - dp->i_endoff = roundup2(enduseful, DIRBLKSIZ); + SET_I_ENDOFF(dp, roundup2(enduseful, DIRBLKSIZ)); /* * We return with the directory locked, so that * the parameters we set up above will still be @@ -575,24 +576,32 @@ if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); - /* - * Return pointer to current entry in dp->i_offset, - * and distance past previous entry (if there - * is a previous entry in this block) in dp->i_count. - * Save directory inode pointer in ndp->ni_dvp for dirremove(). - * - * Technically we shouldn't be setting these in the - * WANTPARENT case (first lookup in rename()), but any - * lookups that will result in directory changes will - * overwrite these. - */ - dp->i_offset = i_offset; - if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) - dp->i_count = 0; - else - dp->i_count = dp->i_offset - prevoff; + + if (VOP_ISLOCKED(vdp) == LK_EXCLUSIVE) { + /* + * Return pointer to current entry in + * dp->i_offset, and distance past previous + * entry (if there is a previous entry in this + * block) in dp->i_count. + * + * We shouldn't be setting these in the + * WANTPARENT case (first lookup in rename()), but any + * lookups that will result in directory changes will + * overwrite these. + */ + SET_I_OFFSET(dp, i_offset); + if ((I_OFFSET(dp) & (DIRBLKSIZ - 1)) == 0) + SET_I_COUNT(dp, 0); + else + SET_I_COUNT(dp, I_OFFSET(dp) - prevoff); + } if (dd_ino != NULL) return (0); + + /* + * Save directory inode pointer in ndp->ni_dvp for + * dirremove(). + */ if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); @@ -629,7 +638,7 @@ * Careful about locking second inode. * This can only occur if the target is ".". */ - dp->i_offset = i_offset; + SET_I_OFFSET(dp, i_offset); if (dp->i_number == ino) return (EISDIR); if (dd_ino != NULL) @@ -887,14 +896,14 @@ dp = VTOI(dvp); newentrysize = DIRSIZ(OFSFMT(dvp), dirp); - if (dp->i_count == 0) { + if (I_COUNT(dp) == 0) { /* * If dp->i_count is 0, then namei could find no * space in the directory. Here, dp->i_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ - if (dp->i_offset & (DIRBLKSIZ - 1)) + if (I_OFFSET(dp) & (DIRBLKSIZ - 1)) panic("ufs_direnter: newblk"); flags = BA_CLRBUF; if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) @@ -907,28 +916,28 @@ } #endif old_isize = dp->i_size; - vnode_pager_setsize(dvp, (u_long)dp->i_offset + DIRBLKSIZ); - if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ, + vnode_pager_setsize(dvp, (u_long)I_OFFSET(dp) + DIRBLKSIZ); + if ((error = UFS_BALLOC(dvp, (off_t)I_OFFSET(dp), DIRBLKSIZ, cr, flags, &bp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); vnode_pager_setsize(dvp, (u_long)old_isize); return (error); } - dp->i_size = dp->i_offset + DIRBLKSIZ; + dp->i_size = I_OFFSET(dp) + DIRBLKSIZ; DIP_SET(dp, i_size, dp->i_size); - dp->i_endoff = dp->i_size; + SET_I_ENDOFF(dp, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); dirp->d_reclen = DIRBLKSIZ; - blkoff = dp->i_offset & + blkoff = I_OFFSET(dp) & (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1); bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { - ufsdirhash_newblk(dp, dp->i_offset); - ufsdirhash_add(dp, dirp, dp->i_offset); + ufsdirhash_newblk(dp, I_OFFSET(dp)); + ufsdirhash_add(dp, dirp, I_OFFSET(dp)); ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, - dp->i_offset); + I_OFFSET(dp)); } #endif if (DOINGSOFTDEP(dvp)) { @@ -944,7 +953,7 @@ (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } - if (softdep_setup_directory_add(bp, dp, dp->i_offset, + if (softdep_setup_directory_add(bp, dp, I_OFFSET(dp), dirp->d_ino, newdirbp, 1)) UFS_INODE_SET_FLAG(dp, IN_NEEDSYNC); if (newdirbp) @@ -952,27 +961,7 @@ bdwrite(bp); if ((dp->i_flag & IN_NEEDSYNC) == 0) return (UFS_UPDATE(dvp, 0)); - /* - * We have just allocated a directory block in an - * indirect block. We must prevent holes in the - * directory created if directory entries are - * written out of order. To accomplish this we - * fsync when we extend a directory into indirects. - * During rename it's not safe to drop the tvp lock - * so sync must be delayed until it is. - * - * This synchronous step could be removed if fsck and - * the kernel were taught to fill in sparse - * directories rather than panic. - */ - if (isrename) - return (0); - if (tvp != NULL) - VOP_UNLOCK(tvp); - (void) VOP_FSYNC(dvp, MNT_WAIT, td); - if (tvp != NULL) - vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); - return (error); + return (0); } if (DOINGASYNC(dvp)) { bdwrite(bp); @@ -1001,15 +990,15 @@ * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ - if (dp->i_offset + dp->i_count > dp->i_size) { - dp->i_size = dp->i_offset + dp->i_count; + if (I_OFFSET(dp) + I_COUNT(dp) > dp->i_size) { + dp->i_size = I_OFFSET(dp) + I_COUNT(dp); DIP_SET(dp, i_size, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_MODIFIED); } /* * Get the block containing the space for the new directory entry. */ - error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp); + error = UFS_BLKATOFF(dvp, (off_t)I_OFFSET(dp), &dirbuf, &bp); if (error) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); @@ -1024,7 +1013,7 @@ ep = (struct direct *)dirbuf; dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0; spacefree = ep->d_reclen - dsize; - for (loc = ep->d_reclen; loc < dp->i_count; ) { + for (loc = ep->d_reclen; loc < I_COUNT(dp); ) { nep = (struct direct *)(dirbuf + loc); /* Trim the existing slot (NB: dsize may be zero). */ @@ -1052,8 +1041,8 @@ #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_move(dp, nep, - dp->i_offset + ((char *)nep - dirbuf), - dp->i_offset + ((char *)ep - dirbuf)); + I_OFFSET(dp) + ((char *)nep - dirbuf), + I_OFFSET(dp) + ((char *)ep - dirbuf)); #endif if (DOINGSOFTDEP(dvp)) softdep_change_directoryentry_offset(bp, dp, dirbuf, @@ -1094,19 +1083,19 @@ #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL && (ep->d_ino == 0 || dirp->d_reclen == spacefree)) - ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf)); + ufsdirhash_add(dp, dirp, I_OFFSET(dp) + ((char *)ep - dirbuf)); #endif bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, dirbuf - - (dp->i_offset & (DIRBLKSIZ - 1)), - rounddown2(dp->i_offset, DIRBLKSIZ)); + (I_OFFSET(dp) & (DIRBLKSIZ - 1)), + rounddown2(I_OFFSET(dp), DIRBLKSIZ)); #endif if (DOINGSOFTDEP(dvp)) { (void) softdep_setup_directory_add(bp, dp, - dp->i_offset + (caddr_t)ep - dirbuf, + I_OFFSET(dp) + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); if (newdirbp != NULL) bdwrite(newdirbp); @@ -1128,10 +1117,10 @@ * lock on the newly entered node. */ if (isrename == 0 && error == 0 && - dp->i_endoff && dp->i_endoff < dp->i_size) { + I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size) { if (tvp != NULL) VOP_UNLOCK(tvp); - error = UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, + error = UFS_TRUNCATE(dvp, (off_t)I_ENDOFF(dp), IO_NORMAL | (DOINGASYNC(dvp) ? 0 : IO_SYNC), cr); if (error != 0) vn_printf(dvp, @@ -1139,7 +1128,7 @@ error); #ifdef UFS_DIRHASH if (error == 0 && dp->i_dirhash != NULL) - ufsdirhash_dirtrunc(dp, dp->i_endoff); + ufsdirhash_dirtrunc(dp, I_ENDOFF(dp)); #endif error = 0; if (tvp != NULL) @@ -1190,9 +1179,9 @@ } } if (flags & DOWHITEOUT) - offset = dp->i_offset; + offset = I_OFFSET(dp); else - offset = dp->i_offset - dp->i_count; + offset = I_OFFSET(dp) - I_COUNT(dp); if ((error = UFS_BLKATOFF(dvp, offset, (char **)&ep, &bp)) != 0) { if (ip) { ip->i_effnlink++; @@ -1216,7 +1205,7 @@ goto out; } /* Set 'rep' to the entry being removed. */ - if (dp->i_count == 0) + if (I_COUNT(dp) == 0) rep = ep; else rep = (struct direct *)((char *)ep + ep->d_reclen); @@ -1226,7 +1215,7 @@ * that `ep' is the previous entry when dp->i_count != 0. */ if (dp->i_dirhash != NULL) - ufsdirhash_remove(dp, rep, dp->i_offset); + ufsdirhash_remove(dp, rep, I_OFFSET(dp)); #endif if (ip && rep->d_ino != ip->i_number) panic("ufs_dirremove: ip %ju does not match dirent ino %ju\n", @@ -1240,7 +1229,7 @@ rep->d_type = 0; rep->d_ino = 0; - if (dp->i_count != 0) { + if (I_COUNT(dp) != 0) { /* * Collapse new free space into previous entry. */ @@ -1250,8 +1239,8 @@ #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, (char *)ep - - ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)), - rounddown2(dp->i_offset, DIRBLKSIZ)); + ((I_OFFSET(dp) - I_COUNT(dp)) & (DIRBLKSIZ - 1)), + rounddown2(I_OFFSET(dp), DIRBLKSIZ)); #endif out: error = 0; @@ -1313,7 +1302,7 @@ UFS_INODE_SET_FLAG(oip, IN_CHANGE); } - error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); + error = UFS_BLKATOFF(vdp, (off_t)I_OFFSET(dp), (char **)&ep, &bp); if (error == 0 && ep->d_namlen == 2 && ep->d_name[1] == '.' && ep->d_name[0] == '.' && ep->d_ino != oip->i_number) { brelse(bp); @@ -1522,3 +1511,115 @@ vput(vp); return (error); } + +#ifdef DIAGNOSTIC +static void +ufs_assert_inode_offset_owner(struct inode *ip, struct iown_tracker *tr, + const char *name, const char *file, int line) +{ + char msg[128]; + + snprintf(msg, sizeof(msg), "at %s@%d", file, line); + ASSERT_VOP_ELOCKED(ITOV(ip), msg); + MPASS((ip->i_mode & IFMT) == IFDIR); + if (curthread == tr->tr_owner && ip->i_lock_gen == tr->tr_gen) + return; + printf("locked at\n"); + stack_print(&tr->tr_st); + printf("unlocked at\n"); + stack_print(&tr->tr_unlock); + panic("%s ip %p %jd offset owner %p %d gen %d " + "curthread %p %d gen %d at %s@%d\n", + name, ip, (uintmax_t)ip->i_number, tr->tr_owner, + tr->tr_owner->td_tid, tr->tr_gen, + curthread, curthread->td_tid, ip->i_lock_gen, + file, line); +} + +static void +ufs_set_inode_offset_owner(struct inode *ip, struct iown_tracker *tr, + const char *file, int line) +{ + char msg[128]; + + snprintf(msg, sizeof(msg), "at %s@%d", file, line); + ASSERT_VOP_ELOCKED(ITOV(ip), msg); + MPASS((ip->i_mode & IFMT) == IFDIR); + tr->tr_owner = curthread; + tr->tr_gen = ip->i_lock_gen; + stack_save(&tr->tr_st); +} + +static void +ufs_init_one_tracker(struct iown_tracker *tr) +{ + tr->tr_owner = NULL; + stack_zero(&tr->tr_st); +} + +void +ufs_init_trackers(struct inode *ip) +{ + ufs_init_one_tracker(&ip->i_offset_tracker); + ufs_init_one_tracker(&ip->i_count_tracker); + ufs_init_one_tracker(&ip->i_endoff_tracker); +} + +void +ufs_unlock_tracker(struct inode *ip) +{ + if (ip->i_count_tracker.tr_gen == ip->i_lock_gen) + stack_save(&ip->i_count_tracker.tr_unlock); + if (ip->i_offset_tracker.tr_gen == ip->i_lock_gen) + stack_save(&ip->i_offset_tracker.tr_unlock); + if (ip->i_endoff_tracker.tr_gen == ip->i_lock_gen) + stack_save(&ip->i_endoff_tracker.tr_unlock); + ip->i_lock_gen++; +} + +doff_t +ufs_get_i_offset(struct inode *ip, const char *file, int line) +{ + ufs_assert_inode_offset_owner(ip, &ip->i_offset_tracker, "i_offset", + file, line); + return (ip->i_offset); +} + +void +ufs_set_i_offset(struct inode *ip, doff_t off, const char *file, int line) +{ + ufs_set_inode_offset_owner(ip, &ip->i_offset_tracker, file, line); + ip->i_offset = off; +} + +int32_t +ufs_get_i_count(struct inode *ip, const char *file, int line) +{ + ufs_assert_inode_offset_owner(ip, &ip->i_count_tracker, "i_count", + file, line); + return (ip->i_count); +} + +void +ufs_set_i_count(struct inode *ip, int32_t cnt, const char *file, int line) +{ + ufs_set_inode_offset_owner(ip, &ip->i_count_tracker, file, line); + ip->i_count = cnt; +} + +doff_t +ufs_get_i_endoff(struct inode *ip, const char *file, int line) +{ + ufs_assert_inode_offset_owner(ip, &ip->i_endoff_tracker, "i_endoff", + file, line); + return (ip->i_endoff); +} + +void +ufs_set_i_endoff(struct inode *ip, doff_t off, const char *file, int line) +{ + ufs_set_inode_offset_owner(ip, &ip->i_endoff_tracker, file, line); + ip->i_endoff = off; +} + +#endif Index: sys/ufs/ufs/ufs_vnops.c =================================================================== --- sys/ufs/ufs/ufs_vnops.c +++ sys/ufs/ufs/ufs_vnops.c @@ -1067,6 +1067,15 @@ if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif + + if (DOINGSOFTDEP(tdvp)) { + error = softdep_prelink(tdvp, vp, true); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } + if (VTOI(tdvp)->i_effnlink < 2) { print_bad_link_count("ufs_link", tdvp); error = EINVAL; @@ -1089,6 +1098,7 @@ error = EPERM; goto out; } + ip->i_effnlink++; ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); @@ -1129,6 +1139,7 @@ struct direct newdir; int error = 0; +//XXXKIB error = VOP_FSYNC(dvp, MNT_WAIT); switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ @@ -1338,6 +1349,18 @@ goto relock; } } + + if (DOINGSOFTDEP(fdvp)) { + error = softdep_prerename(fdvp, fvp, tdvp, tvp); + if (error != 0) { + if (error == ERELOOKUP) { + atomic_add_int(&rename_restarts, 1); + goto relock; + } + goto releout; + } + } + fdp = VTOI(fdvp); fip = VTOI(fvp); tdp = VTOI(tdvp); @@ -1481,9 +1504,9 @@ if (error) goto bad; /* Setup tdvp for directory compaction if needed. */ - if (tdp->i_count && tdp->i_endoff && - tdp->i_endoff < tdp->i_size) - endoff = tdp->i_endoff; + if (I_COUNT(tdp) != 0 && I_ENDOFF(tdp) != 0 && + I_ENDOFF(tdp) < tdp->i_size) + endoff = I_ENDOFF(tdp); } else { if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip)) panic("ufs_rename: EXDEV"); @@ -1611,7 +1634,7 @@ } else if (DOINGSUJ(tdvp)) /* Journal must account for each new link. */ softdep_setup_dotdot_link(tdp, fip); - fip->i_offset = mastertemplate.dot_reclen; + SET_I_OFFSET(fip, mastertemplate.dot_reclen); ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); cache_purge(fdvp); } @@ -1649,8 +1672,10 @@ * are no longer needed. */ if (error == 0 && endoff != 0) { - error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | - (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred); + do { + error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | + (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred); + } while (error == ERELOOKUP); if (error != 0 && !ffs_fsfail_cleanup(VFSTOUFS(mp), error)) vn_printf(tdvp, "ufs_rename: failed to truncate, error %d\n", @@ -1668,8 +1693,11 @@ */ error = 0; } - if (error == 0 && tdp->i_flag & IN_NEEDSYNC) - error = VOP_FSYNC(tdvp, MNT_WAIT, td); + if (error == 0 && tdp->i_flag & IN_NEEDSYNC) { + do { + error = VOP_FSYNC(tdvp, MNT_WAIT, td); + } while (error == ERELOOKUP); + } vput(tdvp); return (error); @@ -1918,6 +1946,7 @@ } dmode = vap->va_mode & 0777; dmode |= IFDIR; + /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is @@ -1928,6 +1957,15 @@ error = EINVAL; goto out; } + + if (DOINGSOFTDEP(dvp)) { + error = softdep_prelink(dvp, NULL, true); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } + error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; @@ -2184,6 +2222,14 @@ error = EINVAL; goto out; } + if (DOINGSOFTDEP(dvp)) { + error = softdep_prelink(dvp, vp, false); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } + #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif @@ -2704,6 +2750,13 @@ print_bad_link_count(callfunc, dvp); return (EINVAL); } + if (DOINGSOFTDEP(dvp)) { + error = softdep_prelink(dvp, NULL, true); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error);