Index: sys/kern/subr_syscall.c =================================================================== --- sys/kern/subr_syscall.c +++ sys/kern/subr_syscall.c @@ -217,6 +217,8 @@ KASSERT((td->td_pflags & TDP_FORKING) == 0, ("fork() did not clear TDP_FORKING upon completion")); + KASSERT(td->td_errno != ERELOOKUP, + ("ERELOOKUP not consumed syscall %d", td->td_sa.code)); p = td->td_proc; sa = &td->td_sa; Index: sys/kern/uipc_usrreq.c =================================================================== --- sys/kern/uipc_usrreq.c +++ sys/kern/uipc_usrreq.c @@ -671,6 +671,8 @@ vput(nd.ni_dvp); if (error) { vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; goto error; } vp = nd.ni_vp; Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -1937,7 +1937,10 @@ } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); - if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) + do { + error = BO_SYNC(bo, MNT_WAIT); + } while (error == ERELOOKUP); + if (error != 0) return (error); /* * XXX We could save a lock/unlock if this was only @@ -3678,7 +3681,9 @@ vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } - error = VOP_FSYNC(vp, MNT_WAIT, td); + do { + error = VOP_FSYNC(vp, MNT_WAIT, td); + } while (error == ERELOOKUP); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -1384,6 +1384,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1470,6 +1472,8 @@ vput(nd.ni_dvp); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1568,7 +1572,7 @@ return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); - } while (error == EAGAIN); + } while (error == EAGAIN || error == ERELOOKUP); return (error); } @@ -1741,6 +1745,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, tmppath); @@ -1791,6 +1797,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1937,6 +1945,8 @@ vrele(vp); else vput(vp); + if (error == ERELOOKUP) + goto restart; fdout: if (fp != NULL) fdrop(fp, td); @@ -3395,7 +3405,8 @@ int error; if (length < 0) - return(EINVAL); + return (EINVAL); +retry: NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); @@ -3424,6 +3435,8 @@ vn_finished_write(mp); vn_rangelock_unlock(vp, rl_cookie); vrele(vp); + if (error == ERELOOKUP) + goto retry; return (error); } @@ -3479,6 +3492,7 @@ if (!fullsync) /* XXXKIB: compete outstanding aio writes */; #endif +retry: error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; @@ -3498,6 +3512,8 @@ error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); VOP_UNLOCK(vp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto retry; drop: fdrop(fp, td); return (error); @@ -3679,7 +3695,7 @@ * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) - error = -1; + error = ERESTART; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, @@ -3708,8 +3724,10 @@ out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); - if (error == -1) + if (error == ERESTART) return (0); + if (error == ERELOOKUP) + goto again; return (error); } @@ -3803,6 +3821,8 @@ if (error == 0) vput(nd.ni_vp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -3903,6 +3923,8 @@ vrele(nd.ni_dvp); else vput(nd.ni_dvp); + if (error == ERELOOKUP) + goto restart; fdout: if (fp != NULL) fdrop(fp, td); @@ -4416,7 +4438,8 @@ if (error != 0) return (error); VOP_UNLOCK(vp); - } while ((error = kern_linkat_vp(td, vp, fd, path, pathseg)) == EAGAIN); + error = kern_linkat_vp(td, vp, fd, path, pathseg); + } while (error == EAGAIN || error == ERELOOKUP); return (error); } Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -275,6 +276,10 @@ vn_finished_write(mp); if (error) { NDFREE(ndp, NDF_ONLY_PNBUF); + if (error == ERELOOKUP) { + NDREINIT(ndp); + goto restart; + } return (error); } fmode &= ~O_TRUNC; @@ -1524,6 +1529,7 @@ vp = fp->f_vnode; +retry: /* * Lock the whole range for truncation. Otherwise split i/o * might happen partly before and partly after the truncation. @@ -1550,6 +1556,8 @@ vn_finished_write(mp); out1: vn_rangelock_unlock(vp, rl_cookie); + if (error == ERELOOKUP) + goto retry; return (error); } @@ -3318,3 +3326,91 @@ return (error); } + +static u_long vn_lock_pair_pause_cnt; +SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD, + &vn_lock_pair_pause_cnt, 0, + "Count of vn_lock_pair deadlocks"); + +static void +vn_lock_pair_pause(const char *wmesg) +{ + atomic_add_long(&vn_lock_pair_pause_cnt, 1); + pause(wmesg, prng32_bounded(hz / 10)); +} + +/* + * Lock pair of vnodes vp1, vp2, avoiding lock order reversal. + * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1 + * must be unlocked. Same for vp2 and vp2_locked. One of the vnodes + * can be NULL. + * + * The function returns with both vnodes exclusively locked, and + * guarantees that it does not create lock order reversal with other + * threads during its execution. Both vnodes could be unlocked + * temporary (and reclaimed). + */ +void +vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2, + bool vp2_locked) +{ + int error; + + if (vp1 == NULL && vp2 == NULL) + return; + if (vp1 != NULL) { + if (vp1_locked) + ASSERT_VOP_ELOCKED(vp1, "vp1"); + else + ASSERT_VOP_UNLOCKED(vp1, "vp1"); + } else { + vp1_locked = true; + } + if (vp2 != NULL) { + if (vp2_locked) + ASSERT_VOP_ELOCKED(vp2, "vp2"); + else + ASSERT_VOP_UNLOCKED(vp2, "vp2"); + } else { + vp2_locked = true; + } + if (!vp1_locked && !vp2_locked) { + vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY); + vp1_locked = true; + } + + for (;;) { + if (vp1_locked && vp2_locked) + break; + if (vp1_locked && vp2 != NULL) { + if (vp1 != NULL) { + error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT, + __FILE__, __LINE__); + if (error == 0) + break; + VOP_UNLOCK(vp1); + vp1_locked = false; + vn_lock_pair_pause("vlp1"); + } + vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); + vp2_locked = true; + } + if (vp2_locked && vp1 != NULL) { + if (vp2 != NULL) { + error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT, + __FILE__, __LINE__); + if (error == 0) + break; + VOP_UNLOCK(vp2); + vp2_locked = false; + vn_lock_pair_pause("vlp2"); + } + vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY); + vp1_locked = true; + } + } + if (vp1 != NULL) + ASSERT_VOP_ELOCKED(vp1, "vp1 ret"); + if (vp2 != NULL) + ASSERT_VOP_ELOCKED(vp2, "vp2 ret"); +} Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -729,6 +729,8 @@ bool vn_isdisk(struct vnode *vp); int _vn_lock(struct vnode *vp, int flags, const char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) +void vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2, + bool vp2_locked); int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); Index: sys/ufs/ffs/ffs_alloc.c =================================================================== --- sys/ufs/ffs/ffs_alloc.c +++ sys/ufs/ffs/ffs_alloc.c @@ -3468,7 +3468,7 @@ break; } dp = VTOI(dvp); - dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ + SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, DT_DIR, 0); cache_purge(fdvp); Index: sys/ufs/ffs/ffs_extern.h =================================================================== --- sys/ufs/ffs/ffs_extern.h +++ sys/ufs/ffs/ffs_extern.h @@ -173,6 +173,9 @@ void softdep_freefile(struct vnode *, ino_t, int); int softdep_request_cleanup(struct fs *, struct vnode *, struct ucred *, int); +int softdep_prerename(struct vnode *, struct vnode *, struct vnode *, + struct vnode *); +int softdep_prelink(struct vnode *, struct vnode *, int); void softdep_setup_freeblocks(struct inode *, off_t, int); void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t, int); void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t, Index: sys/ufs/ffs/ffs_inode.c =================================================================== --- sys/ufs/ffs/ffs_inode.c +++ sys/ufs/ffs/ffs_inode.c @@ -67,6 +67,17 @@ static int ffs_indirtrunc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, ufs2_daddr_t *); +static void +ffs_inode_bwrite(struct vnode *vp, struct buf *bp, int flags) +{ + if ((flags & IO_SYNC) != 0) + bwrite(bp); + else if (DOINGASYNC(vp)) + bdwrite(bp); + else + bawrite(bp); +} + /* * Update the access, modified, and inode change times as specified by the * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode @@ -357,12 +368,7 @@ DIP_SET(ip, i_size, length); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; - if (flags & IO_SYNC) - bwrite(bp); - else if (DOINGASYNC(vp)) - bdwrite(bp); - else - bawrite(bp); + ffs_inode_bwrite(vp, bp, flags); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); return (ffs_update(vp, waitforupdate)); } @@ -456,6 +462,8 @@ error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); if (error) return (error); + ffs_inode_bwrite(vp, bp, flags); + /* * When we are doing soft updates and the UFS_BALLOC * above fills in a direct block hole with a full sized @@ -468,6 +476,10 @@ fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) return (error); + + error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); + if (error) + return (error); ip->i_size = length; DIP_SET(ip, i_size, length); size = blksize(fs, ip, lbn); @@ -478,12 +490,7 @@ allocbuf(bp, size); if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; - if (flags & IO_SYNC) - bwrite(bp); - else if (DOINGASYNC(vp)) - bdwrite(bp); - else - bawrite(bp); + ffs_inode_bwrite(vp, bp, flags); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); } /* Index: sys/ufs/ffs/ffs_snapshot.c =================================================================== --- sys/ufs/ffs/ffs_snapshot.c +++ sys/ufs/ffs/ffs_snapshot.c @@ -301,6 +301,8 @@ NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); vrele(nd.ni_dvp); + if (error == ERELOOKUP) + goto restart; return (error); } vp = nd.ni_vp; @@ -368,8 +370,12 @@ if (error) goto out; bawrite(nbp); - if (cg % 10 == 0) - ffs_syncvnode(vp, MNT_WAIT, 0); + if (cg % 10 == 0) { + error = ffs_syncvnode(vp, MNT_WAIT, 0); + /* vp possibly reclaimed if unlocked */ + if (error != 0) + goto out; + } } /* * Copy all the cylinder group maps. Although the @@ -391,8 +397,8 @@ goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); - if (cg % 10 == 0) - ffs_syncvnode(vp, MNT_WAIT, 0); + if (cg % 10 == 0 && error == 0) + error = ffs_syncvnode(vp, MNT_WAIT, 0); if (error) goto out; } Index: sys/ufs/ffs/ffs_softdep.c =================================================================== --- sys/ufs/ffs/ffs_softdep.c +++ sys/ufs/ffs/ffs_softdep.c @@ -609,6 +609,27 @@ panic("softdep_freework called"); } +int +softdep_prerename(fdvp, fvp, tdvp, tvp) + struct vnode *fdvp; + struct vnode *fvp; + struct vnode *tdvp; + struct vnode *tvp; +{ + + panic("softdep_prerename called"); +} + +void +softdep_prelink(dvp, vp, will_direnter) + struct vnode *dvp; + struct vnode *vp; + int will_direnter; +{ + + panic("softdep_prelink called"); +} + #else FEATURE(softupdates, "FFS soft-updates support"); @@ -748,7 +769,7 @@ static void clear_unlinked_inodedep(struct inodedep *); static struct inodedep *first_unlinked_inodedep(struct ufsmount *); static int flush_pagedep_deps(struct vnode *, struct mount *, - struct diraddhd *); + struct diraddhd *, struct buf *); static int free_pagedep(struct pagedep *); static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); @@ -925,7 +946,6 @@ static int journal_space(struct ufsmount *, int); static void journal_suspend(struct ufsmount *); static int journal_unsuspend(struct ufsmount *ump); -static void softdep_prelink(struct vnode *, struct vnode *); static void add_to_journal(struct worklist *); static void remove_from_journal(struct worklist *); static bool softdep_excess_items(struct ufsmount *, int); @@ -1389,6 +1409,136 @@ /* List of all filesystems mounted with soft updates */ static TAILQ_HEAD(, mount_softdeps) softdepmounts; +/* + * This function fetches inode inum on mount point mp. We already + * hold a locked vnode vp, and might have a locked buffer bp belonging + * to vp. + + * We must not block on acquiring the new inode lock as we will get + * into a lock-order reversal with the buffer lock and possibly get a + * deadlock. Thus if we cannot instantiate the requested vnode + * without sleeping on its lock, we must unlock the vnode and the + * buffer before doing a blocking on the vnode lock. We return + * ERELOOKUP if we have had to unlock either the vnode or the buffer so + * that the caller can reassess its state. + * + * Top-level VFS code (for syscalls and other consumers, e.g. callers + * of VOP_FSYNC() in syncer) check for ERELOOKUP and restart at safe + * point. + * + * Since callers expect to operate on fully constructed vnode, we also + * recheck v_data after relock, and return ENOENT if NULL. + * + * If unlocking bp, we must unroll dequeueing its unfinished + * dependencies, and clear scan flag, before unlocking. If unlocking + * vp while it is under deactivation, we re-queue deactivation. + */ +static int +get_parent_vp(struct vnode *vp, struct mount *mp, ino_t inum, struct buf *bp, + struct diraddhd *diraddhdp, struct diraddhd *unfinishedp, + struct vnode **rvp) +{ + struct vnode *pvp; + struct diradd *dap; + int error; + bool bplocked; + + ASSERT_VOP_ELOCKED(vp, "child vnode must be locked"); + for (bplocked = true, pvp = NULL;;) { + error = ffs_vgetf(mp, inum, LK_EXCLUSIVE | LK_NOWAIT, &pvp, + FFSV_FORCEINSMQ); + if (error == 0) { + /* + * Since we could have unlocked vp, the inode + * number could no longer indicate a + * constructed node. In this case, we must + * restart the syscall. + */ + if (VTOI(pvp)->i_mode == 0 || !bplocked) { + if (VTOI(pvp)->i_mode == 0) + vgone(pvp); + vput(pvp); + error = ERELOOKUP; + goto out; + } + + error = 0; + goto out1; + } + if (bp != NULL && bplocked) { + /* + * Requeue unfinished dependencies before + * unlocking buffer, which could make + * diraddhdp invalid. + */ + ACQUIRE_LOCK(VFSTOUFS(mp)); + while ((dap = LIST_FIRST(unfinishedp)) != NULL) { + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); + } + FREE_LOCK(VFSTOUFS(mp)); + bp->b_vflags &= ~BV_SCANNED; + BUF_NOREC(bp); + BUF_UNLOCK(bp); + bplocked = false; + } + + /* + * Do not drop vnode lock while inactivating. This + * would result in leaks of the VI flags and + * reclaiming of non-truncated vnode. Instead, + * re-schedule inactivation hoping that we would be + * able to sync inode later. + */ + if ((vp->v_iflag & VI_DOINGINACT) != 0) { + VI_LOCK(vp); + vp->v_iflag |= VI_OWEINACT; + VI_UNLOCK(vp); + return (ERELOOKUP); + } + + VOP_UNLOCK(vp); + error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &pvp, + FFSV_FORCEINSMQ); + if (error != 0) { + MPASS(error != ERELOOKUP); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + break; + } + if (VTOI(pvp)->i_mode == 0) { + vgone(pvp); + vput(pvp); + pvp = NULL; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = ERELOOKUP; + break; + } + error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); + if (error == 0) + break; + vput(pvp); + pvp = NULL; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_data == NULL) { + error = ENOENT; + break; + } + } + if (bp != NULL) { + MPASS(!bplocked); + error = ERELOOKUP; + } + if (error != 0 && pvp != NULL) { + vput(pvp); + pvp = NULL; + } +out1: + *rvp = pvp; +out: + ASSERT_VOP_ELOCKED(vp, "child vnode must be locked on return"); + return (error); +} + /* * This function cleans the worklist for a filesystem. * Each filesystem running with soft dependencies gets its own @@ -3095,48 +3245,207 @@ return (0); } +/* + * Try hard to sync all data and metadata for the vnode, and workitems + * flushing which might conflict with the vnode lock. This is a + * helper for softdep_prerename(). + */ +static int +softdep_prerename_vnode(ump, vp) + struct ufsmount *ump; + struct vnode *vp; +{ + int error; + + ASSERT_VOP_ELOCKED(vp, "prehandle"); + if (vp->v_data == NULL) + return (0); + error = VOP_FSYNC(vp, MNT_WAIT, curthread); + if (error != 0) + return (error); + ACQUIRE_LOCK(ump); + process_removes(vp); + process_truncates(vp); + FREE_LOCK(ump); + return (0); +} + +/* + * Must be called from VOP_RENAME() after all vnodes are locked. + * Ensures that there is enough journal space for rename. It is + * sufficiently different from softdep_prelink() by having to handle + * four vnodes. + */ +int +softdep_prerename(fdvp, fvp, tdvp, tvp) + struct vnode *fdvp; + struct vnode *fvp; + struct vnode *tdvp; + struct vnode *tvp; +{ + struct ufsmount *ump; + int error; + + ump = VFSTOUFS(fdvp->v_mount); + + if (journal_space(ump, 0)) + return (0); + + VOP_UNLOCK(tdvp); + VOP_UNLOCK(fvp); + if (tvp != NULL && tvp != tdvp) + VOP_UNLOCK(tvp); + + error = softdep_prerename_vnode(ump, fdvp); + VOP_UNLOCK(fdvp); + if (error != 0) + return (error); + + VOP_LOCK(fvp, LK_EXCLUSIVE | LK_RETRY); + error = softdep_prerename_vnode(ump, fvp); + VOP_UNLOCK(fvp); + if (error != 0) + return (error); + + if (tdvp != fdvp) { + VOP_LOCK(tdvp, LK_EXCLUSIVE | LK_RETRY); + error = softdep_prerename_vnode(ump, tdvp); + VOP_UNLOCK(tdvp); + if (error != 0) + return (error); + } + + if (tvp != fvp && tvp != NULL) { + VOP_LOCK(tvp, LK_EXCLUSIVE | LK_RETRY); + error = softdep_prerename_vnode(ump, tvp); + VOP_UNLOCK(tvp); + if (error != 0) + return (error); + } + + ACQUIRE_LOCK(ump); + softdep_speedup(ump); + process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); + if (journal_space(ump, 0) == 0) { + softdep_speedup(ump); + if (journal_space(ump, 1) == 0) + journal_suspend(ump); + } + FREE_LOCK(ump); + return (ERELOOKUP); +} + /* * Before adjusting a link count on a vnode verify that we have sufficient * journal space. If not, process operations that depend on the currently * locked pair of vnodes to try to flush space as the syncer, buf daemon, * and softdep flush threads can not acquire these locks to reclaim space. + * + * Returns 0 if all owned locks are still valid and were not dropped + * in the process, in other case it returns either an error from sync, + * or ERELOOKUP if any of the locks were re-acquired. In the later + * case, the state of the vnodes cannot be relied upon and our VFS + * syscall must be restarted at top level from the lookup. */ -static void -softdep_prelink(dvp, vp) +int +softdep_prelink(dvp, vp, will_direnter) struct vnode *dvp; struct vnode *vp; + int will_direnter; { struct ufsmount *ump; + int error, error1; + ASSERT_VOP_ELOCKED(dvp, "prelink dvp"); + if (vp != NULL) + ASSERT_VOP_ELOCKED(vp, "prelink vp"); ump = VFSTOUFS(dvp->v_mount); - LOCK_OWNED(ump); + /* * Nothing to do if we have sufficient journal space. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. + * + * will_direnter == 1: In case allocated a directory block in + * an indirect block, we must prevent holes in the directory + * created if directory entries are written out of order. To + * accomplish this we fsync when we extend a directory into + * indirects. During rename it's not safe to drop the tvp + * lock so sync must be delayed until it is. + * + * This synchronous step could be removed if fsck and the + * kernel were taught to fill in sparse directories rather + * than panic. */ - if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) - return; + if (journal_space(ump, 0) || (vp != NULL && IS_SNAPSHOT(VTOI(vp)))) { + error = 0; + if (will_direnter && (vp == NULL || !IS_SNAPSHOT(VTOI(vp)))) { + if (vp != NULL) + VOP_UNLOCK(vp); + error = ffs_syncvnode(dvp, MNT_WAIT, 0); + if (vp != NULL) { + error1 = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); + if (error1 != 0) { + vn_lock_pair(dvp, true, vp, false); + if (error == 0) + error = ERELOOKUP; + } else if (vp->v_data == NULL) { + error = ERELOOKUP; + } + } + } + return (error); + } + stat_journal_low++; - FREE_LOCK(ump); - if (vp) + if (vp != NULL) { + VOP_UNLOCK(dvp); ffs_syncvnode(vp, MNT_NOWAIT, 0); + vn_lock_pair(dvp, false, vp, true); + if (dvp->v_data == NULL) + return (ERELOOKUP); + } + if (vp != NULL) + VOP_UNLOCK(vp); ffs_syncvnode(dvp, MNT_WAIT, 0); - ACQUIRE_LOCK(ump); + VOP_UNLOCK(dvp); + /* Process vp before dvp as it may create .. removes. */ - if (vp) { + if (vp != NULL) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_data == NULL) { + vn_lock_pair(dvp, false, vp, true); + return (ERELOOKUP); + } + ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); + FREE_LOCK(ump); + VOP_UNLOCK(vp); + } + + vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); + if (dvp->v_data == NULL) { + vn_lock_pair(dvp, true, vp, false); + return (ERELOOKUP); } + + ACQUIRE_LOCK(ump); process_removes(dvp); process_truncates(dvp); + VOP_UNLOCK(dvp); softdep_speedup(ump); + process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } + FREE_LOCK(ump); + + vn_lock_pair(dvp, false, vp, false); + return (ERELOOKUP); } static void @@ -4742,7 +5051,6 @@ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_create: No addref structure present.")); } - softdep_prelink(dvp, NULL); FREE_LOCK(ITOUMP(dp)); } @@ -4777,7 +5085,6 @@ if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); - softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } @@ -4808,7 +5115,6 @@ if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); - softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } @@ -4858,7 +5164,6 @@ if (DOINGSUJ(dvp)) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &dotdotaddref->ja_ref, if_deps); - softdep_prelink(ITOV(dp), NULL); FREE_LOCK(ITOUMP(dp)); } @@ -4879,7 +5184,6 @@ ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); - softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } @@ -4900,7 +5204,6 @@ ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); - softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } @@ -8764,11 +9067,11 @@ if (MOUNTEDSUJ(mp)) { flags = DEPALLOC; jmvref = newjmvref(dp, de->d_ino, - dp->i_offset + (oldloc - base), - dp->i_offset + (newloc - base)); + I_OFFSET(dp) + (oldloc - base), + I_OFFSET(dp) + (newloc - base)); } - lbn = lblkno(ump->um_fs, dp->i_offset); - offset = blkoff(ump->um_fs, dp->i_offset); + lbn = lblkno(ump->um_fs, I_OFFSET(dp)); + offset = blkoff(ump->um_fs, I_OFFSET(dp)); oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); ACQUIRE_LOCK(ump); @@ -9280,7 +9583,7 @@ jremref = dotremref = dotdotremref = NULL; if (DOINGSUJ(dvp)) { if (isrmdir) { - jremref = newjremref(dirrem, dp, ip, dp->i_offset, + jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp), ip->i_effnlink + 2); dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, ip->i_effnlink + 1); @@ -9288,12 +9591,12 @@ dp->i_effnlink + 1); dotdotremref->jr_state |= MKDIR_PARENT; } else - jremref = newjremref(dirrem, dp, ip, dp->i_offset, + jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp), ip->i_effnlink + 1); } ACQUIRE_LOCK(ump); - lbn = lblkno(ump->um_fs, dp->i_offset); - offset = blkoff(ump->um_fs, dp->i_offset); + lbn = lblkno(ump->um_fs, I_OFFSET(dp)); + offset = blkoff(ump->um_fs, I_OFFSET(dp)); pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC, &pagedep); dirrem->dm_pagedep = pagedep; @@ -9304,7 +9607,7 @@ * the jremref is preserved for any potential diradd in this * location. This can not coincide with a rmdir. */ - if (dp->i_offset == DOTDOT_OFFSET) { + if (I_OFFSET(dp) == DOTDOT_OFFSET) { if (isrmdir) panic("newdirrem: .. directory change during remove?"); jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); @@ -9405,7 +9708,7 @@ mp = ITOVFS(dp); ump = VFSTOUFS(mp); - offset = blkoff(ump->um_fs, dp->i_offset); + offset = blkoff(ump->um_fs, I_OFFSET(dp)); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_change called on non-softdep filesystem")); @@ -9508,7 +9811,7 @@ KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_change: bad jaddref %p", jaddref)); - jaddref->ja_diroff = dp->i_offset; + jaddref->ja_diroff = I_OFFSET(dp); jaddref->ja_diradd = dap; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); @@ -9527,7 +9830,7 @@ * committed when need to move the dot and dotdot references to * this new name. */ - if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) + if (inodedep->id_mkdiradd && I_OFFSET(dp) != DOTDOT_OFFSET) merge_diradd(inodedep, dap); FREE_LOCK(ump); } @@ -12622,25 +12925,12 @@ * for details on possible races. */ FREE_LOCK(ump); - if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, - FFSV_FORCEINSMQ)) { - /* - * Unmount cannot proceed after unlock because - * caller must have called vn_start_write(). - */ - VOP_UNLOCK(vp); - error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, - &pvp, FFSV_FORCEINSMQ); - MPASS(VTOI(pvp)->i_mode != 0); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if (VN_IS_DOOMED(vp)) { - if (error == 0) - vput(pvp); - error = ENOENT; - } - if (error != 0) - return (error); - } + error = get_parent_vp(vp, mp, parentino, NULL, NULL, NULL, + &pvp); + if (error == ERELOOKUP) + error = 0; + if (error != 0) + return (error); /* * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps * that are contained in direct blocks will be resolved by @@ -12964,9 +13254,11 @@ for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; - if ((error = flush_pagedep_deps(vp, wk->wk_mp, - &pagedep->pd_diraddhd[i]))) { - BUF_NOREC(bp); + error = flush_pagedep_deps(vp, wk->wk_mp, + &pagedep->pd_diraddhd[i], bp); + if (error != 0) { + if (error != ERELOOKUP) + BUF_NOREC(bp); goto out_unlock; } } @@ -13200,10 +13492,11 @@ * Eliminate a pagedep dependency by flushing out all its diradd dependencies. */ static int -flush_pagedep_deps(pvp, mp, diraddhdp) +flush_pagedep_deps(pvp, mp, diraddhdp, locked_bp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; + struct buf *locked_bp; { struct inodedep *inodedep; struct inoref *inoref; @@ -13270,10 +13563,10 @@ } if (dap->da_state & MKDIR_BODY) { FREE_LOCK(ump); - if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, - FFSV_FORCEINSMQ))) + error = get_parent_vp(pvp, mp, inum, locked_bp, + diraddhdp, &unfinished, &vp); + if (error != 0) break; - MPASS(VTOI(vp)->i_mode != 0); error = flush_newblk_dep(vp, mp, 0); /* * If we still have the dependency we might need to @@ -13335,10 +13628,10 @@ */ if (dap == LIST_FIRST(diraddhdp)) { FREE_LOCK(ump); - if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, - FFSV_FORCEINSMQ))) + error = get_parent_vp(pvp, mp, inum, locked_bp, + diraddhdp, &unfinished, &vp); + if (error != 0) break; - MPASS(VTOI(vp)->i_mode != 0); error = ffs_update(vp, 1); vput(vp); if (error) Index: sys/ufs/ffs/ffs_vfsops.c =================================================================== --- sys/ufs/ffs/ffs_vfsops.c +++ sys/ufs/ffs/ffs_vfsops.c @@ -1861,8 +1861,14 @@ #ifdef QUOTA qsyncvp(vp); #endif - if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0) - allerror = error; + for (;;) { + error = ffs_syncvnode(vp, waitfor, 0); + if (error == ERELOOKUP) + continue; + if (error != 0) + allerror = error; + break; + } vput(vp); } /* @@ -2001,6 +2007,9 @@ ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; ip->i_mode = 0; /* ensure error cases below throw away vnode */ +#ifdef DIAGNOSTIC + ufs_init_trackers(ip); +#endif #ifdef QUOTA { int i; Index: sys/ufs/ffs/ffs_vnops.c =================================================================== --- sys/ufs/ffs/ffs_vnops.c +++ sys/ufs/ffs/ffs_vnops.c @@ -253,7 +253,7 @@ struct buf *bp, *nbp; ufs_lbn_t lbn; int error, passes; - bool still_dirty, wait; + bool still_dirty, unlocked, wait; ip = VTOI(vp); ip->i_flag &= ~IN_NEEDSYNC; @@ -277,6 +277,7 @@ error = 0; passes = 0; wait = false; /* Always do an async pass first. */ + unlocked = false; lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); BO_LOCK(bo); loop: @@ -325,6 +326,26 @@ if (!LIST_EMPTY(&bp->b_dep) && (error = softdep_sync_buf(vp, bp, wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { + /* + * Lock order conflict, buffer was already unlocked, + * and vnode possibly unlocked. + */ + if (error == ERELOOKUP) { + if (vp->v_data == NULL) + return (EBADF); + unlocked = true; + if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && + (error = softdep_sync_metadata(vp)) != 0) { + if (ffs_fsfail_cleanup(ump, error)) + error = 0; + return (unlocked && error == 0 ? + ERELOOKUP : error); + } + /* Re-evaluate inode size */ + lbn = lblkno(ITOFS(ip), (ip->i_size + + ITOFS(ip)->fs_bsize - 1)); + goto next; + } /* I/O error. */ if (error != EBUSY) { BUF_UNLOCK(bp); @@ -361,9 +382,11 @@ if (waitfor != MNT_WAIT) { BO_UNLOCK(bo); if ((flags & NO_INO_UPDT) != 0) - return (0); - else - return (ffs_update(vp, 0)); + return (unlocked ? ERELOOKUP : 0); + error = ffs_update(vp, 0); + if (error == 0 && unlocked) + error = ERELOOKUP; + return (error); } /* Drain IO to see if we're done. */ bufobj_wwait(bo, 0, 0); @@ -419,6 +442,8 @@ } else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) { error = ffs_update(vp, 1); } + if (error == 0 && unlocked) + error = ERELOOKUP; return (error); } @@ -434,16 +459,18 @@ struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; - struct thread *a_td; char *file; int line; } */ *ap; { + struct vnode *vp = ap->a_vp; +#ifdef DIAGNOSTIC + struct inode *ip; +#endif + int result; #ifndef NO_FFS_SNAPSHOT - struct vnode *vp; int flags; struct lock *lkp; - int result; /* * Adaptive spinning mixed with SU leads to trouble. use a giant hammer @@ -456,7 +483,6 @@ case LK_SHARED: case LK_UPGRADE: case LK_EXCLUSIVE: - vp = ap->a_vp; flags = ap->a_flags; for (;;) { #ifdef DEBUG_VFS_LOCKS @@ -483,28 +509,67 @@ flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; } +#ifdef DIAGNOSTIC + switch (ap->a_flags & LK_TYPE_MASK) { + case LK_UPGRADE: + case LK_EXCLUSIVE: + if (result == 0 && vp->v_vnlock->lk_recurse == 0) { + ip = VTOI(vp); + if (ip != NULL) + ip->i_lock_gen++; + } + } +#endif break; default: +#ifdef DIAGNOSTIC + if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) { + ip = VTOI(vp); + if (ip != NULL) + ufs_unlock_tracker(ip); + } +#endif result = VOP_LOCK1_APV(&ufs_vnodeops, ap); + break; } - return (result); #else /* * See above for an explanation. */ if ((ap->a_flags & LK_NODDLKTREAT) != 0) ap->a_flags |= LK_ADAPTIVE; - return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); +#ifdef DIAGNOSTIC + if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) { + ip = VTOI(vp); + if (ip != NULL) + ufs_unlock_tracker(ip); + } #endif + result = VOP_LOCK1_APV(&ufs_vnodeops, ap); +#endif +#ifdef DIAGNOSTIC + switch (ap->a_flags & LK_TYPE_MASK) { + case LK_UPGRADE: + case LK_EXCLUSIVE: + if (result == 0 && vp->v_vnlock->lk_recurse == 0) { + ip = VTOI(vp); + if (ip != NULL) + ip->i_lock_gen++; + } + } +#endif + return (result); } #ifdef INVARIANTS static int ffs_unlock_debug(struct vop_unlock_args *ap) { - struct vnode *vp = ap->a_vp; - struct inode *ip = VTOI(vp); + struct vnode *vp; + struct inode *ip; + vp = ap->a_vp; + ip = VTOI(vp); if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) { if ((vp->v_mflag & VMP_LAZYLIST) == 0) { VI_LOCK(vp); @@ -514,6 +579,11 @@ VI_UNLOCK(vp); } } +#ifdef DIAGNOSTIC + if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && ip != NULL && + vp->v_vnlock->lk_recurse == 0) + ufs_unlock_tracker(ip); +#endif return (VOP_UNLOCK_APV(&ufs_vnodeops, ap)); } #endif Index: sys/ufs/ufs/inode.h =================================================================== --- sys/ufs/ufs/inode.h +++ sys/ufs/ufs/inode.h @@ -44,12 +44,24 @@ #include #include #include +#ifdef DIAGNOSTIC +#include +#endif /* * This must agree with the definition in . */ #define doff_t int32_t +#ifdef DIAGNOSTIC +struct iown_tracker { + struct thread *tr_owner; + struct stack tr_st; + struct stack tr_unlock; + int tr_gen; +}; +#endif + /* * The inode is used to describe each active (or recently active) file in the * UFS filesystem. It is composed of two types of information. The first part @@ -94,6 +106,12 @@ doff_t i_endoff; /* End of useful stuff in directory. */ doff_t i_diroff; /* Offset in dir, where we found last entry. */ doff_t i_offset; /* Offset of free space in directory. */ +#ifdef DIAGNOSTIC + int i_lock_gen; + struct iown_tracker i_count_tracker; + struct iown_tracker i_endoff_tracker; + struct iown_tracker i_offset_tracker; +#endif int i_nextclustercg; /* last cg searched for cluster */ @@ -254,6 +272,35 @@ uint32_t ufid_ino; /* File number (ino). */ uint32_t ufid_gen; /* Generation number. */ }; + +#ifdef DIAGNOSTIC +void ufs_init_trackers(struct inode *ip); +void ufs_unlock_tracker(struct inode *ip); + +doff_t ufs_get_i_offset(struct inode *ip, const char *file, int line); +void ufs_set_i_offset(struct inode *ip, doff_t off, const char *file, int line); +#define I_OFFSET(ip) ufs_get_i_offset(ip, __FILE__, __LINE__) +#define SET_I_OFFSET(ip, off) ufs_set_i_offset(ip, off, __FILE__, __LINE__) + +int32_t ufs_get_i_count(struct inode *ip, const char *file, int line); +void ufs_set_i_count(struct inode *ip, int32_t cnt, const char *file, int line); +#define I_COUNT(ip) ufs_get_i_count(ip, __FILE__, __LINE__) +#define SET_I_COUNT(ip, cnt) ufs_set_i_count(ip, cnt, __FILE__, __LINE__) + +doff_t ufs_get_i_endoff(struct inode *ip, const char *file, int line); +void ufs_set_i_endoff(struct inode *ip, doff_t off, const char *file, int line); +#define I_ENDOFF(ip) ufs_get_i_endoff(ip, __FILE__, __LINE__) +#define SET_I_ENDOFF(ip, off) ufs_set_i_endoff(ip, off, __FILE__, __LINE__) + +#else +#define I_OFFSET(ip) ((ip)->i_offset) +#define SET_I_OFFSET(ip, off) ((ip)->i_offset = (off)) +#define I_COUNT(ip) ((ip)->i_count) +#define SET_I_COUNT(ip, cnt) ((ip)->i_count = cnt) +#define I_ENDOFF(ip) ((ip)->i_endoff) +#define SET_I_ENDOFF(ip, off) ((ip)->i_endoff = off) +#endif + #endif /* _KERNEL */ #endif /* !_UFS_UFS_INODE_H_ */ Index: sys/ufs/ufs/ufs_inode.c =================================================================== --- sys/ufs/ufs/ufs_inode.c +++ sys/ufs/ufs/ufs_inode.c @@ -166,7 +166,8 @@ isize += ip->i_din2->di_extsize; if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip)) error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, NOCRED); - if (ip->i_nlink <= 0 && ip->i_mode && !UFS_RDONLY(ip)) { + if (ip->i_nlink <= 0 && ip->i_mode != 0 && !UFS_RDONLY(ip) && + (vp->v_iflag & VI_OWEINACT) == 0) { #ifdef QUOTA if (!getinoquota(ip)) (void)chkiq(ip, -1, NOCRED, FORCE); @@ -207,10 +208,12 @@ * If we are done with the inode, reclaim it * so that it can be reused immediately. */ - if (ip->i_mode == 0) + if (ip->i_mode == 0 && (vp->v_iflag & VI_OWEINACT) == 0) vrecycle(vp); if (mp != NULL) vn_finished_secondary_write(mp); + if (error == ERELOOKUP) + error = 0; return (error); } Index: sys/ufs/ufs/ufs_lookup.c =================================================================== --- sys/ufs/ufs/ufs_lookup.c +++ sys/ufs/ufs/ufs_lookup.c @@ -66,6 +66,7 @@ #endif #include #include +#include #ifdef DIAGNOSTIC static int dirchk = 1; @@ -504,22 +505,22 @@ * dp->i_offset + dp->i_count. */ if (slotstatus == NONE) { - dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); - dp->i_count = 0; - enduseful = dp->i_offset; + SET_I_OFFSET(dp, roundup2(dp->i_size, DIRBLKSIZ)); + SET_I_COUNT(dp, 0); + enduseful = I_OFFSET(dp); } else if (nameiop == DELETE) { - dp->i_offset = slotoffset; - if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) - dp->i_count = 0; + SET_I_OFFSET(dp, slotoffset); + if ((I_OFFSET(dp) & (DIRBLKSIZ - 1)) == 0) + SET_I_COUNT(dp, 0); else - dp->i_count = dp->i_offset - prevoff; + SET_I_COUNT(dp, I_OFFSET(dp) - prevoff); } else { - dp->i_offset = slotoffset; - dp->i_count = slotsize; + SET_I_OFFSET(dp, slotoffset); + SET_I_COUNT(dp, slotsize); if (enduseful < slotoffset + slotsize) enduseful = slotoffset + slotsize; } - dp->i_endoff = roundup2(enduseful, DIRBLKSIZ); + SET_I_ENDOFF(dp, roundup2(enduseful, DIRBLKSIZ)); /* * We return with the directory locked, so that * the parameters we set up above will still be @@ -575,24 +576,32 @@ if (nameiop == DELETE && (flags & ISLASTCN)) { if (flags & LOCKPARENT) ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); - /* - * Return pointer to current entry in dp->i_offset, - * and distance past previous entry (if there - * is a previous entry in this block) in dp->i_count. - * Save directory inode pointer in ndp->ni_dvp for dirremove(). - * - * Technically we shouldn't be setting these in the - * WANTPARENT case (first lookup in rename()), but any - * lookups that will result in directory changes will - * overwrite these. - */ - dp->i_offset = i_offset; - if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) - dp->i_count = 0; - else - dp->i_count = dp->i_offset - prevoff; + + if (VOP_ISLOCKED(vdp) == LK_EXCLUSIVE) { + /* + * Return pointer to current entry in + * dp->i_offset, and distance past previous + * entry (if there is a previous entry in this + * block) in dp->i_count. + * + * We shouldn't be setting these in the + * WANTPARENT case (first lookup in rename()), but any + * lookups that will result in directory changes will + * overwrite these. + */ + SET_I_OFFSET(dp, i_offset); + if ((I_OFFSET(dp) & (DIRBLKSIZ - 1)) == 0) + SET_I_COUNT(dp, 0); + else + SET_I_COUNT(dp, I_OFFSET(dp) - prevoff); + } if (dd_ino != NULL) return (0); + + /* + * Save directory inode pointer in ndp->ni_dvp for + * dirremove(). + */ if ((error = VFS_VGET(vdp->v_mount, ino, LK_EXCLUSIVE, &tdp)) != 0) return (error); @@ -629,7 +638,7 @@ * Careful about locking second inode. * This can only occur if the target is ".". */ - dp->i_offset = i_offset; + SET_I_OFFSET(dp, i_offset); if (dp->i_number == ino) return (EISDIR); if (dd_ino != NULL) @@ -887,14 +896,14 @@ dp = VTOI(dvp); newentrysize = DIRSIZ(OFSFMT(dvp), dirp); - if (dp->i_count == 0) { + if (I_COUNT(dp) == 0) { /* * If dp->i_count is 0, then namei could find no * space in the directory. Here, dp->i_offset will * be on a directory block boundary and we will write the * new entry into a fresh block. */ - if (dp->i_offset & (DIRBLKSIZ - 1)) + if (I_OFFSET(dp) & (DIRBLKSIZ - 1)) panic("ufs_direnter: newblk"); flags = BA_CLRBUF; if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) @@ -907,28 +916,28 @@ } #endif old_isize = dp->i_size; - vnode_pager_setsize(dvp, (u_long)dp->i_offset + DIRBLKSIZ); - if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ, + vnode_pager_setsize(dvp, (u_long)I_OFFSET(dp) + DIRBLKSIZ); + if ((error = UFS_BALLOC(dvp, (off_t)I_OFFSET(dp), DIRBLKSIZ, cr, flags, &bp)) != 0) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); vnode_pager_setsize(dvp, (u_long)old_isize); return (error); } - dp->i_size = dp->i_offset + DIRBLKSIZ; + dp->i_size = I_OFFSET(dp) + DIRBLKSIZ; DIP_SET(dp, i_size, dp->i_size); - dp->i_endoff = dp->i_size; + SET_I_ENDOFF(dp, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); dirp->d_reclen = DIRBLKSIZ; - blkoff = dp->i_offset & + blkoff = I_OFFSET(dp) & (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1); bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) { - ufsdirhash_newblk(dp, dp->i_offset); - ufsdirhash_add(dp, dirp, dp->i_offset); + ufsdirhash_newblk(dp, I_OFFSET(dp)); + ufsdirhash_add(dp, dirp, I_OFFSET(dp)); ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, - dp->i_offset); + I_OFFSET(dp)); } #endif if (DOINGSOFTDEP(dvp)) { @@ -944,7 +953,7 @@ (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } - if (softdep_setup_directory_add(bp, dp, dp->i_offset, + if (softdep_setup_directory_add(bp, dp, I_OFFSET(dp), dirp->d_ino, newdirbp, 1)) UFS_INODE_SET_FLAG(dp, IN_NEEDSYNC); if (newdirbp) @@ -952,27 +961,7 @@ bdwrite(bp); if ((dp->i_flag & IN_NEEDSYNC) == 0) return (UFS_UPDATE(dvp, 0)); - /* - * We have just allocated a directory block in an - * indirect block. We must prevent holes in the - * directory created if directory entries are - * written out of order. To accomplish this we - * fsync when we extend a directory into indirects. - * During rename it's not safe to drop the tvp lock - * so sync must be delayed until it is. - * - * This synchronous step could be removed if fsck and - * the kernel were taught to fill in sparse - * directories rather than panic. - */ - if (isrename) - return (0); - if (tvp != NULL) - VOP_UNLOCK(tvp); - (void) VOP_FSYNC(dvp, MNT_WAIT, td); - if (tvp != NULL) - vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); - return (error); + return (0); } if (DOINGASYNC(dvp)) { bdwrite(bp); @@ -1001,15 +990,15 @@ * * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. */ - if (dp->i_offset + dp->i_count > dp->i_size) { - dp->i_size = dp->i_offset + dp->i_count; + if (I_OFFSET(dp) + I_COUNT(dp) > dp->i_size) { + dp->i_size = I_OFFSET(dp) + I_COUNT(dp); DIP_SET(dp, i_size, dp->i_size); UFS_INODE_SET_FLAG(dp, IN_SIZEMOD | IN_MODIFIED); } /* * Get the block containing the space for the new directory entry. */ - error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp); + error = UFS_BLKATOFF(dvp, (off_t)I_OFFSET(dp), &dirbuf, &bp); if (error) { if (DOINGSOFTDEP(dvp) && newdirbp != NULL) bdwrite(newdirbp); @@ -1024,7 +1013,7 @@ ep = (struct direct *)dirbuf; dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0; spacefree = ep->d_reclen - dsize; - for (loc = ep->d_reclen; loc < dp->i_count; ) { + for (loc = ep->d_reclen; loc < I_COUNT(dp); ) { nep = (struct direct *)(dirbuf + loc); /* Trim the existing slot (NB: dsize may be zero). */ @@ -1052,8 +1041,8 @@ #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_move(dp, nep, - dp->i_offset + ((char *)nep - dirbuf), - dp->i_offset + ((char *)ep - dirbuf)); + I_OFFSET(dp) + ((char *)nep - dirbuf), + I_OFFSET(dp) + ((char *)ep - dirbuf)); #endif if (DOINGSOFTDEP(dvp)) softdep_change_directoryentry_offset(bp, dp, dirbuf, @@ -1094,19 +1083,19 @@ #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL && (ep->d_ino == 0 || dirp->d_reclen == spacefree)) - ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf)); + ufsdirhash_add(dp, dirp, I_OFFSET(dp) + ((char *)ep - dirbuf)); #endif bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize); #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, dirbuf - - (dp->i_offset & (DIRBLKSIZ - 1)), - rounddown2(dp->i_offset, DIRBLKSIZ)); + (I_OFFSET(dp) & (DIRBLKSIZ - 1)), + rounddown2(I_OFFSET(dp), DIRBLKSIZ)); #endif if (DOINGSOFTDEP(dvp)) { (void) softdep_setup_directory_add(bp, dp, - dp->i_offset + (caddr_t)ep - dirbuf, + I_OFFSET(dp) + (caddr_t)ep - dirbuf, dirp->d_ino, newdirbp, 0); if (newdirbp != NULL) bdwrite(newdirbp); @@ -1128,10 +1117,10 @@ * lock on the newly entered node. */ if (isrename == 0 && error == 0 && - dp->i_endoff && dp->i_endoff < dp->i_size) { + I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size) { if (tvp != NULL) VOP_UNLOCK(tvp); - error = UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, + error = UFS_TRUNCATE(dvp, (off_t)I_ENDOFF(dp), IO_NORMAL | (DOINGASYNC(dvp) ? 0 : IO_SYNC), cr); if (error != 0) vn_printf(dvp, @@ -1139,7 +1128,7 @@ error); #ifdef UFS_DIRHASH if (error == 0 && dp->i_dirhash != NULL) - ufsdirhash_dirtrunc(dp, dp->i_endoff); + ufsdirhash_dirtrunc(dp, I_ENDOFF(dp)); #endif error = 0; if (tvp != NULL) @@ -1190,9 +1179,9 @@ } } if (flags & DOWHITEOUT) - offset = dp->i_offset; + offset = I_OFFSET(dp); else - offset = dp->i_offset - dp->i_count; + offset = I_OFFSET(dp) - I_COUNT(dp); if ((error = UFS_BLKATOFF(dvp, offset, (char **)&ep, &bp)) != 0) { if (ip) { ip->i_effnlink++; @@ -1216,7 +1205,7 @@ goto out; } /* Set 'rep' to the entry being removed. */ - if (dp->i_count == 0) + if (I_COUNT(dp) == 0) rep = ep; else rep = (struct direct *)((char *)ep + ep->d_reclen); @@ -1226,7 +1215,7 @@ * that `ep' is the previous entry when dp->i_count != 0. */ if (dp->i_dirhash != NULL) - ufsdirhash_remove(dp, rep, dp->i_offset); + ufsdirhash_remove(dp, rep, I_OFFSET(dp)); #endif if (ip && rep->d_ino != ip->i_number) panic("ufs_dirremove: ip %ju does not match dirent ino %ju\n", @@ -1240,7 +1229,7 @@ rep->d_type = 0; rep->d_ino = 0; - if (dp->i_count != 0) { + if (I_COUNT(dp) != 0) { /* * Collapse new free space into previous entry. */ @@ -1250,8 +1239,8 @@ #ifdef UFS_DIRHASH if (dp->i_dirhash != NULL) ufsdirhash_checkblock(dp, (char *)ep - - ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)), - rounddown2(dp->i_offset, DIRBLKSIZ)); + ((I_OFFSET(dp) - I_COUNT(dp)) & (DIRBLKSIZ - 1)), + rounddown2(I_OFFSET(dp), DIRBLKSIZ)); #endif out: error = 0; @@ -1313,7 +1302,7 @@ UFS_INODE_SET_FLAG(oip, IN_CHANGE); } - error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); + error = UFS_BLKATOFF(vdp, (off_t)I_OFFSET(dp), (char **)&ep, &bp); if (error == 0 && ep->d_namlen == 2 && ep->d_name[1] == '.' && ep->d_name[0] == '.' && ep->d_ino != oip->i_number) { brelse(bp); @@ -1522,3 +1511,115 @@ vput(vp); return (error); } + +#ifdef DIAGNOSTIC +static void +ufs_assert_inode_offset_owner(struct inode *ip, struct iown_tracker *tr, + const char *name, const char *file, int line) +{ + char msg[128]; + + snprintf(msg, sizeof(msg), "at %s@%d", file, line); + ASSERT_VOP_ELOCKED(ITOV(ip), msg); + MPASS((ip->i_mode & IFMT) == IFDIR); + if (curthread == tr->tr_owner && ip->i_lock_gen == tr->tr_gen) + return; + printf("locked at\n"); + stack_print(&tr->tr_st); + printf("unlocked at\n"); + stack_print(&tr->tr_unlock); + panic("%s ip %p %jd offset owner %p %d gen %d " + "curthread %p %d gen %d at %s@%d\n", + name, ip, (uintmax_t)ip->i_number, tr->tr_owner, + tr->tr_owner->td_tid, tr->tr_gen, + curthread, curthread->td_tid, ip->i_lock_gen, + file, line); +} + +static void +ufs_set_inode_offset_owner(struct inode *ip, struct iown_tracker *tr, + const char *file, int line) +{ + char msg[128]; + + snprintf(msg, sizeof(msg), "at %s@%d", file, line); + ASSERT_VOP_ELOCKED(ITOV(ip), msg); + MPASS((ip->i_mode & IFMT) == IFDIR); + tr->tr_owner = curthread; + tr->tr_gen = ip->i_lock_gen; + stack_save(&tr->tr_st); +} + +static void +ufs_init_one_tracker(struct iown_tracker *tr) +{ + tr->tr_owner = NULL; + stack_zero(&tr->tr_st); +} + +void +ufs_init_trackers(struct inode *ip) +{ + ufs_init_one_tracker(&ip->i_offset_tracker); + ufs_init_one_tracker(&ip->i_count_tracker); + ufs_init_one_tracker(&ip->i_endoff_tracker); +} + +void +ufs_unlock_tracker(struct inode *ip) +{ + if (ip->i_count_tracker.tr_gen == ip->i_lock_gen) + stack_save(&ip->i_count_tracker.tr_unlock); + if (ip->i_offset_tracker.tr_gen == ip->i_lock_gen) + stack_save(&ip->i_offset_tracker.tr_unlock); + if (ip->i_endoff_tracker.tr_gen == ip->i_lock_gen) + stack_save(&ip->i_endoff_tracker.tr_unlock); + ip->i_lock_gen++; +} + +doff_t +ufs_get_i_offset(struct inode *ip, const char *file, int line) +{ + ufs_assert_inode_offset_owner(ip, &ip->i_offset_tracker, "i_offset", + file, line); + return (ip->i_offset); +} + +void +ufs_set_i_offset(struct inode *ip, doff_t off, const char *file, int line) +{ + ufs_set_inode_offset_owner(ip, &ip->i_offset_tracker, file, line); + ip->i_offset = off; +} + +int32_t +ufs_get_i_count(struct inode *ip, const char *file, int line) +{ + ufs_assert_inode_offset_owner(ip, &ip->i_count_tracker, "i_count", + file, line); + return (ip->i_count); +} + +void +ufs_set_i_count(struct inode *ip, int32_t cnt, const char *file, int line) +{ + ufs_set_inode_offset_owner(ip, &ip->i_count_tracker, file, line); + ip->i_count = cnt; +} + +doff_t +ufs_get_i_endoff(struct inode *ip, const char *file, int line) +{ + ufs_assert_inode_offset_owner(ip, &ip->i_endoff_tracker, "i_endoff", + file, line); + return (ip->i_endoff); +} + +void +ufs_set_i_endoff(struct inode *ip, doff_t off, const char *file, int line) +{ + ufs_set_inode_offset_owner(ip, &ip->i_endoff_tracker, file, line); + ip->i_endoff = off; +} + +#endif Index: sys/ufs/ufs/ufs_vnops.c =================================================================== --- sys/ufs/ufs/ufs_vnops.c +++ sys/ufs/ufs/ufs_vnops.c @@ -1006,10 +1006,16 @@ td = curthread; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || - (VTOI(dvp)->i_flags & APPEND)) { - error = EPERM; - goto out; + (VTOI(dvp)->i_flags & APPEND)) + return (EPERM); + if (DOINGSOFTDEP(dvp)) { + error = softdep_prelink(dvp, vp, true); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } } + #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif @@ -1030,7 +1036,6 @@ (void) VOP_FSYNC(dvp, MNT_WAIT, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } -out: return (error); } @@ -1067,6 +1072,15 @@ if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif + + if (DOINGSOFTDEP(tdvp)) { + error = softdep_prelink(tdvp, vp, true); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } + if (VTOI(tdvp)->i_effnlink < 2) { print_bad_link_count("ufs_link", tdvp); error = EINVAL; @@ -1089,6 +1103,7 @@ error = EPERM; goto out; } + ip->i_effnlink++; ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); @@ -1129,6 +1144,15 @@ struct direct newdir; int error = 0; + if (DOINGSOFTDEP(dvp) && (ap->a_flags == CREATE || + ap->a_flags == DELETE)) { + error = softdep_prelink(dvp, NULL, true); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } + switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ @@ -1338,6 +1362,18 @@ goto relock; } } + + if (DOINGSOFTDEP(fdvp)) { + error = softdep_prerename(fdvp, fvp, tdvp, tvp); + if (error != 0) { + if (error == ERELOOKUP) { + atomic_add_int(&rename_restarts, 1); + goto relock; + } + goto releout; + } + } + fdp = VTOI(fdvp); fip = VTOI(fvp); tdp = VTOI(tdvp); @@ -1481,9 +1517,9 @@ if (error) goto bad; /* Setup tdvp for directory compaction if needed. */ - if (tdp->i_count && tdp->i_endoff && - tdp->i_endoff < tdp->i_size) - endoff = tdp->i_endoff; + if (I_COUNT(tdp) != 0 && I_ENDOFF(tdp) != 0 && + I_ENDOFF(tdp) < tdp->i_size) + endoff = I_ENDOFF(tdp); } else { if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip)) panic("ufs_rename: EXDEV"); @@ -1611,7 +1647,7 @@ } else if (DOINGSUJ(tdvp)) /* Journal must account for each new link. */ softdep_setup_dotdot_link(tdp, fip); - fip->i_offset = mastertemplate.dot_reclen; + SET_I_OFFSET(fip, mastertemplate.dot_reclen); ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); cache_purge(fdvp); } @@ -1649,8 +1685,10 @@ * are no longer needed. */ if (error == 0 && endoff != 0) { - error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | - (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred); + do { + error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | + (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred); + } while (error == ERELOOKUP); if (error != 0 && !ffs_fsfail_cleanup(VFSTOUFS(mp), error)) vn_printf(tdvp, "ufs_rename: failed to truncate, error %d\n", @@ -1668,8 +1706,11 @@ */ error = 0; } - if (error == 0 && tdp->i_flag & IN_NEEDSYNC) - error = VOP_FSYNC(tdvp, MNT_WAIT, td); + if (error == 0 && tdp->i_flag & IN_NEEDSYNC) { + do { + error = VOP_FSYNC(tdvp, MNT_WAIT, td); + } while (error == ERELOOKUP); + } vput(tdvp); return (error); @@ -1918,6 +1959,7 @@ } dmode = vap->va_mode & 0777; dmode |= IFDIR; + /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is @@ -1928,6 +1970,15 @@ error = EINVAL; goto out; } + + if (DOINGSOFTDEP(dvp)) { + error = softdep_prelink(dvp, NULL, true); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } + error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; @@ -2184,6 +2235,14 @@ error = EINVAL; goto out; } + if (DOINGSOFTDEP(dvp)) { + error = softdep_prelink(dvp, vp, false); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } + #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif @@ -2703,6 +2762,13 @@ print_bad_link_count(callfunc, dvp); return (EINVAL); } + if (DOINGSOFTDEP(dvp)) { + error = softdep_prelink(dvp, NULL, true); + if (error != 0) { + MPASS(error == ERELOOKUP); + return (error); + } + } error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error);