diff --git a/sbin/fsck_ffs/dir.c b/sbin/fsck_ffs/dir.c --- a/sbin/fsck_ffs/dir.c +++ b/sbin/fsck_ffs/dir.c @@ -88,12 +88,97 @@ if (inoinfo(inp->i_parent)->ino_state == DFOUND && INO_IS_DUNFOUND(inp->i_number)) { inoinfo(inp->i_number)->ino_state = DFOUND; + check_dirdepth(inp); change++; } } } while (change > 0); } +/* + * Check that the recorded depth of the directory is correct. + */ +void +check_dirdepth(struct inoinfo *inp) +{ + struct inoinfo *parentinp; + struct inode ip; + union dinode *dp; + int saveresolved; + static int updateasked, dirdepthupdate; + + if ((parentinp = getinoinfo(inp->i_parent)) == NULL) { + pfatal("check_dirdepth: UNKNOWN PARENT DIR"); + return; + } + /* + * If depth is correct, nothing to do. + */ + if (parentinp->i_depth + 1 == inp->i_depth) + return; + /* + * Only the root inode should have depth of 0, so if any other + * directory has a depth of 0 then this is an old filesystem + * that has not been tracking directory depth. Ask just once + * whether it should start tracking directory depth. + */ + if (inp->i_depth == 0 && updateasked == 0) { + updateasked = 1; + if (preen) { + pwarn("UPDATING FILESYSTEM TO TRACK DIRECTORY DEPTH"); + dirdepthupdate = 1; + } else { + /* + * The file system can be marked clean even if + * a directory does not have the right depth. + * Hence, resolved should not be cleared when + * the filesystem does not update directory depths. + */ + saveresolved = resolved; + dirdepthupdate = + reply("UPDATE FILESYSTEM TO TRACK DIRECTORY DEPTH"); + resolved = saveresolved; + } + } + /* + * If we are not converting, nothing more to do. + */ + if (inp->i_depth == 0 && dirdepthupdate == 0) + return; + /* + * Individual directory at wrong depth. Report it and correct if + * in preen mode or ask if in interactive mode. Note that if a + * directory is renamed to a new location that is at a different + * level in the tree, its depth will be recalculated, but none of + * the directories that it contains will be updated. Thus it is + * not unexpected to find directories with incorrect depths. No + * operational harm will come from this though new directory + * placement in the subtree may not be as optimal until the depths + * of the affected directories are corrected. + * + * To avoid much spurious output on otherwise clean filesystems + * we only generate detailed output when the debug flag is given. + */ + ginode(inp->i_number, &ip); + dp = ip.i_dp; + if (inp->i_depth != 0 && debug) { + pwarn("DIRECTORY"); + prtinode(&ip); + printf(" DEPTH %d SHOULD BE %d", inp->i_depth, + parentinp->i_depth + 1); + if (preen == 0 && reply("ADJUST") == 0) { + irelse(&ip); + return; + } + if (preen) + printf(" (ADJUSTED)\n"); + } + inp->i_depth = parentinp->i_depth + 1; + DIP_SET(dp, di_dirdepth, inp->i_depth); + inodirty(&ip); + irelse(&ip); +} + /* * Scan each entry in a directory block. */ @@ -471,7 +556,7 @@ { struct inode ip; union dinode *dp; - int lostdir; + int lostdir, depth; ino_t oldlfdir; struct inoinfo *inp; struct inodesc idesc; @@ -546,7 +631,7 @@ irelse(&ip); return (0); } - if ((changeino(UFS_ROOTINO, lfname, lfdir) & ALTERED) == 0) { + if ((changeino(UFS_ROOTINO, lfname, lfdir, 1) & ALTERED) == 0) { pfatal("SORRY. CANNOT CREATE lost+found DIRECTORY\n\n"); irelse(&ip); return (0); @@ -575,7 +660,8 @@ } inoinfo(orphan)->ino_linkcnt--; if (lostdir) { - if ((changeino(orphan, "..", lfdir) & ALTERED) == 0 && + depth = DIP(dp, di_dirdepth) + 1; + if ((changeino(orphan, "..", lfdir, depth) & ALTERED) == 0 && parentdir != (ino_t)-1) (void)makeentry(orphan, lfdir, ".."); DIP_SET(dp, di_nlink, DIP(dp, di_nlink) + 1); @@ -607,7 +693,7 @@ * fix an entry in a directory. */ int -changeino(ino_t dir, const char *name, ino_t newnum) +changeino(ino_t dir, const char *name, ino_t newnum, int depth) { struct inodesc idesc; struct inode ip; @@ -621,7 +707,10 @@ idesc.id_name = strdup(name); idesc.id_parent = newnum; /* new value for name */ ginode(dir, &ip); - error = ckinode(ip.i_dp, &idesc); + if (((error = ckinode(ip.i_dp, &idesc)) & ALTERED) && newnum != 0) { + DIP_SET(ip.i_dp, di_dirdepth, depth); + getinoinfo(dir)->i_depth = depth; + } free(idesc.id_name); irelse(&ip); return (error); @@ -815,8 +904,8 @@ struct inode ip; union dinode *dp; struct bufarea *bp; - struct inoinfo *inp; struct dirtemplate *dirp; + struct inoinfo *inp, *parentinp; ino = allocino(request, IFDIR|mode); if (ino == 0) @@ -859,6 +948,12 @@ inp->i_parent = parent; inp->i_dotdot = parent; inp->i_flags |= INFO_NEW; + if ((parentinp = getinoinfo(inp->i_parent)) == NULL) { + pfatal("allocdir: UNKNOWN PARENT DIR"); + } else { + inp->i_depth = parentinp->i_depth + 1; + DIP_SET(dp, di_dirdepth, inp->i_depth); + } inoinfo(ino)->ino_type = DT_DIR; inoinfo(ino)->ino_state = inoinfo(parent)->ino_state; if (inoinfo(ino)->ino_state == DSTATE) { diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h --- a/sbin/fsck_ffs/fsck.h +++ b/sbin/fsck_ffs/fsck.h @@ -309,6 +309,7 @@ ino_t i_parent; /* inode number of parent */ ino_t i_dotdot; /* inode number of `..' */ size_t i_isize; /* size of inode */ + u_int i_depth; /* depth of directory from root */ u_int i_flags; /* flags, see below */ u_int i_numblks; /* size of block array in bytes */ ufs2_daddr_t i_blks[1]; /* actually longer */ @@ -462,9 +463,10 @@ void catchquit(int); void cgdirty(struct bufarea *); struct bufarea *cglookup(int cg); -int changeino(ino_t dir, const char *name, ino_t newnum); +int changeino(ino_t dir, const char *name, ino_t newnum, int depth); void check_blkcnt(struct inode *ip); int check_cgmagic(int cg, struct bufarea *cgbp, int requestrebuild); +void check_dirdepth(struct inoinfo *inp); int chkrange(ufs2_daddr_t blk, int cnt); void ckfini(int markclean); int ckinode(union dinode *dp, struct inodesc *); diff --git a/sbin/fsck_ffs/inode.c b/sbin/fsck_ffs/inode.c --- a/sbin/fsck_ffs/inode.c +++ b/sbin/fsck_ffs/inode.c @@ -1135,6 +1135,7 @@ inp->i_dotdot = (ino_t)0; inp->i_number = inumber; inp->i_isize = DIP(dp, di_size); + inp->i_depth = DIP(dp, di_dirdepth); inp->i_numblks = blks; for (i = 0; i < MIN(blks, UFS_NDADDR); i++) inp->i_blks[i] = DIP(dp, di_db[i]); diff --git a/sbin/fsck_ffs/pass1.c b/sbin/fsck_ffs/pass1.c --- a/sbin/fsck_ffs/pass1.c +++ b/sbin/fsck_ffs/pass1.c @@ -388,14 +388,15 @@ n_files++; inoinfo(inumber)->ino_linkcnt = DIP(dp, di_nlink); if (mode == IFDIR) { - if (DIP(dp, di_size) == 0) + if (DIP(dp, di_size) == 0) { inoinfo(inumber)->ino_state = DCLEAR; - else if (DIP(dp, di_nlink) <= 0) + } else if (DIP(dp, di_nlink) <= 0) { inoinfo(inumber)->ino_state = DZLINK; - else + } else { inoinfo(inumber)->ino_state = DSTATE; - cacheino(dp, inumber); - countdirs++; + cacheino(dp, inumber); + countdirs++; + } } else if (DIP(dp, di_nlink) <= 0) inoinfo(inumber)->ino_state = FZLINK; else diff --git a/sbin/fsck_ffs/pass2.c b/sbin/fsck_ffs/pass2.c --- a/sbin/fsck_ffs/pass2.c +++ b/sbin/fsck_ffs/pass2.c @@ -210,8 +210,10 @@ if (inp->i_parent == 0 || inp->i_isize == 0) continue; if (inoinfo(inp->i_parent)->ino_state == DFOUND && - INO_IS_DUNFOUND(inp->i_number)) + INO_IS_DUNFOUND(inp->i_number)) { inoinfo(inp->i_number)->ino_state = DFOUND; + check_dirdepth(inp); + } if (inp->i_dotdot == inp->i_parent || inp->i_dotdot == (ino_t)-1) continue; @@ -271,7 +273,8 @@ inoinfo(inp->i_dotdot)->ino_linkcnt++; inoinfo(inp->i_parent)->ino_linkcnt--; inp->i_dotdot = inp->i_parent; - (void)changeino(inp->i_number, "..", inp->i_parent); + (void)changeino(inp->i_number, "..", inp->i_parent, + getinoinfo(inp->i_parent)->i_depth + 1); } /* * Mark all the directories that can be found from the root. @@ -548,10 +551,12 @@ case DFOUND: inp = getinoinfo(dirp->d_ino); if (idesc->id_entryno > 2) { - if (inp->i_parent == 0) + if (inp->i_parent == 0) { inp->i_parent = idesc->id_number; - else if ((n = fix_extraneous(inp, idesc)) == 1) + check_dirdepth(inp); + } else if ((n = fix_extraneous(inp, idesc))) { break; + } } /* FALLTHROUGH */ diff --git a/sbin/fsck_ffs/pass3.c b/sbin/fsck_ffs/pass3.c --- a/sbin/fsck_ffs/pass3.c +++ b/sbin/fsck_ffs/pass3.c @@ -74,7 +74,7 @@ if (inp->i_number == UFS_ROOTINO || (inp->i_parent != 0 && !S_IS_DUNFOUND(state))) continue; - if (state == DCLEAR) + if (state == DCLEAR || state == DZLINK) continue; /* * If we are running with soft updates and we come @@ -102,6 +102,7 @@ inoinfo(lfdir)->ino_linkcnt--; } inoinfo(orphan)->ino_state = DFOUND; + check_dirdepth(inp); propagate(); continue; } @@ -127,6 +128,7 @@ } irelse(&ip); inoinfo(orphan)->ino_state = DFOUND; + check_dirdepth(inp); propagate(); } } diff --git a/sbin/fsdb/fsdb.c b/sbin/fsdb/fsdb.c --- a/sbin/fsdb/fsdb.c +++ b/sbin/fsdb/fsdb.c @@ -781,7 +781,7 @@ if (!checkactivedir()) return 1; - rval = changeino(curinum, argv[1], 0); + rval = changeino(curinum, argv[1], 0, 0); if (rval & ALTERED) { printf("Name `%s' removed\n", argv[1]); return 0; diff --git a/sbin/newfs/mkfs.c b/sbin/newfs/mkfs.c --- a/sbin/newfs/mkfs.c +++ b/sbin/newfs/mkfs.c @@ -915,8 +915,9 @@ alloc(sblock.fs_fsize, node.dp1.di_mode); node.dp1.di_blocks = btodb(fragroundup(&sblock, node.dp1.di_size)); - wtfs(fsbtodb(&sblock, node.dp1.di_db[0]), - sblock.fs_fsize, iobuf); + node.dp1.di_dirdepth = 1; + wtfs(fsbtodb(&sblock, node.dp1.di_db[0]), + sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO + 1); } } else { @@ -951,8 +952,9 @@ alloc(sblock.fs_fsize, node.dp2.di_mode); node.dp2.di_blocks = btodb(fragroundup(&sblock, node.dp2.di_size)); - wtfs(fsbtodb(&sblock, node.dp2.di_db[0]), - sblock.fs_fsize, iobuf); + node.dp2.di_dirdepth = 1; + wtfs(fsbtodb(&sblock, node.dp2.di_db[0]), + sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO + 1); } } diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -1179,6 +1179,8 @@ } ip->i_flags = 0; DIP_SET(ip, i_flags, 0); + if ((mode & IFMT) == IFDIR) + DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1); /* * Set up a new generation number for this inode. */ @@ -1238,10 +1240,10 @@ ffs_dirpref(struct inode *pip) { struct fs *fs; - int cg, prefcg, dirsize, cgsize; + int cg, prefcg, curcg, dirsize, cgsize; + int depth, range, start, end, numdirs, power, numerator, denominator; u_int avgifree, avgbfree, avgndir, curdirsize; u_int minifree, minbfree, maxndir; - u_int mincg, minndir; u_int maxcontigdirs; mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); @@ -1252,35 +1254,53 @@ avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; /* - * Force allocation in another cg if creating a first level dir. - */ - ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); - if (ITOV(pip)->v_vflag & VV_ROOT) { - prefcg = arc4random() % fs->fs_ncg; - mincg = prefcg; - minndir = fs->fs_ipg; - for (cg = prefcg; cg < fs->fs_ncg; cg++) - if (fs->fs_cs(fs, cg).cs_ndir < minndir && - fs->fs_cs(fs, cg).cs_nifree >= avgifree && - fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { - mincg = cg; - minndir = fs->fs_cs(fs, cg).cs_ndir; - } - for (cg = 0; cg < prefcg; cg++) - if (fs->fs_cs(fs, cg).cs_ndir < minndir && - fs->fs_cs(fs, cg).cs_nifree >= avgifree && - fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { - mincg = cg; - minndir = fs->fs_cs(fs, cg).cs_ndir; - } - return ((ino_t)(fs->fs_ipg * mincg)); - } + * Select a preferred cylinder group to place a new directory. + * If we are near the root of the filesystem we aim to spread + * them out as much as possible. As we descend deeper from the + * root we cluster them closer together around their parent as + * we expect them to be more closely interactive. Higher-level + * directories like usr/src/sys and usr/src/bin should be + * separated while the directories in these areas are more + * likely to be accessed together so should be closer. + * + * We pick a range of cylinder groups around the cylinder group + * of the directory in which we are being created. The size of + * the range for our search is based on our depth from the root + * of our filesystem. We then probe that range based on how many + * directories are already present. The first new directory is at + * 1/2 (middle) of the range; the second is in the first 1/4 of the + * range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc. + */ + depth = DIP(pip, i_dirdepth); + range = fs->fs_ncg / (1 << depth); + curcg = ino_to_cg(fs, pip->i_number); + start = curcg - (range / 2); + if (start < 0) + start += fs->fs_ncg; + end = curcg + (range / 2); + if (end >= fs->fs_ncg) + end -= fs->fs_ncg; + numdirs = pip->i_effnlink - 1; + power = fls(numdirs); + numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1; + denominator = 1 << power; + prefcg = (curcg - (range / 2) + (range * numerator / denominator)); + if (prefcg < 0) + prefcg += fs->fs_ncg; + if (prefcg >= fs->fs_ncg) + prefcg -= fs->fs_ncg; + /* + * If this filesystem is not tracking directory depths, + * revert to the old algorithm. + */ + if (depth == 0 && pip->i_number != UFS_ROOTINO) + prefcg = curcg; /* * Count various limits which used for * optimal allocation of a directory inode. */ - maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); + maxndir = min(avgndir + (1 << depth), fs->fs_ipg); minifree = avgifree - avgifree / 4; if (minifree < 1) minifree = 1; @@ -1324,7 +1344,6 @@ * in new cylinder groups so finds every possible block after * one pass over the filesystem. */ - prefcg = ino_to_cg(fs, pip->i_number); for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -12485,17 +12485,6 @@ KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_update_inodeblock called on non-softdep filesystem")); fs = ump->um_fs; - /* - * Preserve the freelink that is on disk. clear_unlinked_inodedep() - * does not have access to the in-core ip so must write directly into - * the inode block buffer when setting freelink. - */ - if (fs->fs_magic == FS_UFS1_MAGIC) - DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + - ino_to_fsbo(fs, ip->i_number))->di_freelink); - else - DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + - ino_to_fsbo(fs, ip->i_number))->di_freelink); /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while @@ -12511,6 +12500,21 @@ panic("softdep_update_inodeblock: bad link count"); return; } + /* + * Preserve the freelink that is on disk. clear_unlinked_inodedep() + * does not have access to the in-core ip so must write directly into + * the inode block buffer when setting freelink. + */ + if ((inodedep->id_state & UNLINKED) != 0) { + if (fs->fs_magic == FS_UFS1_MAGIC) + DIP_SET(ip, i_freelink, + ((struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number))->di_freelink); + else + DIP_SET(ip, i_freelink, + ((struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number))->di_freelink); + } KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta, ("softdep_update_inodeblock inconsistent ip %p i_nlink %d " "inodedep %p id_nlinkdelta %jd", diff --git a/sys/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h --- a/sys/ufs/ufs/dinode.h +++ b/sys/ufs/ufs/dinode.h @@ -156,7 +156,10 @@ [(UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)]; }; u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */ - uint32_t di_freelink; /* 240: SUJ: Next unlinked inode. */ + union { + uint32_t di_freelink; /* 240: SUJ: Next unlinked inode. */ + uint32_t di_dirdepth; /* 240: IFDIR: depth from root dir */ + }; uint32_t di_ckhash; /* 244: if CK_INODE, its check-hash */ uint32_t di_spare[2]; /* 248: Reserved; currently unused */ }; @@ -179,7 +182,10 @@ struct ufs1_dinode { u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ int16_t di_nlink; /* 2: File link count. */ - uint32_t di_freelink; /* 4: SUJ: Next unlinked inode. */ + union { + uint32_t di_freelink; /* 4: SUJ: Next unlinked inode. */ + uint32_t di_dirdepth; /* 4: IFDIR: depth from root dir */ + }; u_int64_t di_size; /* 8: File byte count. */ int32_t di_atime; /* 16: Last access time. */ int32_t di_atimensec; /* 20: Last access time. */ diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1710,6 +1710,10 @@ * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { + /* + * Set the directory depth based on its new parent. + */ + DIP_SET(fip, i_dirdepth, DIP(tdp, i_dirdepth) + 1); /* * If tip exists we simply use its link, otherwise we must * add a new one. @@ -2121,6 +2125,7 @@ ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); + DIP_SET(ip, i_dirdepth, DIP(dp,i_dirdepth) + 1); if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE;