diff --git a/lib/libufs/inode.c b/lib/libufs/inode.c index fe34fd45b815..efaaa4e5e3da 100644 --- a/lib/libufs/inode.c +++ b/lib/libufs/inode.c @@ -1,120 +1,132 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002 Juli Mallett. All rights reserved. * * This software was written by Juli Mallett for the * FreeBSD project. Redistribution and use in source and binary forms, with * or without modification, are permitted provided that the following * conditions are met: * * 1. Redistribution of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistribution in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include int getinode(struct uufsd *disk, union dinodep *dp, ino_t inum) { ino_t min, max; caddr_t inoblock; struct fs *fs; + struct timespec now; ERROR(disk, NULL); fs = &disk->d_fs; if (inum >= (ino_t)fs->fs_ipg * fs->fs_ncg) { ERROR(disk, "inode number out of range"); return (-1); } inoblock = disk->d_inoblock; min = disk->d_inomin; max = disk->d_inomax; if (inoblock == NULL) { inoblock = malloc(fs->fs_bsize); if (inoblock == NULL) { ERROR(disk, "unable to allocate inode block"); return (-1); } disk->d_inoblock = inoblock; } + if (clock_gettime(CLOCK_REALTIME_FAST, &now) != 0) { + ERROR(disk, "cannot get current time of day"); + return (-1); + } if (inum >= min && inum < max) goto gotit; bread(disk, fsbtodb(fs, ino_to_fsba(fs, inum)), inoblock, fs->fs_bsize); disk->d_inomin = min = inum - (inum % INOPB(fs)); disk->d_inomax = max = min + INOPB(fs); gotit: switch (disk->d_ufs) { case 1: disk->d_dp.dp1 = &((struct ufs1_dinode *)inoblock)[inum - min]; + if (ffs_oldfscompat_inode_read(fs, disk->d_dp, now.tv_sec)) + putinode(disk); if (dp != NULL) *dp = disk->d_dp; return (0); case 2: disk->d_dp.dp2 = &((struct ufs2_dinode *)inoblock)[inum - min]; if (dp != NULL) *dp = disk->d_dp; - if (ffs_verify_dinode_ckhash(fs, disk->d_dp.dp2) == 0) + if (ffs_verify_dinode_ckhash(fs, disk->d_dp.dp2) == 0) { + if (ffs_oldfscompat_inode_read(fs, disk->d_dp, + now.tv_sec)) + putinode(disk); return (0); + } ERROR(disk, "check-hash failed for inode read from disk"); return (-1); default: break; } ERROR(disk, "unknown UFS filesystem type"); return (-1); } int putinode(struct uufsd *disk) { struct fs *fs; fs = &disk->d_fs; if (disk->d_inoblock == NULL) { ERROR(disk, "No inode block allocated"); return (-1); } if (disk->d_ufs == 2) ffs_update_dinode_ckhash(fs, disk->d_dp.dp2); if (bwrite(disk, fsbtodb(fs, ino_to_fsba(&disk->d_fs, disk->d_inomin)), disk->d_inoblock, disk->d_fs.fs_bsize) <= 0) return (-1); return (0); } diff --git a/lib/libufs/libufs.h b/lib/libufs/libufs.h index 4c6242e9daef..43936c71b8ad 100644 --- a/lib/libufs/libufs.h +++ b/lib/libufs/libufs.h @@ -1,174 +1,168 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2002 Juli Mallett. All rights reserved. * * This software was written by Juli Mallett for the * FreeBSD project. Redistribution and use in source and binary forms, with * or without modification, are permitted provided that the following * conditions are met: * * 1. Redistribution of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistribution in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef __LIBUFS_H__ #define __LIBUFS_H__ - -/* - * libufs structures. - */ -union dinodep { - struct ufs1_dinode *dp1; - struct ufs2_dinode *dp2; -}; +#include /* * userland ufs disk. */ struct uufsd { const char *d_name; /* disk name */ int d_ufs; /* decimal UFS version */ int d_fd; /* raw device file descriptor */ long d_bsize; /* device bsize */ ufs2_daddr_t d_sblock; /* superblock location */ struct fs_summary_info *d_si; /* Superblock summary info */ caddr_t d_inoblock; /* inode block */ uint32_t d_inomin; /* low ino, not ino_t for ABI compat */ uint32_t d_inomax; /* high ino, not ino_t for ABI compat */ union dinodep d_dp; /* pointer to currently active inode */ union { struct fs d_fs; /* filesystem information */ char d_sb[MAXBSIZE]; /* superblock as buffer */ } d_sbunion; union { struct cg d_cg; /* cylinder group */ char d_buf[MAXBSIZE]; /* cylinder group storage */ } d_cgunion; int d_ccg; /* current cylinder group */ int d_lcg; /* last cylinder group (in d_cg) */ const char *d_error; /* human readable disk error */ off_t d_sblockloc; /* where to look for the superblock */ int d_lookupflags; /* flags to superblock lookup */ int d_mine; /* internal flags */ #define d_fs d_sbunion.d_fs #define d_sb d_sbunion.d_sb #define d_cg d_cgunion.d_cg }; /* * libufs macros (internal, non-exported). */ #ifdef _LIBUFS /* * Trace steps through libufs, to be used at entry and erroneous return. */ static inline void ERROR(struct uufsd *u, const char *str) { #ifdef _LIBUFS_DEBUGGING if (str != NULL) { fprintf(stderr, "libufs: %s", str); if (errno != 0) fprintf(stderr, ": %s", strerror(errno)); fprintf(stderr, "\n"); } #endif if (u != NULL) u->d_error = str; } #endif /* _LIBUFS */ __BEGIN_DECLS /* * libufs prototypes. */ /* * ffs_subr.c */ void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); void ffs_fragacct(struct fs *, int, int32_t [], int); int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); +bool ffs_oldfscompat_inode_read(struct fs *, union dinodep, time_t); int ffs_sbsearch(void *, struct fs **, int, char *, int (*)(void *, off_t, void **, int)); void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); int ffs_sbget(void *, struct fs **, off_t, int, char *, int (*)(void *, off_t, void **, int)); int ffs_sbput(void *, struct fs *, off_t, int (*)(void *, off_t, void *, int)); void ffs_update_dinode_ckhash(struct fs *, struct ufs2_dinode *); int ffs_verify_dinode_ckhash(struct fs *, struct ufs2_dinode *); /* * block.c */ ssize_t bread(struct uufsd *, ufs2_daddr_t, void *, size_t); ssize_t bwrite(struct uufsd *, ufs2_daddr_t, const void *, size_t); int berase(struct uufsd *, ufs2_daddr_t, ufs2_daddr_t); /* * cgroup.c */ ufs2_daddr_t cgballoc(struct uufsd *); int cgbfree(struct uufsd *, ufs2_daddr_t, long); ino_t cgialloc(struct uufsd *); int cgget(int, struct fs *, int, struct cg *); int cgput(int, struct fs *, struct cg *); int cgread(struct uufsd *); int cgread1(struct uufsd *, int); int cgwrite(struct uufsd *); int cgwrite1(struct uufsd *, int); /* * inode.c */ int getinode(struct uufsd *, union dinodep *, ino_t); int putinode(struct uufsd *); /* * sblock.c */ int sbread(struct uufsd *); int sbfind(struct uufsd *, int); int sbwrite(struct uufsd *, int); /* low level superblock read/write functions */ int sbget(int, struct fs **, off_t, int); int sbsearch(int, struct fs **, int); int sbput(int, struct fs *, int); /* * type.c */ int ufs_disk_close(struct uufsd *); int ufs_disk_fillout(struct uufsd *, const char *); int ufs_disk_fillout_blank(struct uufsd *, const char *); int ufs_disk_write(struct uufsd *); /* * crc32c.c */ uint32_t calculate_crc32c(uint32_t, const void *, size_t); __END_DECLS #endif /* __LIBUFS_H__ */ diff --git a/sbin/dump/traverse.c b/sbin/dump/traverse.c index 281cffcdf6f2..e18c2ac86bf7 100644 --- a/sbin/dump/traverse.c +++ b/sbin/dump/traverse.c @@ -1,1008 +1,1004 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint #if 0 static char sccsid[] = "@(#)traverse.c 8.7 (Berkeley) 6/15/95"; #endif #endif /* not lint */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "dump.h" -union dinode { - struct ufs1_dinode dp1; - struct ufs2_dinode dp2; -}; #define DIP(dp, field) \ ((sblock->fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) #define DIP_SET(dp, field, val) do {\ if (sblock->fs_magic == FS_UFS1_MAGIC) \ (dp)->dp1.field = (val); \ else \ (dp)->dp2.field = (val); \ } while (0) #define HASDUMPEDFILE 0x1 #define HASSUBDIRS 0x2 static int dirindir(ino_t ino, ufs2_daddr_t blkno, int level, long *size, long *tapesize, int nodump, ino_t maxino); static void dmpindir(union dinode *dp, ino_t ino, ufs2_daddr_t blk, int level, off_t *size); static void ufs1_blksout(ufs1_daddr_t *blkp, int frags, ino_t ino); static void ufs2_blksout(union dinode *dp, ufs2_daddr_t *blkp, int frags, ino_t ino, int last); static int appendextdata(union dinode *dp); static void writeextdata(union dinode *dp, ino_t ino, int added); static int searchdir(ino_t ino, ufs2_daddr_t blkno, long size, long filesize, long *tapesize, int nodump, ino_t maxino); static long blockest(union dinode *dp); /* * This is an estimation of the number of TP_BSIZE blocks in the file. * It estimates the number of blocks in files with holes by assuming * that all of the blocks accounted for by di_blocks are data blocks * (when some of the blocks are usually used for indirect pointers); * hence the estimate may be high. */ static long blockest(union dinode *dp) { long blkest, sizeest; /* * dp->di_size is the size of the file in bytes. * dp->di_blocks stores the number of sectors actually in the file. * If there are more sectors than the size would indicate, this just * means that there are indirect blocks in the file or unused * sectors in the last file block; we can safely ignore these * (blkest = sizeest below). * If the file is bigger than the number of sectors would indicate, * then the file has holes in it. In this case we must use the * block count to estimate the number of data blocks used, but * we use the actual size for estimating the number of indirect * dump blocks (sizeest vs. blkest in the indirect block * calculation). */ if ((DIP(dp, di_flags) & SF_SNAPSHOT) != 0) return (1); blkest = howmany(dbtob(DIP(dp, di_blocks)), TP_BSIZE); sizeest = howmany(DIP(dp, di_size), TP_BSIZE); if (blkest > sizeest) blkest = sizeest; if (DIP(dp, di_size) > sblock->fs_bsize * UFS_NDADDR) { /* calculate the number of indirect blocks on the dump tape */ blkest += howmany(sizeest - UFS_NDADDR * sblock->fs_bsize / TP_BSIZE, TP_NINDIR); } return (blkest + 1); } /* Auxiliary macro to pick up files changed since previous dump. */ #define CHANGEDSINCE(dp, t) \ (DIP(dp, di_mtime) >= (t) || DIP(dp, di_ctime) >= (t)) /* The WANTTODUMP macro decides whether a file should be dumped. */ #ifdef UF_NODUMP #define WANTTODUMP(dp) \ (CHANGEDSINCE(dp, spcl.c_ddate) && \ (nonodump || (DIP(dp, di_flags) & UF_NODUMP) != UF_NODUMP)) #else #define WANTTODUMP(dp) CHANGEDSINCE(dp, spcl.c_ddate) #endif /* * Dump pass 1. * * Walk the inode list for a file system to find all allocated inodes * that have been modified since the previous dump time. Also, find all * the directories in the file system. */ int mapfiles(ino_t maxino, long *tapesize) { int i, cg, mode, inosused; int anydirskipped = 0; union dinode *dp; struct cg *cgp; ino_t ino; u_char *cp; if ((cgp = malloc(sblock->fs_cgsize)) == NULL) quit("mapfiles: cannot allocate memory.\n"); for (cg = 0; cg < sblock->fs_ncg; cg++) { ino = cg * sblock->fs_ipg; blkread(fsbtodb(sblock, cgtod(sblock, cg)), (char *)cgp, sblock->fs_cgsize); if (sblock->fs_magic == FS_UFS2_MAGIC) inosused = cgp->cg_initediblk; else inosused = sblock->fs_ipg; /* * If we are using soft updates, then we can trust the * cylinder group inode allocation maps to tell us which * inodes are allocated. We will scan the used inode map * to find the inodes that are really in use, and then * read only those inodes in from disk. */ if (sblock->fs_flags & FS_DOSOFTDEP) { if (!cg_chkmagic(cgp)) quit("mapfiles: cg %d: bad magic number\n", cg); cp = &cg_inosused(cgp)[(inosused - 1) / CHAR_BIT]; for ( ; inosused > 0; inosused -= CHAR_BIT, cp--) { if (*cp == 0) continue; for (i = 1 << (CHAR_BIT - 1); i > 0; i >>= 1) { if (*cp & i) break; inosused--; } break; } if (inosused <= 0) continue; } for (i = 0; i < inosused; i++, ino++) { if (ino < UFS_ROOTINO || (dp = getino(ino, &mode)) == NULL || (mode & IFMT) == 0) continue; if (ino >= maxino) { msg("Skipping inode %ju >= maxino %ju\n", (uintmax_t)ino, (uintmax_t)maxino); continue; } /* * Everything must go in usedinomap so that a check * for "in dumpdirmap but not in usedinomap" to detect * dirs with nodump set has a chance of succeeding * (this is used in mapdirs()). */ SETINO(ino, usedinomap); if (mode == IFDIR) SETINO(ino, dumpdirmap); if (WANTTODUMP(dp)) { SETINO(ino, dumpinomap); if (mode != IFREG && mode != IFDIR && mode != IFLNK) *tapesize += 1; else *tapesize += blockest(dp); continue; } if (mode == IFDIR) { if (!nonodump && (DIP(dp, di_flags) & UF_NODUMP)) CLRINO(ino, usedinomap); anydirskipped = 1; } } } /* * Restore gets very upset if the root is not dumped, * so ensure that it always is dumped. */ SETINO(UFS_ROOTINO, dumpinomap); return (anydirskipped); } /* * Dump pass 2. * * Scan each directory on the file system to see if it has any modified * files in it. If it does, and has not already been added to the dump * list (because it was itself modified), then add it. If a directory * has not been modified itself, contains no modified files and has no * subdirectories, then it can be deleted from the dump list and from * the list of directories. By deleting it from the list of directories, * its parent may now qualify for the same treatment on this or a later * pass using this algorithm. */ int mapdirs(ino_t maxino, long *tapesize) { union dinode *dp; int i, isdir, nodump; char *map; ino_t ino; union dinode di; long filesize; int ret, change = 0; isdir = 0; /* XXX just to get gcc to shut up */ for (map = dumpdirmap, ino = 1; ino < maxino; ino++) { if (((ino - 1) % CHAR_BIT) == 0) /* map is offset by 1 */ isdir = *map++; else isdir >>= 1; /* * If a directory has been removed from usedinomap, it * either has the nodump flag set, or has inherited * it. Although a directory can't be in dumpinomap if * it isn't in usedinomap, we have to go through it to * propagate the nodump flag. */ nodump = !nonodump && (TSTINO(ino, usedinomap) == 0); if ((isdir & 1) == 0 || (TSTINO(ino, dumpinomap) && !nodump)) continue; dp = getino(ino, &i); /* * inode buf may change in searchdir(). */ if (sblock->fs_magic == FS_UFS1_MAGIC) di.dp1 = dp->dp1; else di.dp2 = dp->dp2; filesize = DIP(&di, di_size); for (ret = 0, i = 0; filesize > 0 && i < UFS_NDADDR; i++) { if (DIP(&di, di_db[i]) != 0) ret |= searchdir(ino, DIP(&di, di_db[i]), (long)sblksize(sblock, DIP(&di, di_size), i), filesize, tapesize, nodump, maxino); if (ret & HASDUMPEDFILE) filesize = 0; else filesize -= sblock->fs_bsize; } for (i = 0; filesize > 0 && i < UFS_NIADDR; i++) { if (DIP(&di, di_ib[i]) == 0) continue; ret |= dirindir(ino, DIP(&di, di_ib[i]), i, &filesize, tapesize, nodump, maxino); } if (ret & HASDUMPEDFILE) { SETINO(ino, dumpinomap); *tapesize += blockest(&di); change = 1; continue; } if (nodump) { if (ret & HASSUBDIRS) change = 1; /* subdirs inherit nodump */ CLRINO(ino, dumpdirmap); } else if ((ret & HASSUBDIRS) == 0) if (!TSTINO(ino, dumpinomap)) { CLRINO(ino, dumpdirmap); change = 1; } } return (change); } /* * Read indirect blocks, and pass the data blocks to be searched * as directories. Quit as soon as any entry is found that will * require the directory to be dumped. */ static int dirindir( ino_t ino, ufs2_daddr_t blkno, int ind_level, long *filesize, long *tapesize, int nodump, ino_t maxino) { union { ufs1_daddr_t ufs1[MAXBSIZE / sizeof(ufs1_daddr_t)]; ufs2_daddr_t ufs2[MAXBSIZE / sizeof(ufs2_daddr_t)]; } idblk; int ret = 0; int i; blkread(fsbtodb(sblock, blkno), (char *)&idblk, (int)sblock->fs_bsize); if (ind_level <= 0) { for (i = 0; *filesize > 0 && i < NINDIR(sblock); i++) { if (sblock->fs_magic == FS_UFS1_MAGIC) blkno = idblk.ufs1[i]; else blkno = idblk.ufs2[i]; if (blkno != 0) ret |= searchdir(ino, blkno, sblock->fs_bsize, *filesize, tapesize, nodump, maxino); if (ret & HASDUMPEDFILE) *filesize = 0; else *filesize -= sblock->fs_bsize; } return (ret); } ind_level--; for (i = 0; *filesize > 0 && i < NINDIR(sblock); i++) { if (sblock->fs_magic == FS_UFS1_MAGIC) blkno = idblk.ufs1[i]; else blkno = idblk.ufs2[i]; if (blkno != 0) ret |= dirindir(ino, blkno, ind_level, filesize, tapesize, nodump, maxino); } return (ret); } /* * Scan a disk block containing directory information looking to see if * any of the entries are on the dump list and to see if the directory * contains any subdirectories. */ static int searchdir( ino_t ino, ufs2_daddr_t blkno, long size, long filesize, long *tapesize, int nodump, ino_t maxino) { int mode; struct direct *dp; union dinode *ip; long loc, ret = 0; static caddr_t dblk; if (dblk == NULL && (dblk = malloc(sblock->fs_bsize)) == NULL) quit("searchdir: cannot allocate indirect memory.\n"); blkread(fsbtodb(sblock, blkno), dblk, (int)size); if (filesize < size) size = filesize; for (loc = 0; loc < size; ) { dp = (struct direct *)(dblk + loc); if (dp->d_reclen == 0) { msg("corrupted directory, inumber %ju\n", (uintmax_t)ino); break; } loc += dp->d_reclen; if (dp->d_ino == 0) continue; if (dp->d_ino >= maxino) { msg("corrupted directory entry, d_ino %ju >= %ju\n", (uintmax_t)dp->d_ino, (uintmax_t)maxino); break; } if (dp->d_name[0] == '.') { if (dp->d_name[1] == '\0') continue; if (dp->d_name[1] == '.' && dp->d_name[2] == '\0') continue; } if (nodump) { ip = getino(dp->d_ino, &mode); if (TSTINO(dp->d_ino, dumpinomap)) { CLRINO(dp->d_ino, dumpinomap); *tapesize -= blockest(ip); } /* * Add back to dumpdirmap and remove from usedinomap * to propagate nodump. */ if (mode == IFDIR) { SETINO(dp->d_ino, dumpdirmap); CLRINO(dp->d_ino, usedinomap); ret |= HASSUBDIRS; } } else { if (TSTINO(dp->d_ino, dumpinomap)) { ret |= HASDUMPEDFILE; if (ret & HASSUBDIRS) break; } if (TSTINO(dp->d_ino, dumpdirmap)) { ret |= HASSUBDIRS; if (ret & HASDUMPEDFILE) break; } } } return (ret); } /* * Dump passes 3 and 4. * * Dump the contents of an inode to tape. */ void dumpino(union dinode *dp, ino_t ino) { int ind_level, cnt, last, added; off_t size; char buf[TP_BSIZE]; if (newtape) { newtape = 0; dumpmap(dumpinomap, TS_BITS, ino); } CLRINO(ino, dumpinomap); /* * Zero out the size of a snapshot so that it will be dumped * as a zero length file. */ if ((DIP(dp, di_flags) & SF_SNAPSHOT) != 0) { DIP_SET(dp, di_size, 0); DIP_SET(dp, di_flags, DIP(dp, di_flags) & ~SF_SNAPSHOT); } if (sblock->fs_magic == FS_UFS1_MAGIC) { spcl.c_mode = dp->dp1.di_mode; spcl.c_size = dp->dp1.di_size; spcl.c_extsize = 0; spcl.c_atime = _time32_to_time(dp->dp1.di_atime); spcl.c_atimensec = dp->dp1.di_atimensec; spcl.c_mtime = _time32_to_time(dp->dp1.di_mtime); spcl.c_mtimensec = dp->dp1.di_mtimensec; spcl.c_birthtime = 0; spcl.c_birthtimensec = 0; spcl.c_rdev = dp->dp1.di_rdev; spcl.c_file_flags = dp->dp1.di_flags; spcl.c_uid = dp->dp1.di_uid; spcl.c_gid = dp->dp1.di_gid; } else { spcl.c_mode = dp->dp2.di_mode; spcl.c_size = dp->dp2.di_size; spcl.c_extsize = dp->dp2.di_extsize; spcl.c_atime = _time64_to_time(dp->dp2.di_atime); spcl.c_atimensec = dp->dp2.di_atimensec; spcl.c_mtime = _time64_to_time(dp->dp2.di_mtime); spcl.c_mtimensec = dp->dp2.di_mtimensec; spcl.c_birthtime = _time64_to_time(dp->dp2.di_birthtime); spcl.c_birthtimensec = dp->dp2.di_birthnsec; spcl.c_rdev = dp->dp2.di_rdev; spcl.c_file_flags = dp->dp2.di_flags; spcl.c_uid = dp->dp2.di_uid; spcl.c_gid = dp->dp2.di_gid; } spcl.c_type = TS_INODE; spcl.c_count = 0; switch (DIP(dp, di_mode) & S_IFMT) { case 0: /* * Freed inode. */ return; case S_IFLNK: /* * Check for short symbolic link. */ if (DIP(dp, di_size) > 0 && DIP(dp, di_size) < sblock->fs_maxsymlinklen) { spcl.c_addr[0] = 1; spcl.c_count = 1; added = appendextdata(dp); writeheader(ino); memmove(buf, DIP(dp, di_shortlink), (u_long)DIP(dp, di_size)); buf[DIP(dp, di_size)] = '\0'; writerec(buf, 0); writeextdata(dp, ino, added); return; } /* FALLTHROUGH */ case S_IFDIR: case S_IFREG: if (DIP(dp, di_size) > 0) break; /* FALLTHROUGH */ case S_IFIFO: case S_IFSOCK: case S_IFCHR: case S_IFBLK: added = appendextdata(dp); writeheader(ino); writeextdata(dp, ino, added); return; default: msg("Warning: undefined file type 0%o\n", DIP(dp, di_mode) & IFMT); return; } if (DIP(dp, di_size) > UFS_NDADDR * sblock->fs_bsize) { cnt = UFS_NDADDR * sblock->fs_frag; last = 0; } else { cnt = howmany(DIP(dp, di_size), sblock->fs_fsize); last = 1; } if (sblock->fs_magic == FS_UFS1_MAGIC) ufs1_blksout(&dp->dp1.di_db[0], cnt, ino); else ufs2_blksout(dp, &dp->dp2.di_db[0], cnt, ino, last); if ((size = DIP(dp, di_size) - UFS_NDADDR * sblock->fs_bsize) <= 0) return; for (ind_level = 0; ind_level < UFS_NIADDR; ind_level++) { dmpindir(dp, ino, DIP(dp, di_ib[ind_level]), ind_level, &size); if (size <= 0) return; } } /* * Read indirect blocks, and pass the data blocks to be dumped. */ static void dmpindir(union dinode *dp, ino_t ino, ufs2_daddr_t blk, int ind_level, off_t *size) { union { ufs1_daddr_t ufs1[MAXBSIZE / sizeof(ufs1_daddr_t)]; ufs2_daddr_t ufs2[MAXBSIZE / sizeof(ufs2_daddr_t)]; } idblk; int i, cnt, last; if (blk != 0) blkread(fsbtodb(sblock, blk), (char *)&idblk, (int)sblock->fs_bsize); else memset(&idblk, 0, sblock->fs_bsize); if (ind_level <= 0) { if (*size > NINDIR(sblock) * sblock->fs_bsize) { cnt = NINDIR(sblock) * sblock->fs_frag; last = 0; } else { cnt = howmany(*size, sblock->fs_fsize); last = 1; } *size -= NINDIR(sblock) * sblock->fs_bsize; if (sblock->fs_magic == FS_UFS1_MAGIC) ufs1_blksout(idblk.ufs1, cnt, ino); else ufs2_blksout(dp, idblk.ufs2, cnt, ino, last); return; } ind_level--; for (i = 0; i < NINDIR(sblock); i++) { if (sblock->fs_magic == FS_UFS1_MAGIC) dmpindir(dp, ino, idblk.ufs1[i], ind_level, size); else dmpindir(dp, ino, idblk.ufs2[i], ind_level, size); if (*size <= 0) return; } } /* * Collect up the data into tape record sized buffers and output them. */ static void ufs1_blksout(ufs1_daddr_t *blkp, int frags, ino_t ino) { ufs1_daddr_t *bp; int i, j, count, blks, tbperdb; blks = howmany(frags * sblock->fs_fsize, TP_BSIZE); tbperdb = sblock->fs_bsize >> tp_bshift; for (i = 0; i < blks; i += TP_NINDIR) { if (i + TP_NINDIR > blks) count = blks; else count = i + TP_NINDIR; assert(count <= TP_NINDIR + i); for (j = i; j < count; j++) if (blkp[j / tbperdb] != 0) spcl.c_addr[j - i] = 1; else spcl.c_addr[j - i] = 0; spcl.c_count = count - i; writeheader(ino); bp = &blkp[i / tbperdb]; for (j = i; j < count; j += tbperdb, bp++) if (*bp != 0) { if (j + tbperdb <= count) dumpblock(*bp, (int)sblock->fs_bsize); else dumpblock(*bp, (count - j) * TP_BSIZE); } spcl.c_type = TS_ADDR; } } /* * Collect up the data into tape record sized buffers and output them. */ static void ufs2_blksout(union dinode *dp, ufs2_daddr_t *blkp, int frags, ino_t ino, int last) { ufs2_daddr_t *bp; int i, j, count, resid, blks, tbperdb, added; static int writingextdata = 0; /* * Calculate the number of TP_BSIZE blocks to be dumped. * For filesystems with a fragment size bigger than TP_BSIZE, * only part of the final fragment may need to be dumped. */ blks = howmany(frags * sblock->fs_fsize, TP_BSIZE); if (last) { if (writingextdata) resid = howmany(fragoff(sblock, spcl.c_extsize), TP_BSIZE); else resid = howmany(fragoff(sblock, dp->dp2.di_size), TP_BSIZE); if (resid > 0) blks -= howmany(sblock->fs_fsize, TP_BSIZE) - resid; } tbperdb = sblock->fs_bsize >> tp_bshift; for (i = 0; i < blks; i += TP_NINDIR) { if (i + TP_NINDIR > blks) count = blks; else count = i + TP_NINDIR; assert(count <= TP_NINDIR + i); for (j = i; j < count; j++) if (blkp[j / tbperdb] != 0) spcl.c_addr[j - i] = 1; else spcl.c_addr[j - i] = 0; spcl.c_count = count - i; if (last && count == blks && !writingextdata) added = appendextdata(dp); writeheader(ino); bp = &blkp[i / tbperdb]; for (j = i; j < count; j += tbperdb, bp++) if (*bp != 0) { if (j + tbperdb <= count) dumpblock(*bp, (int)sblock->fs_bsize); else dumpblock(*bp, (count - j) * TP_BSIZE); } spcl.c_type = TS_ADDR; spcl.c_count = 0; if (last && count == blks && !writingextdata) { writingextdata = 1; writeextdata(dp, ino, added); writingextdata = 0; } } } /* * If there is room in the current block for the extended attributes * as well as the file data, update the header to reflect the added * attribute data at the end. Attributes are placed at the end so that * old versions of restore will correctly restore the file and simply * discard the extra data at the end that it does not understand. * The attribute data is dumped following the file data by the * writeextdata() function (below). */ static int appendextdata(union dinode *dp) { int i, blks, tbperdb; /* * If no extended attributes, there is nothing to do. */ if (spcl.c_extsize == 0) return (0); /* * If there is not enough room at the end of this block * to add the extended attributes, then rather than putting * part of them here, we simply push them entirely into a * new block rather than putting some here and some later. */ if (spcl.c_extsize > UFS_NXADDR * sblock->fs_bsize) blks = howmany(UFS_NXADDR * sblock->fs_bsize, TP_BSIZE); else blks = howmany(spcl.c_extsize, TP_BSIZE); if (spcl.c_count + blks > TP_NINDIR) return (0); /* * Update the block map in the header to indicate the added * extended attribute. They will be appended after the file * data by the writeextdata() routine. */ tbperdb = sblock->fs_bsize >> tp_bshift; assert(spcl.c_count + blks <= TP_NINDIR); for (i = 0; i < blks; i++) if (&dp->dp2.di_extb[i / tbperdb] != 0) spcl.c_addr[spcl.c_count + i] = 1; else spcl.c_addr[spcl.c_count + i] = 0; spcl.c_count += blks; return (blks); } /* * Dump the extended attribute data. If there was room in the file * header, then all we need to do is output the data blocks. If there * was not room in the file header, then an additional TS_ADDR header * is created to hold the attribute data. */ static void writeextdata(union dinode *dp, ino_t ino, int added) { int i, frags, blks, tbperdb, last; ufs2_daddr_t *bp; off_t size; /* * If no extended attributes, there is nothing to do. */ if (spcl.c_extsize == 0) return; /* * If there was no room in the file block for the attributes, * dump them out in a new block, otherwise just dump the data. */ if (added == 0) { if (spcl.c_extsize > UFS_NXADDR * sblock->fs_bsize) { frags = UFS_NXADDR * sblock->fs_frag; last = 0; } else { frags = howmany(spcl.c_extsize, sblock->fs_fsize); last = 1; } ufs2_blksout(dp, &dp->dp2.di_extb[0], frags, ino, last); } else { if (spcl.c_extsize > UFS_NXADDR * sblock->fs_bsize) blks = howmany(UFS_NXADDR * sblock->fs_bsize, TP_BSIZE); else blks = howmany(spcl.c_extsize, TP_BSIZE); tbperdb = sblock->fs_bsize >> tp_bshift; for (i = 0; i < blks; i += tbperdb) { bp = &dp->dp2.di_extb[i / tbperdb]; if (*bp != 0) { if (i + tbperdb <= blks) dumpblock(*bp, (int)sblock->fs_bsize); else dumpblock(*bp, (blks - i) * TP_BSIZE); } } } /* * If an indirect block is added for extended attributes, then * di_exti below should be changed to the structure element * that references the extended attribute indirect block. This * definition is here only to make it compile without complaint. */ #define di_exti di_spare[0] /* * If the extended attributes fall into an indirect block, * dump it as well. */ if ((size = spcl.c_extsize - UFS_NXADDR * sblock->fs_bsize) > 0) dmpindir(dp, ino, dp->dp2.di_exti, 0, &size); } /* * Dump a map to the tape. */ void dumpmap(char *map, int type, ino_t ino) { int i; char *cp; spcl.c_type = type; spcl.c_count = howmany(mapsize * sizeof(char), TP_BSIZE); writeheader(ino); for (i = 0, cp = map; i < spcl.c_count; i++, cp += TP_BSIZE) writerec(cp, 0); } /* * Write a header record to the dump tape. */ void writeheader(ino_t ino) { int32_t sum, cnt, *lp; if (rsync_friendly >= 2) { /* don't track changes to access time */ spcl.c_atime = spcl.c_mtime; spcl.c_atimensec = spcl.c_mtimensec; } spcl.c_inumber = ino; spcl.c_magic = FS_UFS2_MAGIC; spcl.c_checksum = 0; lp = (int32_t *)&spcl; sum = 0; cnt = sizeof(union u_spcl) / (4 * sizeof(int32_t)); while (--cnt >= 0) { sum += *lp++; sum += *lp++; sum += *lp++; sum += *lp++; } spcl.c_checksum = CHECKSUM - sum; writerec((char *)&spcl, 1); } union dinode * getino(ino_t inum, int *modep) { static ino_t minino, maxino; static caddr_t inoblock; struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; if (inoblock == NULL && (inoblock = malloc(sblock->fs_bsize)) == NULL) quit("cannot allocate inode memory.\n"); curino = inum; if (inum >= minino && inum < maxino) goto gotit; blkread(fsbtodb(sblock, ino_to_fsba(sblock, inum)), inoblock, (int)sblock->fs_bsize); minino = inum - (inum % INOPB(sblock)); maxino = minino + INOPB(sblock); gotit: if (sblock->fs_magic == FS_UFS1_MAGIC) { dp1 = &((struct ufs1_dinode *)inoblock)[inum - minino]; *modep = (dp1->di_mode & IFMT); return ((union dinode *)dp1); } dp2 = &((struct ufs2_dinode *)inoblock)[inum - minino]; *modep = (dp2->di_mode & IFMT); return ((union dinode *)dp2); } /* * Read a chunk of data from the disk. * Try to recover from hard errors by reading in sector sized pieces. * Error recovery is attempted at most BREADEMAX times before seeking * consent from the operator to continue. */ int breaderrors = 0; #define BREADEMAX 32 void blkread(ufs2_daddr_t blkno, char *buf, int size) { int secsize, bytes, resid, xfer, base, cnt, i; static char *tmpbuf; off_t offset; loop: offset = blkno << dev_bshift; secsize = sblock->fs_fsize; base = offset % secsize; resid = size % secsize; /* * If the transfer request starts or ends on a non-sector * boundary, we must read the entire sector and copy out * just the part that we need. */ if (base == 0 && resid == 0) { cnt = cread(diskfd, buf, size, offset); if (cnt == size) return; } else { if (tmpbuf == NULL && (tmpbuf = malloc(secsize)) == NULL) quit("buffer malloc failed\n"); xfer = 0; bytes = size; if (base != 0) { cnt = cread(diskfd, tmpbuf, secsize, offset - base); if (cnt != secsize) goto bad; xfer = MIN(secsize - base, size); offset += xfer; bytes -= xfer; resid = bytes % secsize; memcpy(buf, &tmpbuf[base], xfer); } if (bytes >= secsize) { cnt = cread(diskfd, &buf[xfer], bytes - resid, offset); if (cnt != bytes - resid) goto bad; xfer += cnt; offset += cnt; } if (resid == 0) return; cnt = cread(diskfd, tmpbuf, secsize, offset); if (cnt == secsize) { memcpy(&buf[xfer], tmpbuf, resid); return; } } bad: if (blkno + (size / dev_bsize) > fsbtodb(sblock, sblock->fs_size)) { /* * Trying to read the final fragment. * * NB - dump only works in TP_BSIZE blocks, hence * rounds `dev_bsize' fragments up to TP_BSIZE pieces. * It should be smarter about not actually trying to * read more than it can get, but for the time being * we punt and scale back the read only when it gets * us into trouble. (mkm 9/25/83) */ size -= dev_bsize; goto loop; } if (cnt == -1) msg("read error from %s: %s: [block %jd]: count=%d\n", disk, strerror(errno), (intmax_t)blkno, size); else msg("short read error from %s: [block %jd]: count=%d, got=%d\n", disk, (intmax_t)blkno, size, cnt); if (++breaderrors > BREADEMAX) { msg("More than %d block read errors from %s\n", BREADEMAX, disk); broadcast("DUMP IS AILING!\n"); msg("This is an unrecoverable error.\n"); if (!query("Do you want to attempt to continue?")){ dumpabort(0); /*NOTREACHED*/ } else breaderrors = 0; } /* * Zero buffer, then try to read each sector of buffer separately, * and bypass the cache. */ memset(buf, 0, size); for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { if ((cnt = pread(diskfd, buf, (int)dev_bsize, ((off_t)blkno << dev_bshift))) == dev_bsize) continue; if (cnt == -1) { msg("read error from %s: %s: [sector %jd]: count=%ld\n", disk, strerror(errno), (intmax_t)blkno, dev_bsize); continue; } msg("short read from %s: [sector %jd]: count=%ld, got=%d\n", disk, (intmax_t)blkno, dev_bsize, cnt); } } diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h index 3b795783f39c..2e15dad00d12 100644 --- a/sbin/fsck_ffs/fsck.h +++ b/sbin/fsck_ffs/fsck.h @@ -1,542 +1,538 @@ /*- * SPDX-License-Identifier: BSD-3-Clause and BSD-2-Clause * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fsck.h 8.4 (Berkeley) 5/9/95 */ #ifndef _FSCK_H_ #define _FSCK_H_ #include #include #include #include #define MAXDUP 10 /* limit on dup blks (per inode) */ #define MAXBAD 10 /* limit on bad blks (per inode) */ #define MINBUFS 100 /* minimum number of buffers required */ #define INOBUFSIZE 64*1024 /* size of buffer to read inodes in pass1 */ #define ZEROBUFSIZE (dev_bsize * 128) /* size of zero buffer used by -Z */ -union dinode { - struct ufs1_dinode dp1; - struct ufs2_dinode dp2; -}; #define DIP(dp, field) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) #define DIP_SET(dp, field, val) do { \ if (sblock.fs_magic == FS_UFS1_MAGIC) \ (dp)->dp1.field = (val); \ else \ (dp)->dp2.field = (val); \ } while (0) /* * Each inode on the file system is described by the following structure. * The linkcnt is initially set to the value in the inode. Each time it * is found during the descent in passes 2, 3, and 4 the count is * decremented. Any inodes whose count is non-zero after pass 4 needs to * have its link count adjusted by the value remaining in ino_linkcnt. */ struct inostat { u_char ino_state; /* state of inode, see below */ u_char ino_type:4; /* type of inode */ u_char ino_idtype:4; /* idesc id_type, SNAP or ADDR */ u_short ino_linkcnt; /* number of links not found */ }; /* * Inode states. */ #define USTATE 0x1 /* inode not allocated */ #define FSTATE 0x2 /* inode is file */ #define FZLINK 0x3 /* inode is file with a link count of zero */ #define DSTATE 0x4 /* inode is directory */ #define DZLINK 0x5 /* inode is directory with a zero link count */ #define DFOUND 0x6 /* directory found during descent */ /* 0x7 UNUSED - see S_IS_DVALID() definition */ #define DCLEAR 0x8 /* directory is to be cleared */ #define FCLEAR 0x9 /* file is to be cleared */ /* DUNFOUND === (state == DSTATE || state == DZLINK) */ #define S_IS_DUNFOUND(state) (((state) & ~0x1) == DSTATE) /* DVALID === (state == DSTATE || state == DZLINK || state == DFOUND) */ #define S_IS_DVALID(state) (((state) & ~0x3) == DSTATE) #define INO_IS_DUNFOUND(ino) S_IS_DUNFOUND(inoinfo(ino)->ino_state) #define INO_IS_DVALID(ino) S_IS_DVALID(inoinfo(ino)->ino_state) /* * Inode state information is contained on per cylinder group lists * which are described by the following structure. */ extern struct inostatlist { long il_numalloced; /* number of inodes allocated in this cg */ struct inostat *il_stat;/* inostat info for this cylinder group */ } *inostathead; /* * Structure to reference a dinode. */ struct inode { struct bufarea *i_bp; /* buffer containing the dinode */ union dinode *i_dp; /* pointer to dinode in buffer */ ino_t i_number; /* inode number */ }; /* * Size of hash tables */ #define HASHSIZE 2048 #define HASH(x) ((x * 2654435761) & (HASHSIZE - 1)) /* * buffer cache structure. */ struct bufarea { TAILQ_ENTRY(bufarea) b_list; /* LRU buffer queue */ LIST_ENTRY(bufarea) b_hash; /* hash list */ ufs2_daddr_t b_bno; /* disk block number */ int b_size; /* size of I/O */ int b_errs; /* I/O error */ int b_flags; /* B_ flags below */ int b_type; /* BT_ type below */ int b_refcnt; /* ref count of users */ int b_index; /* for BT_LEVEL, ptr index */ /* for BT_INODES, first inum */ union { char *b_buf; /* buffer space */ ufs1_daddr_t *b_indir1; /* UFS1 indirect block */ ufs2_daddr_t *b_indir2; /* UFS2 indirect block */ struct fs *b_fs; /* super block */ struct cg *b_cg; /* cylinder group */ struct ufs1_dinode *b_dinode1; /* UFS1 inode block */ struct ufs2_dinode *b_dinode2; /* UFS2 inode block */ } b_un; }; #define IBLK(bp, i) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (bp)->b_un.b_indir1[i] : (bp)->b_un.b_indir2[i]) #define IBLK_SET(bp, i, val) do { \ if (sblock.fs_magic == FS_UFS1_MAGIC) \ (bp)->b_un.b_indir1[i] = (val); \ else \ (bp)->b_un.b_indir2[i] = (val); \ } while (0) /* * Buffer flags */ #define B_DIRTY 0x00000001 /* Buffer is dirty */ /* * Type of data in buffer */ #define BT_UNKNOWN 0 /* Buffer type is unknown */ #define BT_SUPERBLK 1 /* Buffer holds a superblock */ #define BT_CYLGRP 2 /* Buffer holds a cylinder group map */ #define BT_LEVEL1 3 /* Buffer holds single level indirect */ #define BT_LEVEL2 4 /* Buffer holds double level indirect */ #define BT_LEVEL3 5 /* Buffer holds triple level indirect */ #define BT_EXTATTR 6 /* Buffer holds external attribute data */ #define BT_INODES 7 /* Buffer holds inodes */ #define BT_DIRDATA 8 /* Buffer holds directory data */ #define BT_DATA 9 /* Buffer holds user data */ #define BT_NUMBUFTYPES 10 #define BT_NAMES { \ "unknown", \ "Superblock", \ "Cylinder Group", \ "Single Level Indirect", \ "Double Level Indirect", \ "Triple Level Indirect", \ "External Attribute", \ "Inode Block", \ "Directory Contents", \ "User Data" } extern char *buftype[]; #define BT_BUFTYPE(type) \ type < BT_NUMBUFTYPES ? buftype[type] : buftype[BT_UNKNOWN] extern long readcnt[BT_NUMBUFTYPES]; extern long totalreadcnt[BT_NUMBUFTYPES]; extern struct timespec readtime[BT_NUMBUFTYPES]; extern struct timespec totalreadtime[BT_NUMBUFTYPES]; extern struct timespec startprog; extern struct bufarea *icachebp; /* inode cache buffer */ extern struct bufarea sblk; /* file system superblock */ extern struct bufarea *pdirbp; /* current directory contents */ #define dirty(bp) do { \ if (fswritefd < 0) \ pfatal("SETTING DIRTY FLAG IN READ_ONLY MODE\n"); \ else \ (bp)->b_flags |= B_DIRTY; \ } while (0) #define initbarea(bp, type) do { \ (bp)->b_bno = (ufs2_daddr_t)-4; \ (bp)->b_size = 0; \ (bp)->b_errs = 0; \ (bp)->b_flags = 0; \ (bp)->b_type = type; \ (bp)->b_refcnt = 0; \ (bp)->b_index = 0; \ } while (0) #define sbdirty() dirty(&sblk) #define sblock (*sblk.b_un.b_fs) enum fixstate {DONTKNOW, NOFIX, FIX, IGNORE}; extern ino_t cursnapshot; struct inodesc { enum fixstate id_fix; /* policy on fixing errors */ int (*id_func)(struct inodesc *); /* function to be applied to blocks of inode */ struct bufarea *id_bp; /* ckinode: buffer with indirect pointers */ union dinode *id_dp; /* ckinode: dinode being traversed */ ino_t id_number; /* inode number described */ ino_t id_parent; /* for DATA nodes, their parent */ ufs_lbn_t id_lbn; /* logical block number of current block */ ufs2_daddr_t id_blkno; /* current block number being examined */ int id_level; /* level of indirection of this block */ int id_numfrags; /* number of frags contained in block */ ufs_lbn_t id_lballoc; /* pass1: last LBN that is allocated */ off_t id_filesize; /* for DATA nodes, the size of the directory */ ufs2_daddr_t id_entryno;/* for DATA nodes, current entry number */ int id_loc; /* for DATA nodes, current location in dir */ struct direct *id_dirp; /* for DATA nodes, ptr to current entry */ char *id_name; /* for DATA nodes, name to find or enter */ char id_type; /* type of descriptor, DATA, ADDR, or SNAP */ }; /* file types */ #define DATA 1 /* a directory */ #define SNAP 2 /* a snapshot */ #define ADDR 3 /* anything but a directory or a snapshot */ /* * Linked list of duplicate blocks. * * The list is composed of two parts. The first part of the * list (from duplist through the node pointed to by muldup) * contains a single copy of each duplicate block that has been * found. The second part of the list (from muldup to the end) * contains duplicate blocks that have been found more than once. * To check if a block has been found as a duplicate it is only * necessary to search from duplist through muldup. To find the * total number of times that a block has been found as a duplicate * the entire list must be searched for occurrences of the block * in question. The following diagram shows a sample list where * w (found twice), x (found once), y (found three times), and z * (found once) are duplicate block numbers: * * w -> y -> x -> z -> y -> w -> y * ^ ^ * | | * duplist muldup */ struct dups { struct dups *next; ufs2_daddr_t dup; }; extern struct dups *duplist; /* head of dup list */ extern struct dups *muldup; /* end of unique duplicate dup block numbers */ /* * Inode cache data structures. */ struct inoinfo { SLIST_ENTRY(inoinfo) i_hash; /* hash list */ ino_t i_number; /* inode number of this entry */ ino_t i_parent; /* inode number of parent */ ino_t i_dotdot; /* inode number of `..' */ size_t i_isize; /* size of inode */ u_int i_depth; /* depth of directory from root */ u_int i_flags; /* flags, see below */ u_int i_numblks; /* size of block array in bytes */ ufs2_daddr_t i_blks[1]; /* actually longer */ }; extern SLIST_HEAD(inohash, inoinfo) *inphash; extern struct inoinfo **inpsort; /* * flags for struct inoinfo */ #define INFO_NEW 0x0000001 /* replaced broken directory */ extern long dirhash, inplast; extern unsigned long numdirs, listmax; extern long countdirs; /* number of directories we actually found */ #define MIBSIZE 3 /* size of fsck sysctl MIBs */ extern int adjblkcnt[MIBSIZE]; /* MIB cmd to adjust inode block count */ extern int adjrefcnt[MIBSIZE]; /* MIB cmd to adjust inode reference count */ extern int adjndir[MIBSIZE]; /* MIB cmd to adjust number of directories */ extern int adjnbfree[MIBSIZE]; /* MIB cmd to adjust number of free blocks */ extern int adjnifree[MIBSIZE]; /* MIB cmd to adjust number of free inodes */ extern int adjnffree[MIBSIZE]; /* MIB cmd to adjust number of free frags */ extern int adjnumclusters[MIBSIZE]; /* MIB cmd adjust number of free clusters */ extern int adjdepth[MIBSIZE]; /* MIB cmd to adjust directory depth count */ extern int freefiles[MIBSIZE]; /* MIB cmd to free a set of files */ extern int freedirs[MIBSIZE]; /* MIB cmd to free a set of directories */ extern int freeblks[MIBSIZE]; /* MIB cmd to free a set of data blocks */ extern int setsize[MIBSIZE]; /* MIB cmd to set inode size */ extern struct fsck_cmd cmd; /* sysctl file system update commands */ extern int bkgrdcheck; /* determine if background check is possible */ extern int bkgrdsumadj; /* whether the kernel has the ability to adjust the superblock summary fields */ extern off_t bflag; /* location of alternate super block */ extern int bkgrdflag; /* use a snapshot to run on an active system */ extern char *blockmap; /* ptr to primary blk allocation map */ extern char *cdevname; /* name of device being checked */ extern int cgheader_corrupt; /* one or more CG headers are corrupt */ extern char ckclean; /* only do work if not cleanly unmounted */ extern int ckhashadd; /* check hashes to be added */ extern char *copybuf; /* buffer to copy snapshot blocks */ extern int cvtlevel; /* convert to newer file system format */ extern long dev_bsize; /* computed value of DEV_BSIZE */ extern u_int real_dev_bsize; /* actual disk sector size, not overridden */ extern int debug; /* output debugging info */ extern int Eflag; /* delete empty data blocks */ extern int fsmodified; /* 1 => write done to file system */ extern int fsreadfd; /* file descriptor for reading file system */ extern int fswritefd; /* file descriptor for writing file system */ extern char havesb; /* superblock has been read */ extern int inoopt; /* trim out unused inodes */ extern ino_t lfdir; /* lost & found directory inode number */ extern int lfmode; /* lost & found directory creation mode */ extern const char *lfname; /* lost & found directory name */ extern ufs2_daddr_t maxfsblock; /* number of blocks in the file system */ extern ino_t maxino; /* number of inodes in file system */ extern ufs2_daddr_t n_blks; /* number of blocks in use */ extern ino_t n_files; /* number of files in use */ extern char nflag; /* assume a no response */ extern char preen; /* just fix normal inconsistencies */ extern char rerun; /* rerun fsck. Only used in non-preen mode */ extern char resolved; /* cleared if unresolved changes => not clean */ extern int returntosingle; /* 1 => return to single user mode on exit */ extern long secsize; /* actual disk sector size */ extern char skipclean; /* skip clean file systems if preening */ extern int snapcnt; /* number of active snapshots */ extern struct inode snaplist[FSMAXSNAP + 1]; /* list of active snapshots */ extern int sujrecovery; /* 1 => doing check using the journal */ extern int surrender; /* Give up if reads fail */ extern char usedsoftdep; /* just fix soft dependency inconsistencies */ extern int wantrestart; /* Restart fsck on early termination */ extern char yflag; /* assume a yes response */ extern int zflag; /* zero unused directory space */ extern int Zflag; /* zero empty data blocks */ extern volatile sig_atomic_t got_siginfo; /* received a SIGINFO */ extern volatile sig_atomic_t got_sigalarm; /* received a SIGALRM */ #define clearinode(dp) \ if (sblock.fs_magic == FS_UFS1_MAGIC) { \ (dp)->dp1 = zino.dp1; \ } else { \ (dp)->dp2 = zino.dp2; \ } extern union dinode zino; #define setbmap(blkno) setbit(blockmap, blkno) #define testbmap(blkno) isset(blockmap, blkno) #define clrbmap(blkno) clrbit(blockmap, blkno) #define STOP 0x01 #define SKIP 0x02 #define KEEPON 0x04 #define ALTERED 0x08 #define FOUND 0x10 #define EEXIT 8 /* Standard error exit. */ #define ERERUN 16 /* fsck needs to be re-run. */ #define ERESTART -1 int flushentry(void); /* * Wrapper for malloc() that flushes the cylinder group cache to try * to get space. */ static inline void* Malloc(size_t size) { void *retval; while ((retval = malloc(size)) == NULL) if (flushentry() == 0) break; return (retval); } /* * Wrapper for calloc() that flushes the cylinder group cache to try * to get space. */ static inline void* Calloc(size_t cnt, size_t size) { void *retval; while ((retval = calloc(cnt, size)) == NULL) if (flushentry() == 0) break; return (retval); } struct fstab; void adjust(struct inodesc *, int lcnt); void alarmhandler(int sig); ufs2_daddr_t allocblk(long cg, long frags, ufs2_daddr_t (*checkblkavail) (ufs2_daddr_t blkno, long frags)); ino_t allocdir(ino_t parent, ino_t request, int mode); ino_t allocino(ino_t request, int type); void binval(struct bufarea *); void blkerror(ino_t ino, const char *type, ufs2_daddr_t blk); char *blockcheck(char *name); int blread(int fd, char *buf, ufs2_daddr_t blk, long size); void bufinit(void); void blwrite(int fd, char *buf, ufs2_daddr_t blk, ssize_t size); void blerase(int fd, ufs2_daddr_t blk, long size); void blzero(int fd, ufs2_daddr_t blk, long size); void brelse(struct bufarea *); struct inoinfo *cacheino(union dinode *dp, ino_t inumber); void catch(int); void catchquit(int); void cgdirty(struct bufarea *); struct bufarea *cglookup(int cg); int changeino(ino_t dir, const char *name, ino_t newnum, int depth); void check_blkcnt(struct inode *ip); int check_cgmagic(int cg, struct bufarea *cgbp); void rebuild_cg(int cg, struct bufarea *cgbp); void check_dirdepth(struct inoinfo *inp); int chkfilesize(mode_t mode, u_int64_t filesize); int chkrange(ufs2_daddr_t blk, int cnt); void ckfini(int markclean); int ckinode(union dinode *dp, struct inodesc *); void clri(struct inodesc *, const char *type, int flag); int clearentry(struct inodesc *); void copyonwrite(struct fs *, struct bufarea *, ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long)); void direrror(ino_t ino, const char *errmesg); int dirscan(struct inodesc *); int dofix(struct inodesc *, const char *msg); int eascan(struct inodesc *, struct ufs2_dinode *dp); void fileerror(ino_t cwd, ino_t ino, const char *errmesg); void finalIOstats(void); int findino(struct inodesc *); int findname(struct inodesc *); void flush(int fd, struct bufarea *bp); int freeblock(struct inodesc *); void freedirino(ino_t ino, ino_t parent); void freeino(ino_t ino); void freeinodebuf(void); void fsckinit(void); void fsutilinit(void); int ftypeok(union dinode *dp); void getblk(struct bufarea *bp, ufs2_daddr_t blk, long size); struct bufarea *getdatablk(ufs2_daddr_t blkno, long size, int type); struct inoinfo *getinoinfo(ino_t inumber); union dinode *getnextinode(ino_t inumber, int rebuiltcg); void getpathname(char *namebuf, ino_t curdir, ino_t ino); void ginode(ino_t, struct inode *); void gjournal_check(const char *filesys); void infohandler(int sig); void irelse(struct inode *); ufs2_daddr_t ino_blkatoff(union dinode *, ino_t, ufs_lbn_t, int *, struct bufarea **); void inocleanup(void); void inodirty(struct inode *); struct inostat *inoinfo(ino_t inum); void IOstats(char *what); int linkup(ino_t orphan, ino_t parentdir, char *name); int makeentry(ino_t parent, ino_t ino, const char *name); int openfilesys(char *dev); void panic(const char *fmt, ...) __printflike(1, 2); void pass1(void); void pass1b(void); int pass1check(struct inodesc *); void pass2(void); void pass3(void); void pass4(void); void pass5(void); void pfatal(const char *fmt, ...) __printflike(1, 2); void propagate(void); void prtbuf(struct bufarea *, const char *, ...) __printflike(2, 3); void prtinode(struct inode *); void pwarn(const char *fmt, ...) __printflike(1, 2); int readsb(void); int removecachedino(ino_t); int reply(const char *question); void rwerror(const char *mesg, ufs2_daddr_t blk); void sblock_init(void); void setinodebuf(int, ino_t); int setup(char *dev); int snapblkfree(struct fs *, ufs2_daddr_t, long, ino_t, ufs2_daddr_t (*)(ufs2_daddr_t, long)); void snapremove(ino_t); void snapflush(ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long)); ufs2_daddr_t std_checkblkavail(ufs2_daddr_t blkno, long frags); ufs2_daddr_t suj_checkblkavail(ufs2_daddr_t, long); int suj_check(const char *filesys); void update_maps(struct cg *, struct cg*, int); #endif /* !_FSCK_H_ */ diff --git a/sbin/fsck_ffs/inode.c b/sbin/fsck_ffs/inode.c index 3db8a5e5c23d..f7b9b941fdb4 100644 --- a/sbin/fsck_ffs/inode.c +++ b/sbin/fsck_ffs/inode.c @@ -1,1475 +1,1499 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)inode.c 8.8 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsck.h" struct bufarea *icachebp; /* inode cache buffer */ +static time_t now; /* current time of day */ static int iblock(struct inodesc *, off_t isize, int type); static ufs2_daddr_t indir_blkatoff(ufs2_daddr_t, ino_t, ufs_lbn_t, ufs_lbn_t, struct bufarea **); static int snapclean(struct inodesc *idesc); static void chkcopyonwrite(struct fs *, ufs2_daddr_t, ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long)); int ckinode(union dinode *dp, struct inodesc *idesc) { off_t remsize, sizepb; int i, offset, ret; struct inode ip; union dinode dino; ufs2_daddr_t ndb; mode_t mode; char pathbuf[MAXPATHLEN + 1]; if (idesc->id_fix != IGNORE) idesc->id_fix = DONTKNOW; idesc->id_dp = dp; idesc->id_lbn = -1; idesc->id_lballoc = -1; idesc->id_level = 0; idesc->id_entryno = 0; idesc->id_filesize = DIP(dp, di_size); mode = DIP(dp, di_mode) & IFMT; if (mode == IFBLK || mode == IFCHR || (mode == IFLNK && DIP(dp, di_size) < (unsigned)sblock.fs_maxsymlinklen)) return (KEEPON); if (sblock.fs_magic == FS_UFS1_MAGIC) dino.dp1 = dp->dp1; else dino.dp2 = dp->dp2; if (DIP(&dino, di_size) < 0) { pfatal("NEGATIVE INODE SIZE %jd\n", DIP(&dino, di_size)); return (STOP); } ndb = howmany(DIP(&dino, di_size), sblock.fs_bsize); for (i = 0; i < UFS_NDADDR; i++) { idesc->id_lbn++; if (--ndb == 0 && (offset = blkoff(&sblock, DIP(&dino, di_size))) != 0) idesc->id_numfrags = numfrags(&sblock, fragroundup(&sblock, offset)); else idesc->id_numfrags = sblock.fs_frag; if (DIP(&dino, di_db[i]) == 0) { if (idesc->id_type == DATA && ndb >= 0) { /* An empty block in a directory XXX */ getpathname(pathbuf, idesc->id_number, idesc->id_number); pfatal("DIRECTORY %s: CONTAINS EMPTY BLOCKS", pathbuf); if (reply("ADJUST LENGTH") == 1) { ginode(idesc->id_number, &ip); DIP_SET(ip.i_dp, di_size, i * sblock.fs_bsize); printf( "YOU MUST RERUN FSCK AFTERWARDS\n"); rerun = 1; inodirty(&ip); irelse(&ip); } return (STOP); } continue; } idesc->id_blkno = DIP(&dino, di_db[i]); if (idesc->id_type != DATA) ret = (*idesc->id_func)(idesc); else ret = dirscan(idesc); if (ret & STOP) return (ret); } idesc->id_numfrags = sblock.fs_frag; remsize = DIP(&dino, di_size) - sblock.fs_bsize * UFS_NDADDR; sizepb = sblock.fs_bsize; for (i = 0; i < UFS_NIADDR; i++) { sizepb *= NINDIR(&sblock); idesc->id_level = i + 1; if (DIP(&dino, di_ib[i])) { idesc->id_blkno = DIP(&dino, di_ib[i]); ret = iblock(idesc, remsize, BT_LEVEL1 + i); if (ret & STOP) return (ret); } else if (remsize > 0) { idesc->id_lbn += sizepb / sblock.fs_bsize; if (idesc->id_type == DATA) { /* An empty block in a directory XXX */ getpathname(pathbuf, idesc->id_number, idesc->id_number); pfatal("DIRECTORY %s: CONTAINS EMPTY BLOCKS", pathbuf); if (reply("ADJUST LENGTH") == 1) { ginode(idesc->id_number, &ip); DIP_SET(ip.i_dp, di_size, DIP(ip.i_dp, di_size) - remsize); remsize = 0; printf( "YOU MUST RERUN FSCK AFTERWARDS\n"); rerun = 1; inodirty(&ip); irelse(&ip); break; } } } remsize -= sizepb; } return (KEEPON); } static int iblock(struct inodesc *idesc, off_t isize, int type) { struct inode ip; struct bufarea *bp; int i, n, (*func)(struct inodesc *), nif; off_t sizepb; char buf[BUFSIZ]; char pathbuf[MAXPATHLEN + 1]; if (idesc->id_type != DATA) { func = idesc->id_func; if (((n = (*func)(idesc)) & KEEPON) == 0) return (n); } else func = dirscan; bp = getdatablk(idesc->id_blkno, sblock.fs_bsize, type); if (bp->b_errs != 0) { brelse(bp); return (SKIP); } idesc->id_bp = bp; idesc->id_level--; for (sizepb = sblock.fs_bsize, i = 0; i < idesc->id_level; i++) sizepb *= NINDIR(&sblock); if (howmany(isize, sizepb) > NINDIR(&sblock)) nif = NINDIR(&sblock); else nif = howmany(isize, sizepb); if (idesc->id_func == pass1check && nif < NINDIR(&sblock)) { for (i = nif; i < NINDIR(&sblock); i++) { if (IBLK(bp, i) == 0) continue; (void)sprintf(buf, "PARTIALLY TRUNCATED INODE I=%lu", (u_long)idesc->id_number); if (preen) { pfatal("%s", buf); } else if (dofix(idesc, buf)) { IBLK_SET(bp, i, 0); dirty(bp); } } flush(fswritefd, bp); } for (i = 0; i < nif; i++) { if (IBLK(bp, i)) { idesc->id_blkno = IBLK(bp, i); bp->b_index = i; if (idesc->id_level == 0) { idesc->id_lbn++; n = (*func)(idesc); } else { n = iblock(idesc, isize, type - 1); idesc->id_level++; } if (n & STOP) { brelse(bp); return (n); } } else { idesc->id_lbn += sizepb / sblock.fs_bsize; if (idesc->id_type == DATA && isize > 0) { /* An empty block in a directory XXX */ getpathname(pathbuf, idesc->id_number, idesc->id_number); pfatal("DIRECTORY %s: CONTAINS EMPTY BLOCKS", pathbuf); if (reply("ADJUST LENGTH") == 1) { ginode(idesc->id_number, &ip); DIP_SET(ip.i_dp, di_size, DIP(ip.i_dp, di_size) - isize); isize = 0; printf( "YOU MUST RERUN FSCK AFTERWARDS\n"); rerun = 1; inodirty(&ip); brelse(bp); return(STOP); } } } isize -= sizepb; } brelse(bp); return (KEEPON); } /* * Finds the disk block address at the specified lbn within the inode * specified by dp. This follows the whole tree and honors di_size and * di_extsize so it is a true test of reachability. The lbn may be * negative if an extattr or indirect block is requested. */ ufs2_daddr_t ino_blkatoff(union dinode *dp, ino_t ino, ufs_lbn_t lbn, int *frags, struct bufarea **bpp) { ufs_lbn_t tmpval; ufs_lbn_t cur; ufs_lbn_t next; int i; *frags = 0; if (bpp != NULL) *bpp = NULL; /* * Handle extattr blocks first. */ if (lbn < 0 && lbn >= -UFS_NXADDR) { lbn = -1 - lbn; if (lbn > lblkno(&sblock, dp->dp2.di_extsize - 1)) return (0); *frags = numfrags(&sblock, sblksize(&sblock, dp->dp2.di_extsize, lbn)); return (dp->dp2.di_extb[lbn]); } /* * Now direct and indirect. */ if (DIP(dp, di_mode) == IFLNK && DIP(dp, di_size) < sblock.fs_maxsymlinklen) return (0); if (lbn >= 0 && lbn < UFS_NDADDR) { *frags = numfrags(&sblock, sblksize(&sblock, DIP(dp, di_size), lbn)); return (DIP(dp, di_db[lbn])); } *frags = sblock.fs_frag; for (i = 0, tmpval = NINDIR(&sblock), cur = UFS_NDADDR; i < UFS_NIADDR; i++, tmpval *= NINDIR(&sblock), cur = next) { next = cur + tmpval; if (lbn == -cur - i) return (DIP(dp, di_ib[i])); /* * Determine whether the lbn in question is within this tree. */ if (lbn < 0 && -lbn >= next) continue; if (lbn > 0 && lbn >= next) continue; if (DIP(dp, di_ib[i]) == 0) return (0); return (indir_blkatoff(DIP(dp, di_ib[i]), ino, -cur - i, lbn, bpp)); } pfatal("lbn %jd not in ino %ju\n", lbn, (uintmax_t)ino); return (0); } /* * Fetch an indirect block to find the block at a given lbn. The lbn * may be negative to fetch a specific indirect block pointer or positive * to fetch a specific block. */ static ufs2_daddr_t indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn, struct bufarea **bpp) { struct bufarea *bp; ufs_lbn_t lbnadd; ufs_lbn_t base; int i, level; level = lbn_level(cur); if (level == -1) pfatal("Invalid indir lbn %jd in ino %ju\n", lbn, (uintmax_t)ino); if (level == 0 && lbn < 0) pfatal("Invalid lbn %jd in ino %ju\n", lbn, (uintmax_t)ino); lbnadd = 1; base = -(cur + level); for (i = level; i > 0; i--) lbnadd *= NINDIR(&sblock); if (lbn > 0) i = (lbn - base) / lbnadd; else i = (-lbn - base) / lbnadd; if (i < 0 || i >= NINDIR(&sblock)) { pfatal("Invalid indirect index %d produced by lbn %jd " "in ino %ju\n", i, lbn, (uintmax_t)ino); return (0); } if (level == 0) cur = base + (i * lbnadd); else cur = -(base + (i * lbnadd)) - (level - 1); bp = getdatablk(blk, sblock.fs_bsize, BT_LEVEL1 + level); if (bp->b_errs != 0) return (0); blk = IBLK(bp, i); bp->b_index = i; if (cur == lbn || blk == 0) { if (bpp != NULL) *bpp = bp; else brelse(bp); return (blk); } brelse(bp); if (level == 0) pfatal("Invalid lbn %jd at level 0 for ino %ju\n", lbn, (uintmax_t)ino); return (indir_blkatoff(blk, ino, cur, lbn, bpp)); } /* * Check that a block in a legal block number. * Return 0 if in range, 1 if out of range. */ int chkrange(ufs2_daddr_t blk, int cnt) { int c; if (cnt <= 0 || blk <= 0 || blk >= maxfsblock || cnt > maxfsblock - blk) { if (debug) printf("out of range: blk %ld, offset %i, size %d\n", (long)blk, (int)fragnum(&sblock, blk), cnt); return (1); } if (cnt > sblock.fs_frag || fragnum(&sblock, blk) + cnt > sblock.fs_frag) { if (debug) printf("bad size: blk %ld, offset %i, size %d\n", (long)blk, (int)fragnum(&sblock, blk), cnt); return (1); } c = dtog(&sblock, blk); if (blk < cgdmin(&sblock, c)) { if ((blk + cnt) > cgsblock(&sblock, c)) { if (debug) { printf("blk %ld < cgdmin %ld;", (long)blk, (long)cgdmin(&sblock, c)); printf(" blk + cnt %ld > cgsbase %ld\n", (long)(blk + cnt), (long)cgsblock(&sblock, c)); } return (1); } } else { if ((blk + cnt) > cgbase(&sblock, c+1)) { if (debug) { printf("blk %ld >= cgdmin %ld;", (long)blk, (long)cgdmin(&sblock, c)); printf(" blk + cnt %ld > sblock.fs_fpg %ld\n", (long)(blk + cnt), (long)sblock.fs_fpg); } return (1); } } return (0); } /* * General purpose interface for reading inodes. * * firstinum and lastinum track contents of getnextino() cache (below). */ static ino_t firstinum, lastinum; static struct bufarea inobuf; void ginode(ino_t inumber, struct inode *ip) { ufs2_daddr_t iblk; + union dinodep dpp; struct ufs2_dinode *dp; if (inumber < UFS_ROOTINO || inumber >= maxino) errx(EEXIT, "bad inode number %ju to ginode", (uintmax_t)inumber); ip->i_number = inumber; if (inumber >= firstinum && inumber < lastinum) { /* contents in getnextino() cache */ ip->i_bp = &inobuf; inobuf.b_refcnt++; inobuf.b_index = firstinum; } else if (icachebp != NULL && inumber >= icachebp->b_index && inumber < icachebp->b_index + INOPB(&sblock)) { /* take an additional reference for the returned inode */ icachebp->b_refcnt++; ip->i_bp = icachebp; } else { iblk = ino_to_fsba(&sblock, inumber); /* release our cache-hold reference on old icachebp */ if (icachebp != NULL) brelse(icachebp); icachebp = getdatablk(iblk, sblock.fs_bsize, BT_INODES); if (icachebp->b_errs != 0) { icachebp = NULL; ip->i_bp = NULL; ip->i_dp = &zino; return; } /* take a cache-hold reference on new icachebp */ icachebp->b_refcnt++; icachebp->b_index = rounddown(inumber, INOPB(&sblock)); ip->i_bp = icachebp; } if (sblock.fs_magic == FS_UFS1_MAGIC) { ip->i_dp = (union dinode *) &ip->i_bp->b_un.b_dinode1[inumber - ip->i_bp->b_index]; + dpp.dp1 = (struct ufs1_dinode *)ip->i_dp; + if (ffs_oldfscompat_inode_read(&sblock, dpp, now)) + inodirty(ip); return; } ip->i_dp = (union dinode *) &ip->i_bp->b_un.b_dinode2[inumber - ip->i_bp->b_index]; - dp = (struct ufs2_dinode *)ip->i_dp; + dpp.dp2 = dp = (struct ufs2_dinode *)ip->i_dp; /* Do not check hash of inodes being created */ if (dp->di_mode != 0 && ffs_verify_dinode_ckhash(&sblock, dp)) { pwarn("INODE CHECK-HASH FAILED"); prtinode(ip); if (preen || reply("FIX") != 0) { if (preen) printf(" (FIXED)\n"); ffs_update_dinode_ckhash(&sblock, dp); inodirty(ip); } } + if (ffs_oldfscompat_inode_read(&sblock, dpp, now)) + inodirty(ip); } /* * Release a held inode. */ void irelse(struct inode *ip) { /* Check for failed inode read */ if (ip->i_bp == NULL) return; if (debug && sblock.fs_magic == FS_UFS2_MAGIC && ffs_verify_dinode_ckhash(&sblock, (struct ufs2_dinode *)ip->i_dp)) { pwarn("irelse: releasing inode with bad check-hash"); prtinode(ip); } if (ip->i_bp->b_refcnt <= 0) pfatal("irelse: releasing unreferenced ino %ju\n", (uintmax_t) ip->i_number); brelse(ip->i_bp); } /* * Special purpose version of ginode used to optimize first pass * over all the inodes in numerical order. */ static ino_t nextinum, lastvalidinum; static long readcount, readpercg, fullcnt, inobufsize, partialcnt, partialsize; union dinode * getnextinode(ino_t inumber, int rebuiltcg) { int j; long size; mode_t mode; ufs2_daddr_t ndb, blk; union dinode *dp; + union dinodep dpp; struct inode ip; static caddr_t nextinop; if (inumber != nextinum++ || inumber > lastvalidinum) errx(EEXIT, "bad inode number %ju to nextinode", (uintmax_t)inumber); if (inumber >= lastinum) { readcount++; firstinum = lastinum; blk = ino_to_fsba(&sblock, lastinum); if (readcount % readpercg == 0) { size = partialsize; lastinum += partialcnt; } else { size = inobufsize; lastinum += fullcnt; } /* * Flush old contents in case they have been updated. * If getblk encounters an error, it will already have zeroed * out the buffer, so we do not need to do so here. */ if (inobuf.b_refcnt != 0) pfatal("Non-zero getnextinode() ref count %d\n", inobuf.b_refcnt); flush(fswritefd, &inobuf); getblk(&inobuf, blk, size); nextinop = inobuf.b_un.b_buf; } dp = (union dinode *)nextinop; - if (sblock.fs_magic == FS_UFS1_MAGIC) + if (sblock.fs_magic == FS_UFS1_MAGIC) { nextinop += sizeof(struct ufs1_dinode); - else + dpp.dp1 = (struct ufs1_dinode *)dp; + } else { nextinop += sizeof(struct ufs2_dinode); + dpp.dp2 = (struct ufs2_dinode *)dp; + } if ((ckhashadd & CK_INODE) != 0) { ffs_update_dinode_ckhash(&sblock, (struct ufs2_dinode *)dp); dirty(&inobuf); } if (ffs_verify_dinode_ckhash(&sblock, (struct ufs2_dinode *)dp) != 0) { pwarn("INODE CHECK-HASH FAILED"); ip.i_bp = NULL; ip.i_dp = dp; ip.i_number = inumber; prtinode(&ip); if (preen || reply("FIX") != 0) { if (preen) printf(" (FIXED)\n"); ffs_update_dinode_ckhash(&sblock, (struct ufs2_dinode *)dp); dirty(&inobuf); } } + if (ffs_oldfscompat_inode_read(&sblock, dpp, now)) + dirty(&inobuf); if (rebuiltcg && (char *)dp == inobuf.b_un.b_buf) { /* * Try to determine if we have reached the end of the * allocated inodes. */ mode = DIP(dp, di_mode) & IFMT; if (mode == 0) { if (memcmp(dp->dp2.di_db, zino.dp2.di_db, UFS_NDADDR * sizeof(ufs2_daddr_t)) || memcmp(dp->dp2.di_ib, zino.dp2.di_ib, UFS_NIADDR * sizeof(ufs2_daddr_t)) || dp->dp2.di_mode || dp->dp2.di_size) return (NULL); return (dp); } if (!ftypeok(dp)) return (NULL); ndb = howmany(DIP(dp, di_size), sblock.fs_bsize); if (ndb < 0) return (NULL); if (mode == IFBLK || mode == IFCHR) ndb++; if (mode == IFLNK) { /* * Fake ndb value so direct/indirect block checks below * will detect any garbage after symlink string. */ if (DIP(dp, di_size) < (off_t)sblock.fs_maxsymlinklen) { ndb = howmany(DIP(dp, di_size), sizeof(ufs2_daddr_t)); if (ndb > UFS_NDADDR) { j = ndb - UFS_NDADDR; for (ndb = 1; j > 1; j--) ndb *= NINDIR(&sblock); ndb += UFS_NDADDR; } } } for (j = ndb; ndb < UFS_NDADDR && j < UFS_NDADDR; j++) if (DIP(dp, di_db[j]) != 0) return (NULL); for (j = 0, ndb -= UFS_NDADDR; ndb > 0; j++) ndb /= NINDIR(&sblock); for (; j < UFS_NIADDR; j++) if (DIP(dp, di_ib[j]) != 0) return (NULL); } return (dp); } void setinodebuf(int cg, ino_t inosused) { + struct timespec time; ino_t inum; + /* + * Get the current value of the present time. + * This will happen before each cylinder group is scanned. + * If for some reason getting the time fails, we will use + * the last time that the superblock was updated. + */ + if (clock_gettime(CLOCK_REALTIME_FAST, &time) == 0) + now = time.tv_sec; + else + now = sblock.fs_time; inum = cg * sblock.fs_ipg; lastvalidinum = inum + inosused - 1; nextinum = inum; lastinum = inum; readcount = 0; /* Flush old contents in case they have been updated */ flush(fswritefd, &inobuf); inobuf.b_bno = 0; if (inobuf.b_un.b_buf == NULL) { inobufsize = blkroundup(&sblock, MAX(INOBUFSIZE, sblock.fs_bsize)); initbarea(&inobuf, BT_INODES); if ((inobuf.b_un.b_buf = Malloc((unsigned)inobufsize)) == NULL) errx(EEXIT, "cannot allocate space for inode buffer"); } fullcnt = inobufsize / ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode)); readpercg = inosused / fullcnt; partialcnt = inosused % fullcnt; partialsize = fragroundup(&sblock, partialcnt * ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode))); if (partialcnt != 0) { readpercg++; } else { partialcnt = fullcnt; partialsize = inobufsize; } } int freeblock(struct inodesc *idesc) { struct dups *dlp; struct bufarea *cgbp; struct cg *cgp; ufs2_daddr_t blkno; long size, nfrags; blkno = idesc->id_blkno; if (idesc->id_type == SNAP) { pfatal("clearing a snapshot dinode\n"); return (STOP); } size = lfragtosize(&sblock, idesc->id_numfrags); if (snapblkfree(&sblock, blkno, size, idesc->id_number, std_checkblkavail)) return (KEEPON); for (nfrags = idesc->id_numfrags; nfrags > 0; blkno++, nfrags--) { if (chkrange(blkno, 1)) { return (SKIP); } else if (testbmap(blkno)) { for (dlp = duplist; dlp; dlp = dlp->next) { if (dlp->dup != blkno) continue; dlp->dup = duplist->dup; dlp = duplist; duplist = duplist->next; free((char *)dlp); break; } if (dlp == NULL) { clrbmap(blkno); n_blks--; } } } /* * If all successfully returned, account for them. */ if (nfrags == 0) { cgbp = cglookup(dtog(&sblock, idesc->id_blkno)); cgp = cgbp->b_un.b_cg; if (idesc->id_numfrags == sblock.fs_frag) cgp->cg_cs.cs_nbfree++; else cgp->cg_cs.cs_nffree += idesc->id_numfrags; cgdirty(cgbp); } return (KEEPON); } /* * Prepare a snapshot file for being removed. */ void snapremove(ino_t inum) { struct inodesc idesc; struct inode ip; int i; for (i = 0; i < snapcnt; i++) if (snaplist[i].i_number == inum) break; if (i == snapcnt) ginode(inum, &ip); else ip = snaplist[i]; if ((DIP(ip.i_dp, di_flags) & SF_SNAPSHOT) == 0) { printf("snapremove: inode %jd is not a snapshot\n", (intmax_t)inum); if (i == snapcnt) irelse(&ip); return; } if (debug) printf("snapremove: remove %sactive snapshot %jd\n", i == snapcnt ? "in" : "", (intmax_t)inum); /* * If on active snapshot list, remove it. */ if (i < snapcnt) { for (i++; i < FSMAXSNAP; i++) { if (sblock.fs_snapinum[i] == 0) break; snaplist[i - 1] = snaplist[i]; sblock.fs_snapinum[i - 1] = sblock.fs_snapinum[i]; } sblock.fs_snapinum[i - 1] = 0; bzero(&snaplist[i - 1], sizeof(struct inode)); snapcnt--; } memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = SNAP; idesc.id_func = snapclean; idesc.id_number = inum; (void)ckinode(ip.i_dp, &idesc); DIP_SET(ip.i_dp, di_flags, DIP(ip.i_dp, di_flags) & ~SF_SNAPSHOT); inodirty(&ip); irelse(&ip); } static int snapclean(struct inodesc *idesc) { ufs2_daddr_t blkno; struct bufarea *bp; union dinode *dp; blkno = idesc->id_blkno; if (blkno == 0) return (KEEPON); dp = idesc->id_dp; if (blkno == BLK_NOCOPY || blkno == BLK_SNAP) { if (idesc->id_lbn < UFS_NDADDR) { DIP_SET(dp, di_db[idesc->id_lbn], 0); } else { bp = idesc->id_bp; IBLK_SET(bp, bp->b_index, 0); dirty(bp); } } return (KEEPON); } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ino_trunc above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int snapblkfree(struct fs *fs, ufs2_daddr_t bno, long size, ino_t inum, ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags)) { union dinode *dp; struct inode ip; struct bufarea *snapbp; ufs_lbn_t lbn; ufs2_daddr_t blkno, relblkno; int i, frags, claimedblk, copydone; /* If no snapshots, nothing to do */ if (snapcnt == 0) return (0); if (debug) printf("snapblkfree: in ino %jd free blkno %jd, size %jd\n", (intmax_t)inum, (intmax_t)bno, (intmax_t)size); relblkno = blknum(fs, bno); lbn = fragstoblks(fs, relblkno); /* Direct blocks are always pre-copied */ if (lbn < UFS_NDADDR) return (0); copydone = 0; claimedblk = 0; for (i = 0; i < snapcnt; i++) { /* * Lookup block being freed. */ ip = snaplist[i]; dp = ip.i_dp; blkno = ino_blkatoff(dp, inum != 0 ? inum : ip.i_number, lbn, &frags, &snapbp); /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) pfatal("snapblkfree: inconsistent block type"); IBLK_SET(snapbp, snapbp->b_index, BLK_NOCOPY); dirty(snapbp); brelse(snapbp); continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ brelse(snapbp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { if (debug) printf("Grabonremove snapshot %ju lbn %jd " "from inum %ju\n", (intmax_t)ip.i_number, (intmax_t)lbn, (uintmax_t)inum); IBLK_SET(snapbp, snapbp->b_index, relblkno); dirty(snapbp); brelse(snapbp); DIP_SET(dp, di_blocks, DIP(dp, di_blocks) + btodb(size)); inodirty(&ip); return (1); } /* First time through, read the contents of the old block. */ if (copydone == 0) { copydone = 1; if (blread(fsreadfd, copybuf, fsbtodb(fs, relblkno), fs->fs_bsize) != 0) { pfatal("Could not read snapshot %ju block " "%jd\n", (intmax_t)ip.i_number, (intmax_t)relblkno); continue; } } /* * This allocation will never require any additional * allocations for the snapshot inode. */ blkno = allocblk(dtog(fs, relblkno), fs->fs_frag, checkblkavail); if (blkno == 0) { pfatal("Could not allocate block for snapshot %ju\n", (intmax_t)ip.i_number); continue; } if (debug) printf("Copyonremove: snapino %jd lbn %jd for inum %ju " "size %ld new blkno %jd\n", (intmax_t)ip.i_number, (intmax_t)lbn, (uintmax_t)inum, size, (intmax_t)blkno); blwrite(fswritefd, copybuf, fsbtodb(fs, blkno), fs->fs_bsize); IBLK_SET(snapbp, snapbp->b_index, blkno); dirty(snapbp); brelse(snapbp); DIP_SET(dp, di_blocks, DIP(dp, di_blocks) + btodb(fs->fs_bsize)); inodirty(&ip); } return (0); } /* * Notification that a block is being written. Return if the block * is part of a snapshot as snapshots never track other snapshots. * The block will be copied in all of the snapshots that are tracking * it and have not yet copied it. Some buffers may hold more than one * block. Here we need to check each block in the buffer. */ void copyonwrite(struct fs *fs, struct bufarea *bp, ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags)) { ufs2_daddr_t copyblkno; long i, numblks; /* If no snapshots, nothing to do. */ if (snapcnt == 0) return; numblks = blkroundup(fs, bp->b_size) / fs->fs_bsize; if (debug) prtbuf(bp, "copyonwrite: checking %jd block%s in buffer", (intmax_t)numblks, numblks > 1 ? "s" : ""); copyblkno = blknum(fs, dbtofsb(fs, bp->b_bno)); for (i = 0; i < numblks; i++) { chkcopyonwrite(fs, copyblkno, checkblkavail); copyblkno += fs->fs_frag; } } static void chkcopyonwrite(struct fs *fs, ufs2_daddr_t copyblkno, ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags)) { struct inode ip; union dinode *dp; struct bufarea *snapbp; ufs2_daddr_t blkno; int i, frags, copydone; ufs_lbn_t lbn; lbn = fragstoblks(fs, copyblkno); /* Direct blocks are always pre-copied */ if (lbn < UFS_NDADDR) return; copydone = 0; for (i = 0; i < snapcnt; i++) { /* * Lookup block being freed. */ ip = snaplist[i]; dp = ip.i_dp; blkno = ino_blkatoff(dp, ip.i_number, lbn, &frags, &snapbp); /* * Check to see if block needs to be copied. */ if (blkno != 0) { /* * A block that we have already copied or don't track. */ brelse(snapbp); continue; } /* First time through, read the contents of the old block. */ if (copydone == 0) { copydone = 1; if (blread(fsreadfd, copybuf, fsbtodb(fs, copyblkno), fs->fs_bsize) != 0) { pfatal("Could not read snapshot %ju block " "%jd\n", (intmax_t)ip.i_number, (intmax_t)copyblkno); continue; } } /* * This allocation will never require any additional * allocations for the snapshot inode. */ if ((blkno = allocblk(dtog(fs, copyblkno), fs->fs_frag, checkblkavail)) == 0) { pfatal("Could not allocate block for snapshot %ju\n", (intmax_t)ip.i_number); continue; } if (debug) prtbuf(snapbp, "Copyonwrite: snapino %jd lbn %jd using " "blkno %ju setting in buffer", (intmax_t)ip.i_number, (intmax_t)lbn, (intmax_t)blkno); blwrite(fswritefd, copybuf, fsbtodb(fs, blkno), fs->fs_bsize); IBLK_SET(snapbp, snapbp->b_index, blkno); dirty(snapbp); brelse(snapbp); DIP_SET(dp, di_blocks, DIP(dp, di_blocks) + btodb(fs->fs_bsize)); inodirty(&ip); } return; } /* * Traverse an inode and check that its block count is correct * fixing it if necessary. */ void check_blkcnt(struct inode *ip) { struct inodesc idesc; union dinode *dp; ufs2_daddr_t ndb; int j, ret, offset; dp = ip->i_dp; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_func = pass1check; idesc.id_number = ip->i_number; idesc.id_type = (DIP(dp, di_flags) & SF_SNAPSHOT) == 0 ? ADDR : SNAP; (void)ckinode(dp, &idesc); if (sblock.fs_magic == FS_UFS2_MAGIC && dp->dp2.di_extsize > 0) { ndb = howmany(dp->dp2.di_extsize, sblock.fs_bsize); for (j = 0; j < UFS_NXADDR; j++) { if (--ndb == 0 && (offset = blkoff(&sblock, dp->dp2.di_extsize)) != 0) idesc.id_numfrags = numfrags(&sblock, fragroundup(&sblock, offset)); else idesc.id_numfrags = sblock.fs_frag; if (dp->dp2.di_extb[j] == 0) continue; idesc.id_blkno = dp->dp2.di_extb[j]; ret = (*idesc.id_func)(&idesc); if (ret & STOP) break; } } idesc.id_entryno *= btodb(sblock.fs_fsize); if (DIP(dp, di_blocks) != idesc.id_entryno) { if (!(sujrecovery && preen)) { pwarn("INCORRECT BLOCK COUNT I=%lu (%ju should be %ju)", (u_long)idesc.id_number, (uintmax_t)DIP(dp, di_blocks), (uintmax_t)idesc.id_entryno); if (preen) printf(" (CORRECTED)\n"); else if (reply("CORRECT") == 0) return; } if (bkgrdflag == 0) { DIP_SET(dp, di_blocks, idesc.id_entryno); inodirty(ip); } else { cmd.value = idesc.id_number; cmd.size = idesc.id_entryno - DIP(dp, di_blocks); if (debug) printf("adjblkcnt ino %ju amount %lld\n", (uintmax_t)cmd.value, (long long)cmd.size); if (sysctl(adjblkcnt, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST INODE BLOCK COUNT", cmd.value); } } } void freeinodebuf(void) { struct bufarea *bp; int i; /* * Flush old contents in case they have been updated. */ flush(fswritefd, &inobuf); if (inobuf.b_un.b_buf != NULL) free((char *)inobuf.b_un.b_buf); inobuf.b_un.b_buf = NULL; firstinum = lastinum = 0; /* * Reload the snapshot inodes in case any of them changed. */ for (i = 0; i < snapcnt; i++) { bp = snaplist[i].i_bp; bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, bp->b_bno, bp->b_size); } } /* * Routines to maintain information about directory inodes. * This is built during the first pass and used during the * second and third passes. * * Enter inodes into the cache. */ struct inoinfo * cacheino(union dinode *dp, ino_t inumber) { struct inoinfo *inp; int i, blks; if (getinoinfo(inumber) != NULL) pfatal("cacheino: duplicate entry for ino %jd\n", (intmax_t)inumber); if (howmany(DIP(dp, di_size), sblock.fs_bsize) > UFS_NDADDR) blks = UFS_NDADDR + UFS_NIADDR; else if (DIP(dp, di_size) > 0) blks = howmany(DIP(dp, di_size), sblock.fs_bsize); else blks = 1; inp = (struct inoinfo *) Malloc(sizeof(*inp) + (blks - 1) * sizeof(ufs2_daddr_t)); if (inp == NULL) errx(EEXIT, "cannot increase directory list"); SLIST_INSERT_HEAD(&inphash[inumber % dirhash], inp, i_hash); inp->i_flags = 0; inp->i_parent = inumber == UFS_ROOTINO ? UFS_ROOTINO : (ino_t)0; inp->i_dotdot = (ino_t)0; inp->i_number = inumber; inp->i_isize = DIP(dp, di_size); inp->i_depth = DIP(dp, di_dirdepth); inp->i_numblks = blks; for (i = 0; i < MIN(blks, UFS_NDADDR); i++) inp->i_blks[i] = DIP(dp, di_db[i]); if (blks > UFS_NDADDR) for (i = 0; i < UFS_NIADDR; i++) inp->i_blks[UFS_NDADDR + i] = DIP(dp, di_ib[i]); if (inplast == listmax) { listmax += 100; inpsort = (struct inoinfo **)reallocarray((char *)inpsort, listmax, sizeof(struct inoinfo *)); if (inpsort == NULL) errx(EEXIT, "cannot increase directory list"); } inpsort[inplast++] = inp; return (inp); } /* * Look up an inode cache structure. */ struct inoinfo * getinoinfo(ino_t inumber) { struct inoinfo *inp; SLIST_FOREACH(inp, &inphash[inumber % dirhash], i_hash) { if (inp->i_number != inumber) continue; return (inp); } return (NULL); } /* * Remove an entry from the inode cache and disk-order sorted list. * Return 0 on success and 1 on failure. */ int removecachedino(ino_t inumber) { struct inoinfo *inp, **inpp; char *listtype; listtype = "hash"; SLIST_FOREACH(inp, &inphash[inumber % dirhash], i_hash) { if (inp->i_number != inumber) continue; SLIST_REMOVE(&inphash[inumber % dirhash], inp, inoinfo, i_hash); for (inpp = &inpsort[inplast - 1]; inpp >= inpsort; inpp--) { if (*inpp != inp) continue; *inpp = inpsort[inplast - 1]; inplast--; free(inp); return (0); } listtype = "sort"; break; } pfatal("removecachedino: entry for ino %jd not found on %s list\n", (intmax_t)inumber, listtype); return (1); } /* * Clean up all the inode cache structure. */ void inocleanup(void) { struct inoinfo **inpp; if (inphash == NULL) return; for (inpp = &inpsort[inplast - 1]; inpp >= inpsort; inpp--) free((char *)(*inpp)); free((char *)inphash); inphash = NULL; free((char *)inpsort); inpsort = NULL; } void inodirty(struct inode *ip) { if (sblock.fs_magic == FS_UFS2_MAGIC) ffs_update_dinode_ckhash(&sblock, (struct ufs2_dinode *)ip->i_dp); dirty(ip->i_bp); } void clri(struct inodesc *idesc, const char *type, int flag) { union dinode *dp; struct inode ip; ginode(idesc->id_number, &ip); dp = ip.i_dp; if (flag == 1) { pwarn("%s %s", type, (DIP(dp, di_mode) & IFMT) == IFDIR ? "DIR" : "FILE"); prtinode(&ip); printf("\n"); } if (preen || reply("CLEAR") == 1) { if (preen) printf(" (CLEARED)\n"); n_files--; if (bkgrdflag == 0) { if (idesc->id_type == SNAP) { snapremove(idesc->id_number); idesc->id_type = ADDR; } (void)ckinode(dp, idesc); inoinfo(idesc->id_number)->ino_state = USTATE; clearinode(dp); inodirty(&ip); } else { cmd.value = idesc->id_number; cmd.size = -DIP(dp, di_nlink); if (debug) printf("adjrefcnt ino %ld amt %lld\n", (long)cmd.value, (long long)cmd.size); if (sysctl(adjrefcnt, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST INODE", cmd.value); } } irelse(&ip); } int findname(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (dirp->d_ino != idesc->id_parent || idesc->id_entryno < 2) { idesc->id_entryno++; return (KEEPON); } memmove(idesc->id_name, dirp->d_name, (size_t)dirp->d_namlen + 1); return (STOP|FOUND); } int findino(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (dirp->d_ino == 0) return (KEEPON); if (strcmp(dirp->d_name, idesc->id_name) == 0 && dirp->d_ino >= UFS_ROOTINO && dirp->d_ino < maxino) { idesc->id_parent = dirp->d_ino; return (STOP|FOUND); } return (KEEPON); } int clearentry(struct inodesc *idesc) { struct direct *dirp = idesc->id_dirp; if (dirp->d_ino != idesc->id_parent || idesc->id_entryno < 2) { idesc->id_entryno++; return (KEEPON); } dirp->d_ino = 0; return (STOP|FOUND|ALTERED); } void prtinode(struct inode *ip) { char *p; union dinode *dp; struct passwd *pw; time_t t; dp = ip->i_dp; printf(" I=%lu ", (u_long)ip->i_number); if (ip->i_number < UFS_ROOTINO || ip->i_number >= maxino) return; printf(" OWNER="); if ((pw = getpwuid((int)DIP(dp, di_uid))) != NULL) printf("%s ", pw->pw_name); else printf("%u ", (unsigned)DIP(dp, di_uid)); printf("MODE=%o\n", DIP(dp, di_mode)); if (preen) printf("%s: ", cdevname); printf("SIZE=%ju ", (uintmax_t)DIP(dp, di_size)); t = DIP(dp, di_mtime); if ((p = ctime(&t)) != NULL) printf("MTIME=%12.12s %4.4s ", &p[4], &p[20]); } void blkerror(ino_t ino, const char *type, ufs2_daddr_t blk) { pfatal("%jd %s I=%ju", (intmax_t)blk, type, (uintmax_t)ino); printf("\n"); switch (inoinfo(ino)->ino_state) { case FSTATE: case FZLINK: inoinfo(ino)->ino_state = FCLEAR; return; case DSTATE: case DZLINK: inoinfo(ino)->ino_state = DCLEAR; return; case FCLEAR: case DCLEAR: return; default: errx(EEXIT, "BAD STATE %d TO BLKERR", inoinfo(ino)->ino_state); /* NOTREACHED */ } } /* * allocate an unused inode */ ino_t allocino(ino_t request, int type) { ino_t ino; struct inode ip; union dinode *dp; struct bufarea *cgbp; struct cg *cgp; int cg, anyino; anyino = 0; if (request == 0) { request = UFS_ROOTINO; anyino = 1; } else if (inoinfo(request)->ino_state != USTATE) return (0); retry: for (ino = request; ino < maxino; ino++) if (inoinfo(ino)->ino_state == USTATE) break; if (ino >= maxino) return (0); cg = ino_to_cg(&sblock, ino); cgbp = cglookup(cg); cgp = cgbp->b_un.b_cg; if (!check_cgmagic(cg, cgbp)) { if (anyino == 0) return (0); request = (cg + 1) * sblock.fs_ipg; goto retry; } setbit(cg_inosused(cgp), ino % sblock.fs_ipg); cgp->cg_cs.cs_nifree--; switch (type & IFMT) { case IFDIR: inoinfo(ino)->ino_state = DSTATE; cgp->cg_cs.cs_ndir++; break; case IFREG: case IFLNK: inoinfo(ino)->ino_state = FSTATE; break; default: return (0); } cgdirty(cgbp); ginode(ino, &ip); dp = ip.i_dp; memset(dp, 0, ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode))); DIP_SET(dp, di_db[0], allocblk(ino_to_cg(&sblock, ino), (long)1, std_checkblkavail)); if (DIP(dp, di_db[0]) == 0) { inoinfo(ino)->ino_state = USTATE; inodirty(&ip); irelse(&ip); return (0); } DIP_SET(dp, di_mode, type); DIP_SET(dp, di_atime, time(NULL)); DIP_SET(dp, di_ctime, DIP(dp, di_atime)); DIP_SET(dp, di_mtime, DIP(dp, di_ctime)); DIP_SET(dp, di_size, sblock.fs_fsize); DIP_SET(dp, di_blocks, btodb(sblock.fs_fsize)); n_files++; inodirty(&ip); irelse(&ip); inoinfo(ino)->ino_type = IFTODT(type); return (ino); } /* * deallocate an inode */ void freeino(ino_t ino) { struct inodesc idesc; union dinode *dp; struct inode ip; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = ADDR; idesc.id_func = freeblock; idesc.id_number = ino; ginode(ino, &ip); dp = ip.i_dp; (void)ckinode(dp, &idesc); clearinode(dp); inodirty(&ip); irelse(&ip); inoinfo(ino)->ino_state = USTATE; n_files--; } diff --git a/sbin/newfs/mkfs.c b/sbin/newfs/mkfs.c index b0d178de984e..0101c8bdf893 100644 --- a/sbin/newfs/mkfs.c +++ b/sbin/newfs/mkfs.c @@ -1,1236 +1,1232 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program. * * Copyright (c) 1980, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static char sccsid[] = "@(#)mkfs.c 8.11 (Berkeley) 5/3/95"; #endif /* not lint */ #endif #include #define _WANT_P_OSREL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "newfs.h" /* * make file system for cylinder-group style file systems */ #define UMASK 0755 #define POWEROF2(num) (((num) & ((num) - 1)) == 0) /* * The definition of "struct cg" used to contain an extra field at the end * to represent the variable-length data that followed the fixed structure. * This had the effect of artificially limiting the number of blocks that * newfs would put in a CG, since newfs thought that the fixed-size header * was bigger than it really was. When we started validating that the CG * header data actually fit into one fs block, the placeholder field caused * a problem because it caused struct cg to be a different size depending on * platform. The placeholder field was later removed, but this caused a * backward compatibility problem with older binaries that still thought * struct cg was larger, and a new file system could fail validation if * viewed by the older binaries. To avoid this compatibility problem, we * now artificially reduce the amount of space that the variable-length data * can use such that new file systems will pass validation by older binaries. */ #define CGSIZEFUDGE 8 static struct csum *fscs; #define sblock disk.d_fs #define acg disk.d_cg -union dinode { - struct ufs1_dinode dp1; - struct ufs2_dinode dp2; -}; #define DIP(dp, field) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) static caddr_t iobuf; static long iobufsize; static ufs2_daddr_t alloc(int size, int mode); static int charsperline(void); static void clrblock(struct fs *, unsigned char *, int); static void fsinit(time_t); static int ilog2(int); static void initcg(int, time_t); static int isblock(struct fs *, unsigned char *, int); static void iput(union dinode *, ino_t); static int makedir(struct direct *, int); static void setblock(struct fs *, unsigned char *, int); static void wtfs(ufs2_daddr_t, int, char *); static u_int32_t newfs_random(void); void mkfs(struct partition *pp, char *fsys) { int fragsperinode, optimalfpg, origdensity, minfpg, lastminfpg; long i, j, csfrags; uint cg; time_t utime; quad_t sizepb; int width; ino_t maxinum; int minfragsperinode; /* minimum ratio of frags to inodes */ char tmpbuf[100]; /* XXX this will break in about 2,500 years */ struct fsrecovery *fsr; char *fsrbuf; union { struct fs fdummy; char cdummy[SBLOCKSIZE]; } dummy; #define fsdummy dummy.fdummy #define chdummy dummy.cdummy /* * Our blocks == sector size, and the version of UFS we are using is * specified by Oflag. */ disk.d_bsize = sectorsize; disk.d_ufs = Oflag; if (Rflag) utime = 1000000000; else time(&utime); if ((sblock.fs_si = malloc(sizeof(struct fs_summary_info))) == NULL) { printf("Superblock summary info allocation failed.\n"); exit(18); } sblock.fs_old_flags = FS_FLAGS_UPDATED; sblock.fs_flags = 0; if (Uflag) sblock.fs_flags |= FS_DOSOFTDEP; if (Lflag) strlcpy(sblock.fs_volname, volumelabel, MAXVOLLEN); if (Jflag) sblock.fs_flags |= FS_GJOURNAL; if (lflag) sblock.fs_flags |= FS_MULTILABEL; if (tflag) sblock.fs_flags |= FS_TRIM; /* * Validate the given file system size. * Verify that its last block can actually be accessed. * Convert to file system fragment sized units. */ if (fssize <= 0) { printf("preposterous size %jd\n", (intmax_t)fssize); exit(13); } wtfs(fssize - (realsectorsize / DEV_BSIZE), realsectorsize, (char *)&sblock); /* * collect and verify the file system density info */ sblock.fs_avgfilesize = avgfilesize; sblock.fs_avgfpdir = avgfilesperdir; if (sblock.fs_avgfilesize <= 0) printf("illegal expected average file size %d\n", sblock.fs_avgfilesize), exit(14); if (sblock.fs_avgfpdir <= 0) printf("illegal expected number of files per directory %d\n", sblock.fs_avgfpdir), exit(15); restart: /* * collect and verify the block and fragment sizes */ sblock.fs_bsize = bsize; sblock.fs_fsize = fsize; if (!POWEROF2(sblock.fs_bsize)) { printf("block size must be a power of 2, not %d\n", sblock.fs_bsize); exit(16); } if (!POWEROF2(sblock.fs_fsize)) { printf("fragment size must be a power of 2, not %d\n", sblock.fs_fsize); exit(17); } if (sblock.fs_fsize < sectorsize) { printf("increasing fragment size from %d to sector size (%d)\n", sblock.fs_fsize, sectorsize); sblock.fs_fsize = sectorsize; } if (sblock.fs_bsize > MAXBSIZE) { printf("decreasing block size from %d to maximum (%d)\n", sblock.fs_bsize, MAXBSIZE); sblock.fs_bsize = MAXBSIZE; } if (sblock.fs_bsize < MINBSIZE) { printf("increasing block size from %d to minimum (%d)\n", sblock.fs_bsize, MINBSIZE); sblock.fs_bsize = MINBSIZE; } if (sblock.fs_fsize > MAXBSIZE) { printf("decreasing fragment size from %d to maximum (%d)\n", sblock.fs_fsize, MAXBSIZE); sblock.fs_fsize = MAXBSIZE; } if (sblock.fs_bsize < sblock.fs_fsize) { printf("increasing block size from %d to fragment size (%d)\n", sblock.fs_bsize, sblock.fs_fsize); sblock.fs_bsize = sblock.fs_fsize; } if (sblock.fs_fsize * MAXFRAG < sblock.fs_bsize) { printf( "increasing fragment size from %d to block size / %d (%d)\n", sblock.fs_fsize, MAXFRAG, sblock.fs_bsize / MAXFRAG); sblock.fs_fsize = sblock.fs_bsize / MAXFRAG; } if (maxbsize == 0) maxbsize = bsize; if (maxbsize < bsize || !POWEROF2(maxbsize)) { sblock.fs_maxbsize = sblock.fs_bsize; printf("Extent size set to %d\n", sblock.fs_maxbsize); } else if (maxbsize > FS_MAXCONTIG * sblock.fs_bsize) { sblock.fs_maxbsize = FS_MAXCONTIG * sblock.fs_bsize; printf("Extent size reduced to %d\n", sblock.fs_maxbsize); } else { sblock.fs_maxbsize = maxbsize; } /* * Maxcontig sets the default for the maximum number of blocks * that may be allocated sequentially. With file system clustering * it is possible to allocate contiguous blocks up to the maximum * transfer size permitted by the controller or buffering. */ if (maxcontig == 0) maxcontig = MAX(1, MAXPHYS / bsize); sblock.fs_maxcontig = maxcontig; if (sblock.fs_maxcontig < sblock.fs_maxbsize / sblock.fs_bsize) { sblock.fs_maxcontig = sblock.fs_maxbsize / sblock.fs_bsize; printf("Maxcontig raised to %d\n", sblock.fs_maxbsize); } if (sblock.fs_maxcontig > 1) sblock.fs_contigsumsize = MIN(sblock.fs_maxcontig,FS_MAXCONTIG); sblock.fs_bmask = ~(sblock.fs_bsize - 1); sblock.fs_fmask = ~(sblock.fs_fsize - 1); sblock.fs_qbmask = ~sblock.fs_bmask; sblock.fs_qfmask = ~sblock.fs_fmask; sblock.fs_bshift = ilog2(sblock.fs_bsize); sblock.fs_fshift = ilog2(sblock.fs_fsize); sblock.fs_frag = numfrags(&sblock, sblock.fs_bsize); sblock.fs_fragshift = ilog2(sblock.fs_frag); if (sblock.fs_frag > MAXFRAG) { printf("fragment size %d is still too small (can't happen)\n", sblock.fs_bsize / MAXFRAG); exit(21); } sblock.fs_fsbtodb = ilog2(sblock.fs_fsize / sectorsize); sblock.fs_size = fssize = dbtofsb(&sblock, fssize); sblock.fs_providersize = dbtofsb(&sblock, mediasize / sectorsize); /* * Before the filesystem is finally initialized, mark it * as incompletely initialized. */ sblock.fs_magic = FS_BAD_MAGIC; if (Oflag == 1) { sblock.fs_sblockloc = SBLOCK_UFS1; sblock.fs_sblockactualloc = SBLOCK_UFS1; sblock.fs_nindir = sblock.fs_bsize / sizeof(ufs1_daddr_t); sblock.fs_inopb = sblock.fs_bsize / sizeof(struct ufs1_dinode); sblock.fs_maxsymlinklen = ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)); sblock.fs_old_inodefmt = FS_44INODEFMT; sblock.fs_old_cgoffset = 0; sblock.fs_old_cgmask = 0xffffffff; sblock.fs_old_size = sblock.fs_size; sblock.fs_old_rotdelay = 0; sblock.fs_old_rps = 60; sblock.fs_old_nspf = sblock.fs_fsize / sectorsize; sblock.fs_old_cpg = 1; sblock.fs_old_interleave = 1; sblock.fs_old_trackskew = 0; sblock.fs_old_cpc = 0; sblock.fs_old_postblformat = 1; sblock.fs_old_nrpos = 1; } else { sblock.fs_sblockloc = SBLOCK_UFS2; sblock.fs_sblockactualloc = SBLOCK_UFS2; sblock.fs_nindir = sblock.fs_bsize / sizeof(ufs2_daddr_t); sblock.fs_inopb = sblock.fs_bsize / sizeof(struct ufs2_dinode); sblock.fs_maxsymlinklen = ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)); } sblock.fs_sblkno = roundup(howmany(sblock.fs_sblockloc + SBLOCKSIZE, sblock.fs_fsize), sblock.fs_frag); sblock.fs_cblkno = sblock.fs_sblkno + roundup(howmany(SBLOCKSIZE, sblock.fs_fsize), sblock.fs_frag); sblock.fs_iblkno = sblock.fs_cblkno + sblock.fs_frag; sblock.fs_maxfilesize = sblock.fs_bsize * UFS_NDADDR - 1; for (sizepb = sblock.fs_bsize, i = 0; i < UFS_NIADDR; i++) { sizepb *= NINDIR(&sblock); sblock.fs_maxfilesize += sizepb; } /* * It's impossible to create a snapshot in case that fs_maxfilesize * is smaller than the fssize. */ if (sblock.fs_maxfilesize < (u_quad_t)fssize) { warnx("WARNING: You will be unable to create snapshots on this " "file system. Correct by using a larger blocksize."); } /* * Calculate the number of blocks to put into each cylinder group. * * This algorithm selects the number of blocks per cylinder * group. The first goal is to have at least enough data blocks * in each cylinder group to meet the density requirement. Once * this goal is achieved we try to expand to have at least * MINCYLGRPS cylinder groups. Once this goal is achieved, we * pack as many blocks into each cylinder group map as will fit. * * We start by calculating the smallest number of blocks that we * can put into each cylinder group. If this is too big, we reduce * the density until it fits. */ retry: maxinum = (((int64_t)(1)) << 32) - INOPB(&sblock); minfragsperinode = 1 + fssize / maxinum; if (density == 0) { density = MAX(NFPI, minfragsperinode) * fsize; } else if (density < minfragsperinode * fsize) { origdensity = density; density = minfragsperinode * fsize; fprintf(stderr, "density increased from %d to %d\n", origdensity, density); } origdensity = density; for (;;) { fragsperinode = MAX(numfrags(&sblock, density), 1); if (fragsperinode < minfragsperinode) { bsize <<= 1; fsize <<= 1; printf("Block size too small for a file system %s %d\n", "of this size. Increasing blocksize to", bsize); goto restart; } minfpg = fragsperinode * INOPB(&sblock); if (minfpg > sblock.fs_size) minfpg = sblock.fs_size; sblock.fs_ipg = INOPB(&sblock); sblock.fs_fpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_fpg < minfpg) sblock.fs_fpg = minfpg; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); sblock.fs_fpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_fpg < minfpg) sblock.fs_fpg = minfpg; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); if (CGSIZE(&sblock) < (unsigned long)sblock.fs_bsize - CGSIZEFUDGE) break; density -= sblock.fs_fsize; } if (density != origdensity) printf("density reduced from %d to %d\n", origdensity, density); /* * Start packing more blocks into the cylinder group until * it cannot grow any larger, the number of cylinder groups * drops below MINCYLGRPS, or we reach the size requested. * For UFS1 inodes per cylinder group are stored in an int16_t * so fs_ipg is limited to 2^15 - 1. */ for ( ; sblock.fs_fpg < maxblkspercg; sblock.fs_fpg += sblock.fs_frag) { sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); if (Oflag > 1 || (Oflag == 1 && sblock.fs_ipg <= 0x7fff)) { if (sblock.fs_size / sblock.fs_fpg < MINCYLGRPS) break; if (CGSIZE(&sblock) < (unsigned long)sblock.fs_bsize - CGSIZEFUDGE) continue; if (CGSIZE(&sblock) == (unsigned long)sblock.fs_bsize - CGSIZEFUDGE) break; } sblock.fs_fpg -= sblock.fs_frag; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); break; } /* * Check to be sure that the last cylinder group has enough blocks * to be viable. If it is too small, reduce the number of blocks * per cylinder group which will have the effect of moving more * blocks into the last cylinder group. */ optimalfpg = sblock.fs_fpg; for (;;) { sblock.fs_ncg = howmany(sblock.fs_size, sblock.fs_fpg); lastminfpg = roundup(sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock), sblock.fs_frag); if (sblock.fs_size < lastminfpg) { printf("Filesystem size %jd < minimum size of %d\n", (intmax_t)sblock.fs_size, lastminfpg); exit(28); } if (sblock.fs_size % sblock.fs_fpg >= lastminfpg || sblock.fs_size % sblock.fs_fpg == 0) break; sblock.fs_fpg -= sblock.fs_frag; sblock.fs_ipg = roundup(howmany(sblock.fs_fpg, fragsperinode), INOPB(&sblock)); } if (optimalfpg != sblock.fs_fpg) printf("Reduced frags per cylinder group from %d to %d %s\n", optimalfpg, sblock.fs_fpg, "to enlarge last cyl group"); sblock.fs_cgsize = fragroundup(&sblock, CGSIZE(&sblock)); sblock.fs_dblkno = sblock.fs_iblkno + sblock.fs_ipg / INOPF(&sblock); if (Oflag == 1) { sblock.fs_old_spc = sblock.fs_fpg * sblock.fs_old_nspf; sblock.fs_old_nsect = sblock.fs_old_spc; sblock.fs_old_npsect = sblock.fs_old_spc; sblock.fs_old_ncyl = sblock.fs_ncg; } /* * fill in remaining fields of the super block */ sblock.fs_csaddr = cgdmin(&sblock, 0); sblock.fs_cssize = fragroundup(&sblock, sblock.fs_ncg * sizeof(struct csum)); fscs = (struct csum *)calloc(1, sblock.fs_cssize); if (fscs == NULL) errx(31, "calloc failed"); sblock.fs_sbsize = fragroundup(&sblock, sizeof(struct fs)); if (sblock.fs_sbsize > SBLOCKSIZE) sblock.fs_sbsize = SBLOCKSIZE; if (sblock.fs_sbsize < realsectorsize) sblock.fs_sbsize = realsectorsize; sblock.fs_minfree = minfree; if (metaspace > 0 && metaspace < sblock.fs_fpg / 2) sblock.fs_metaspace = blknum(&sblock, metaspace); else if (metaspace != -1) /* reserve half of minfree for metadata blocks */ sblock.fs_metaspace = blknum(&sblock, (sblock.fs_fpg * minfree) / 200); if (maxbpg == 0) sblock.fs_maxbpg = MAXBLKPG(sblock.fs_bsize); else sblock.fs_maxbpg = maxbpg; sblock.fs_optim = opt; sblock.fs_cgrotor = 0; sblock.fs_pendingblocks = 0; sblock.fs_pendinginodes = 0; sblock.fs_fmod = 0; sblock.fs_ronly = 0; sblock.fs_state = 0; sblock.fs_clean = 1; sblock.fs_id[0] = (long)utime; sblock.fs_id[1] = newfs_random(); sblock.fs_fsmnt[0] = '\0'; csfrags = howmany(sblock.fs_cssize, sblock.fs_fsize); sblock.fs_dsize = sblock.fs_size - sblock.fs_sblkno - sblock.fs_ncg * (sblock.fs_dblkno - sblock.fs_sblkno); sblock.fs_cstotal.cs_nbfree = fragstoblks(&sblock, sblock.fs_dsize) - howmany(csfrags, sblock.fs_frag); sblock.fs_cstotal.cs_nffree = fragnum(&sblock, sblock.fs_size) + (fragnum(&sblock, csfrags) > 0 ? sblock.fs_frag - fragnum(&sblock, csfrags) : 0); sblock.fs_cstotal.cs_nifree = sblock.fs_ncg * sblock.fs_ipg - UFS_ROOTINO; sblock.fs_cstotal.cs_ndir = 0; sblock.fs_dsize -= csfrags; sblock.fs_time = utime; if (Oflag == 1) { sblock.fs_old_time = utime; sblock.fs_old_dsize = sblock.fs_dsize; sblock.fs_old_csaddr = sblock.fs_csaddr; sblock.fs_old_cstotal.cs_ndir = sblock.fs_cstotal.cs_ndir; sblock.fs_old_cstotal.cs_nbfree = sblock.fs_cstotal.cs_nbfree; sblock.fs_old_cstotal.cs_nifree = sblock.fs_cstotal.cs_nifree; sblock.fs_old_cstotal.cs_nffree = sblock.fs_cstotal.cs_nffree; } /* * Set flags for metadata that is being check-hashed. * * Metadata check hashes are not supported in the UFS version 1 * filesystem to keep it as small and simple as possible. */ if (Oflag > 1) { sblock.fs_flags |= FS_METACKHASH; if (getosreldate() >= P_OSREL_CK_CYLGRP) sblock.fs_metackhash |= CK_CYLGRP; if (getosreldate() >= P_OSREL_CK_SUPERBLOCK) sblock.fs_metackhash |= CK_SUPERBLOCK; if (getosreldate() >= P_OSREL_CK_INODE) sblock.fs_metackhash |= CK_INODE; } /* * Dump out summary information about file system. */ # define B2MBFACTOR (1 / (1024.0 * 1024.0)) printf("%s: %.1fMB (%jd sectors) block size %d, fragment size %d\n", fsys, (float)sblock.fs_size * sblock.fs_fsize * B2MBFACTOR, (intmax_t)fsbtodb(&sblock, sblock.fs_size), sblock.fs_bsize, sblock.fs_fsize); printf("\tusing %d cylinder groups of %.2fMB, %d blks, %d inodes.\n", sblock.fs_ncg, (float)sblock.fs_fpg * sblock.fs_fsize * B2MBFACTOR, sblock.fs_fpg / sblock.fs_frag, sblock.fs_ipg); if (sblock.fs_flags & FS_DOSOFTDEP) printf("\twith soft updates\n"); # undef B2MBFACTOR if (Eflag && !Nflag) { printf("Erasing sectors [%jd...%jd]\n", sblock.fs_sblockloc / disk.d_bsize, fsbtodb(&sblock, sblock.fs_size) - 1); berase(&disk, sblock.fs_sblockloc / disk.d_bsize, sblock.fs_size * sblock.fs_fsize - sblock.fs_sblockloc); } /* * Wipe out old UFS1 superblock(s) if necessary. */ if (!Nflag && Oflag != 1 && realsectorsize <= SBLOCK_UFS1) { i = bread(&disk, part_ofs + SBLOCK_UFS1 / disk.d_bsize, chdummy, SBLOCKSIZE); if (i == -1) err(1, "can't read old UFS1 superblock: %s", disk.d_error); if (fsdummy.fs_magic == FS_UFS1_MAGIC) { fsdummy.fs_magic = 0; bwrite(&disk, part_ofs + SBLOCK_UFS1 / disk.d_bsize, chdummy, SBLOCKSIZE); for (cg = 0; cg < fsdummy.fs_ncg; cg++) { if (fsbtodb(&fsdummy, cgsblock(&fsdummy, cg)) > fssize) break; bwrite(&disk, part_ofs + fsbtodb(&fsdummy, cgsblock(&fsdummy, cg)), chdummy, SBLOCKSIZE); } } } /* * Reference the summary information so it will also be written. */ sblock.fs_csp = fscs; if (!Nflag && sbwrite(&disk, 0) != 0) err(1, "sbwrite: %s", disk.d_error); if (Xflag == 1) { printf("** Exiting on Xflag 1\n"); exit(0); } if (Xflag == 2) printf("** Leaving BAD MAGIC on Xflag 2\n"); else sblock.fs_magic = (Oflag != 1) ? FS_UFS2_MAGIC : FS_UFS1_MAGIC; /* * Now build the cylinders group blocks and * then print out indices of cylinder groups. */ printf("super-block backups (for fsck_ffs -b #) at:\n"); i = 0; width = charsperline(); /* * Allocate space for two sets of inode blocks. */ iobufsize = 2 * sblock.fs_bsize; if ((iobuf = calloc(1, iobufsize)) == 0) { printf("Cannot allocate I/O buffer\n"); exit(38); } /* * Write out all the cylinder groups and backup superblocks. */ for (cg = 0; cg < sblock.fs_ncg; cg++) { if (!Nflag) initcg(cg, utime); j = snprintf(tmpbuf, sizeof(tmpbuf), " %jd%s", (intmax_t)fsbtodb(&sblock, cgsblock(&sblock, cg)), cg < (sblock.fs_ncg-1) ? "," : ""); if (j < 0) tmpbuf[j = 0] = '\0'; if (i + j >= width) { printf("\n"); i = 0; } i += j; printf("%s", tmpbuf); fflush(stdout); } printf("\n"); if (Nflag) exit(0); /* * Now construct the initial file system, * then write out the super-block. */ fsinit(utime); if (Oflag == 1) { sblock.fs_old_cstotal.cs_ndir = sblock.fs_cstotal.cs_ndir; sblock.fs_old_cstotal.cs_nbfree = sblock.fs_cstotal.cs_nbfree; sblock.fs_old_cstotal.cs_nifree = sblock.fs_cstotal.cs_nifree; sblock.fs_old_cstotal.cs_nffree = sblock.fs_cstotal.cs_nffree; } if (Xflag == 3) { printf("** Exiting on Xflag 3\n"); exit(0); } if (sbwrite(&disk, 0) != 0) err(1, "sbwrite: %s", disk.d_error); /* * For UFS1 filesystems with a blocksize of 64K, the first * alternate superblock resides at the location used for * the default UFS2 superblock. As there is a valid * superblock at this location, the boot code will use * it as its first choice. Thus we have to ensure that * all of its statistcs on usage are correct. */ if (Oflag == 1 && sblock.fs_bsize == 65536) wtfs(fsbtodb(&sblock, cgsblock(&sblock, 0)), sblock.fs_bsize, (char *)&sblock); /* * Read the last sector of the boot block, replace the last * 20 bytes with the recovery information, then write it back. * The recovery information only works for UFS2 filesystems. * For UFS1, zero out the area to ensure that an old UFS2 * recovery block is not accidentally found. */ if ((fsrbuf = malloc(realsectorsize)) == NULL || bread(&disk, part_ofs + (SBLOCK_UFS2 - realsectorsize) / disk.d_bsize, fsrbuf, realsectorsize) == -1) err(1, "can't read recovery area: %s", disk.d_error); fsr = (struct fsrecovery *)&fsrbuf[realsectorsize - sizeof *fsr]; if (sblock.fs_magic != FS_UFS2_MAGIC) { memset(fsr, 0, sizeof *fsr); } else { fsr->fsr_magic = sblock.fs_magic; fsr->fsr_fpg = sblock.fs_fpg; fsr->fsr_fsbtodb = sblock.fs_fsbtodb; fsr->fsr_sblkno = sblock.fs_sblkno; fsr->fsr_ncg = sblock.fs_ncg; } wtfs((SBLOCK_UFS2 - realsectorsize) / disk.d_bsize, realsectorsize, fsrbuf); free(fsrbuf); /* * Update information about this partition in pack * label, to that it may be updated on disk. */ if (pp != NULL) { pp->p_fstype = FS_BSDFFS; pp->p_fsize = sblock.fs_fsize; pp->p_frag = sblock.fs_frag; pp->p_cpg = sblock.fs_fpg; } /* * This should NOT happen. If it does complain loudly and * take evasive action. */ if ((int32_t)CGSIZE(&sblock) > sblock.fs_bsize) { printf("INTERNAL ERROR: ipg %d, fpg %d, contigsumsize %d, ", sblock.fs_ipg, sblock.fs_fpg, sblock.fs_contigsumsize); printf("old_cpg %d, size_cg %zu, CGSIZE %zu\n", sblock.fs_old_cpg, sizeof(struct cg), CGSIZE(&sblock)); printf("Please file a FreeBSD bug report and include this " "output\n"); maxblkspercg = fragstoblks(&sblock, sblock.fs_fpg) - 1; density = 0; goto retry; } } /* * Initialize a cylinder group. */ void initcg(int cylno, time_t utime) { long blkno, start; off_t savedactualloc; uint i, j, d, dlower, dupper; ufs2_daddr_t cbase, dmax; struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; struct csum *cs; /* * Determine block bounds for cylinder group. * Allow space for super block summary information in first * cylinder group. */ cbase = cgbase(&sblock, cylno); dmax = cbase + sblock.fs_fpg; if (dmax > sblock.fs_size) dmax = sblock.fs_size; dlower = cgsblock(&sblock, cylno) - cbase; dupper = cgdmin(&sblock, cylno) - cbase; if (cylno == 0) dupper += howmany(sblock.fs_cssize, sblock.fs_fsize); cs = &fscs[cylno]; memset(&acg, 0, sblock.fs_cgsize); acg.cg_time = utime; acg.cg_magic = CG_MAGIC; acg.cg_cgx = cylno; acg.cg_niblk = sblock.fs_ipg; acg.cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); acg.cg_ndblk = dmax - cbase; if (sblock.fs_contigsumsize > 0) acg.cg_nclusterblks = acg.cg_ndblk / sblock.fs_frag; start = sizeof(acg); if (Oflag == 2) { acg.cg_iusedoff = start; } else { acg.cg_old_ncyl = sblock.fs_old_cpg; acg.cg_old_time = acg.cg_time; acg.cg_time = 0; acg.cg_old_niblk = acg.cg_niblk; acg.cg_niblk = 0; acg.cg_initediblk = 0; acg.cg_old_btotoff = start; acg.cg_old_boff = acg.cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); acg.cg_iusedoff = acg.cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } acg.cg_freeoff = acg.cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); acg.cg_nextfreeoff = acg.cg_freeoff + howmany(sblock.fs_fpg, CHAR_BIT); if (sblock.fs_contigsumsize > 0) { acg.cg_clustersumoff = roundup(acg.cg_nextfreeoff, sizeof(u_int32_t)); acg.cg_clustersumoff -= sizeof(u_int32_t); acg.cg_clusteroff = acg.cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t); acg.cg_nextfreeoff = acg.cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } if (acg.cg_nextfreeoff > (unsigned)sblock.fs_cgsize) { printf("Panic: cylinder group too big by %d bytes\n", acg.cg_nextfreeoff - (unsigned)sblock.fs_cgsize); exit(37); } acg.cg_cs.cs_nifree += sblock.fs_ipg; if (cylno == 0) for (i = 0; i < (long)UFS_ROOTINO; i++) { setbit(cg_inosused(&acg), i); acg.cg_cs.cs_nifree--; } if (cylno > 0) { /* * In cylno 0, beginning space is reserved * for boot and super blocks. */ for (d = 0; d < dlower; d += sblock.fs_frag) { blkno = d / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree++; } } if ((i = dupper % sblock.fs_frag)) { acg.cg_frsum[sblock.fs_frag - i]++; for (d = dupper + sblock.fs_frag - i; dupper < d; dupper++) { setbit(cg_blksfree(&acg), dupper); acg.cg_cs.cs_nffree++; } } for (d = dupper; d + sblock.fs_frag <= acg.cg_ndblk; d += sblock.fs_frag) { blkno = d / sblock.fs_frag; setblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) setbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree++; } if (d < acg.cg_ndblk) { acg.cg_frsum[acg.cg_ndblk - d]++; for (; d < acg.cg_ndblk; d++) { setbit(cg_blksfree(&acg), d); acg.cg_cs.cs_nffree++; } } if (sblock.fs_contigsumsize > 0) { int32_t *sump = cg_clustersum(&acg); u_char *mapp = cg_clustersfree(&acg); int map = *mapp++; int bit = 1; int run = 0; for (i = 0; i < acg.cg_nclusterblks; i++) { if ((map & bit) != 0) run++; else if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; run = 0; } if ((i & (CHAR_BIT - 1)) != CHAR_BIT - 1) bit <<= 1; else { map = *mapp++; bit = 1; } } if (run != 0) { if (run > sblock.fs_contigsumsize) run = sblock.fs_contigsumsize; sump[run]++; } } *cs = acg.cg_cs; /* * Write out the duplicate super block. Then write the cylinder * group map and two blocks worth of inodes in a single write. */ savedactualloc = sblock.fs_sblockactualloc; sblock.fs_sblockactualloc = dbtob(fsbtodb(&sblock, cgsblock(&sblock, cylno))); if (sbwrite(&disk, 0) != 0) err(1, "sbwrite: %s", disk.d_error); sblock.fs_sblockactualloc = savedactualloc; if (cgwrite(&disk) != 0) err(1, "initcg: cgwrite: %s", disk.d_error); start = 0; dp1 = (struct ufs1_dinode *)(&iobuf[start]); dp2 = (struct ufs2_dinode *)(&iobuf[start]); for (i = 0; i < acg.cg_initediblk; i++) { if (sblock.fs_magic == FS_UFS1_MAGIC) { dp1->di_gen = newfs_random(); dp1++; } else { dp2->di_gen = newfs_random(); dp2++; } } wtfs(fsbtodb(&sblock, cgimin(&sblock, cylno)), iobufsize, iobuf); /* * For the old file system, we have to initialize all the inodes. */ if (Oflag == 1) { for (i = 2 * sblock.fs_frag; i < sblock.fs_ipg / INOPF(&sblock); i += sblock.fs_frag) { dp1 = (struct ufs1_dinode *)(&iobuf[start]); for (j = 0; j < INOPB(&sblock); j++) { dp1->di_gen = newfs_random(); dp1++; } wtfs(fsbtodb(&sblock, cgimin(&sblock, cylno) + i), sblock.fs_bsize, &iobuf[start]); } } } /* * initialize the file system */ #define ROOTLINKCNT 3 static struct direct root_dir[] = { { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 1, "." }, { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 2, ".." }, { UFS_ROOTINO + 1, sizeof(struct direct), DT_DIR, 5, ".snap" }, }; #define SNAPLINKCNT 2 static struct direct snap_dir[] = { { UFS_ROOTINO + 1, sizeof(struct direct), DT_DIR, 1, "." }, { UFS_ROOTINO, sizeof(struct direct), DT_DIR, 2, ".." }, }; void fsinit(time_t utime) { union dinode node; struct group *grp; gid_t gid; int entries; memset(&node, 0, sizeof node); if ((grp = getgrnam("operator")) != NULL) { gid = grp->gr_gid; } else { warnx("Cannot retrieve operator gid, using gid 0."); gid = 0; } entries = (nflag) ? ROOTLINKCNT - 1: ROOTLINKCNT; if (sblock.fs_magic == FS_UFS1_MAGIC) { /* * initialize the node */ node.dp1.di_atime = utime; node.dp1.di_mtime = utime; node.dp1.di_ctime = utime; /* * create the root directory */ node.dp1.di_mode = IFDIR | UMASK; node.dp1.di_nlink = entries; node.dp1.di_size = makedir(root_dir, entries); node.dp1.di_db[0] = alloc(sblock.fs_fsize, node.dp1.di_mode); node.dp1.di_blocks = btodb(fragroundup(&sblock, node.dp1.di_size)); wtfs(fsbtodb(&sblock, node.dp1.di_db[0]), sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO); if (!nflag) { /* * create the .snap directory */ node.dp1.di_mode |= 020; node.dp1.di_gid = gid; node.dp1.di_nlink = SNAPLINKCNT; node.dp1.di_size = makedir(snap_dir, SNAPLINKCNT); node.dp1.di_db[0] = alloc(sblock.fs_fsize, node.dp1.di_mode); node.dp1.di_blocks = btodb(fragroundup(&sblock, node.dp1.di_size)); node.dp1.di_dirdepth = 1; wtfs(fsbtodb(&sblock, node.dp1.di_db[0]), sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO + 1); } } else { /* * initialize the node */ node.dp2.di_atime = utime; node.dp2.di_mtime = utime; node.dp2.di_ctime = utime; node.dp2.di_birthtime = utime; /* * create the root directory */ node.dp2.di_mode = IFDIR | UMASK; node.dp2.di_nlink = entries; node.dp2.di_size = makedir(root_dir, entries); node.dp2.di_db[0] = alloc(sblock.fs_fsize, node.dp2.di_mode); node.dp2.di_blocks = btodb(fragroundup(&sblock, node.dp2.di_size)); wtfs(fsbtodb(&sblock, node.dp2.di_db[0]), sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO); if (!nflag) { /* * create the .snap directory */ node.dp2.di_mode |= 020; node.dp2.di_gid = gid; node.dp2.di_nlink = SNAPLINKCNT; node.dp2.di_size = makedir(snap_dir, SNAPLINKCNT); node.dp2.di_db[0] = alloc(sblock.fs_fsize, node.dp2.di_mode); node.dp2.di_blocks = btodb(fragroundup(&sblock, node.dp2.di_size)); node.dp2.di_dirdepth = 1; wtfs(fsbtodb(&sblock, node.dp2.di_db[0]), sblock.fs_fsize, iobuf); iput(&node, UFS_ROOTINO + 1); } } } /* * construct a set of directory entries in "iobuf". * return size of directory. */ int makedir(struct direct *protodir, int entries) { char *cp; int i, spcleft; spcleft = DIRBLKSIZ; memset(iobuf, 0, DIRBLKSIZ); for (cp = iobuf, i = 0; i < entries - 1; i++) { protodir[i].d_reclen = DIRSIZ(0, &protodir[i]); memmove(cp, &protodir[i], protodir[i].d_reclen); cp += protodir[i].d_reclen; spcleft -= protodir[i].d_reclen; } protodir[i].d_reclen = spcleft; memmove(cp, &protodir[i], DIRSIZ(0, &protodir[i])); return (DIRBLKSIZ); } /* * allocate a block or frag */ ufs2_daddr_t alloc(int size, int mode) { int i, blkno, frag; uint d; bread(&disk, part_ofs + fsbtodb(&sblock, cgtod(&sblock, 0)), (char *)&acg, sblock.fs_cgsize); if (acg.cg_magic != CG_MAGIC) { printf("cg 0: bad magic number\n"); exit(38); } if (acg.cg_cs.cs_nbfree == 0) { printf("first cylinder group ran out of space\n"); exit(39); } for (d = 0; d < acg.cg_ndblk; d += sblock.fs_frag) if (isblock(&sblock, cg_blksfree(&acg), d / sblock.fs_frag)) goto goth; printf("internal error: can't find block in cyl 0\n"); exit(40); goth: blkno = fragstoblks(&sblock, d); clrblock(&sblock, cg_blksfree(&acg), blkno); if (sblock.fs_contigsumsize > 0) clrbit(cg_clustersfree(&acg), blkno); acg.cg_cs.cs_nbfree--; sblock.fs_cstotal.cs_nbfree--; fscs[0].cs_nbfree--; if (mode & IFDIR) { acg.cg_cs.cs_ndir++; sblock.fs_cstotal.cs_ndir++; fscs[0].cs_ndir++; } if (size != sblock.fs_bsize) { frag = howmany(size, sblock.fs_fsize); fscs[0].cs_nffree += sblock.fs_frag - frag; sblock.fs_cstotal.cs_nffree += sblock.fs_frag - frag; acg.cg_cs.cs_nffree += sblock.fs_frag - frag; acg.cg_frsum[sblock.fs_frag - frag]++; for (i = frag; i < sblock.fs_frag; i++) setbit(cg_blksfree(&acg), d + i); } if (cgwrite(&disk) != 0) err(1, "alloc: cgwrite: %s", disk.d_error); return ((ufs2_daddr_t)d); } /* * Allocate an inode on the disk */ void iput(union dinode *ip, ino_t ino) { union dinodep dp; bread(&disk, part_ofs + fsbtodb(&sblock, cgtod(&sblock, 0)), (char *)&acg, sblock.fs_cgsize); if (acg.cg_magic != CG_MAGIC) { printf("cg 0: bad magic number\n"); exit(31); } acg.cg_cs.cs_nifree--; setbit(cg_inosused(&acg), ino); if (cgwrite(&disk) != 0) err(1, "iput: cgwrite: %s", disk.d_error); sblock.fs_cstotal.cs_nifree--; fscs[0].cs_nifree--; if (getinode(&disk, &dp, ino) == -1) { printf("iput: %s\n", disk.d_error); exit(32); } if (sblock.fs_magic == FS_UFS1_MAGIC) *dp.dp1 = ip->dp1; else *dp.dp2 = ip->dp2; putinode(&disk); } /* * possibly write to disk */ static void wtfs(ufs2_daddr_t bno, int size, char *bf) { if (Nflag) return; if (bwrite(&disk, part_ofs + bno, bf, size) < 0) err(36, "wtfs: %d bytes at sector %jd", size, (intmax_t)bno); } /* * check if a block is available */ static int isblock(struct fs *fs, unsigned char *cp, int h) { unsigned char mask; switch (fs->fs_frag) { case 8: return (cp[h] == 0xff); case 4: mask = 0x0f << ((h & 0x1) << 2); return ((cp[h >> 1] & mask) == mask); case 2: mask = 0x03 << ((h & 0x3) << 1); return ((cp[h >> 2] & mask) == mask); case 1: mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: fprintf(stderr, "isblock bad fs_frag %d\n", fs->fs_frag); return (0); } } /* * take a block out of the map */ static void clrblock(struct fs *fs, unsigned char *cp, int h) { switch ((fs)->fs_frag) { case 8: cp[h] = 0; return; case 4: cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: fprintf(stderr, "clrblock bad fs_frag %d\n", fs->fs_frag); return; } } /* * put a block into the map */ static void setblock(struct fs *fs, unsigned char *cp, int h) { switch (fs->fs_frag) { case 8: cp[h] = 0xff; return; case 4: cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: fprintf(stderr, "setblock bad fs_frag %d\n", fs->fs_frag); return; } } /* * Determine the number of characters in a * single line. */ static int charsperline(void) { int columns; char *cp; struct winsize ws; columns = 0; if (ioctl(0, TIOCGWINSZ, &ws) != -1) columns = ws.ws_col; if (columns == 0 && (cp = getenv("COLUMNS"))) columns = atoi(cp); if (columns == 0) columns = 80; /* last resort */ return (columns); } static int ilog2(int val) { u_int n; for (n = 0; n < sizeof(n) * CHAR_BIT; n++) if (1 << n == val) return (n); errx(1, "ilog2: %d is not a power of 2\n", val); } /* * For the regression test, return predictable random values. * Otherwise use a true random number generator. */ static u_int32_t newfs_random(void) { static u_int32_t nextnum = 1; if (Rflag) return (nextnum++); return (arc4random()); } diff --git a/sbin/quotacheck/quotacheck.c b/sbin/quotacheck/quotacheck.c index 3f608c103b1d..4837dd4dba04 100644 --- a/sbin/quotacheck/quotacheck.c +++ b/sbin/quotacheck/quotacheck.c @@ -1,722 +1,718 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1980, 1990, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint static char sccsid[] = "@(#)quotacheck.c 8.3 (Berkeley) 1/29/94"; #endif /* not lint */ #endif #include /* * Fix up / report on disk quotas & usage */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "quotacheck.h" const char *qfname = QUOTAFILENAME; const char *qfextension[] = INITQFNAMES; const char *quotagroup = QUOTAGROUP; union { struct fs sblk; char dummy[MAXBSIZE]; } sb_un; #define sblock sb_un.sblk union { struct cg cgblk; char dummy[MAXBSIZE]; } cg_un; #define cgblk cg_un.cgblk long dev_bsize = 1; ino_t maxino; -union dinode { - struct ufs1_dinode dp1; - struct ufs2_dinode dp2; -}; #define DIP(dp, field) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) #define HASUSR 1 #define HASGRP 2 struct fileusage { struct fileusage *fu_next; u_long fu_curinodes; u_long fu_curblocks; u_long fu_id; char fu_name[1]; /* actually bigger */ }; #define FUHASH 1024 /* must be power of two */ struct fileusage *fuhead[MAXQUOTAS][FUHASH]; int aflag; /* all file systems */ int cflag; /* convert format to 32 or 64 bit size */ int gflag; /* check group quotas */ int uflag; /* check user quotas */ int vflag; /* verbose */ int fi; /* open disk file descriptor */ struct fileusage * addid(u_long, int, char *, const char *); void blkread(ufs2_daddr_t, char *, long); void freeinodebuf(void); union dinode * getnextinode(ino_t); int getquotagid(void); struct fileusage * lookup(u_long, int); int oneof(char *, char*[], int); void printchanges(const char *, int, struct dqblk *, struct fileusage *, u_long); void setinodebuf(ino_t); int update(const char *, struct quotafile *, int); void usage(void); int main(int argc, char *argv[]) { struct fstab *fs; struct passwd *pw; struct group *gr; struct quotafile *qfu, *qfg; int i, argnum, maxrun, errs, ch; long done = 0; char *name; errs = maxrun = 0; while ((ch = getopt(argc, argv, "ac:guvl:")) != -1) { switch(ch) { case 'a': aflag++; break; case 'c': if (cflag) usage(); cflag = atoi(optarg); break; case 'g': gflag++; break; case 'u': uflag++; break; case 'v': vflag++; break; case 'l': maxrun = atoi(optarg); break; default: usage(); } } argc -= optind; argv += optind; if ((argc == 0 && !aflag) || (argc > 0 && aflag)) usage(); if (cflag && cflag != 32 && cflag != 64) usage(); if (!gflag && !uflag) { gflag++; uflag++; } if (gflag) { setgrent(); while ((gr = getgrent()) != NULL) (void) addid((u_long)gr->gr_gid, GRPQUOTA, gr->gr_name, NULL); endgrent(); } if (uflag) { setpwent(); while ((pw = getpwent()) != NULL) (void) addid((u_long)pw->pw_uid, USRQUOTA, pw->pw_name, NULL); endpwent(); } /* * The maxrun (-l) option is now deprecated. */ if (maxrun > 0) warnx("the -l option is now deprecated"); if (aflag) exit(checkfstab(uflag, gflag)); if (setfsent() == 0) errx(1, "%s: can't open", FSTAB); while ((fs = getfsent()) != NULL) { if (((argnum = oneof(fs->fs_file, argv, argc)) >= 0 || (argnum = oneof(fs->fs_spec, argv, argc)) >= 0) && (name = blockcheck(fs->fs_spec))) { done |= 1 << argnum; qfu = NULL; if (uflag) qfu = quota_open(fs, USRQUOTA, O_CREAT|O_RDWR); qfg = NULL; if (gflag) qfg = quota_open(fs, GRPQUOTA, O_CREAT|O_RDWR); if (qfu == NULL && qfg == NULL) continue; errs += chkquota(name, qfu, qfg); if (qfu) quota_close(qfu); if (qfg) quota_close(qfg); } } endfsent(); for (i = 0; i < argc; i++) if ((done & (1 << i)) == 0) fprintf(stderr, "%s not found in %s\n", argv[i], FSTAB); exit(errs); } void usage(void) { (void)fprintf(stderr, "%s\n%s\n", "usage: quotacheck [-guv] [-c 32 | 64] [-l maxrun] -a", " quotacheck [-guv] [-c 32 | 64] filesystem ..."); exit(1); } /* * Scan the specified file system to check quota(s) present on it. */ int chkquota(char *specname, struct quotafile *qfu, struct quotafile *qfg) { struct fileusage *fup; union dinode *dp; struct fs *fs; int i, ret, mode, errs = 0; u_int32_t cg; ino_t curino, ino, inosused, userino = 0, groupino = 0; dev_t dev, userdev = 0, groupdev = 0; struct stat sb; const char *mntpt; char *cp; if (qfu != NULL) mntpt = quota_fsname(qfu); else if (qfg != NULL) mntpt = quota_fsname(qfg); else errx(1, "null quotafile information passed to chkquota()\n"); if (cflag) { if (vflag && qfu != NULL) printf("%s: convert user quota to %d bits\n", mntpt, cflag); if (qfu != NULL && quota_convert(qfu, cflag) < 0) { if (errno == EBADF) errx(1, "%s: cannot convert an active quota file", mntpt); err(1, "user quota conversion to size %d failed", cflag); } if (vflag && qfg != NULL) printf("%s: convert group quota to %d bits\n", mntpt, cflag); if (qfg != NULL && quota_convert(qfg, cflag) < 0) { if (errno == EBADF) errx(1, "%s: cannot convert an active quota file", mntpt); err(1, "group quota conversion to size %d failed", cflag); } } if ((fi = open(specname, O_RDONLY, 0)) < 0) { warn("%s", specname); return (1); } if ((stat(mntpt, &sb)) < 0) { warn("%s", mntpt); return (1); } dev = sb.st_dev; if (vflag) { (void)printf("*** Checking "); if (qfu) (void)printf("user%s", qfg ? " and " : ""); if (qfg) (void)printf("group"); (void)printf(" quotas for %s (%s)\n", specname, mntpt); } if (qfu) { if (stat(quota_qfname(qfu), &sb) == 0) { userino = sb.st_ino; userdev = sb.st_dev; } } if (qfg) { if (stat(quota_qfname(qfg), &sb) == 0) { groupino = sb.st_ino; groupdev = sb.st_dev; } } sync(); if ((ret = sbget(fi, &fs, UFS_STDSB, UFS_NOCSUM)) != 0) { switch (ret) { case ENOENT: warn("Cannot find file system superblock"); return (1); default: warn("Unable to read file system superblock"); return (1); } } bcopy(fs, &sblock, fs->fs_sbsize); free(fs); dev_bsize = sblock.fs_fsize / fsbtodb(&sblock, 1); maxino = sblock.fs_ncg * sblock.fs_ipg; for (cg = 0; cg < sblock.fs_ncg; cg++) { ino = cg * sblock.fs_ipg; setinodebuf(ino); blkread(fsbtodb(&sblock, cgtod(&sblock, cg)), (char *)(&cgblk), sblock.fs_cgsize); if (sblock.fs_magic == FS_UFS2_MAGIC) inosused = cgblk.cg_initediblk; else inosused = sblock.fs_ipg; /* * If we are using soft updates, then we can trust the * cylinder group inode allocation maps to tell us which * inodes are allocated. We will scan the used inode map * to find the inodes that are really in use, and then * read only those inodes in from disk. */ if (sblock.fs_flags & FS_DOSOFTDEP) { if (!cg_chkmagic(&cgblk)) errx(1, "CG %d: BAD MAGIC NUMBER\n", cg); cp = &cg_inosused(&cgblk)[(inosused - 1) / CHAR_BIT]; for ( ; inosused > 0; inosused -= CHAR_BIT, cp--) { if (*cp == 0) continue; for (i = 1 << (CHAR_BIT - 1); i > 0; i >>= 1) { if (*cp & i) break; inosused--; } break; } if (inosused <= 0) continue; } for (curino = 0; curino < inosused; curino++, ino++) { if ((dp = getnextinode(ino)) == NULL || ino < UFS_ROOTINO || (mode = DIP(dp, di_mode) & IFMT) == 0) continue; /* * XXX: Do not account for UIDs or GIDs that appear * to be negative to prevent generating 100GB+ * quota files. */ if ((int)DIP(dp, di_uid) < 0 || (int)DIP(dp, di_gid) < 0) { if (vflag) { if (aflag) (void)printf("%s: ", mntpt); (void)printf("out of range UID/GID (%u/%u) ino=%ju\n", DIP(dp, di_uid), DIP(dp,di_gid), (uintmax_t)ino); } continue; } /* * Do not account for file system snapshot files * or the actual quota data files to be consistent * with how they are handled inside the kernel. */ #ifdef SF_SNAPSHOT if (DIP(dp, di_flags) & SF_SNAPSHOT) continue; #endif if ((ino == userino && dev == userdev) || (ino == groupino && dev == groupdev)) continue; if (qfg) { fup = addid((u_long)DIP(dp, di_gid), GRPQUOTA, NULL, mntpt); fup->fu_curinodes++; if (mode == IFREG || mode == IFDIR || mode == IFLNK) fup->fu_curblocks += DIP(dp, di_blocks); } if (qfu) { fup = addid((u_long)DIP(dp, di_uid), USRQUOTA, NULL, mntpt); fup->fu_curinodes++; if (mode == IFREG || mode == IFDIR || mode == IFLNK) fup->fu_curblocks += DIP(dp, di_blocks); } } } freeinodebuf(); if (qfu) errs += update(mntpt, qfu, USRQUOTA); if (qfg) errs += update(mntpt, qfg, GRPQUOTA); close(fi); (void)fflush(stdout); return (errs); } /* * Update a specified quota file. */ int update(const char *fsname, struct quotafile *qf, int type) { struct fileusage *fup; u_long id, lastid, highid = 0; struct dqblk dqbuf; struct stat sb; static struct dqblk zerodqbuf; static struct fileusage zerofileusage; /* * Scan the on-disk quota file and record any usage changes. */ lastid = quota_maxid(qf); for (id = 0; id <= lastid; id++) { if (quota_read(qf, &dqbuf, id) < 0) dqbuf = zerodqbuf; if ((fup = lookup(id, type)) == NULL) fup = &zerofileusage; if (fup->fu_curinodes || fup->fu_curblocks || dqbuf.dqb_bsoftlimit || dqbuf.dqb_bhardlimit || dqbuf.dqb_isoftlimit || dqbuf.dqb_ihardlimit) highid = id; if (dqbuf.dqb_curinodes == fup->fu_curinodes && dqbuf.dqb_curblocks == fup->fu_curblocks) { fup->fu_curinodes = 0; fup->fu_curblocks = 0; continue; } printchanges(fsname, type, &dqbuf, fup, id); dqbuf.dqb_curinodes = fup->fu_curinodes; dqbuf.dqb_curblocks = fup->fu_curblocks; (void) quota_write_usage(qf, &dqbuf, id); fup->fu_curinodes = 0; fup->fu_curblocks = 0; } /* * Walk the hash table looking for ids with non-zero usage * that are not currently recorded in the quota file. E.g. * ids that are past the end of the current file. */ for (id = 0; id < FUHASH; id++) { for (fup = fuhead[type][id]; fup != NULL; fup = fup->fu_next) { if (fup->fu_id <= lastid) continue; if (fup->fu_curinodes == 0 && fup->fu_curblocks == 0) continue; bzero(&dqbuf, sizeof(struct dqblk)); if (fup->fu_id > highid) highid = fup->fu_id; printchanges(fsname, type, &dqbuf, fup, fup->fu_id); dqbuf.dqb_curinodes = fup->fu_curinodes; dqbuf.dqb_curblocks = fup->fu_curblocks; (void) quota_write_usage(qf, &dqbuf, fup->fu_id); fup->fu_curinodes = 0; fup->fu_curblocks = 0; } } /* * If this is old format file, then size may be smaller, * so ensure that we only truncate when it will make things * smaller, and not if it will grow an old format file. */ if (highid < lastid && stat(quota_qfname(qf), &sb) == 0 && sb.st_size > (off_t)((highid + 2) * sizeof(struct dqblk))) truncate(quota_qfname(qf), (((off_t)highid + 2) * sizeof(struct dqblk))); return (0); } /* * Check to see if target appears in list of size cnt. */ int oneof(char *target, char *list[], int cnt) { int i; for (i = 0; i < cnt; i++) if (strcmp(target, list[i]) == 0) return (i); return (-1); } /* * Determine the group identifier for quota files. */ int getquotagid(void) { struct group *gr; if ((gr = getgrnam(quotagroup)) != NULL) return (gr->gr_gid); return (-1); } /* * Routines to manage the file usage table. * * Lookup an id of a specific type. */ struct fileusage * lookup(u_long id, int type) { struct fileusage *fup; for (fup = fuhead[type][id & (FUHASH-1)]; fup != NULL; fup = fup->fu_next) if (fup->fu_id == id) return (fup); return (NULL); } /* * Add a new file usage id if it does not already exist. */ struct fileusage * addid(u_long id, int type, char *name, const char *fsname) { struct fileusage *fup, **fhp; int len; if ((fup = lookup(id, type)) != NULL) return (fup); if (name) len = strlen(name); else len = 0; if ((fup = calloc(1, sizeof(*fup) + len)) == NULL) errx(1, "calloc failed"); fhp = &fuhead[type][id & (FUHASH - 1)]; fup->fu_next = *fhp; *fhp = fup; fup->fu_id = id; if (name) bcopy(name, fup->fu_name, len + 1); else { (void)sprintf(fup->fu_name, "%lu", id); if (vflag) { if (aflag && fsname != NULL) (void)printf("%s: ", fsname); printf("unknown %cid: %lu\n", type == USRQUOTA ? 'u' : 'g', id); } } return (fup); } /* * Special purpose version of ginode used to optimize pass * over all the inodes in numerical order. */ static ino_t nextino, lastinum, lastvalidinum; static long readcnt, readpercg, fullcnt, inobufsize, partialcnt, partialsize; static caddr_t inodebuf; #define INOBUFSIZE 56*1024 /* size of buffer to read inodes */ union dinode * getnextinode(ino_t inumber) { long size; ufs2_daddr_t dblk; union dinode *dp; static caddr_t nextinop; if (inumber != nextino++ || inumber > lastvalidinum) errx(1, "bad inode number %ju to nextinode", (uintmax_t)inumber); if (inumber >= lastinum) { readcnt++; dblk = fsbtodb(&sblock, ino_to_fsba(&sblock, lastinum)); if (readcnt % readpercg == 0) { size = partialsize; lastinum += partialcnt; } else { size = inobufsize; lastinum += fullcnt; } /* * If blkread returns an error, it will already have zeroed * out the buffer, so we do not need to do so here. */ blkread(dblk, inodebuf, size); nextinop = inodebuf; } dp = (union dinode *)nextinop; if (sblock.fs_magic == FS_UFS1_MAGIC) nextinop += sizeof(struct ufs1_dinode); else nextinop += sizeof(struct ufs2_dinode); return (dp); } /* * Prepare to scan a set of inodes. */ void setinodebuf(ino_t inum) { if (inum % sblock.fs_ipg != 0) errx(1, "bad inode number %ju to setinodebuf", (uintmax_t)inum); lastvalidinum = inum + sblock.fs_ipg - 1; nextino = inum; lastinum = inum; readcnt = 0; if (inodebuf != NULL) return; inobufsize = blkroundup(&sblock, INOBUFSIZE); fullcnt = inobufsize / ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode)); readpercg = sblock.fs_ipg / fullcnt; partialcnt = sblock.fs_ipg % fullcnt; partialsize = partialcnt * ((sblock.fs_magic == FS_UFS1_MAGIC) ? sizeof(struct ufs1_dinode) : sizeof(struct ufs2_dinode)); if (partialcnt != 0) { readpercg++; } else { partialcnt = fullcnt; partialsize = inobufsize; } if ((inodebuf = malloc((unsigned)inobufsize)) == NULL) errx(1, "cannot allocate space for inode buffer"); } /* * Free up data structures used to scan inodes. */ void freeinodebuf(void) { if (inodebuf != NULL) free(inodebuf); inodebuf = NULL; } /* * Read specified disk blocks. */ void blkread(ufs2_daddr_t bno, char *buf, long cnt) { if (lseek(fi, (off_t)bno * dev_bsize, SEEK_SET) < 0 || read(fi, buf, cnt) != cnt) errx(1, "blkread failed on block %ld", (long)bno); } /* * Display updated block and i-node counts. */ void printchanges(const char *fsname, int type, struct dqblk *dp, struct fileusage *fup, u_long id) { if (!vflag) return; if (aflag) (void)printf("%s: ", fsname); if (fup->fu_name[0] == '\0') (void)printf("%-8lu fixed ", id); else (void)printf("%-8s fixed ", fup->fu_name); switch (type) { case GRPQUOTA: (void)printf("(group):"); break; case USRQUOTA: (void)printf("(user): "); break; default: (void)printf("(unknown quota type %d)", type); break; } if (dp->dqb_curinodes != fup->fu_curinodes) (void)printf("\tinodes %lu -> %lu", (u_long)dp->dqb_curinodes, (u_long)fup->fu_curinodes); if (dp->dqb_curblocks != fup->fu_curblocks) (void)printf("\tblocks %lu -> %lu", (u_long)dp->dqb_curblocks, (u_long)fup->fu_curblocks); (void)printf("\n"); } diff --git a/share/man/man7/ffs.7 b/share/man/man7/ffs.7 index 7d003443f32a..a1a319ae47dd 100644 --- a/share/man/man7/ffs.7 +++ b/share/man/man7/ffs.7 @@ -1,328 +1,332 @@ .\" Copyright (c) 2001 Networks Associates Technology, Inc. .\" All rights reserved. .\" .\" This software was developed for the FreeBSD Project by Chris .\" Costello at Safeport Network Services and NAI Labs, the Security .\" Research Division of Network Associates, Inc. under DARPA/SPAWAR .\" contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS .\" research program. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd May 3, 2020 +.Dd January 19, 2025 .Dt FFS 7 .Os .Sh NAME .Nm ffs .Nd Berkeley fast file system .Sh SYNOPSIS In the kernel configuration file: .Cd "options FFS" .Cd "options QUOTA" .Cd "options SOFTUPDATES" .Cd "options SUIDDIR" .Cd "options UFS_ACL" .Cd "options UFS_DIRHASH" .Cd "options UFS_EXTATTR" .Cd "options UFS_EXTATTR_AUTOSTART" .Cd "options UFS_GJOURNAL" .Pp In .Xr fstab 5 : .Bd -literal -compact /dev/disk0a /mnt ufs rw 1 1 .Ed .Sh DESCRIPTION The Berkeley fast file system provides facilities to store file system data onto a disk device. .Nm has been optimized over the years for speed and reliability and is the default .Fx file system. .Ss Quotas .Bl -tag -width 2n .It Cd "options QUOTA" This option allows system administrators to set limits on disk usage on a per-user basis. Quotas can be used only on file systems mounted with the .Cm quota option; see .Xr quota 1 and .Xr edquota 8 . .El .Ss Soft Updates .Bl -tag -width 2n .It Cd "options SOFTUPDATES" The soft updates feature tracks writes to the disk and enforces metadata update dependencies (e.g., updating free block maps) to ensure that the file system remains consistent. .Pp To create a new file system with the soft updates enabled, use .Xr newfs 8 command: .Pp .D1 Nm newfs Fl U Ar fs .Pp .Ar fs can be either a mount point listed in .Xr fstab 5 .Pq e.g. , Pa /usr , or a disk device .Pq e.g., Pa /dev/da0a . .Pp It is possible to enable soft updates on an .Em unmounted file system by using .Xr tunefs 8 command: .Pp .D1 Nm tunefs Fl n Cm enable Ar fs .Pp Soft updates can also add journaling that reduces the time spent by .Xr fsck_ffs 8 cleaning up a filesystem after a crash from several minutes to a few seconds. The journal is placed in an inode named .Pa .sujournal , and is kept as a circular log of segments containing records that describe metadata operations. .Pp To create a new file system with both the soft updates and soft updates journaling enabled, use the following command: .Pp .D1 Nm newfs Fl j Ar fs .Pp This runs .Xr tunefs 8 command after .Xr newfs 8 command with .Fl U flag enabled. It is possible to enable soft updates journaling on an .Em unmounted file system by using .Xr tunefs 8 command: .Pp .D1 Nm tunefs Fl j Cm enable Ar fs .Pp This flag automatically enables the soft updates feature when it is not enabled. Note that this .Xr tunefs 8 command will fail if a file .Pa .sujournal already exists before enabling the soft updates journaling. .El .Ss File Ownership Inheritance .Bl -tag -width 2n .It Cd "options SUIDDIR" For use in file sharing environments on networks including .Tn "Microsoft Windows" and .Tn "Apple Macintosh" computers, this option allows files on file systems mounted with the .Cm suiddir option to inherit the ownership of its directory, i.e., .Dq "if it's my directory, it must be my file." .El .Ss Access Control Lists .Bl -tag -width 2n .It Cd "options UFS_ACL" Access control lists allow the association of fine-grained discretionary access control information with files and directories. This option requires the presence of the .Dv UFS_EXTATTR option, and it is recommended that .Dv UFS_EXTATTR_AUTOSTART is included as well, so that ACLs are enabled atomically upon mounting the file system. .El .Pp In order to enable support for ACLs, two extended attributes must be available in the .Dv EXTATTR_NAMESPACE_SYSTEM namespace: .Pa posix1e.acl_access , which holds the access ACL, and .Pa posix1e.acl_default , which holds the default ACL for directories. If you are using file system extended attributes, the following commands may be used to allocate space for and create the necessary EA backing files for ACLs in the root of each file system. In these examples, the root file system is used; see .Sx "Extended Attributes" for more details. .Bd -literal -offset indent mkdir -p /.attribute/system cd /.attribute/system extattrctl initattr -p / 388 posix1e.acl_access extattrctl initattr -p / 388 posix1e.acl_default .Ed .Pp On the next mount of the root file system, the attributes will be automatically started if .Dv UFS_EXTATTR_AUTOSTART is included in the kernel configuration, and ACLs will be enabled. .Ss Directory Hashing .Bl -tag -width 2n .It Cd "options UFS_DIRHASH" Implements a hash-based lookup scheme for directories in order to speed up accesses to very large directories. .El .Ss Extended Attributes .Bl -tag -width 2n .It Cd "options UFS_EXTATTR" Extended attributes allow the association of additional arbitrary metadata with files and directories, which can be assigned and retrieved from userland as well as from within the kernel; see .Xr extattrctl 8 . .It Cd "options UFS_EXTATTR_AUTOSTART" If this option is defined, .Nm will search for a .Pa .attribute subdirectory of the file system root during the mount operation. If found, extended attribute support will be automatically started for that file system. .El .Ss GEOM-based Journaling .Bl -tag -width 2n .It Cd "options UFS_GJOURNAL" Implements a block level journaling of a UFS file system, which is for both data and metadata. To enable this, create a .Xr gjournal 8 GEOM provider for a block device by using the following command: .Pp .D1 Nm gjournal label Ar da0 .Pp In this example, .Pa /dev/da0 is used as the target block device, and .Pa /dev/da0.journal is created. Then create a new file system by using .Xr newfs 8 with the block level journaling flag and mount it: .Pp .D1 Nm newfs Fl J Ar /dev/da0.journal .D1 Nm mount Fl o Cm async Ar /dev/da0.journal Ar /mnt .Pp .Cm async option is not mandatory but recommended for better performance because the journaling guarantees the consistency of an .Cm async mount. .Pp It is also possible to enable the block level journaling on an existing file system. To do so, use .Xr gjournal 8 utility to label the underlying block device and .Xr tunefs 8 utility to enable the block level journaling flag: .Pp .D1 Nm gjournal label Ar da0 .D1 Nm tunefs Fl J Cm enable Ar /dev/da0.journal .D1 Nm mount Fl o Cm async Ar /dev/da0.journal Ar /mnt .El .Ss Xr sysctl 8 MIBs The following .Xr sysctl 8 MIBs are defined for use with .Nm : .Bl -hang -width ".Va vfs.ffs.doreallocblk" .It Va vfs.ffs.doasyncfree Asynchronously write out modified i-node and indirect blocks upon reallocating file system blocks to be contiguous. .Pq Default: 1 . .It Va vfs.ffs.doreallocblks Enable support for the rearrangement of blocks to be contiguous. .Pq Default: 1 . +.It Va vfs.ffs.prttimechgs +Print a console message when timestamps for UFS1 filesystems are found +to be in the future and are changed to be the present time. +.Pq Default: 0 . .El .Sh HISTORY The .Nm manual page first appeared in .Fx 4.5 . .Sh SEE ALSO .Xr quota 1 , .Xr acl 3 , .Xr extattr 3 , .Xr edquota 8 , .Xr extattrctl 8 , .Xr fsck_ffs 8 , .Xr sysctl 8 , .Xr tunefs 8 .Rs .%A M. McKusick .%A W. Joy .%A S. Leffler .%A R. Fabry .%D August 1984 .%T "A Fast File System for UNIX" .%J "ACM Transactions on Computer Systems" .%N 2 .%V 3 .%P 181-197 .Re .Rs .%A M. McKusick .%D June 2000 .%T "Soft Updates: A Technique for Eliminating Most Synchronous Writes in the Fast Filesystem" .%J "Proceedings of the Freenix Track at the 1999 Usenix Annual Technical Conference" .%P 71-84 .Re .Rs .%A M. McKusick .%A J. Roberson .%D May 2010 .%T "Journaled Soft-updates" .%J "BSD Canada Conference 2010 (BSDCan)" .Re diff --git a/stand/libsa/ufs.c b/stand/libsa/ufs.c index 2c3b3764bd74..e1d540ed2321 100644 --- a/stand/libsa/ufs.c +++ b/stand/libsa/ufs.c @@ -1,958 +1,955 @@ /* $NetBSD: ufs.c,v 1.20 1998/03/01 07:15:39 ross Exp $ */ /*- * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * * Copyright (c) 1990, 1991 Carnegie Mellon University * All Rights Reserved. * * Author: David Golub * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Stand-alone file reading package. */ #include #include #include #include #include #include #include "stand.h" #include "string.h" static int ufs_open(const char *path, struct open_file *f); static int ufs_write(struct open_file *f, const void *buf, size_t size, size_t *resid); static int ufs_close(struct open_file *f); static int ufs_read(struct open_file *f, void *buf, size_t size, size_t *resid); static off_t ufs_seek(struct open_file *f, off_t offset, int where); static int ufs_stat(struct open_file *f, struct stat *sb); static int ufs_readdir(struct open_file *f, struct dirent *d); static int ufs_mount(const char *dev, const char *path, void **data); static int ufs_unmount(const char *dev, void *data); struct fs_ops ufs_fsops = { .fs_name = "ufs", .fo_open = ufs_open, .fo_close = ufs_close, .fo_read = ufs_read, .fo_write = ufs_write, .fo_seek = ufs_seek, .fo_stat = ufs_stat, .fo_readdir = ufs_readdir, .fo_mount = ufs_mount, .fo_unmount = ufs_unmount }; /* * In-core open file. */ struct file { off_t f_seekp; /* seek pointer */ struct fs *f_fs; /* pointer to super-block */ - union dinode { - struct ufs1_dinode di1; - struct ufs2_dinode di2; - } f_di; /* copy of on-disk inode */ + union dinode f_dp; /* copy of on-disk inode */ int f_nindir[UFS_NIADDR]; /* number of blocks mapped by indirect block at level i */ char *f_blk[UFS_NIADDR]; /* buffer for indirect block at level i */ size_t f_blksize[UFS_NIADDR]; /* size of buffer */ ufs2_daddr_t f_blkno[UFS_NIADDR];/* disk address of block in buffer */ ufs2_daddr_t f_buf_blkno; /* block number of data block */ char *f_buf; /* buffer for data block */ size_t f_buf_size; /* size of data block */ int f_inumber; /* inumber */ }; #define DIP(fp, field) \ ((fp)->f_fs->fs_magic == FS_UFS1_MAGIC ? \ - (fp)->f_di.di1.field : (fp)->f_di.di2.field) + (fp)->f_dp.dp1.field : (fp)->f_dp.dp2.field) typedef struct ufs_mnt { char *um_dev; int um_fd; STAILQ_ENTRY(ufs_mnt) um_link; } ufs_mnt_t; typedef STAILQ_HEAD(ufs_mnt_list, ufs_mnt) ufs_mnt_list_t; static ufs_mnt_list_t mnt_list = STAILQ_HEAD_INITIALIZER(mnt_list); static int read_inode(ino_t, struct open_file *); static int block_map(struct open_file *, ufs2_daddr_t, ufs2_daddr_t *); static int buf_read_file(struct open_file *, char **, size_t *); static int buf_write_file(struct open_file *, const char *, size_t *); static int search_directory(char *, struct open_file *, ino_t *); static int ufs_use_sa_read(void *, off_t, void **, int); /* from ffs_subr.c */ int ffs_sbget(void *devfd, struct fs **fsp, off_t sblock, int flags, char *filltype, int (*readfunc)(void *devfd, off_t loc, void **bufp, int size)); int ffs_sbsearch(void *, struct fs **, int, char *, int (*)(void *, off_t, void **, int)); /* * Read a new inode into a file structure. */ static int read_inode(ino_t inumber, struct open_file *f) { struct file *fp = (struct file *)f->f_fsdata; struct fs *fs = fp->f_fs; char *buf; size_t rsize; int rc; if (fs == NULL) panic("fs == NULL"); /* * Read inode and save it. */ buf = malloc(fs->fs_bsize); twiddle(1); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fs, ino_to_fsba(fs, inumber)), fs->fs_bsize, buf, &rsize); if (rc) goto out; if (rsize != fs->fs_bsize) { rc = EIO; goto out; } if (fp->f_fs->fs_magic == FS_UFS1_MAGIC) - fp->f_di.di1 = ((struct ufs1_dinode *)buf) + fp->f_dp.dp1 = ((struct ufs1_dinode *)buf) [ino_to_fsbo(fs, inumber)]; else - fp->f_di.di2 = ((struct ufs2_dinode *)buf) + fp->f_dp.dp2 = ((struct ufs2_dinode *)buf) [ino_to_fsbo(fs, inumber)]; /* * Clear out the old buffers */ { int level; for (level = 0; level < UFS_NIADDR; level++) fp->f_blkno[level] = -1; fp->f_buf_blkno = -1; } fp->f_seekp = 0; fp->f_inumber = inumber; out: free(buf); return (rc); } /* * Given an offset in a file, find the disk block number that * contains that block. */ static int block_map(struct open_file *f, ufs2_daddr_t file_block, ufs2_daddr_t *disk_block_p) { struct file *fp = (struct file *)f->f_fsdata; struct fs *fs = fp->f_fs; int level; int idx; ufs2_daddr_t ind_block_num; int rc; /* * Index structure of an inode: * * di_db[0..UFS_NDADDR-1] hold block numbers for blocks * 0..UFS_NDADDR-1 * * di_ib[0] index block 0 is the single indirect block * holds block numbers for blocks * UFS_NDADDR .. UFS_NDADDR + NINDIR(fs)-1 * * di_ib[1] index block 1 is the double indirect block * holds block numbers for INDEX blocks for blocks * UFS_NDADDR + NINDIR(fs) .. * UFS_NDADDR + NINDIR(fs) + NINDIR(fs)**2 - 1 * * di_ib[2] index block 2 is the triple indirect block * holds block numbers for double-indirect * blocks for blocks * UFS_NDADDR + NINDIR(fs) + NINDIR(fs)**2 .. * UFS_NDADDR + NINDIR(fs) + NINDIR(fs)**2 * + NINDIR(fs)**3 - 1 */ if (file_block < UFS_NDADDR) { /* Direct block. */ *disk_block_p = DIP(fp, di_db[file_block]); return (0); } file_block -= UFS_NDADDR; /* * nindir[0] = NINDIR * nindir[1] = NINDIR**2 * nindir[2] = NINDIR**3 * etc */ for (level = 0; level < UFS_NIADDR; level++) { if (file_block < fp->f_nindir[level]) break; file_block -= fp->f_nindir[level]; } if (level == UFS_NIADDR) { /* Block number too high */ return (EFBIG); } ind_block_num = DIP(fp, di_ib[level]); for (; level >= 0; level--) { if (ind_block_num == 0) { *disk_block_p = 0; /* missing */ return (0); } if (fp->f_blkno[level] != ind_block_num) { if (fp->f_blk[level] == (char *)0) fp->f_blk[level] = malloc(fs->fs_bsize); twiddle(1); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fp->f_fs, ind_block_num), fs->fs_bsize, fp->f_blk[level], &fp->f_blksize[level]); if (rc) return (rc); if (fp->f_blksize[level] != fs->fs_bsize) return (EIO); fp->f_blkno[level] = ind_block_num; } if (level > 0) { idx = file_block / fp->f_nindir[level - 1]; file_block %= fp->f_nindir[level - 1]; } else idx = file_block; if (fp->f_fs->fs_magic == FS_UFS1_MAGIC) ind_block_num = ((ufs1_daddr_t *)fp->f_blk[level])[idx]; else ind_block_num = ((ufs2_daddr_t *)fp->f_blk[level])[idx]; } *disk_block_p = ind_block_num; return (0); } /* * Write a portion of a file from an internal buffer. */ static int buf_write_file(struct open_file *f, const char *buf_p, size_t *size_p) { struct file *fp = (struct file *)f->f_fsdata; struct fs *fs = fp->f_fs; long off; ufs_lbn_t file_block; ufs2_daddr_t disk_block; size_t block_size; int rc; /* * Calculate the starting block address and offset. */ off = blkoff(fs, fp->f_seekp); file_block = lblkno(fs, fp->f_seekp); block_size = sblksize(fs, DIP(fp, di_size), file_block); rc = block_map(f, file_block, &disk_block); if (rc) return (rc); if (disk_block == 0) /* Because we can't allocate space on the drive */ return (EFBIG); /* * Truncate buffer at end of file, and at the end of * this block. */ if (*size_p > DIP(fp, di_size) - fp->f_seekp) *size_p = DIP(fp, di_size) - fp->f_seekp; if (*size_p > block_size - off) *size_p = block_size - off; /* * If we don't entirely occlude the block and it's not * in memory already, read it in first. */ if (((off > 0) || (*size_p + off < block_size)) && (file_block != fp->f_buf_blkno)) { if (fp->f_buf == (char *)0) fp->f_buf = malloc(fs->fs_bsize); twiddle(4); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fs, disk_block), block_size, fp->f_buf, &fp->f_buf_size); if (rc) return (rc); fp->f_buf_blkno = file_block; } /* * Copy the user data into the cached block. */ bcopy(buf_p, fp->f_buf + off, *size_p); /* * Write the block out to storage. */ twiddle(4); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_WRITE, fsbtodb(fs, disk_block), block_size, fp->f_buf, &fp->f_buf_size); return (rc); } /* * Read a portion of a file into an internal buffer. Return * the location in the buffer and the amount in the buffer. */ static int buf_read_file(struct open_file *f, char **buf_p, size_t *size_p) { struct file *fp = (struct file *)f->f_fsdata; struct fs *fs = fp->f_fs; long off; ufs_lbn_t file_block; ufs2_daddr_t disk_block; size_t block_size; int rc; off = blkoff(fs, fp->f_seekp); file_block = lblkno(fs, fp->f_seekp); block_size = sblksize(fs, DIP(fp, di_size), file_block); if (file_block != fp->f_buf_blkno) { if (fp->f_buf == NULL) fp->f_buf = malloc(fs->fs_bsize); rc = block_map(f, file_block, &disk_block); if (rc) return (rc); if (disk_block == 0) { bzero(fp->f_buf, block_size); fp->f_buf_size = block_size; } else { twiddle(4); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fs, disk_block), block_size, fp->f_buf, &fp->f_buf_size); if (rc) return (rc); } fp->f_buf_blkno = file_block; } /* * Return address of byte in buffer corresponding to * offset, and size of remainder of buffer after that * byte. */ *buf_p = fp->f_buf + off; *size_p = block_size - off; /* * But truncate buffer at end of file. */ if (*size_p > DIP(fp, di_size) - fp->f_seekp) *size_p = DIP(fp, di_size) - fp->f_seekp; return (0); } /* * Search a directory for a name and return its * i_number. */ static int search_directory(char *name, struct open_file *f, ino_t *inumber_p) { struct file *fp = (struct file *)f->f_fsdata; struct direct *dp; struct direct *edp; char *buf; size_t buf_size; int namlen, length; int rc; length = strlen(name); fp->f_seekp = 0; while (fp->f_seekp < DIP(fp, di_size)) { rc = buf_read_file(f, &buf, &buf_size); if (rc) return (rc); dp = (struct direct *)buf; edp = (struct direct *)(buf + buf_size); while (dp < edp) { if (dp->d_ino == (ino_t)0) goto next; #if BYTE_ORDER == LITTLE_ENDIAN if (fp->f_fs->fs_maxsymlinklen <= 0) namlen = dp->d_type; else #endif namlen = dp->d_namlen; if (namlen == length && !strcmp(name, dp->d_name)) { /* found entry */ *inumber_p = dp->d_ino; return (0); } next: dp = (struct direct *)((char *)dp + dp->d_reclen); } fp->f_seekp += buf_size; } return (ENOENT); } /* * Open a file. */ static int ufs_open(const char *upath, struct open_file *f) { char *cp, *ncp; int c; ino_t inumber, parent_inumber; struct file *fp; struct fs *fs; int rc; int nlinks = 0; char namebuf[MAXPATHLEN+1]; char *buf = NULL; char *path = NULL; const char *dev; ufs_mnt_t *mnt; /* allocate file system specific data structure */ errno = 0; fp = calloc(1, sizeof(struct file)); if (fp == NULL) return (errno); f->f_fsdata = (void *)fp; dev = devformat((struct devdesc *)f->f_devdata); /* Is this device mounted? */ STAILQ_FOREACH(mnt, &mnt_list, um_link) { if (strcmp(dev, mnt->um_dev) == 0) break; } if (mnt == NULL) { /* read super block */ twiddle(1); if ((rc = ffs_sbget(f, &fs, UFS_STDSB, UFS_NOHASHFAIL, "stand", ufs_use_sa_read)) != 0) { goto out; } } else { struct open_file *sbf; struct file *sfp; /* get superblock from mounted file system */ sbf = fd2open_file(mnt->um_fd); sfp = sbf->f_fsdata; fs = sfp->f_fs; } fp->f_fs = fs; /* * Calculate indirect block levels. */ { ufs2_daddr_t mult; int level; mult = 1; for (level = 0; level < UFS_NIADDR; level++) { mult *= NINDIR(fs); fp->f_nindir[level] = mult; } } inumber = UFS_ROOTINO; if ((rc = read_inode(inumber, f)) != 0) goto out; cp = path = strdup(upath); if (path == NULL) { rc = ENOMEM; goto out; } while (*cp) { /* * Remove extra separators */ while (*cp == '/') cp++; if (*cp == '\0') break; /* * Check that current node is a directory. */ if ((DIP(fp, di_mode) & IFMT) != IFDIR) { rc = ENOTDIR; goto out; } /* * Get next component of path name. */ { int len = 0; ncp = cp; while ((c = *cp) != '\0' && c != '/') { if (++len > UFS_MAXNAMLEN) { rc = ENOENT; goto out; } cp++; } *cp = '\0'; } /* * Look up component in current directory. * Save directory inumber in case we find a * symbolic link. */ parent_inumber = inumber; rc = search_directory(ncp, f, &inumber); *cp = c; if (rc) goto out; /* * Open next component. */ if ((rc = read_inode(inumber, f)) != 0) goto out; /* * Check for symbolic link. */ if ((DIP(fp, di_mode) & IFMT) == IFLNK) { int link_len = DIP(fp, di_size); int len; len = strlen(cp); if (link_len + len > MAXPATHLEN || ++nlinks > MAXSYMLINKS) { rc = ENOENT; goto out; } bcopy(cp, &namebuf[link_len], len + 1); if (link_len < fs->fs_maxsymlinklen) { bcopy(DIP(fp, di_shortlink), namebuf, (unsigned) link_len); } else { /* * Read file for symbolic link */ size_t buf_size; ufs2_daddr_t disk_block; struct fs *fs = fp->f_fs; if (!buf) buf = malloc(fs->fs_bsize); rc = block_map(f, (ufs2_daddr_t)0, &disk_block); if (rc) goto out; twiddle(1); rc = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, fsbtodb(fs, disk_block), fs->fs_bsize, buf, &buf_size); if (rc) goto out; bcopy((char *)buf, namebuf, (unsigned)link_len); } /* * If relative pathname, restart at parent directory. * If absolute pathname, restart at root. */ cp = namebuf; if (*cp != '/') inumber = parent_inumber; else inumber = (ino_t)UFS_ROOTINO; if ((rc = read_inode(inumber, f)) != 0) goto out; } } /* * Found terminal component. */ rc = 0; fp->f_seekp = 0; out: free(buf); free(path); if (rc) { free(fp->f_buf); if (mnt == NULL && fp->f_fs != NULL) { free(fp->f_fs->fs_csp); free(fp->f_fs->fs_si); free(fp->f_fs); } free(fp); } return (rc); } /* * A read function for use by standalone-layer routines. */ static int ufs_use_sa_read(void *devfd, off_t loc, void **bufp, int size) { struct open_file *f; size_t buf_size; int error; f = (struct open_file *)devfd; if ((*bufp = malloc(size)) == NULL) return (ENOSPC); error = (f->f_dev->dv_strategy)(f->f_devdata, F_READ, loc / DEV_BSIZE, size, *bufp, &buf_size); if (error != 0) return (error); if (buf_size != size) return (EIO); return (0); } static int ufs_close(struct open_file *f) { ufs_mnt_t *mnt; struct file *fp = (struct file *)f->f_fsdata; int level; char *dev; f->f_fsdata = NULL; if (fp == NULL) return (0); for (level = 0; level < UFS_NIADDR; level++) { free(fp->f_blk[level]); } free(fp->f_buf); dev = devformat((struct devdesc *)f->f_devdata); STAILQ_FOREACH(mnt, &mnt_list, um_link) { if (strcmp(dev, mnt->um_dev) == 0) break; } if (mnt == NULL && fp->f_fs != NULL) { free(fp->f_fs->fs_csp); free(fp->f_fs->fs_si); free(fp->f_fs); } free(fp); return (0); } /* * Copy a portion of a file into kernel memory. * Cross block boundaries when necessary. */ static int ufs_read(struct open_file *f, void *start, size_t size, size_t *resid) { struct file *fp = (struct file *)f->f_fsdata; size_t csize; char *buf; size_t buf_size; int rc = 0; char *addr = start; while (size != 0) { if (fp->f_seekp >= DIP(fp, di_size)) break; rc = buf_read_file(f, &buf, &buf_size); if (rc) break; csize = size; if (csize > buf_size) csize = buf_size; bcopy(buf, addr, csize); fp->f_seekp += csize; addr += csize; size -= csize; } if (resid) *resid = size; return (rc); } /* * Write to a portion of an already allocated file. * Cross block boundaries when necessary. Can not * extend the file. */ static int ufs_write(struct open_file *f, const void *start, size_t size, size_t *resid) { struct file *fp = (struct file *)f->f_fsdata; size_t csize; int rc = 0; const char *addr = start; csize = size; while ((size != 0) && (csize != 0)) { if (fp->f_seekp >= DIP(fp, di_size)) break; if (csize >= 512) csize = 512; /* XXX */ rc = buf_write_file(f, addr, &csize); if (rc) break; fp->f_seekp += csize; addr += csize; size -= csize; } if (resid) *resid = size; return (rc); } static off_t ufs_seek(struct open_file *f, off_t offset, int where) { struct file *fp = (struct file *)f->f_fsdata; switch (where) { case SEEK_SET: fp->f_seekp = offset; break; case SEEK_CUR: fp->f_seekp += offset; break; case SEEK_END: fp->f_seekp = DIP(fp, di_size) - offset; break; default: errno = EINVAL; return (-1); } return (fp->f_seekp); } static int ufs_stat(struct open_file *f, struct stat *sb) { struct file *fp = (struct file *)f->f_fsdata; /* only important stuff */ sb->st_mode = DIP(fp, di_mode); sb->st_uid = DIP(fp, di_uid); sb->st_gid = DIP(fp, di_gid); sb->st_size = DIP(fp, di_size); sb->st_mtime = DIP(fp, di_mtime); /* * The items below are ufs specific! * Other fs types will need their own solution * if these fields are needed. */ sb->st_ino = fp->f_inumber; /* * We need something to differentiate devs. * fs_id is unique but 64bit, we xor the two * halves to squeeze it into 32bits. */ sb->st_dev = (dev_t)(fp->f_fs->fs_id[0] ^ fp->f_fs->fs_id[1]); return (0); } static int ufs_readdir(struct open_file *f, struct dirent *d) { struct file *fp = (struct file *)f->f_fsdata; struct direct *dp; char *buf; size_t buf_size; int error; /* * assume that a directory entry will not be split across blocks */ do { if (fp->f_seekp >= DIP(fp, di_size)) return (ENOENT); error = buf_read_file(f, &buf, &buf_size); if (error) return (error); dp = (struct direct *)buf; fp->f_seekp += dp->d_reclen; } while (dp->d_ino == (ino_t)0); d->d_type = dp->d_type; strcpy(d->d_name, dp->d_name); return (0); } static int ufs_mount(const char *dev, const char *path, void **data) { char *fs; ufs_mnt_t *mnt; struct open_file *f; errno = 0; mnt = calloc(1, sizeof(*mnt)); if (mnt == NULL) return (errno); mnt->um_fd = -1; mnt->um_dev = strdup(dev); if (mnt->um_dev == NULL) goto done; if (asprintf(&fs, "%s%s", dev, path) < 0) goto done; mnt->um_fd = open(fs, O_RDONLY); free(fs); if (mnt->um_fd == -1) goto done; /* Is it ufs file system? */ f = fd2open_file(mnt->um_fd); if (strcmp(f->f_ops->fs_name, "ufs") == 0) STAILQ_INSERT_TAIL(&mnt_list, mnt, um_link); else errno = ENXIO; done: if (errno != 0) { free(mnt->um_dev); if (mnt->um_fd >= 0) close(mnt->um_fd); free(mnt); } else { *data = mnt; } return (errno); } static int ufs_unmount(const char *dev __unused, void *data) { ufs_mnt_t *mnt = data; STAILQ_REMOVE(&mnt_list, mnt, ufs_mnt, um_link); free(mnt->um_dev); close(mnt->um_fd); free(mnt); return (0); } diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index b586ab8e126a..d4f733553071 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -1,3623 +1,3622 @@ /*- * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause) * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 */ #include #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef ufs2_daddr_t allocfcn_t(struct inode *ip, uint64_t cg, ufs2_daddr_t bpref, int size, int rsize); static ufs2_daddr_t ffs_alloccg(struct inode *, uint64_t, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); static void ffs_blkfree_cg(struct ufsmount *, struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, struct workhead *); #ifdef INVARIANTS static int ffs_checkfreeblk(struct inode *, ufs2_daddr_t, long); #endif static void ffs_checkcgintegrity(struct fs *, uint64_t, int); static ufs2_daddr_t ffs_clusteralloc(struct inode *, uint64_t, ufs2_daddr_t, int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, uint64_t, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_hashalloc(struct inode *, uint64_t, ufs2_daddr_t, int, int, allocfcn_t *); static ufs2_daddr_t ffs_nodealloccg(struct inode *, uint64_t, ufs2_daddr_t, int, int); static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); static void ffs_ckhash_cg(struct buf *); /* * Allocate a block in the filesystem. * * The size of the requested block is given, which must be some * multiple of fs_fsize and <= fs_bsize. * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadratically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following hierarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadratically rehash into other cylinder groups, until an * available block is located. */ int ffs_alloc(struct inode *ip, ufs2_daddr_t lbn, ufs2_daddr_t bpref, int size, int flags, struct ucred *cred, ufs2_daddr_t *bnp) { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t bno; uint64_t cg, reclaimed; int64_t delta; #ifdef QUOTA int error; #endif *bnp = 0; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_alloc: bad size"); } if (cred == NOCRED) panic("ffs_alloc: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: #ifdef QUOTA UFS_UNLOCK(ump); error = chkdq(ip, btodb(size), cred, 0); if (error) return (error); UFS_LOCK(ump); #endif if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) goto nospace; if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); if (bno > 0) { delta = btodb(size); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); *bnp = bno; return (0); } nospace: #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(size), cred, FORCE); UFS_LOCK(ump); #endif if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); goto retry; } if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (reclaimed > 0 && ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Reallocate a fragment to a bigger size * * The number and size of the old block is given, and a preference * and new size is also specified. The allocator attempts to extend * the original block. Failing that, the regular block allocator is * invoked to get an appropriate block. */ int ffs_realloccg(struct inode *ip, ufs2_daddr_t lbprev, ufs2_daddr_t bprev, ufs2_daddr_t bpref, int osize, int nsize, int flags, struct ucred *cred, struct buf **bpp) { struct vnode *vp; struct fs *fs; struct buf *bp; struct ufsmount *ump; uint64_t cg, request, reclaimed; int error, gbflags; ufs2_daddr_t bno; int64_t delta; vp = ITOV(ip); ump = ITOUMP(ip); fs = ump->um_fs; bp = NULL; gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; #ifdef WITNESS gbflags |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0; #endif mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_realloccg: allocation on suspended filesystem"); if ((uint64_t)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (uint64_t)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { printf( "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, osize, nsize, fs->fs_fsmnt); panic("ffs_realloccg: bad size"); } if (cred == NOCRED) panic("ffs_realloccg: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { goto nospace; } if (bprev == 0) { printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, fs->fs_fsmnt); panic("ffs_realloccg: bad bprev"); } UFS_UNLOCK(ump); /* * Allocate the extra space in the buffer. */ error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); if (error) { return (error); } if (bp->b_blkno == bp->b_lblkno) { if (lbprev >= UFS_NDADDR) panic("ffs_realloccg: lbprev out of range"); bp->b_blkno = fsbtodb(fs, bprev); } #ifdef QUOTA error = chkdq(ip, btodb(nsize - osize), cred, 0); if (error) { brelse(bp); return (error); } #endif /* * Check for extension in the existing location. */ *bpp = NULL; cg = dtog(fs, bprev); UFS_LOCK(ump); bno = ffs_fragextend(ip, cg, bprev, osize, nsize); if (bno) { if (bp->b_blkno != fsbtodb(fs, bno)) panic("ffs_realloccg: bad blockno"); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } /* * Allocate a new disk location. */ if (bpref >= fs->fs_size) bpref = 0; switch ((int)fs->fs_optim) { case FS_OPTSPACE: /* * Allocate an exact sized fragment. Although this makes * best use of space, we will waste time relocating it if * the file continues to grow. If the fragmentation is * less than half of the minimum free reserve, we choose * to begin optimizing for time. */ request = nsize; if (fs->fs_minfree <= 5 || fs->fs_cstotal.cs_nffree > (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) break; log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTTIME; break; case FS_OPTTIME: /* * At this point we have discovered a file that is trying to * grow a small fragment to a larger fragment. To save time, * we allocate a full sized block, then free the unused portion. * If the file continues to grow, the `ffs_fragextend' call * above will be able to grow it in place without further * copying. If aberrant programs cause disk fragmentation to * grow within 2% of the free reserve, we choose to begin * optimizing for space. */ request = fs->fs_bsize; if (fs->fs_cstotal.cs_nffree < (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) break; log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTSPACE; break; default: printf("dev = %s, optim = %ld, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) /* * The usual case is that a smaller fragment that * was just allocated has been replaced with a bigger * fragment or a full-size block. If it is marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the block was written * earlier, but very uncommon. If the block has never * been written, there is no need to send a BIO_DELETE * for it when it is freed. The gain from avoiding the * TRIMs for the common case of unwritten blocks far * exceeds the cost of the write amplification for the * uncommon case of failing to send a TRIM for a block * that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); UFS_LOCK(ump); #endif nospace: /* * no space available */ if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; UFS_UNLOCK(ump); if (bp) { brelse(bp); bp = NULL; } UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); goto retry; } if (bp) brelse(bp); if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (reclaimed > 0 && ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Reallocate a sequence of blocks into a contiguous sequence of blocks. * * The vnode and an array of buffer pointers for a range of sequential * logical blocks to be made contiguous is given. The allocator attempts * to find a range of sequential blocks starting as close as possible * from the end of the allocation for the logical block immediately * preceding the current range. If successful, the physical block numbers * in the buffer pointers and in the inode are changed to reflect the new * allocation. If unsuccessful, the allocation is left unchanged. The * success in doing the reallocation is returned. Note that the error * return is not reflected back to the user. Rather the previous block * allocation will be used. */ -SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, - "FFS filesystem"); +SYSCTL_DECL(_vfs_ffs); static int doasyncfree = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "do not force synchronous writes when blocks are reallocated"); static int doreallocblks = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "enable block reallocation"); static int dotrimcons = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, "enable BIO_DELETE / TRIM consolidation"); static int maxclustersearch = 10; SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 0, "max number of cylinder group to search for contigous blocks"); #ifdef DIAGNOSTIC static int prtrealloc = 0; SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0, "print out FFS filesystem block reallocation operations"); #endif int ffs_reallocblks( struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap) { struct ufsmount *ump; int error; /* * We used to skip reallocating the blocks of a file into a * contiguous sequence if the underlying flash device requested * BIO_DELETE notifications, because devices that benefit from * BIO_DELETE also benefit from not moving the data. However, * the destination for the data is usually moved before the data * is written to the initially allocated location, so we rarely * suffer the penalty of extra writes. With the addition of the * consolidation of contiguous blocks into single BIO_DELETE * operations, having fewer but larger contiguous blocks reduces * the number of (slow and expensive) BIO_DELETE operations. So * when doing BIO_DELETE consolidation, we do block reallocation. * * Skip if reallocblks has been disabled globally. */ ump = ap->a_vp->v_mount->mnt_data; if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || doreallocblks == 0) return (ENOSPC); /* * We can't wait in softdep prealloc as it may fsync and recurse * here. Instead we simply fail to reallocate blocks if this * rare condition arises. */ if (DOINGSUJ(ap->a_vp)) if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) return (ENOSPC); vn_seqc_write_begin(ap->a_vp); error = ump->um_fstype == UFS1 ? ffs_reallocblks_ufs1(ap) : ffs_reallocblks_ufs2(ap); vn_seqc_write_end(ap->a_vp); return (error); } static int ffs_reallocblks_ufs1( struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap) { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp, *bp; ufs1_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs1_daddr_t soff, newblk, blkno; ufs2_daddr_t pref; struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkfreeblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, leave space for the indirect block. Indirect blocks * are initially laid out in a position after the last direct * block. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din1->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs1_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs1_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); MPASS(cg < fs->fs_ncg); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkfreeblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %d,", *bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din1->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din1->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) /* * The usual case is that a set of N-contiguous blocks * that was just allocated has been replaced with a * set of N+1-contiguous blocks. If they are marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the blocks were written * earlier, but very uncommon. If the blocks have never * been written, there is no need to send a BIO_DELETE * for them when they are freed. The gain from avoiding * the TRIMs for the common case of unwritten blocks * far exceeds the cost of the write amplification for * the uncommon case of failing to send a TRIM for the * blocks that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, bp->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %d,", blkno); #endif } #ifdef DIAGNOSTIC if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din1->di_db[0]) brelse(sbp); return (ENOSPC); } static int ffs_reallocblks_ufs2( struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap) { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp, *bp; ufs2_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs2_daddr_t soff, newblk, blkno, pref; struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkfreeblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, do not move anything in it. Indirect blocks are * usually initially laid out in a position between the data * blocks. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din2->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs2_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs2_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); MPASS(cg < fs->fs_ncg); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkfreeblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %jd,", (intmax_t)*bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din2->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din2->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) /* * The usual case is that a set of N-contiguous blocks * that was just allocated has been replaced with a * set of N+1-contiguous blocks. If they are marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the blocks were written * earlier, but very uncommon. If the blocks have never * been written, there is no need to send a BIO_DELETE * for them when they are freed. The gain from avoiding * the TRIMs for the common case of unwritten blocks * far exceeds the cost of the write amplification for * the uncommon case of failing to send a TRIM for the * blocks that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, bp->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %jd,", (intmax_t)blkno); #endif } #ifdef DIAGNOSTIC if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din2->di_db[0]) brelse(sbp); return (ENOSPC); } /* * Allocate an inode in the filesystem. * * If allocating a directory, use ffs_dirpref to select the inode. * If allocating in a directory, the following hierarchy is followed: * 1) allocate the preferred inode. * 2) allocate an inode in the same cylinder group. * 3) quadratically rehash into other cylinder groups, until an * available inode is located. * If no inode preference is given the following hierarchy is used * to allocate an inode: * 1) allocate an inode in cylinder group 0. * 2) quadratically rehash into other cylinder groups, until an * available inode is located. */ int ffs_valloc(struct vnode *pvp, int mode, struct ucred *cred, struct vnode **vpp) { struct inode *pip; struct fs *fs; struct inode *ip; struct timespec ts; struct ufsmount *ump; ino_t ino, ipref; uint64_t cg; int error, reclaimed; *vpp = NULL; pip = VTOI(pvp); ump = ITOUMP(pip); fs = ump->um_fs; UFS_LOCK(ump); reclaimed = 0; retry: if (fs->fs_cstotal.cs_nifree == 0) goto noinodes; if ((mode & IFMT) == IFDIR) ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); /* * Track number of dirs created one after another * in a same cg without intervening by files. */ if ((mode & IFMT) == IFDIR) { if (fs->fs_contigdirs[cg] < 255) fs->fs_contigdirs[cg]++; } else { if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; /* * Get rid of the cached old vnode, force allocation of a new vnode * for this inode. If this fails, release the allocated ino and * return the error. */ if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, FFSV_FORCEINSMQ | FFSV_REPLACE | FFSV_NEWINODE)) != 0) { ffs_vfree(pvp, ino, mode); return (error); } /* * We got an inode, so check mode and panic if it is already allocated. */ ip = VTOI(*vpp); if (ip->i_mode) { printf("mode = 0%o, inum = %ju, fs = %s\n", ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); panic("ffs_valloc: dup alloc"); } if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ printf("free inode %s/%ju had %ld blocks\n", fs->fs_fsmnt, (intmax_t)ino, (long)DIP(ip, i_blocks)); DIP_SET(ip, i_blocks, 0); } ip->i_flags = 0; DIP_SET(ip, i_flags, 0); if ((mode & IFMT) == IFDIR) DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1); /* * Set up a new generation number for this inode. */ while (ip->i_gen == 0 || ++ip->i_gen == 0) ip->i_gen = arc4random(); DIP_SET(ip, i_gen, ip->i_gen); if (fs->fs_magic == FS_UFS2_MAGIC) { vfs_timestamp(&ts); ip->i_din2->di_birthtime = ts.tv_sec; ip->i_din2->di_birthnsec = ts.tv_nsec; } ip->i_flag = 0; (*vpp)->v_vflag = 0; (*vpp)->v_type = VNON; if (fs->fs_magic == FS_UFS2_MAGIC) { (*vpp)->v_op = &ffs_vnodeops2; UFS_INODE_SET_FLAG(ip, IN_UFS2); } else { (*vpp)->v_op = &ffs_vnodeops1; } return (0); noinodes: if (reclaimed == 0) { reclaimed = 1; softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); goto retry; } if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, pip->i_number, "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ static ino_t ffs_dirpref(struct inode *pip) { struct fs *fs; int cg, prefcg, curcg, dirsize, cgsize; int depth, range, start, end, numdirs, power, numerator, denominator; uint64_t avgifree, avgbfree, avgndir, curdirsize; uint64_t minifree, minbfree, maxndir; uint64_t maxcontigdirs; mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); fs = ITOFS(pip); avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; /* * Select a preferred cylinder group to place a new directory. * If we are near the root of the filesystem we aim to spread * them out as much as possible. As we descend deeper from the * root we cluster them closer together around their parent as * we expect them to be more closely interactive. Higher-level * directories like usr/src/sys and usr/src/bin should be * separated while the directories in these areas are more * likely to be accessed together so should be closer. * * We pick a range of cylinder groups around the cylinder group * of the directory in which we are being created. The size of * the range for our search is based on our depth from the root * of our filesystem. We then probe that range based on how many * directories are already present. The first new directory is at * 1/2 (middle) of the range; the second is in the first 1/4 of the * range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc. */ depth = DIP(pip, i_dirdepth); range = fs->fs_ncg / (1 << depth); curcg = ino_to_cg(fs, pip->i_number); start = curcg - (range / 2); if (start < 0) start += fs->fs_ncg; end = curcg + (range / 2); if (end >= fs->fs_ncg) end -= fs->fs_ncg; numdirs = pip->i_effnlink - 1; power = fls(numdirs); numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1; denominator = 1 << power; prefcg = (curcg - (range / 2) + (range * numerator / denominator)); if (prefcg < 0) prefcg += fs->fs_ncg; if (prefcg >= fs->fs_ncg) prefcg -= fs->fs_ncg; /* * If this filesystem is not tracking directory depths, * revert to the old algorithm. */ if (depth == 0 && pip->i_number != UFS_ROOTINO) prefcg = curcg; /* * Count various limits which used for * optimal allocation of a directory inode. */ maxndir = min(avgndir + (1 << depth), fs->fs_ipg); minifree = avgifree - avgifree / 4; if (minifree < 1) minifree = 1; minbfree = avgbfree - avgbfree / 4; if (minbfree < 1) minbfree = 1; cgsize = fs->fs_fsize * fs->fs_fpg; dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; if (dirsize < curdirsize) dirsize = curdirsize; if (dirsize <= 0) maxcontigdirs = 0; /* dirsize overflowed */ else maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); if (fs->fs_avgfpdir > 0) maxcontigdirs = min(maxcontigdirs, fs->fs_ipg / fs->fs_avgfpdir); if (maxcontigdirs == 0) maxcontigdirs = 1; /* * Limit number of dirs in one cg and reserve space for * regular files, but only if we have no deficit in * inodes or space. * * We are trying to find a suitable cylinder group nearby * our preferred cylinder group to place a new directory. * We scan from our preferred cylinder group forward looking * for a cylinder group that meets our criterion. If we get * to the final cylinder group and do not find anything, * we start scanning forwards from the beginning of the * filesystem. While it might seem sensible to start scanning * backwards or even to alternate looking forward and backward, * this approach fails badly when the filesystem is nearly full. * Specifically, we first search all the areas that have no space * and finally try the one preceding that. We repeat this on * every request and in the case of the final block end up * searching the entire filesystem. By jumping to the front * of the filesystem, our future forward searches always look * in new cylinder groups so finds every possible block after * one pass over the filesystem. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } /* * This is a backstop when we have deficit in space. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) break; return ((ino_t)(fs->fs_ipg * cg)); } /* * Select the desired position for the next block in a file. The file is * logically divided into sections. The first section is composed of the * direct blocks and the next fs_maxbpg blocks. Each additional section * contains fs_maxbpg blocks. * * If no blocks have been allocated in the first section, the policy is to * request a block in the same cylinder group as the inode that describes * the file. The first indirect is allocated immediately following the last * direct block and the data blocks for the first indirect immediately * follow it. * * If no blocks have been allocated in any other section, the indirect * block(s) are allocated in the same cylinder group as its inode in an * area reserved immediately following the inode blocks. The policy for * the data blocks is to place them in a cylinder group with a greater than * average number of free blocks. An appropriate cylinder group is found * by using a rotor that sweeps the cylinder groups. When a new group of * blocks is needed, the sweep begins in the cylinder group following the * cylinder group from which the previous allocation was made. The sweep * continues until a cylinder group with greater than the average number * of free blocks is found. If the allocation is for the first block in an * indirect block or the previous block is a hole, then the information on * the previous allocation is unavailable; here a best guess is made based * on the logical block number being allocated. * * If a section is already partially allocated, the policy is to * allocate blocks contiguously within the section if possible. */ ufs2_daddr_t ffs_blkpref_ufs1(struct inode *ip, ufs_lbn_t lbn, int indx, ufs1_daddr_t *bap) { struct fs *fs; uint64_t cg, inocg; uint64_t avgbfree, startcg; ufs2_daddr_t pref, prevbn; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && ip->i_din1->di_db[UFS_NDADDR - 1] != 0) { pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; if (dtog(fs, pref) >= fs->fs_ncg) pref = 0; } return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ if (lbn == UFS_NDADDR) { pref = ip->i_din1->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) { if (dtog(fs, pref + fs->fs_frag) >= fs->fs_ncg) return (0); return (pref + fs->fs_frag); } } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx == 0) { prevbn = 0; } else { prevbn = bap[indx - 1]; if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, fs->fs_bsize) != 0) prevbn = 0; } if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || prevbn == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, prevbn) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg < startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ if (dtog(fs, prevbn + fs->fs_frag) >= fs->fs_ncg) return (0); return (prevbn + fs->fs_frag); } /* * Same as above, but for UFS2 */ ufs2_daddr_t ffs_blkpref_ufs2(struct inode *ip, ufs_lbn_t lbn, int indx, ufs2_daddr_t *bap) { struct fs *fs; uint64_t cg, inocg; uint64_t avgbfree, startcg; ufs2_daddr_t pref, prevbn; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && ip->i_din2->di_db[UFS_NDADDR - 1] != 0) { pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; if (dtog(fs, pref) >= fs->fs_ncg) pref = 0; } return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ if (lbn == UFS_NDADDR) { pref = ip->i_din2->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) { if (dtog(fs, pref + fs->fs_frag) >= fs->fs_ncg) return (0); return (pref + fs->fs_frag); } } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx == 0) { prevbn = 0; } else { prevbn = bap[indx - 1]; if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, fs->fs_bsize) != 0) prevbn = 0; } if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || prevbn == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, prevbn) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg < startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ if (dtog(fs, prevbn + fs->fs_frag) >= fs->fs_ncg) return (0); return (prevbn + fs->fs_frag); } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadratically rehash on the cylinder group number. * 3) brute force search for a free block. * * Must be called with the UFS lock held. Will release the lock on success * and return with it held on failure. */ /*VARARGS5*/ static ufs2_daddr_t ffs_hashalloc(struct inode *ip, uint64_t cg, ufs2_daddr_t pref, int size, /* Search size for data blocks, mode for inodes */ int rsize, /* Real allocated size. */ allocfcn_t *allocator) { struct fs *fs; ufs2_daddr_t result; uint64_t i, icg = cg; mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); #ifdef INVARIANTS if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_hashalloc: allocation on suspended filesystem"); #endif fs = ITOFS(ip); /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size, rsize); if (result) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->fs_ncg; i *= 2) { cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); cg++; if (cg == fs->fs_ncg) cg = 0; } return (0); } /* * Determine whether a fragment can be extended. * * Check to see if the necessary fragments are available, and * if they are, allocate them. */ static ufs2_daddr_t ffs_fragextend(struct inode *ip, uint64_t cg, ufs2_daddr_t bprev, int osize, int nsize) { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int nffree; long bno; int frags, bbase; int i, error; uint8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) return (0); frags = numfrags(fs, nsize); bbase = fragnum(fs, bprev); if (bbase > fragnum(fs, (bprev + frags - 1))) { /* cannot extend across a block boundary */ return (0); } UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { ffs_checkcgintegrity(fs, cg, error); goto fail; } bno = dtogd(fs, bprev); blksfree = cg_blksfree(cgp); for (i = numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i)) goto fail; /* * the current fragment can be extended * deduct the count on fragment being extended into * increase the count on the remaining fragment (if any) * allocate the extended piece */ for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i)) break; cgp->cg_frsum[i - numfrags(fs, osize)]--; if (i != frags) cgp->cg_frsum[i - frags]++; for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree--; nffree++; } UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= nffree; fs->fs_cs(fs, cg).cs_nffree -= nffree; fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, frags, numfrags(fs, osize)); bdwrite(bp); return (bprev); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static ufs2_daddr_t ffs_alloccg(struct inode *ip, uint64_t cg, ufs2_daddr_t bpref, int size, int rsize) { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; int i, allocsiz, error, frags; uint8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { ffs_checkcgintegrity(fs, cg, error); goto fail; } if (size == fs->fs_bsize) { UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } /* * check to see if any fragments are already available * allocsiz is the size which will be allocated, hacking * it down to a smaller size if necessary */ blksfree = cg_blksfree(cgp); frags = numfrags(fs, size); for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0) break; if (allocsiz == fs->fs_frag) { /* * no fragments were available, so a block will be * allocated, and hacked up */ if (cgp->cg_cs.cs_nbfree == 0) goto fail; UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } KASSERT(size == rsize, ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) goto fail; for (i = 0; i < frags; i++) clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree -= frags; cgp->cg_frsum[allocsiz]--; if (frags != allocsiz) cgp->cg_frsum[allocsiz - frags]++; UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= frags; fs->fs_cs(fs, cg).cs_nffree -= frags; fs->fs_fmod = 1; blkno = cgbase(fs, cg) + bno; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); bdwrite(bp); return (blkno); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Allocate a block in a cylinder group. * * This algorithm implements the following policy: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate the next available block on the block rotor for the * specified cylinder group. * Note that this routine only allocates fs_bsize blocks; these * blocks may be fragmented by the routine that allocates them. */ static ufs2_daddr_t ffs_alloccgblk(struct inode *ip, struct buf *bp, ufs2_daddr_t bpref, int size) { struct fs *fs; struct cg *cgp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; uint8_t *blksfree; int i, cgbpref; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); if (bpref == 0) { bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { /* map bpref to correct zone in this cg */ if (bpref < cgdata(fs, cgbpref)) bpref = cgmeta(fs, cgp->cg_cgx); else bpref = cgdata(fs, cgp->cg_cgx); } /* * if the requested block is available, use it */ bno = dtogd(fs, blknum(fs, bpref)); if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) goto gotit; /* * Take the next available block in this cylinder group. */ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); if (bno < 0) return (0); /* Update cg_rotor only if allocated from the data zone */ if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) cgp->cg_rotor = bno; gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; fs->fs_fmod = 1; blkno = cgbase(fs, cgp->cg_cgx) + bno; /* * If the caller didn't want the whole block free the frags here. */ size = numfrags(fs, size); if (size != fs->fs_frag) { bno = dtogd(fs, blkno); for (i = size; i < fs->fs_frag; i++) setbit(blksfree, bno + i); i = fs->fs_frag - size; cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; fs->fs_fmod = 1; cgp->cg_frsum[i]++; } /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); UFS_LOCK(ump); return (blkno); } /* * Determine whether a cluster can be allocated. * * We do not currently check for optimal rotational layout if there * are multiple choices in the same cylinder group. Instead we just * take the first one that we find following bpref. */ static ufs2_daddr_t ffs_clusteralloc(struct inode *ip, uint64_t cg, ufs2_daddr_t bpref, int len) { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int i, run, bit, map, got, error; ufs2_daddr_t bno; uint8_t *mapp; int32_t *lp; uint8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; MPASS(cg < fs->fs_ncg); if (fs->fs_maxcluster[cg] < len) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { ffs_checkcgintegrity(fs, cg, error); UFS_LOCK(ump); return (0); } /* * Check to see if a cluster of the needed size (or bigger) is * available in this cylinder group. */ lp = &cg_clustersum(cgp)[len]; for (i = len; i <= fs->fs_contigsumsize; i++) if (*lp++ > 0) break; if (i > fs->fs_contigsumsize) { /* * This is the first time looking for a cluster in this * cylinder group. Update the cluster summary information * to reflect the true maximum sized cluster so that * future cluster allocation requests can avoid reading * the cylinder group map only to find no clusters. */ lp = &cg_clustersum(cgp)[len - 1]; for (i = len - 1; i > 0; i--) if (*lp-- > 0) break; UFS_LOCK(ump); fs->fs_maxcluster[cg] = i; brelse(bp); return (0); } /* * Search the cluster map to find a big enough cluster. * We take the first one that we find, even if it is larger * than we need as we prefer to get one close to the previous * block allocation. We do not search before the current * preference point as we do not want to allocate a block * that is allocated before the previous one (as we will * then have to wait for another pass of the elevator * algorithm before it will be read). We prefer to fail and * be recalled to try an allocation in the next cylinder group. */ if (dtog(fs, bpref) != cg) bpref = cgdata(fs, cg); else bpref = blknum(fs, bpref); bpref = fragstoblks(fs, dtogd(fs, bpref)); mapp = &cg_clustersfree(cgp)[bpref / NBBY]; map = *mapp++; bit = 1 << (bpref % NBBY); for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { if ((map & bit) == 0) { run = 0; } else { run++; if (run == len) break; } if ((got & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (got >= cgp->cg_nclusterblks) { UFS_LOCK(ump); brelse(bp); return (0); } /* * Allocate the cluster that we have found. */ blksfree = cg_blksfree(cgp); for (i = 1; i <= len; i++) if (!ffs_isblock(fs, blksfree, got - run + i)) panic("ffs_clusteralloc: map mismatch"); bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); if (dtog(fs, bno) != cg) panic("ffs_clusteralloc: allocated out of group"); len = blkstofrags(fs, len); UFS_LOCK(ump); for (i = 0; i < len; i += fs->fs_frag) if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (bno); } static inline struct buf * getinobuf(struct inode *ip, uint64_t cg, uint32_t cginoblk, int gbflags) { struct fs *fs; fs = ITOFS(ip); return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, gbflags)); } /* * Synchronous inode initialization is needed only when barrier writes do not * work as advertised, and will impose a heavy cost on file creation in a newly * created filesystem. */ static int doasyncinodeinit = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, &doasyncinodeinit, 0, "Perform inode block initialization using asynchronous writes"); /* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, * allocate it using the following policy: * 1) allocate the requested inode. * 2) allocate the next available inode after the requested * inode in the specified cylinder group. */ static ufs2_daddr_t ffs_nodealloccg(struct inode *ip, uint64_t cg, ufs2_daddr_t ipref, int mode, int unused) { struct fs *fs; struct cg *cgp; struct buf *bp, *ibp; struct ufsmount *ump; uint8_t *inosused, *loc; struct ufs2_dinode *dp2; int error, start, len, i; uint32_t old_initediblk; ump = ITOUMP(ip); fs = ump->um_fs; check_nifree: if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { ffs_checkcgintegrity(fs, cg, error); UFS_LOCK(ump); return (0); } restart: if (cgp->cg_cs.cs_nifree == 0) { brelse(bp); UFS_LOCK(ump); return (0); } inosused = cg_inosused(cgp); if (ipref) { ipref %= fs->fs_ipg; if (isclr(inosused, ipref)) goto gotit; } start = cgp->cg_irotor / NBBY; len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { len = start + 1; start = 0; loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { printf("cg = %ju, irotor = %ld, fs = %s\n", (intmax_t)cg, (long)cgp->cg_irotor, fs->fs_fsmnt); panic("ffs_nodealloccg: map corrupted"); /* NOTREACHED */ } } ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; gotit: /* * Check to see if we need to initialize more inodes. */ if (fs->fs_magic == FS_UFS2_MAGIC && ipref + INOPB(fs) > cgp->cg_initediblk && cgp->cg_initediblk < cgp->cg_niblk) { old_initediblk = cgp->cg_initediblk; /* * Free the cylinder group lock before writing the * initialized inode block. Entering the * babarrierwrite() with the cylinder group lock * causes lock order violation between the lock and * snaplk. * * Another thread can decide to initialize the same * inode block, but whichever thread first gets the * cylinder group lock after writing the newly * allocated inode block will update it and the other * will realize that it has lost and leave the * cylinder group unchanged. */ ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); brelse(bp); if (ibp == NULL) { /* * The inode block buffer is already owned by * another thread, which must initialize it. * Wait on the buffer to allow another thread * to finish the updates, with dropped cg * buffer lock, then retry. */ ibp = getinobuf(ip, cg, old_initediblk, 0); brelse(ibp); UFS_LOCK(ump); goto check_nifree; } bzero(ibp->b_data, (int)fs->fs_bsize); dp2 = (struct ufs2_dinode *)(ibp->b_data); for (i = 0; i < INOPB(fs); i++) { while (dp2->di_gen == 0) dp2->di_gen = arc4random(); dp2++; } /* * Rather than adding a soft updates dependency to ensure * that the new inode block is written before it is claimed * by the cylinder group map, we just do a barrier write * here. The barrier write will ensure that the inode block * gets written before the updated cylinder group map can be * written. The barrier write should only slow down bulk * loading of newly created filesystems. */ if (doasyncinodeinit) babarrierwrite(ibp); else bwrite(ibp); /* * After the inode block is written, try to update the * cg initediblk pointer. If another thread beat us * to it, then leave it unchanged as the other thread * has already set it correctly. */ error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); UFS_LOCK(ump); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (error != 0) return (error); if (cgp->cg_initediblk == old_initediblk) cgp->cg_initediblk += INOPB(fs); goto restart; } cgp->cg_irotor = ipref; UFS_LOCK(ump); ACTIVECLEAR(fs, cg); setbit(inosused, ipref); cgp->cg_cs.cs_nifree--; fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; fs->fs_fmod = 1; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir++; fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); bdwrite(bp); return ((ino_t)(cg * fs->fs_ipg + ipref)); } /* * Free a block or fragment. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. */ static void ffs_blkfree_cg(struct ufsmount *ump, struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, struct workhead *dephd) { struct mount *mp; struct cg *cgp; struct buf *bp; daddr_t dbn; ufs1_daddr_t fragno, cgbno; int i, blk, frags, bbase, error; uint64_t cg; uint8_t *blksfree; struct cdev *dev; cg = dtog(fs, bno); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; } else if (devvp->v_type == VCHR) { /* * devvp is a normal disk device * XXXKIB: devvp is not locked there, v_rdev access depends on * busy mount, which prevents mntfs devvp from reclamation. */ dev = devvp->v_rdev; } else return; #ifdef INVARIANTS if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0 || fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_blkfree_cg: invalid size"); } #endif if ((uint64_t)bno >= fs->fs_size) { printf("bad block %jd, ino %ju\n", (intmax_t)bno, (intmax_t)inum); ffs_fserr(fs, inum, "bad block"); return; } if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) return; /* * Would like to just downgrade to read-only. Until that * capability is available, just toss the cylinder group * update and mark the filesystem as needing to run fsck. */ fs->fs_flags |= FS_NEEDSFSCK; if (devvp->v_type == VREG) dbn = fragstoblks(fs, cgtod(fs, cg)); else dbn = fsbtodb(fs, cgtod(fs, cg)); error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); KASSERT(error == 0, ("getblkx failed")); softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, numfrags(fs, size), dephd, true); bp->b_flags |= B_RELBUF | B_NOCACHE; bp->b_flags &= ~B_CACHE; bawrite(bp); return; } cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); UFS_LOCK(ump); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, blksfree, fragno)) { if (devvp->v_type == VREG) { UFS_UNLOCK(ump); /* devvp is a snapshot */ brelse(bp); return; } printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free block"); } ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(blksfree, cgbno + i)) { printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)(bno + i), fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free frag"); } setbit(blksfree, cgbno + i); } cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); mp = UFSTOVFS(ump); if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, numfrags(fs, size), dephd, false); bdwrite(bp); } /* * Structures and routines associated with trim management. * * The following requests are passed to trim_lookup to indicate * the actions that should be taken. */ #define NEW 1 /* if found, error else allocate and hash it */ #define OLD 2 /* if not found, error, else return it */ #define REPLACE 3 /* if not found, error else unhash and reallocate it */ #define DONE 4 /* if not found, error else unhash and return it */ #define SINGLE 5 /* don't look up, just allocate it and don't hash it */ MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); #define TRIMLIST_HASH(ump, key) \ (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) /* * These structures describe each of the block free requests aggregated * together to make up a trim request. */ struct trim_blkreq { TAILQ_ENTRY(trim_blkreq) blkreqlist; ufs2_daddr_t bno; long size; struct workhead *pdephd; struct workhead dephd; }; /* * Description of a trim request. */ struct ffs_blkfree_trim_params { TAILQ_HEAD(, trim_blkreq) blklist; LIST_ENTRY(ffs_blkfree_trim_params) hashlist; struct task task; struct ufsmount *ump; struct vnode *devvp; ino_t inum; ufs2_daddr_t bno; long size; long key; }; static void ffs_blkfree_trim_completed(struct buf *); static void ffs_blkfree_trim_task(void *ctx, int pending __unused); static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, struct vnode *, ufs2_daddr_t, long, ino_t, uint64_t, int); static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); /* * Called on trim completion to start a task to free the associated block(s). */ static void ffs_blkfree_trim_completed(struct buf *bp) { struct ffs_blkfree_trim_params *tp; tp = bp->b_fsprivate1; free(bp, M_TRIM); TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); } /* * Trim completion task that free associated block(s). */ static void ffs_blkfree_trim_task(void *ctx, int pending) { struct ffs_blkfree_trim_params *tp; struct trim_blkreq *blkelm; struct ufsmount *ump; tp = ctx; ump = tp->ump; while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, blkelm->size, tp->inum, blkelm->pdephd); TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); free(blkelm, M_TRIM); } vn_finished_secondary_write(UFSTOVFS(ump)); UFS_LOCK(ump); ump->um_trim_inflight -= 1; ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); UFS_UNLOCK(ump); free(tp, M_TRIM); } /* * Lookup a trim request by inode number. * Allocate if requested (NEW, REPLACE, SINGLE). */ static struct ffs_blkfree_trim_params * trim_lookup(struct ufsmount *ump, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, uint64_t key, int alloctype) { struct trimlist_hashhead *tphashhead; struct ffs_blkfree_trim_params *tp, *ntp; ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); if (alloctype != SINGLE) { KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); UFS_LOCK(ump); tphashhead = TRIMLIST_HASH(ump, key); LIST_FOREACH(tp, tphashhead, hashlist) if (key == tp->key) break; } switch (alloctype) { case NEW: KASSERT(tp == NULL, ("trim_lookup: found trim")); break; case OLD: KASSERT(tp != NULL, ("trim_lookup: missing call to ffs_blkrelease_start()")); UFS_UNLOCK(ump); free(ntp, M_TRIM); return (tp); case REPLACE: KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); LIST_REMOVE(tp, hashlist); /* tp will be freed by caller */ break; case DONE: KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); LIST_REMOVE(tp, hashlist); UFS_UNLOCK(ump); free(ntp, M_TRIM); return (tp); } TAILQ_INIT(&ntp->blklist); ntp->ump = ump; ntp->devvp = devvp; ntp->bno = bno; ntp->size = size; ntp->inum = inum; ntp->key = key; if (alloctype != SINGLE) { LIST_INSERT_HEAD(tphashhead, ntp, hashlist); UFS_UNLOCK(ump); } return (ntp); } /* * Dispatch a trim request. */ static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *tp) { struct ufsmount *ump; struct mount *mp; struct buf *bp; /* * Postpone the set of the free bit in the cg bitmap until the * BIO_DELETE is completed. Otherwise, due to disk queue * reordering, TRIM might be issued after we reuse the block * and write some new data into it. */ ump = tp->ump; bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); bp->b_iocmd = BIO_DELETE; bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); bp->b_iodone = ffs_blkfree_trim_completed; bp->b_bcount = tp->size; bp->b_fsprivate1 = tp; UFS_LOCK(ump); ump->um_trim_total += 1; ump->um_trim_inflight += 1; ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); UFS_UNLOCK(ump); mp = UFSTOVFS(ump); vn_start_secondary_write(NULL, &mp, 0); g_vfs_strategy(ump->um_bo, bp); } /* * Allocate a new key to use to identify a range of blocks. */ uint64_t ffs_blkrelease_start(struct ufsmount *ump, struct vnode *devvp, ino_t inum) { static u_long masterkey; uint64_t key; if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) return (SINGLETON_KEY); do { key = atomic_fetchadd_long(&masterkey, 1); } while (key < FIRST_VALID_KEY); (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); return (key); } /* * Deallocate a key that has been used to identify a range of blocks. */ void ffs_blkrelease_finish(struct ufsmount *ump, uint64_t key) { struct ffs_blkfree_trim_params *tp; if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) return; /* * If the vfs.ffs.dotrimcons sysctl option is enabled while * a file deletion is active, specifically after a call * to ffs_blkrelease_start() but before the call to * ffs_blkrelease_finish(), ffs_blkrelease_start() will * have handed out SINGLETON_KEY rather than starting a * collection sequence. Thus if we get a SINGLETON_KEY * passed to ffs_blkrelease_finish(), we just return rather * than trying to finish the nonexistent sequence. */ if (key == SINGLETON_KEY) { #ifdef INVARIANTS printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n", ump->um_mountp->mnt_stat.f_mntonname); #endif return; } /* * We are done with sending blocks using this key. Look up the key * using the DONE alloctype (in tp) to request that it be unhashed * as we will not be adding to it. If the key has never been used, * tp->size will be zero, so we can just free tp. Otherwise the call * to ffs_blkfree_sendtrim(tp) causes the block range described by * tp to be issued (and then tp to be freed). */ tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); if (tp->size == 0) free(tp, M_TRIM); else ffs_blkfree_sendtrim(tp); } /* * Setup to free a block or fragment. * * Check for snapshots that might want to claim the block. * If trims are requested, prepare a trim request. Attempt to * aggregate consecutive blocks into a single trim request. */ void ffs_blkfree(struct ufsmount *ump, struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, __enum_uint8(vtype) vtype, struct workhead *dephd, uint64_t key) { struct ffs_blkfree_trim_params *tp, *ntp; struct trim_blkreq *blkelm; /* * Check to see if a snapshot wants to claim the block. * Check that devvp is a normal disk device, not a snapshot, * it has a snapshot(s) associated with it, and one of the * snapshots wants to claim the block. */ if (devvp->v_type == VCHR && (devvp->v_vflag & VV_COPYONWRITE) && ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { return; } /* * Nothing to delay if TRIM is not required for this block or TRIM * is disabled or the operation is performed on a snapshot. */ if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || devvp->v_type == VREG) { ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); return; } blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); blkelm->bno = bno; blkelm->size = size; if (dephd == NULL) { blkelm->pdephd = NULL; } else { LIST_INIT(&blkelm->dephd); LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); blkelm->pdephd = &blkelm->dephd; } if (key == SINGLETON_KEY) { /* * Just a single non-contiguous piece. Use the SINGLE * alloctype to return a trim request that will not be * hashed for future lookup. */ tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); ffs_blkfree_sendtrim(tp); return; } /* * The callers of this function are not tracking whether or not * the blocks are contiguous. They are just saying that they * are freeing a set of blocks. It is this code that determines * the pieces of that range that are actually contiguous. * * Calling ffs_blkrelease_start() will have created an entry * that we will use. */ tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); if (tp->size == 0) { /* * First block of a potential range, set block and size * for the trim block. */ tp->bno = bno; tp->size = size; TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); return; } /* * If this block is a continuation of the range (either * follows at the end or preceeds in the front) then we * add it to the front or back of the list and return. * * If it is not a continuation of the trim that we were * building, using the REPLACE alloctype, we request that * the old trim request (still in tp) be unhashed and a * new range started (in ntp). The ffs_blkfree_sendtrim(tp) * call causes the block range described by tp to be issued * (and then tp to be freed). */ if (bno + numfrags(fs, size) == tp->bno) { TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); tp->bno = bno; tp->size += size; return; } else if (bno == tp->bno + numfrags(fs, tp->size)) { TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); tp->size += size; return; } ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); ffs_blkfree_sendtrim(tp); } #ifdef INVARIANTS /* * Verify allocation of a block or fragment. * Return 1 if block or fragment is free. */ static int ffs_checkfreeblk(struct inode *ip, ufs2_daddr_t bno, long size) { struct fs *fs; struct cg *cgp; struct buf *bp; ufs1_daddr_t cgbno; int i, frags, blkalloced; uint8_t *blksfree; fs = ITOFS(ip); if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("bsize = %ld, size = %ld, fs = %s\n", (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_checkfreeblk: bad size"); } if ((uint64_t)bno >= fs->fs_size) panic("ffs_checkfreeblk: too big block %jd", (intmax_t)bno); if (ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp) != 0) return (0); blksfree = cg_blksfree(cgp); cgbno = dtogd(fs, bno); if (size == fs->fs_bsize) { blkalloced = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); } else { frags = numfrags(fs, size); for (blkalloced = 0, i = 0; i < frags; i++) if (isset(blksfree, cgbno + i)) blkalloced++; if (blkalloced != 0 && blkalloced != frags) panic("ffs_checkfreeblk: partially free fragment"); } brelse(bp); return (blkalloced == 0); } #endif /* INVARIANTS */ /* * Free an inode. */ int ffs_vfree(struct vnode *pvp, ino_t ino, int mode) { struct ufsmount *ump; if (DOINGSOFTDEP(pvp)) { softdep_freefile(pvp, ino, mode); return (0); } ump = VFSTOUFS(pvp->v_mount); return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); } /* * Do the actual free operation. * The specified inode is placed back in the free map. */ int ffs_freefile(struct ufsmount *ump, struct fs *fs, struct vnode *devvp, ino_t ino, int mode, struct workhead *wkhd) { struct cg *cgp; struct buf *bp; daddr_t dbn; int error; uint64_t cg; uint8_t *inosused; struct cdev *dev; ino_t cgino; cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ dev = devvp->v_rdev; } else { bp = NULL; return (0); } if (ino >= fs->fs_ipg * fs->fs_ncg) panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) return (error); /* * Would like to just downgrade to read-only. Until that * capability is available, just toss the cylinder group * update and mark the filesystem as needing to run fsck. */ fs->fs_flags |= FS_NEEDSFSCK; if (devvp->v_type == VREG) dbn = fragstoblks(fs, cgtod(fs, cg)); else dbn = fsbtodb(fs, cgtod(fs, cg)); error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); KASSERT(error == 0, ("getblkx failed")); softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, true); bp->b_flags |= B_RELBUF | B_NOCACHE; bp->b_flags &= ~B_CACHE; bawrite(bp); return (error); } inosused = cg_inosused(cgp); cgino = ino % fs->fs_ipg; if (isclr(inosused, cgino)) { printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); if (fs->fs_ronly == 0) panic("ffs_freefile: freeing free inode"); } clrbit(inosused, cgino); if (cgino < cgp->cg_irotor) cgp->cg_irotor = cgino; cgp->cg_cs.cs_nifree++; UFS_LOCK(ump); fs->fs_cstotal.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir--; fs->fs_cstotal.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, false); bdwrite(bp); return (0); } /* * Check to see if a file is free. * Used to check for allocated files in snapshots. * Return 1 if file is free. */ int ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino) { struct cg *cgp; struct buf *bp; int ret, error; uint64_t cg; uint8_t *inosused; cg = ino_to_cg(fs, ino); if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) return (1); if (ino >= fs->fs_ipg * fs->fs_ncg) return (1); if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0) return (1); inosused = cg_inosused(cgp); ino %= fs->fs_ipg; ret = isclr(inosused, ino); brelse(bp); return (ret); } /* * Find a block of the specified size in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static ufs1_daddr_t ffs_mapsearch(struct fs *fs, struct cg *cgp, ufs2_daddr_t bpref, int allocsiz) { ufs1_daddr_t bno; int start, len, loc, i; int blk, field, subfield, pos; uint8_t *blksfree; /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = cgp->cg_frotor / NBBY; blksfree = cg_blksfree(cgp); len = howmany(fs->fs_fpg, NBBY) - start; loc = scanc((uint64_t)len, (uint8_t *)&blksfree[start], fragtbl[fs->fs_frag], (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { len = start + 1; start = 0; loc = scanc((uint64_t)len, (uint8_t *)&blksfree[0], fragtbl[fs->fs_frag], (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { printf("start = %d, len = %d, fs = %s\n", start, len, fs->fs_fsmnt); panic("ffs_alloccg: map corrupted"); /* NOTREACHED */ } } bno = (start + len - loc) * NBBY; cgp->cg_frotor = bno; /* * found the byte in the map * sift through the bits to find the selected frag */ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, blksfree, bno); blk <<= 1; field = around[allocsiz]; subfield = inside[allocsiz]; for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { if ((blk & field) == subfield) return (bno + pos); field <<= 1; subfield <<= 1; } } printf("bno = %ju, fs = %s\n", (intmax_t)bno, fs->fs_fsmnt); panic("ffs_alloccg: block not in map"); return (-1); } /* * Fetch and verify a cylinder group. */ int ffs_getcg(struct fs *fs, struct vnode *devvp, uint64_t cg, int flags, struct buf **bpp, struct cg **cgpp) { struct buf *bp; struct cg *cgp; struct mount *mp; const struct statfs *sfs; daddr_t blkno; int error; *bpp = NULL; *cgpp = NULL; if ((fs->fs_metackhash & CK_CYLGRP) != 0) flags |= GB_CKHASH; if (devvp->v_type == VCHR) { blkno = fsbtodb(fs, cgtod(fs, cg)); mp = devvp->v_rdev->si_mountpt; } else { blkno = fragstoblks(fs, cgtod(fs, cg)); mp = devvp->v_mount; } error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL, NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp); if (error != 0) return (error); cgp = (struct cg *)bp->b_data; if ((fs->fs_metackhash & CK_CYLGRP) != 0 && (bp->b_flags & B_CKHASH) != 0 && cgp->cg_ckhash != bp->b_ckhash) { if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { sfs = &mp->mnt_stat; printf("UFS %s%s (%s) cylinder checkhash failed: " "cg %ju, cgp: 0x%x != bp: 0x%jx\n", devvp->v_type == VCHR ? "" : "snapshot of ", sfs->f_mntfromname, sfs->f_mntonname, (intmax_t)cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); } bp->b_flags &= ~B_CKHASH; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (EINTEGRITY); } if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { sfs = &mp->mnt_stat; printf("UFS %s%s (%s)", devvp->v_type == VCHR ? "" : "snapshot of ", sfs->f_mntfromname, sfs->f_mntonname); if (!cg_chkmagic(cgp)) printf(" cg %ju: bad magic number 0x%x should " "be 0x%x\n", (intmax_t)cg, cgp->cg_magic, CG_MAGIC); else printf(": wrong cylinder group cg %ju != " "cgx %u\n", (intmax_t)cg, cgp->cg_cgx); } bp->b_flags &= ~B_CKHASH; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (EINTEGRITY); } bp->b_flags &= ~B_CKHASH; bp->b_xflags |= BX_BKGRDWRITE; /* * If we are using check hashes on the cylinder group then we want * to limit changing the cylinder group time to when we are actually * going to write it to disk so that its check hash remains correct * in memory. If the CK_CYLGRP flag is set the time is updated in * ffs_bufwrite() as the buffer is queued for writing. Otherwise we * update the time here as we have done historically. */ if ((fs->fs_metackhash & CK_CYLGRP) != 0) bp->b_xflags |= BX_CYLGRP; else cgp->cg_old_time = cgp->cg_time = time_second; *bpp = bp; *cgpp = cgp; return (0); } static void ffs_ckhash_cg(struct buf *bp) { uint32_t ckhash; struct cg *cgp; cgp = (struct cg *)bp->b_data; ckhash = cgp->cg_ckhash; cgp->cg_ckhash = 0; bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); cgp->cg_ckhash = ckhash; } /* * Called when a cylinder group read has failed. If an integrity check * is the cause of failure then the cylinder group will not be usable * until the filesystem has been unmounted and fsck has been run to * repair it. To avoid future attempts to allocate resources from the * cylinder group, its available resources are set to zero in the * superblock summary information. Since it will appear to have no * resources available, no further calls will be made to allocate * resources from it. When resources are freed to the cylinder group * the resource free routines will find the cylinder group unusable so * the resource will simply be discarded and thus will not show up in * the superblock summary information until they are recovered by fsck. */ static void ffs_checkcgintegrity(struct fs *fs, uint64_t cg, int error) { if (error != EINTEGRITY) return; fs->fs_cstotal.cs_nffree -= fs->fs_cs(fs, cg).cs_nffree; fs->fs_cs(fs, cg).cs_nffree = 0; fs->fs_cstotal.cs_nbfree -= fs->fs_cs(fs, cg).cs_nbfree; fs->fs_cs(fs, cg).cs_nbfree = 0; fs->fs_cstotal.cs_nifree -= fs->fs_cs(fs, cg).cs_nifree; fs->fs_cs(fs, cg).cs_nifree = 0; fs->fs_maxcluster[cg] = 0; fs->fs_flags |= FS_NEEDSFSCK; fs->fs_fmod = 1; } /* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: * fs: error message */ void ffs_fserr(struct fs *fs, ino_t inum, char *cp) { struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, fs->fs_fsmnt, cp); } /* * This function provides the capability for the fsck program to * update an active filesystem. Sixteen operations are provided: * * adjrefcnt(inode, amt) - adjusts the reference count on the * specified inode by the specified amount. Under normal * operation the count should always go down. Decrementing * the count to zero will cause the inode to be freed. * adjblkcnt(inode, amt) - adjust the number of blocks used by the * inode by the specified amount. * adjdepth(inode, amt) - adjust the depth of the specified directory * inode by the specified amount. * setsize(inode, size) - set the size of the inode to the * specified size. * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - * adjust the superblock summary. * freedirs(inode, count) - directory inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freefiles(inode, count) - file inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] * are marked as free. Blocks should never have to be marked * as in use. * setflags(flags, set/clear) - the fs_flags field has the specified * flags set (second parameter +1) or cleared (second parameter -1). * setcwd(dirinode) - set the current directory to dirinode in the * filesystem associated with the snapshot. * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." * in the current directory is oldvalue then change it to newvalue. * unlink(nameptr, oldvalue) - Verify that the inode number associated * with nameptr in the current directory is oldvalue then unlink it. */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT, 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_DEPTH, adjdepth, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust Directory Inode Depth"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Set the inode size"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of directories"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free frags"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free clusters"); static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of Directory Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of File Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of Blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Change Filesystem Flags"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Set Current Working Directory"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Change Value of .. Entry"); static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Unlink a Duplicate Name"); #ifdef DIAGNOSTIC static int fsckcmds = 0; SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "print out fsck_ffs-based filesystem update commands"); #endif /* DIAGNOSTIC */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct fsck_cmd cmd; struct ufsmount *ump; struct vnode *vp, *dvp, *fdvp; struct inode *ip, *dp; struct mount *mp; struct fs *fs; struct pwd *pwd; ufs2_daddr_t blkno; long blkcnt, blksize; uint64_t key; struct file *fp; cap_rights_t rights; int filetype, error; if (req->newptr == NULL || req->newlen > sizeof(cmd)) return (EBADRPC); if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0) return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); if ((error = getvnode(td, cmd.handle, cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0) return (error); vp = fp->f_vnode; if (vp->v_type != VREG && vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } vn_start_write(vp, &mp, V_WAIT); if (mp == NULL || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { vn_finished_write(mp); fdrop(fp, td); return (EINVAL); } ump = VFSTOUFS(mp); if (mp->mnt_flag & MNT_RDONLY) { vn_finished_write(mp); fdrop(fp, td); return (EROFS); } fs = ump->um_fs; filetype = IFREG; switch (oidp->oid_number) { case FFS_SET_FLAGS: #ifdef DIAGNOSTIC if (fsckcmds) printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, cmd.size > 0 ? "set" : "clear"); #endif /* DIAGNOSTIC */ if (cmd.size > 0) fs->fs_flags |= (long)cmd.value; else fs->fs_flags &= ~(long)cmd.value; break; case FFS_ADJ_REFCNT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust inode %jd link count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); ip->i_nlink += cmd.size; DIP_SET_NLINK(ip, ip->i_nlink); ip->i_effnlink += cmd.size; UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); vput(vp); break; case FFS_ADJ_BLKCNT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust inode %jd block count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); vput(vp); break; case FFS_ADJ_DEPTH: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust directory inode %jd depth by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; if (vp->v_type != VDIR) { vput(vp); error = ENOTDIR; break; } ip = VTOI(vp); DIP_SET(ip, i_dirdepth, DIP(ip, i_dirdepth) + cmd.size); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); vput(vp); break; case FFS_SET_SIZE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: set inode %jd size to %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); DIP_SET(ip, i_size, cmd.size); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); vput(vp); break; case FFS_DIR_FREE: filetype = IFDIR; /* fall through */ case FFS_FILE_FREE: #ifdef DIAGNOSTIC if (fsckcmds) { if (cmd.size == 1) printf("%s: free %s inode %ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value); else printf("%s: free %s inodes %ju-%ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value, (uintmax_t)(cmd.value + cmd.size - 1)); } #endif /* DIAGNOSTIC */ while (cmd.size > 0) { if ((error = ffs_freefile(ump, fs, ump->um_devvp, cmd.value, filetype, NULL))) break; cmd.size -= 1; cmd.value += 1; } break; case FFS_BLK_FREE: #ifdef DIAGNOSTIC if (fsckcmds) { if (cmd.size == 1) printf("%s: free block %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); else printf("%s: free blocks %jd-%jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.value + cmd.size - 1); } #endif /* DIAGNOSTIC */ blkno = cmd.value; blkcnt = cmd.size; blksize = fs->fs_frag - (blkno % fs->fs_frag); key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); while (blkcnt > 0) { if (blkcnt < blksize) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL, key); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; } ffs_blkrelease_finish(ump, key); break; /* * Adjust superblock summaries. fsck(8) is expected to * submit deltas when necessary. */ case FFS_ADJ_NDIR: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of directories by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_ndir += cmd.value; break; case FFS_ADJ_NBFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free blocks by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nbfree += cmd.value; break; case FFS_ADJ_NIFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free inodes by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nifree += cmd.value; break; case FFS_ADJ_NFFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free frags by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nffree += cmd.value; break; case FFS_ADJ_NUMCLUSTERS: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free clusters by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_numclusters += cmd.value; break; case FFS_SET_CWD: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: set current directory to inode %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) break; AUDIT_ARG_VNODE1(vp); if ((error = change_dir(vp, td)) != 0) { vput(vp); break; } VOP_UNLOCK(vp); pwd_chdir(td, vp); break; case FFS_SET_DOTDOT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: change .. in cwd from %jd to %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ /* * First we have to get and lock the parent directory * to which ".." points. */ error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); if (error) break; /* * Now we get and lock the child directory containing "..". */ pwd = pwd_hold(td); dvp = pwd->pwd_cdir; if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) { vput(fdvp); pwd_drop(pwd); break; } dp = VTOI(dvp); SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, DT_DIR, 0); cache_purge(fdvp); cache_purge(dvp); vput(dvp); vput(fdvp); pwd_drop(pwd); break; case FFS_UNLINK: #ifdef DIAGNOSTIC if (fsckcmds) { char buf[32]; if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) strncpy(buf, "Name_too_long", 32); printf("%s: unlink %s (inode %jd)\n", mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ /* * kern_funlinkat will do its own start/finish writes and * they do not nest, so drop ours here. Setting mp == NULL * indicates that vn_finished_write is not needed down below. */ vn_finished_write(mp); mp = NULL; error = kern_funlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE, 0, (ino_t)cmd.size); break; default: #ifdef DIAGNOSTIC if (fsckcmds) { printf("Invalid request %d from fsck\n", oidp->oid_number); } #endif /* DIAGNOSTIC */ error = EINVAL; break; } fdrop(fp, td); vn_finished_write(mp); return (error); } diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index 56e2fdd6ce51..fc55267eca0c 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -1,245 +1,246 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_extern.h 8.6 (Berkeley) 3/30/95 */ #ifndef _UFS_FFS_EXTERN_H #define _UFS_FFS_EXTERN_H #ifndef _KERNEL #error "No user-serving parts inside" #else struct buf; struct cg; struct fid; struct fs; struct inode; struct malloc_type; struct mount; struct thread; struct sockaddr; struct statfs; struct ucred; struct vnode; struct vop_fsync_args; struct vop_reallocblks_args; struct workhead; int ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int, struct ucred *, ufs2_daddr_t *); int ffs_balloc_ufs1(struct vnode *a_vp, off_t a_startoffset, int a_size, struct ucred *a_cred, int a_flags, struct buf **a_bpp); int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size, struct ucred *a_cred, int a_flags, struct buf **a_bpp); void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, __enum_uint8(vtype), struct workhead *, uint64_t); ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *); ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *); void ffs_blkrelease_finish(struct ufsmount *, uint64_t); uint64_t ffs_blkrelease_start(struct ufsmount *, struct vnode *, ino_t); uint32_t ffs_calc_sbhash(struct fs *); int ffs_checkfreefile(struct fs *, struct vnode *, ino_t); void ffs_clrblock(struct fs *, uint8_t *, ufs1_daddr_t); void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); void ffs_bdflush(struct bufobj *, struct buf *); int ffs_copyonwrite(struct vnode *, struct buf *); int ffs_flushfiles(struct mount *, int, struct thread *); void ffs_fragacct(struct fs *, int, int32_t [], int); int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t, int, struct workhead *); void ffs_fserr(struct fs *, ino_t, char *); int ffs_getcg(struct fs *, struct vnode *, uint64_t, int, struct buf **, struct cg **); int ffs_inotovp(struct mount *, ino_t, uint64_t, int, struct vnode **, int); int ffs_isblock(struct fs *, uint8_t *, ufs1_daddr_t); int ffs_isfreeblock(struct fs *, uint8_t *, ufs1_daddr_t); void ffs_oldfscompat_write(struct fs *); +bool ffs_oldfscompat_inode_read(struct fs *, union dinodep, time_t); int ffs_own_mount(const struct mount *mp); int ffs_sbsearch(void *, struct fs **, int, struct malloc_type *, int (*)(void *, off_t, void **, int)); int ffs_reallocblks(struct vop_reallocblks_args *); int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t, ufs2_daddr_t, int, int, int, struct ucred *, struct buf **); int ffs_reload(struct mount *, int); int ffs_sbget(void *, struct fs **, off_t, int, struct malloc_type *, int (*)(void *, off_t, void **, int)); int ffs_sbput(void *, struct fs *, off_t, int (*)(void *, off_t, void *, int)); int ffs_sbupdate(struct ufsmount *, int, int); void ffs_setblock(struct fs *, uint8_t *, ufs1_daddr_t); int ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, __enum_uint8(vtype), struct workhead *); void ffs_snapremove(struct vnode *vp); int ffs_snapshot(struct mount *mp, char *snapfile); void ffs_snapshot_mount(struct mount *mp); void ffs_snapshot_unmount(struct mount *mp); void ffs_susp_initialize(void); void ffs_susp_uninitialize(void); void ffs_sync_snap(struct mount *, int); int ffs_syncvnode(struct vnode *vp, int waitfor, int flags); int ffs_truncate(struct vnode *, off_t, int, struct ucred *); int ffs_update(struct vnode *, int); void ffs_update_dinode_ckhash(struct fs *, struct ufs2_dinode *); int ffs_verify_dinode_ckhash(struct fs *, struct ufs2_dinode *); int ffs_valloc(struct vnode *, int, struct ucred *, struct vnode **); int ffs_vfree(struct vnode *, ino_t, int); vfs_vget_t ffs_vget; int ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int); void process_deferred_inactive(struct mount *mp); int ffs_fsfail_cleanup(struct ufsmount *, int); int ffs_fsfail_cleanup_locked(struct ufsmount *, int); int ffs_breadz(struct ufsmount *, struct vnode *, daddr_t, daddr_t, int, daddr_t *, int *, int, struct ucred *, int, void (*)(struct buf *), struct buf **); /* * Flags to ffs_vgetf */ #define FFSV_FORCEINSMQ 0x0001 /* Force insertion into mount list */ #define FFSV_REPLACE 0x0002 /* Replace existing vnode */ #define FFSV_REPLACE_DOOMED 0x0004 /* Replace existing vnode if it is doomed */ #define FFSV_FORCEINODEDEP 0x0008 /* Force allocation of inodedep, ignore MNT_SOFTDEP */ #define FFSV_NEWINODE 0x0010 /* Newly allocated inode */ /* * Flags to ffs_reload */ #define FFSR_FORCE 0x0001 #define FFSR_UNSUSPEND 0x0002 /* * Definitions for TRIM interface * * Special keys and recommended hash table size */ #define NOTRIM_KEY 1 /* never written, so don't call trim for it */ #define SINGLETON_KEY 2 /* only block being freed, so trim it now */ #define FIRST_VALID_KEY 3 /* first valid key describing a block range */ #define MAXTRIMIO 1024 /* maximum expected outstanding trim requests */ extern struct vop_vector ffs_vnodeops1; extern struct vop_vector ffs_fifoops1; extern struct vop_vector ffs_vnodeops2; extern struct vop_vector ffs_fifoops2; /* * Soft update function prototypes. */ int softdep_check_suspend(struct mount *, struct vnode *, int, int, int, int); void softdep_get_depcounts(struct mount *, int *, int *); void softdep_initialize(void); void softdep_uninitialize(void); int softdep_mount(struct vnode *, struct mount *, struct fs *, struct ucred *); void softdep_unmount(struct mount *); void softdep_handle_error(struct buf *); int softdep_move_dependencies(struct buf *, struct buf *); int softdep_flushworklist(struct mount *, int *, struct thread *); int softdep_flushfiles(struct mount *, int, struct thread *); void softdep_update_inodeblock(struct inode *, struct buf *, int); void softdep_load_inodeblock(struct inode *); void softdep_freefile(struct vnode *, ino_t, int); int softdep_request_cleanup(struct fs *, struct vnode *, struct ucred *, int); int softdep_prerename(struct vnode *, struct vnode *, struct vnode *, struct vnode *); int softdep_prelink(struct vnode *, struct vnode *, struct componentname *); void softdep_setup_freeblocks(struct inode *, off_t, int); void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t, int); void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t, int, int); void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t, ufs2_daddr_t, long, long, struct buf *); void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t, ufs2_daddr_t, long, long, struct buf *); void softdep_setup_allocindir_meta(struct buf *, struct inode *, struct buf *, int, ufs2_daddr_t); void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t, struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *); void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int, struct workhead *, bool); void softdep_setup_inofree(struct mount *, struct buf *, ino_t, struct workhead *, bool); void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *); void softdep_fsync_mountdev(struct vnode *); int softdep_sync_metadata(struct vnode *); int softdep_sync_buf(struct vnode *, struct buf *, int); int softdep_fsync(struct vnode *); int softdep_prealloc(struct vnode *, int); int softdep_journal_lookup(struct mount *, struct vnode **); void softdep_journal_freeblocks(struct inode *, struct ucred *, off_t, int); void softdep_journal_fsync(struct inode *); void softdep_buf_append(struct buf *, struct workhead *); void softdep_inode_append(struct inode *, struct ucred *, struct workhead *); void softdep_freework(struct workhead *); /* * Things to request flushing in softdep_request_cleanup() */ #define FLUSH_INODES 1 #define FLUSH_INODES_WAIT 2 #define FLUSH_BLOCKS 3 #define FLUSH_BLOCKS_WAIT 4 /* * Flag to ffs_syncvnode() to request flushing of data only, * but skip the ffs_update() on the inode itself. Used to avoid * deadlock when flushing snapshot inodes while holding snaplk. */ #define NO_INO_UPDT 0x00000001 /* * Request data sync only from ffs_syncvnode(), not touching even more * metadata than NO_INO_UPDT. */ #define DATA_ONLY 0x00000002 int ffs_rdonly(struct inode *); TAILQ_HEAD(snaphead, inode); struct snapdata { LIST_ENTRY(snapdata) sn_link; struct snaphead sn_head; daddr_t sn_listsize; daddr_t *sn_blklist; struct lock sn_lock; }; #endif /* _KERNEL */ #endif /* !_UFS_FFS_EXTERN_H */ diff --git a/sys/ufs/ffs/ffs_subr.c b/sys/ufs/ffs/ffs_subr.c index 1792e1628a48..c0221c8ee2da 100644 --- a/sys/ufs/ffs/ffs_subr.c +++ b/sys/ufs/ffs/ffs_subr.c @@ -1,1184 +1,1245 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_subr.c 8.5 (Berkeley) 3/21/95 */ #include #include #include #include #ifndef _KERNEL +#include #include #include #include #include #include #include #include uint32_t calculate_crc32c(uint32_t, const void *, size_t); uint32_t ffs_calc_sbhash(struct fs *); struct malloc_type; #define UFS_MALLOC(size, type, flags) malloc(size) #define UFS_FREE(ptr, type) free(ptr) #define maxphys MAXPHYS #else /* _KERNEL */ #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #define UFS_MALLOC(size, type, flags) malloc(size, type, flags) #define UFS_FREE(ptr, type) free(ptr, type) #endif /* _KERNEL */ /* * Verify an inode check-hash. */ int ffs_verify_dinode_ckhash(struct fs *fs, struct ufs2_dinode *dip) { uint32_t ckhash, save_ckhash; /* * Return success if unallocated or we are not doing inode check-hash. */ if (dip->di_mode == 0 || (fs->fs_metackhash & CK_INODE) == 0) return (0); /* * Exclude di_ckhash from the crc32 calculation, e.g., always use * a check-hash value of zero when calculating the check-hash. */ save_ckhash = dip->di_ckhash; dip->di_ckhash = 0; ckhash = calculate_crc32c(~0L, (void *)dip, sizeof(*dip)); dip->di_ckhash = save_ckhash; if (save_ckhash == ckhash) return (0); return (EINVAL); } /* * Update an inode check-hash. */ void ffs_update_dinode_ckhash(struct fs *fs, struct ufs2_dinode *dip) { if (dip->di_mode == 0 || (fs->fs_metackhash & CK_INODE) == 0) return; /* * Exclude old di_ckhash from the crc32 calculation, e.g., always use * a check-hash value of zero when calculating the new check-hash. */ dip->di_ckhash = 0; dip->di_ckhash = calculate_crc32c(~0L, (void *)dip, sizeof(*dip)); } /* * These are the low-level functions that actually read and write * the superblock and its associated data. */ static off_t sblock_try[] = SBLOCKSEARCH; static int readsuper(void *, struct fs **, off_t, int, int (*)(void *, off_t, void **, int)); static void ffs_oldfscompat_read(struct fs *, ufs2_daddr_t); static int validate_sblock(struct fs *, int); /* * Read a superblock from the devfd device. * * If an alternate superblock is specified, it is read. Otherwise the * set of locations given in the SBLOCKSEARCH list is searched for a * superblock. Memory is allocated for the superblock by the readfunc and * is returned. If filltype is non-NULL, additional memory is allocated * of type filltype and filled in with the superblock summary information. * All memory is freed when any error is returned. * * If a superblock is found, zero is returned. Otherwise one of the * following error values is returned: * EIO: non-existent or truncated superblock. * EIO: error reading summary information. * ENOENT: no usable known superblock found. * EILSEQ: filesystem with wrong byte order found. * ENOMEM: failed to allocate space for the superblock. * EINVAL: The previous newfs operation on this volume did not complete. * The administrator must complete newfs before using this volume. */ int ffs_sbget(void *devfd, struct fs **fsp, off_t sblock, int flags, struct malloc_type *filltype, int (*readfunc)(void *devfd, off_t loc, void **bufp, int size)) { struct fs *fs; struct fs_summary_info *fs_si; int i, error; uint64_t size, blks; uint8_t *space; int32_t *lp; char *buf; fs = NULL; *fsp = NULL; if (sblock != UFS_STDSB) { if ((error = readsuper(devfd, &fs, sblock, flags | UFS_ALTSBLK, readfunc)) != 0) { if (fs != NULL) UFS_FREE(fs, filltype); return (error); } } else { for (i = 0; sblock_try[i] != -1; i++) { if ((error = readsuper(devfd, &fs, sblock_try[i], flags, readfunc)) == 0) { if ((flags & UFS_NOCSUM) != 0) { *fsp = fs; return (0); } break; } if (fs != NULL) { UFS_FREE(fs, filltype); fs = NULL; } if (error == ENOENT) continue; return (error); } if (sblock_try[i] == -1) return (ENOENT); } /* * Read in the superblock summary information. */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(uint8_t); if ((fs_si = UFS_MALLOC(sizeof(*fs_si), filltype, M_NOWAIT)) == NULL) { UFS_FREE(fs, filltype); return (ENOMEM); } bzero(fs_si, sizeof(*fs_si)); fs->fs_si = fs_si; if ((space = UFS_MALLOC(size, filltype, M_NOWAIT)) == NULL) { UFS_FREE(fs->fs_si, filltype); UFS_FREE(fs, filltype); return (ENOMEM); } fs->fs_csp = (struct csum *)space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; buf = NULL; error = (*readfunc)(devfd, dbtob(fsbtodb(fs, fs->fs_csaddr + i)), (void **)&buf, size); if (error) { if (buf != NULL) UFS_FREE(buf, filltype); UFS_FREE(fs->fs_csp, filltype); UFS_FREE(fs->fs_si, filltype); UFS_FREE(fs, filltype); return (error); } memcpy(space, buf, size); UFS_FREE(buf, filltype); space += size; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = (int32_t *)space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = (uint8_t *)lp; } size = fs->fs_ncg * sizeof(uint8_t); fs->fs_contigdirs = (uint8_t *)space; bzero(fs->fs_contigdirs, size); *fsp = fs; return (0); } /* * Try to read a superblock from the location specified by sblockloc. * Return zero on success or an errno on failure. */ static int readsuper(void *devfd, struct fs **fsp, off_t sblockloc, int flags, int (*readfunc)(void *devfd, off_t loc, void **bufp, int size)) { struct fs *fs; int error, res; uint32_t ckhash; error = (*readfunc)(devfd, sblockloc, (void **)fsp, SBLOCKSIZE); if (error != 0) return (error); fs = *fsp; if (fs->fs_magic == FS_BAD_MAGIC) return (EINVAL); /* * For UFS1 with a 65536 block size, the first backup superblock * is at the same location as the UFS2 superblock. Since SBLOCK_UFS2 * is the first location checked, the first backup is the superblock * that will be accessed. Here we fail the lookup so that we can * retry with the correct location for the UFS1 superblock. */ if (fs->fs_magic == FS_UFS1_MAGIC && (flags & UFS_ALTSBLK) == 0 && fs->fs_bsize == SBLOCK_UFS2 && sblockloc == SBLOCK_UFS2) return (ENOENT); ffs_oldfscompat_read(fs, sblockloc); if ((error = validate_sblock(fs, flags)) > 0) return (error); /* * If the filesystem has been run on a kernel without * metadata check hashes, disable them. */ if ((fs->fs_flags & FS_METACKHASH) == 0) fs->fs_metackhash = 0; /* * Clear any check-hashes that are not maintained * by this kernel. Also clear any unsupported flags. */ fs->fs_metackhash &= CK_SUPPORTED; fs->fs_flags &= FS_SUPPORTED; if (fs->fs_ckhash != (ckhash = ffs_calc_sbhash(fs))) { if ((flags & (UFS_NOMSG | UFS_NOHASHFAIL)) == (UFS_NOMSG | UFS_NOHASHFAIL)) return (0); if ((flags & UFS_NOMSG) != 0) return (EINTEGRITY); #ifdef _KERNEL res = uprintf("Superblock check-hash failed: recorded " "check-hash 0x%x != computed check-hash 0x%x%s\n", fs->fs_ckhash, ckhash, (flags & UFS_NOHASHFAIL) != 0 ? " (Ignored)" : ""); #else res = 0; #endif /* * Print check-hash failure if no controlling terminal * in kernel or always if in user-mode (libufs). */ if (res == 0) printf("Superblock check-hash failed: recorded " "check-hash 0x%x != computed check-hash " "0x%x%s\n", fs->fs_ckhash, ckhash, (flags & UFS_NOHASHFAIL) ? " (Ignored)" : ""); if ((flags & UFS_NOHASHFAIL) != 0) return (0); return (EINTEGRITY); } /* Have to set for old filesystems that predate this field */ fs->fs_sblockactualloc = sblockloc; /* Not yet any summary information */ fs->fs_si = NULL; return (0); } /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(struct fs *fs, ufs2_daddr_t sblockloc) { uint64_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_save_maxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ void ffs_oldfscompat_write(struct fs *fs) { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = fs->fs_save_maxfilesize; } } +/* + * Sanity checks for loading old filesystem inodes. + * + * XXX - Parts get retired eventually. + * Unfortunately new bits get added. + */ +static int prttimechgs = 0; +#ifdef _KERNEL +SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "FFS filesystem"); + +SYSCTL_INT(_vfs_ffs, OID_AUTO, prttimechgs, CTLFLAG_RWTUN, &prttimechgs, 0, + "print UFS1 time changes made to inodes"); +#endif /* _KERNEL */ +bool +ffs_oldfscompat_inode_read(struct fs *fs, union dinodep dp, time_t now) +{ + bool change; + + change = false; + switch (fs->fs_magic) { + case FS_UFS2_MAGIC: + /* No changes for now */ + break; + + case FS_UFS1_MAGIC: + /* + * With the change to unsigned time values in UFS1, times set + * before Jan 1, 1970 will appear to be in the future. Check + * for future times and set them to be the current time. + */ + if (dp.dp1->di_ctime > now) { + if (prttimechgs) + printf("ctime %ud changed to %ld\n", + dp.dp1->di_ctime, (long)now); + dp.dp1->di_ctime = now; + change = true; + } + if (dp.dp1->di_mtime > now) { + if (prttimechgs) + printf("mtime %ud changed to %ld\n", + dp.dp1->di_mtime, (long)now); + dp.dp1->di_mtime = now; + dp.dp1->di_ctime = now; + change = true; + } + if (dp.dp1->di_atime > now) { + if (prttimechgs) + printf("atime %ud changed to %ld\n", + dp.dp1->di_atime, (long)now); + dp.dp1->di_atime = now; + dp.dp1->di_ctime = now; + change = true; + } + break; + } + return (change); +} + /* * Verify the filesystem values. */ #define ILOG2(num) (fls(num) - 1) #ifdef STANDALONE_SMALL #define MPRINT(...) do { } while (0) #else #define MPRINT(...) if (prtmsg) printf(__VA_ARGS__) #endif #define FCHK(lhs, op, rhs, fmt) \ if (lhs op rhs) { \ MPRINT("UFS%d superblock failed: %s (" #fmt ") %s %s (" \ #fmt ")\n", fs->fs_magic == FS_UFS1_MAGIC ? 1 : 2, \ #lhs, (intmax_t)lhs, #op, #rhs, (intmax_t)rhs); \ if (error < 0) \ return (ENOENT); \ if (error == 0) \ error = ENOENT; \ } #define WCHK(lhs, op, rhs, fmt) \ if (lhs op rhs) { \ MPRINT("UFS%d superblock failed: %s (" #fmt ") %s %s (" \ #fmt ")%s\n", fs->fs_magic == FS_UFS1_MAGIC ? 1 : 2,\ #lhs, (intmax_t)lhs, #op, #rhs, (intmax_t)rhs, wmsg);\ if (error == 0) \ error = warnerr; \ if (warnerr == 0) \ lhs = rhs; \ } #define FCHK2(lhs1, op1, rhs1, lhs2, op2, rhs2, fmt) \ if (lhs1 op1 rhs1 && lhs2 op2 rhs2) { \ MPRINT("UFS%d superblock failed: %s (" #fmt ") %s %s (" \ #fmt ") && %s (" #fmt ") %s %s (" #fmt ")\n", \ fs->fs_magic == FS_UFS1_MAGIC ? 1 : 2, #lhs1, \ (intmax_t)lhs1, #op1, #rhs1, (intmax_t)rhs1, #lhs2, \ (intmax_t)lhs2, #op2, #rhs2, (intmax_t)rhs2); \ if (error < 0) \ return (ENOENT); \ if (error == 0) \ error = ENOENT; \ } static int validate_sblock(struct fs *fs, int flags) { uint64_t i, sectorsize; uint64_t maxfilesize, sizepb; int error, prtmsg, warnerr; char *wmsg; error = 0; sectorsize = dbtob(1); prtmsg = ((flags & UFS_NOMSG) == 0); warnerr = (flags & UFS_NOWARNFAIL) == UFS_NOWARNFAIL ? 0 : ENOENT; wmsg = warnerr ? "" : " (Ignored)"; /* * Check for endian mismatch between machine and filesystem. */ if (((fs->fs_magic != FS_UFS2_MAGIC) && (bswap32(fs->fs_magic) == FS_UFS2_MAGIC)) || ((fs->fs_magic != FS_UFS1_MAGIC) && (bswap32(fs->fs_magic) == FS_UFS1_MAGIC))) { MPRINT("UFS superblock failed due to endian mismatch " "between machine and filesystem\n"); return(EILSEQ); } /* * If just validating for recovery, then do just the minimal * checks needed for the superblock fields needed to find * alternate superblocks. */ if ((flags & UFS_FSRONLY) == UFS_FSRONLY && (fs->fs_magic == FS_UFS1_MAGIC || fs->fs_magic == FS_UFS2_MAGIC)) { error = -1; /* fail on first error */ if (fs->fs_magic == FS_UFS2_MAGIC) { FCHK(fs->fs_sblockloc, !=, SBLOCK_UFS2, %#jx); } else if (fs->fs_magic == FS_UFS1_MAGIC) { FCHK(fs->fs_sblockloc, <, 0, %jd); FCHK(fs->fs_sblockloc, >, SBLOCK_UFS1, %jd); } FCHK(fs->fs_frag, <, 1, %jd); FCHK(fs->fs_frag, >, MAXFRAG, %jd); FCHK(fs->fs_bsize, <, MINBSIZE, %jd); FCHK(fs->fs_bsize, >, MAXBSIZE, %jd); FCHK(fs->fs_bsize, <, roundup(sizeof(struct fs), DEV_BSIZE), %jd); FCHK(fs->fs_fsize, <, sectorsize, %jd); FCHK(fs->fs_fsize * fs->fs_frag, !=, fs->fs_bsize, %jd); FCHK(powerof2(fs->fs_fsize), ==, 0, %jd); FCHK(fs->fs_sbsize, >, SBLOCKSIZE, %jd); FCHK(fs->fs_sbsize, <, (signed)sizeof(struct fs), %jd); FCHK(fs->fs_sbsize % sectorsize, !=, 0, %jd); FCHK(fs->fs_fpg, <, 3 * fs->fs_frag, %jd); FCHK(fs->fs_ncg, <, 1, %jd); FCHK(fs->fs_fsbtodb, !=, ILOG2(fs->fs_fsize / sectorsize), %jd); FCHK(fs->fs_old_cgoffset, <, 0, %jd); FCHK2(fs->fs_old_cgoffset, >, 0, ~fs->fs_old_cgmask, <, 0, %jd); FCHK(fs->fs_old_cgoffset * (~fs->fs_old_cgmask), >, fs->fs_fpg, %jd); FCHK(fs->fs_sblkno, !=, roundup( howmany(fs->fs_sblockloc + SBLOCKSIZE, fs->fs_fsize), fs->fs_frag), %jd); FCHK(CGSIZE(fs), >, fs->fs_bsize, %jd); /* Only need to validate these if reading in csum data */ if ((flags & UFS_NOCSUM) != 0) return (error); FCHK((uint64_t)fs->fs_ipg * fs->fs_ncg, >, (((int64_t)(1)) << 32) - INOPB(fs), %jd); FCHK(fs->fs_cstotal.cs_nifree, <, 0, %jd); FCHK(fs->fs_cstotal.cs_nifree, >, (uint64_t)fs->fs_ipg * fs->fs_ncg, %jd); FCHK(fs->fs_cstotal.cs_ndir, >, ((uint64_t)fs->fs_ipg * fs->fs_ncg) - fs->fs_cstotal.cs_nifree, %jd); FCHK(fs->fs_size, <, 8 * fs->fs_frag, %jd); FCHK(fs->fs_size, <=, ((int64_t)fs->fs_ncg - 1) * fs->fs_fpg, %jd); FCHK(fs->fs_size, >, (int64_t)fs->fs_ncg * fs->fs_fpg, %jd); FCHK(fs->fs_csaddr, <, 0, %jd); FCHK(fs->fs_cssize, !=, fragroundup(fs, fs->fs_ncg * sizeof(struct csum)), %jd); FCHK(fs->fs_csaddr + howmany(fs->fs_cssize, fs->fs_fsize), >, fs->fs_size, %jd); FCHK(fs->fs_csaddr, <, cgdmin(fs, dtog(fs, fs->fs_csaddr)), %jd); FCHK(dtog(fs, fs->fs_csaddr + howmany(fs->fs_cssize, fs->fs_fsize)), >, dtog(fs, fs->fs_csaddr), %jd); return (error); } if (fs->fs_magic == FS_UFS2_MAGIC) { if ((flags & UFS_ALTSBLK) == 0) FCHK2(fs->fs_sblockactualloc, !=, SBLOCK_UFS2, fs->fs_sblockactualloc, !=, 0, %jd); FCHK(fs->fs_sblockloc, !=, SBLOCK_UFS2, %#jx); FCHK(fs->fs_maxsymlinklen, !=, ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)), %jd); FCHK(fs->fs_nindir, !=, fs->fs_bsize / sizeof(ufs2_daddr_t), %jd); FCHK(fs->fs_inopb, !=, fs->fs_bsize / sizeof(struct ufs2_dinode), %jd); } else if (fs->fs_magic == FS_UFS1_MAGIC) { if ((flags & UFS_ALTSBLK) == 0) FCHK(fs->fs_sblockactualloc, >, SBLOCK_UFS1, %jd); FCHK(fs->fs_sblockloc, <, 0, %jd); FCHK(fs->fs_sblockloc, >, SBLOCK_UFS1, %jd); FCHK(fs->fs_nindir, !=, fs->fs_bsize / sizeof(ufs1_daddr_t), %jd); FCHK(fs->fs_inopb, !=, fs->fs_bsize / sizeof(struct ufs1_dinode), %jd); FCHK(fs->fs_maxsymlinklen, !=, ((UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)), %jd); WCHK(fs->fs_old_inodefmt, !=, FS_44INODEFMT, %jd); WCHK(fs->fs_old_rotdelay, !=, 0, %jd); WCHK(fs->fs_old_rps, !=, 60, %jd); WCHK(fs->fs_old_nspf, !=, fs->fs_fsize / sectorsize, %jd); WCHK(fs->fs_old_interleave, !=, 1, %jd); WCHK(fs->fs_old_trackskew, !=, 0, %jd); WCHK(fs->fs_old_cpc, !=, 0, %jd); WCHK(fs->fs_old_postblformat, !=, 1, %jd); FCHK(fs->fs_old_nrpos, !=, 1, %jd); WCHK(fs->fs_old_nsect, !=, fs->fs_old_spc, %jd); WCHK(fs->fs_old_npsect, !=, fs->fs_old_spc, %jd); } else { /* Bad magic number, so assume not a superblock */ return (ENOENT); } FCHK(fs->fs_bsize, <, MINBSIZE, %jd); FCHK(fs->fs_bsize, >, MAXBSIZE, %jd); FCHK(fs->fs_bsize, <, roundup(sizeof(struct fs), DEV_BSIZE), %jd); FCHK(powerof2(fs->fs_bsize), ==, 0, %jd); FCHK(fs->fs_frag, <, 1, %jd); FCHK(fs->fs_frag, >, MAXFRAG, %jd); FCHK(fs->fs_frag, !=, numfrags(fs, fs->fs_bsize), %jd); FCHK(fs->fs_fsize, <, sectorsize, %jd); FCHK(fs->fs_fsize * fs->fs_frag, !=, fs->fs_bsize, %jd); FCHK(powerof2(fs->fs_fsize), ==, 0, %jd); FCHK(fs->fs_fpg, <, 3 * fs->fs_frag, %jd); FCHK(fs->fs_ncg, <, 1, %jd); FCHK(fs->fs_ipg, <, fs->fs_inopb, %jd); FCHK((uint64_t)fs->fs_ipg * fs->fs_ncg, >, (((int64_t)(1)) << 32) - INOPB(fs), %jd); FCHK(fs->fs_cstotal.cs_nifree, <, 0, %jd); FCHK(fs->fs_cstotal.cs_nifree, >, (uint64_t)fs->fs_ipg * fs->fs_ncg, %jd); FCHK(fs->fs_cstotal.cs_ndir, <, 0, %jd); FCHK(fs->fs_cstotal.cs_ndir, >, ((uint64_t)fs->fs_ipg * fs->fs_ncg) - fs->fs_cstotal.cs_nifree, %jd); FCHK(fs->fs_sbsize, >, SBLOCKSIZE, %jd); FCHK(fs->fs_sbsize, <, (signed)sizeof(struct fs), %jd); /* fix for misconfigured filesystems */ if (fs->fs_maxbsize == 0) fs->fs_maxbsize = fs->fs_bsize; FCHK(fs->fs_maxbsize, <, fs->fs_bsize, %jd); FCHK(powerof2(fs->fs_maxbsize), ==, 0, %jd); FCHK(fs->fs_maxbsize, >, FS_MAXCONTIG * fs->fs_bsize, %jd); FCHK(fs->fs_bmask, !=, ~(fs->fs_bsize - 1), %#jx); FCHK(fs->fs_fmask, !=, ~(fs->fs_fsize - 1), %#jx); FCHK(fs->fs_qbmask, !=, ~fs->fs_bmask, %#jx); FCHK(fs->fs_qfmask, !=, ~fs->fs_fmask, %#jx); FCHK(fs->fs_bshift, !=, ILOG2(fs->fs_bsize), %jd); FCHK(fs->fs_fshift, !=, ILOG2(fs->fs_fsize), %jd); FCHK(fs->fs_fragshift, !=, ILOG2(fs->fs_frag), %jd); FCHK(fs->fs_fsbtodb, !=, ILOG2(fs->fs_fsize / sectorsize), %jd); FCHK(fs->fs_old_cgoffset, <, 0, %jd); FCHK2(fs->fs_old_cgoffset, >, 0, ~fs->fs_old_cgmask, <, 0, %jd); FCHK(fs->fs_old_cgoffset * (~fs->fs_old_cgmask), >, fs->fs_fpg, %jd); FCHK(CGSIZE(fs), >, fs->fs_bsize, %jd); /* * If anything has failed up to this point, it is usafe to proceed * as checks below may divide by zero or make other fatal calculations. * So if we have any errors at this point, give up. */ if (error) return (error); FCHK(fs->fs_sbsize % sectorsize, !=, 0, %jd); FCHK(fs->fs_ipg % fs->fs_inopb, !=, 0, %jd); FCHK(fs->fs_sblkno, !=, roundup( howmany(fs->fs_sblockloc + SBLOCKSIZE, fs->fs_fsize), fs->fs_frag), %jd); FCHK(fs->fs_cblkno, !=, fs->fs_sblkno + roundup(howmany(SBLOCKSIZE, fs->fs_fsize), fs->fs_frag), %jd); FCHK(fs->fs_iblkno, !=, fs->fs_cblkno + fs->fs_frag, %jd); FCHK(fs->fs_dblkno, !=, fs->fs_iblkno + fs->fs_ipg / INOPF(fs), %jd); FCHK(fs->fs_cgsize, >, fs->fs_bsize, %jd); FCHK(fs->fs_cgsize, <, fs->fs_fsize, %jd); FCHK(fs->fs_cgsize % fs->fs_fsize, !=, 0, %jd); /* * This test is valid, however older versions of growfs failed * to correctly update fs_dsize so will fail this test. Thus we * exclude it from the requirements. */ #ifdef notdef WCHK(fs->fs_dsize, !=, fs->fs_size - fs->fs_sblkno - fs->fs_ncg * (fs->fs_dblkno - fs->fs_sblkno) - howmany(fs->fs_cssize, fs->fs_fsize), %jd); #endif WCHK(fs->fs_metaspace, <, 0, %jd); WCHK(fs->fs_metaspace, >, fs->fs_fpg / 2, %jd); WCHK(fs->fs_minfree, >, 99, %jd%%); maxfilesize = fs->fs_bsize * UFS_NDADDR - 1; for (sizepb = fs->fs_bsize, i = 0; i < UFS_NIADDR; i++) { sizepb *= NINDIR(fs); maxfilesize += sizepb; } WCHK(fs->fs_maxfilesize, >, maxfilesize, %jd); /* * These values have a tight interaction with each other that * makes it hard to tightly bound them. So we can only check * that they are within a broader possible range. * * The size cannot always be accurately determined, but ensure * that it is consistent with the number of cylinder groups (fs_ncg) * and the number of fragments per cylinder group (fs_fpg). Ensure * that the summary information size is correct and that it starts * and ends in the data area of the same cylinder group. */ FCHK(fs->fs_size, <, 8 * fs->fs_frag, %jd); FCHK(fs->fs_size, <=, ((int64_t)fs->fs_ncg - 1) * fs->fs_fpg, %jd); FCHK(fs->fs_size, >, (int64_t)fs->fs_ncg * fs->fs_fpg, %jd); /* * If we are not requested to read in the csum data stop here * as the correctness of the remaining values is only important * to bound the space needed to be allocated to hold the csum data. */ if ((flags & UFS_NOCSUM) != 0) return (error); FCHK(fs->fs_csaddr, <, 0, %jd); FCHK(fs->fs_cssize, !=, fragroundup(fs, fs->fs_ncg * sizeof(struct csum)), %jd); FCHK(fs->fs_csaddr + howmany(fs->fs_cssize, fs->fs_fsize), >, fs->fs_size, %jd); FCHK(fs->fs_csaddr, <, cgdmin(fs, dtog(fs, fs->fs_csaddr)), %jd); FCHK(dtog(fs, fs->fs_csaddr + howmany(fs->fs_cssize, fs->fs_fsize)), >, dtog(fs, fs->fs_csaddr), %jd); /* * With file system clustering it is possible to allocate * many contiguous blocks. The kernel variable maxphys defines * the maximum transfer size permitted by the controller and/or * buffering. The fs_maxcontig parameter controls the maximum * number of blocks that the filesystem will read or write * in a single transfer. It is calculated when the filesystem * is created as maxphys / fs_bsize. The loader uses a maxphys * of 128K even when running on a system that supports larger * values. If the filesystem was built on a system that supports * a larger maxphys (1M is typical) it will have configured * fs_maxcontig for that larger system. So we bound the upper * allowable limit for fs_maxconfig to be able to at least * work with a 1M maxphys on the smallest block size filesystem: * 1M / 4096 == 256. There is no harm in allowing the mounting of * filesystems that make larger than maxphys I/O requests because * those (mostly 32-bit machines) can (very slowly) handle I/O * requests that exceed maxphys. */ WCHK(fs->fs_maxcontig, <, 0, %jd); WCHK(fs->fs_maxcontig, >, MAX(256, maxphys / fs->fs_bsize), %jd); FCHK2(fs->fs_maxcontig, ==, 0, fs->fs_contigsumsize, !=, 0, %jd); FCHK2(fs->fs_maxcontig, >, 1, fs->fs_contigsumsize, !=, MIN(fs->fs_maxcontig, FS_MAXCONTIG), %jd); return (error); } /* * Make an extensive search to find a superblock. If the superblock * in the standard place cannot be used, try looking for one of the * backup superblocks. * * Flags are made up of the following or'ed together options: * * UFS_NOMSG indicates that superblock inconsistency error messages * should not be printed. * * UFS_NOCSUM causes only the superblock itself to be returned, but does * not read in any auxillary data structures like the cylinder group * summary information. */ int ffs_sbsearch(void *devfd, struct fs **fsp, int reqflags, struct malloc_type *filltype, int (*readfunc)(void *devfd, off_t loc, void **bufp, int size)) { struct fsrecovery *fsr; struct fs *protofs; void *fsrbuf; char *cp; long nocsum, flags, msg, cg; off_t sblk, secsize; int error; msg = (reqflags & UFS_NOMSG) == 0; nocsum = reqflags & UFS_NOCSUM; /* * Try normal superblock read and return it if it works. * * Suppress messages if it fails until we find out if * failure can be avoided. */ flags = UFS_NOMSG | nocsum; error = ffs_sbget(devfd, fsp, UFS_STDSB, flags, filltype, readfunc); /* * If successful or endian error, no need to try further. */ if (error == 0 || error == EILSEQ) { if (msg && error == EILSEQ) printf("UFS superblock failed due to endian mismatch " "between machine and filesystem\n"); return (error); } /* * First try: ignoring hash failures. */ flags |= UFS_NOHASHFAIL; if (msg) flags &= ~UFS_NOMSG; if (ffs_sbget(devfd, fsp, UFS_STDSB, flags, filltype, readfunc) == 0) return (0); /* * Next up is to check if fields of the superblock that are * needed to find backup superblocks are usable. */ if (msg) printf("Attempted recovery for standard superblock: failed\n"); flags = UFS_FSRONLY | UFS_NOHASHFAIL | UFS_NOCSUM | UFS_NOMSG; if (ffs_sbget(devfd, &protofs, UFS_STDSB, flags, filltype, readfunc) == 0) { if (msg) printf("Attempt extraction of recovery data from " "standard superblock.\n"); } else { /* * Final desperation is to see if alternate superblock * parameters have been saved in the boot area. */ if (msg) printf("Attempted extraction of recovery data from " "standard superblock: failed\nAttempt to find " "boot zone recovery data.\n"); /* * Look to see if recovery information has been saved. * If so we can generate a prototype superblock based * on that information. * * We need fragments-per-group, number of cylinder groups, * location of the superblock within the cylinder group, and * the conversion from filesystem fragments to disk blocks. * * When building a UFS2 filesystem, newfs(8) stores these * details at the end of the boot block area at the start * of the filesystem partition. If they have been overwritten * by a boot block, we fail. But usually they are there * and we can use them. * * We could ask the underlying device for its sector size, * but some devices lie. So we just try a plausible range. */ error = ENOENT; fsrbuf = NULL; for (secsize = dbtob(1); secsize <= SBLOCKSIZE; secsize *= 2) if ((error = (*readfunc)(devfd, (SBLOCK_UFS2 - secsize), &fsrbuf, secsize)) == 0) break; if (error != 0) goto trynowarn; cp = fsrbuf; /* type change to keep compiler happy */ fsr = (struct fsrecovery *)&cp[secsize - sizeof *fsr]; if (fsr->fsr_magic != FS_UFS2_MAGIC || (protofs = UFS_MALLOC(SBLOCKSIZE, filltype, M_NOWAIT)) == NULL) { UFS_FREE(fsrbuf, filltype); goto trynowarn; } memset(protofs, 0, sizeof(struct fs)); protofs->fs_fpg = fsr->fsr_fpg; protofs->fs_fsbtodb = fsr->fsr_fsbtodb; protofs->fs_sblkno = fsr->fsr_sblkno; protofs->fs_magic = fsr->fsr_magic; protofs->fs_ncg = fsr->fsr_ncg; UFS_FREE(fsrbuf, filltype); } /* * Scan looking for alternative superblocks. */ flags = nocsum; if (!msg) flags |= UFS_NOMSG; for (cg = 0; cg < protofs->fs_ncg; cg++) { sblk = fsbtodb(protofs, cgsblock(protofs, cg)); if (msg) printf("Try cg %ld at sblock loc %jd\n", cg, (intmax_t)sblk); if (ffs_sbget(devfd, fsp, dbtob(sblk), flags, filltype, readfunc) == 0) { if (msg) printf("Succeeded with alternate superblock " "at %jd\n", (intmax_t)sblk); UFS_FREE(protofs, filltype); return (0); } } UFS_FREE(protofs, filltype); /* * Our alternate superblock strategies failed. Our last ditch effort * is to see if the standard superblock has only non-critical errors. */ trynowarn: flags = UFS_NOWARNFAIL | UFS_NOMSG | nocsum; if (msg) { printf("Finding an alternate superblock failed.\nCheck for " "only non-critical errors in standard superblock\n"); flags &= ~UFS_NOMSG; } if (ffs_sbget(devfd, fsp, UFS_STDSB, flags, filltype, readfunc) != 0) { if (msg) printf("Failed, superblock has critical errors\n"); return (ENOENT); } if (msg) printf("Success, using standard superblock with " "non-critical errors.\n"); return (0); } /* * Write a superblock to the devfd device from the memory pointed to by fs. * Write out the superblock summary information if it is present. * * If the write is successful, zero is returned. Otherwise one of the * following error values is returned: * EIO: failed to write superblock. * EIO: failed to write superblock summary information. */ int ffs_sbput(void *devfd, struct fs *fs, off_t loc, int (*writefunc)(void *devfd, off_t loc, void *buf, int size)) { int i, error, blks, size; uint8_t *space; /* * If there is summary information, write it first, so if there * is an error, the superblock will not be marked as clean. */ if (fs->fs_si != NULL && fs->fs_csp != NULL) { blks = howmany(fs->fs_cssize, fs->fs_fsize); space = (uint8_t *)fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = (*writefunc)(devfd, dbtob(fsbtodb(fs, fs->fs_csaddr + i)), space, size)) != 0) return (error); space += size; } } fs->fs_fmod = 0; #ifndef _KERNEL { struct fs_summary_info *fs_si; fs->fs_time = time(NULL); /* Clear the pointers for the duration of writing. */ fs_si = fs->fs_si; fs->fs_si = NULL; fs->fs_ckhash = ffs_calc_sbhash(fs); error = (*writefunc)(devfd, loc, fs, fs->fs_sbsize); fs->fs_si = fs_si; } #else /* _KERNEL */ fs->fs_time = time_second; fs->fs_ckhash = ffs_calc_sbhash(fs); error = (*writefunc)(devfd, loc, fs, fs->fs_sbsize); #endif /* _KERNEL */ return (error); } /* * Calculate the check-hash for a superblock. */ uint32_t ffs_calc_sbhash(struct fs *fs) { uint32_t ckhash, save_ckhash; /* * A filesystem that was using a superblock ckhash may be moved * to an older kernel that does not support ckhashes. The * older kernel will clear the FS_METACKHASH flag indicating * that it does not update hashes. When the disk is moved back * to a kernel capable of ckhashes it disables them on mount: * * if ((fs->fs_flags & FS_METACKHASH) == 0) * fs->fs_metackhash = 0; * * This leaves (fs->fs_metackhash & CK_SUPERBLOCK) == 0) with an * old stale value in the fs->fs_ckhash field. Thus the need to * just accept what is there. */ if ((fs->fs_metackhash & CK_SUPERBLOCK) == 0) return (fs->fs_ckhash); save_ckhash = fs->fs_ckhash; fs->fs_ckhash = 0; /* * If newly read from disk, the caller is responsible for * verifying that fs->fs_sbsize <= SBLOCKSIZE. */ ckhash = calculate_crc32c(~0L, (void *)fs, fs->fs_sbsize); fs->fs_ckhash = save_ckhash; return (ckhash); } /* * Update the frsum fields to reflect addition or deletion * of some frags. */ void ffs_fragacct(struct fs *fs, int fragmap, int32_t fraglist[], int cnt) { int inblk; int field, subfield; int siz, pos; inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; fragmap <<= 1; for (siz = 1; siz < fs->fs_frag; siz++) { if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) continue; field = around[siz]; subfield = inside[siz]; for (pos = siz; pos <= fs->fs_frag; pos++) { if ((fragmap & field) == subfield) { fraglist[siz] += cnt; pos += siz; field <<= siz; subfield <<= siz; } field <<= 1; subfield <<= 1; } } } /* * block operations * * check if a block is available */ int ffs_isblock(struct fs *fs, unsigned char *cp, ufs1_daddr_t h) { unsigned char mask; switch ((int)fs->fs_frag) { case 8: return (cp[h] == 0xff); case 4: mask = 0x0f << ((h & 0x1) << 2); return ((cp[h >> 1] & mask) == mask); case 2: mask = 0x03 << ((h & 0x3) << 1); return ((cp[h >> 2] & mask) == mask); case 1: mask = 0x01 << (h & 0x7); return ((cp[h >> 3] & mask) == mask); default: #ifdef _KERNEL panic("ffs_isblock"); #endif break; } return (0); } /* * check if a block is free */ int ffs_isfreeblock(struct fs *fs, uint8_t *cp, ufs1_daddr_t h) { switch ((int)fs->fs_frag) { case 8: return (cp[h] == 0); case 4: return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); case 2: return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); case 1: return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); default: #ifdef _KERNEL panic("ffs_isfreeblock"); #endif break; } return (0); } /* * take a block out of the map */ void ffs_clrblock(struct fs *fs, uint8_t *cp, ufs1_daddr_t h) { switch ((int)fs->fs_frag) { case 8: cp[h] = 0; return; case 4: cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] &= ~(0x01 << (h & 0x7)); return; default: #ifdef _KERNEL panic("ffs_clrblock"); #endif break; } } /* * put a block into the map */ void ffs_setblock(struct fs *fs, unsigned char *cp, ufs1_daddr_t h) { switch ((int)fs->fs_frag) { case 8: cp[h] = 0xff; return; case 4: cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); return; case 2: cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); return; case 1: cp[h >> 3] |= (0x01 << (h & 0x7)); return; default: #ifdef _KERNEL panic("ffs_setblock"); #endif break; } } /* * Update the cluster map because of an allocation or free. * * Cnt == 1 means free; cnt == -1 means allocating. */ void ffs_clusteracct(struct fs *fs, struct cg *cgp, ufs1_daddr_t blkno, int cnt) { int32_t *sump; int32_t *lp; uint8_t *freemapp, *mapp; int i, start, end, forw, back, map; uint64_t bit; if (fs->fs_contigsumsize <= 0) return; freemapp = cg_clustersfree(cgp); sump = cg_clustersum(cgp); /* * Allocate or clear the actual block. */ if (cnt > 0) setbit(freemapp, blkno); else clrbit(freemapp, blkno); /* * Find the size of the cluster going forward. */ start = blkno + 1; end = start + fs->fs_contigsumsize; if (end >= cgp->cg_nclusterblks) end = cgp->cg_nclusterblks; mapp = &freemapp[start / NBBY]; map = *mapp++; bit = 1U << (start % NBBY); for (i = start; i < end; i++) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } forw = i - start; /* * Find the size of the cluster going backward. */ start = blkno - 1; end = start - fs->fs_contigsumsize; if (end < 0) end = -1; mapp = &freemapp[start / NBBY]; map = *mapp--; bit = 1U << (start % NBBY); for (i = start; i > end; i--) { if ((map & bit) == 0) break; if ((i & (NBBY - 1)) != 0) { bit >>= 1; } else { map = *mapp--; bit = 1U << (NBBY - 1); } } back = start - i; /* * Account for old cluster and the possibly new forward and * back clusters. */ i = back + forw + 1; if (i > fs->fs_contigsumsize) i = fs->fs_contigsumsize; sump[i] += cnt; if (back > 0) sump[back] -= cnt; if (forw > 0) sump[forw] -= cnt; /* * Update cluster summary information. */ lp = &sump[fs->fs_contigsumsize]; for (i = fs->fs_contigsumsize; i > 0; i--) if (*lp-- > 0) break; fs->fs_maxcluster[cgp->cg_cgx] = i; } diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index be24ba1e733a..eadd5815eaf7 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1,2474 +1,2480 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; VFS_SMR_DECLARE; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static int ffs_sync_lazy(struct mount *mp); static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size); static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = vfs_cache_root, .vfs_cachedroot = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, VFCF_FILEREVINC); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; /* * Note that userquota and groupquota options are not currently used * by UFS/FFS code and generally mount(8) does not pass those options * from userland, but they can be passed by loader(8) via * vfs.root.mountfrom.options. */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", "multilabel", "nfsv4acls", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", "userquota", "untrusted", NULL }; static int ffs_enxio_enable = 1; SYSCTL_DECL(_vfs_ffs); SYSCTL_INT(_vfs_ffs, OID_AUTO, enxio_enable, CTLFLAG_RWTUN, &ffs_enxio_enable, 0, "enable mapping of other disk I/O errors to ENXIO"); /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ static int ffs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp) { struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; int bsize, error; ip = VTOI(vp); fs = ITOFS(ip); lbn = lblkno(fs, offset); bsize = blksize(fs, ip, lbn); *bpp = NULL; error = bread(vp, lbn, bsize, NOCRED, &bp); if (error) { return (error); } if (res) *res = (char *)bp->b_data + blkoff(fs, offset); *bpp = bp; return (0); } /* * Load up the contents of an inode and copy the appropriate pieces * to the incore copy. */ static int ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino) { struct ufs1_dinode *dip1; struct ufs2_dinode *dip2; int error; if (I_IS_UFS1(ip)) { dip1 = ip->i_din1; *dip1 = *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); ip->i_mode = dip1->di_mode; ip->i_nlink = dip1->di_nlink; ip->i_effnlink = dip1->di_nlink; ip->i_size = dip1->di_size; ip->i_flags = dip1->di_flags; ip->i_gen = dip1->di_gen; ip->i_uid = dip1->di_uid; ip->i_gid = dip1->di_gid; + if (ffs_oldfscompat_inode_read(fs, ip->i_dp, time_second) && + fs->fs_ronly == 0) + UFS_INODE_SET_FLAG(ip, IN_MODIFIED); return (0); } dip2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); if ((error = ffs_verify_dinode_ckhash(fs, dip2)) != 0 && !ffs_fsfail_cleanup(ITOUMP(ip), error)) { printf("%s: inode %jd: check-hash failed\n", fs->fs_fsmnt, (intmax_t)ino); return (error); } *ip->i_din2 = *dip2; dip2 = ip->i_din2; ip->i_mode = dip2->di_mode; ip->i_nlink = dip2->di_nlink; ip->i_effnlink = dip2->di_nlink; ip->i_size = dip2->di_size; ip->i_flags = dip2->di_flags; ip->i_gen = dip2->di_gen; ip->i_uid = dip2->di_uid; ip->i_gid = dip2->di_gid; + if (ffs_oldfscompat_inode_read(fs, ip->i_dp, time_second) && + fs->fs_ronly == 0) + UFS_INODE_SET_FLAG(ip, IN_MODIFIED); return (0); } /* * Verify that a filesystem block number is a valid data block. * This routine is only called on untrusted filesystems. */ static int ffs_check_blkno(struct mount *mp, ino_t inum, ufs2_daddr_t daddr, int blksize) { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t end_daddr; int cg, havemtx; KASSERT((mp->mnt_flag & MNT_UNTRUSTED) != 0, ("ffs_check_blkno called on a trusted file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; cg = dtog(fs, daddr); end_daddr = daddr + numfrags(fs, blksize); /* * Verify that the block number is a valid data block. Also check * that it does not point to an inode block or a superblock. Accept * blocks that are unalloacted (0) or part of snapshot metadata * (BLK_NOCOPY or BLK_SNAP). * * Thus, the block must be in a valid range for the filesystem and * either in the space before a backup superblock (except the first * cylinder group where that space is used by the bootstrap code) or * after the inode blocks and before the end of the cylinder group. */ if ((uint64_t)daddr <= BLK_SNAP || ((uint64_t)end_daddr <= fs->fs_size && ((cg > 0 && end_daddr <= cgsblock(fs, cg)) || (daddr >= cgdmin(fs, cg) && end_daddr <= cgbase(fs, cg) + fs->fs_fpg)))) return (0); if ((havemtx = mtx_owned(UFS_MTX(ump))) == 0) UFS_LOCK(ump); if (ppsratecheck(&ump->um_last_integritymsg, &ump->um_secs_integritymsg, 1)) { UFS_UNLOCK(ump); uprintf("\n%s: inode %jd, out-of-range indirect block " "number %jd\n", mp->mnt_stat.f_mntonname, inum, daddr); if (havemtx) UFS_LOCK(ump); } else if (!havemtx) UFS_UNLOCK(ump); return (EINTEGRITY); } /* * On first ENXIO error, initiate an asynchronous forcible unmount. * Used to unmount filesystems whose underlying media has gone away. * * Return true if a cleanup is in progress. */ int ffs_fsfail_cleanup(struct ufsmount *ump, int error) { int retval; UFS_LOCK(ump); retval = ffs_fsfail_cleanup_locked(ump, error); UFS_UNLOCK(ump); return (retval); } int ffs_fsfail_cleanup_locked(struct ufsmount *ump, int error) { mtx_assert(UFS_MTX(ump), MA_OWNED); if (error == ENXIO && (ump->um_flags & UM_FSFAIL_CLEANUP) == 0) { ump->um_flags |= UM_FSFAIL_CLEANUP; if (ump->um_mountp == rootvnode->v_mount) panic("UFS: root fs would be forcibly unmounted"); /* * Queue an async forced unmount. */ vfs_ref(ump->um_mountp); dounmount(ump->um_mountp, MNT_FORCE | MNT_RECURSE | MNT_DEFERRED, curthread); printf("UFS: forcibly unmounting %s from %s\n", ump->um_mountp->mnt_stat.f_mntfromname, ump->um_mountp->mnt_stat.f_mntonname); } return ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0); } /* * Wrapper used during ENXIO cleanup to allocate empty buffers when * the kernel is unable to read the real one. They are needed so that * the soft updates code can use them to unwind its dependencies. */ int ffs_breadz(struct ufsmount *ump, struct vnode *vp, daddr_t lblkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { int error; flags |= GB_CVTENXIO; error = breadn_flags(vp, lblkno, dblkno, size, rablkno, rabsize, cnt, cred, flags, ckhashfunc, bpp); if (error != 0 && ffs_fsfail_cleanup(ump, error)) { error = getblkx(vp, lblkno, dblkno, size, 0, 0, flags, bpp); KASSERT(error == 0, ("getblkx failed")); vfs_bio_bzero_buf(*bpp, 0, size); } return (error); } static int ffs_mount(struct mount *mp) { struct vnode *devvp, *odevvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; int error, flags; int error1 __diagused; uint64_t mntorflags, saved_mnt_flag; accmode_t accmode; struct nameidata ndp; char *fspec; bool mounted_softdep; td = curthread; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); VFS_SMR_ZONE_SET(uma_inode); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); vfs_deleteopt(mp->mnt_optnew, "userquota"); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; if (vfs_getopt(mp->mnt_optnew, "untrusted", NULL, NULL) == 0) mntorflags |= MNT_UNTRUSTED; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { vfs_mount_error(mp, "\"acls\" and \"nfsv4acls\" options " "are mutually exclusive"); return (EINVAL); } mntorflags |= MNT_NFS4ACLS; } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_FPLOOKUP; mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) { if ((mp->mnt_flag & MNT_UPDATE) == 0) return (EINVAL); return (ffs_snapshot(mp, fspec)); } /* * Must not call namei() while owning busy ref. */ if (mp->mnt_flag & MNT_UPDATE) vfs_unbusy(mp); /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec); error = namei(&ndp); if ((mp->mnt_flag & MNT_UPDATE) != 0) { /* * Unmount does not start if MNT_UPDATE is set. Mount * update busies mp before setting MNT_UPDATE. We * must be able to retain our busy ref successfully, * without sleep. */ error1 = vfs_busy(mp, MBF_NOWAIT); MPASS(error1 == 0); } if (error != 0) return (error); NDFREE_PNBUF(&ndp); if (!vn_isdisk_error(ndp.ni_vp, &error)) { vput(ndp.ni_vp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(ndp.ni_vp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(ndp.ni_vp); return (error); } /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount_alloc() populates f_mntonname for us. */ if ((mp->mnt_flag & MNT_UPDATE) == 0) { if ((error = ffs_mountfs(ndp.ni_vp, mp, td)) != 0) { vrele(ndp.ni_vp); return (error); } } else { /* * When updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ ump = VFSTOUFS(mp); fs = ump->um_fs; odevvp = ump->um_odevvp; devvp = ump->um_devvp; /* * If it's not the same vnode, or at least the same device * then it's not correct. */ if (ndp.ni_vp->v_rdev != ump->um_odevvp->v_rdev) error = EINVAL; /* needs translation */ vput(ndp.ni_vp); if (error) return (error); if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); fs->fs_ronly = 1; if (MOUNTEDSOFTDEP(mp)) { MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_SOFTDEP; MNT_IUNLOCK(mp); mounted_softdep = true; } else mounted_softdep = false; /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mounted_softdep) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { fs->fs_ronly = 0; if (mounted_softdep) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_SOFTDEP; MNT_IUNLOCK(mp); } vfs_write_resume(mp, 0); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s Update error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; if (mounted_softdep) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_SOFTDEP; MNT_IUNLOCK(mp); } vfs_write_resume(mp, 0); return (error); } if (mounted_softdep) softdep_unmount(mp); g_topology_lock(); /* * Drop our write and exclusive access. */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp, 0); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, 0)) != 0) { return (error); } else { /* ffs_reload replaces the superblock structure */ fs = ump->um_fs; } if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(odevvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); VOP_UNLOCK(odevvp); if (error) { return (error); } fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly " "dismounted\n", mp->mnt_stat.f_mntonname); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s.%s", mp->mnt_stat.f_mntonname, "Filesystem is not clean - run fsck", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate" " journal contents"); return (EPERM); } } g_topology_lock(); /* * Request exclusive write access. */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); saved_mnt_flag = MNT_RDONLY; if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag & MNT_ASYNC) != 0) saved_mnt_flag |= MNT_ASYNC; mp->mnt_flag &= ~saved_mnt_flag; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 1; if ((fs->fs_flags & FS_DOSOFTDEP) != 0) softdep_unmount(mp); MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vfs_write_resume(mp, 0); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (MOUNTEDSOFTDEP(mp)) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } } MNT_ILOCK(mp); /* * This is racy versus lookup, see ufs_fplookup_vexec for details. */ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) != 0) panic("MNTK_FPLOOKUP set on mount %p when it should not be", mp); if ((mp->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS | MNT_UNION)) == 0) mp->mnt_kern_flag |= MNTK_FPLOOKUP; MNT_IUNLOCK(mp); vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). If the 'force' flag * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) If requested, clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags * to allow secondary writers. * 4) invalidate all cached file data. * 5) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, int flags) { struct vnode *vp, *mvp, *devvp; struct inode *ip; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; int error; ump = VFSTOUFS(mp); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) { MNT_IUNLOCK(mp); return (EINVAL); } MNT_IUNLOCK(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp); /* * Step 2: re-read superblock from disk. */ if ((error = ffs_sbget(devvp, &newfs, UFS_STDSB, 0, M_UFSMNT, ffs_use_bread)) != 0) return (error); /* * Replace our superblock with the new superblock. Preserve * our read-only status. */ fs = VFSTOUFS(mp)->um_fs; newfs->fs_ronly = fs->fs_ronly; free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); fs = VFSTOUFS(mp)->um_fs = newfs; ump->um_bsize = fs->fs_bsize; ump->um_maxsymlinklen = fs->fs_maxsymlinklen; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: reload pending error: blocks %jd " "files %d\n", mp->mnt_stat.f_mntonname, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: If requested, clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags * to allow secondary writers. */ if ((flags & FFSR_UNSUSPEND) != 0) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Skip syncer vnode. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } if ((error = ffs_load_inode(bp, ip, fs, ip->i_number)) != 0) { brelse(bp); vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ip->i_effnlink = ip->i_nlink; brelse(bp); vput(vp); } return (0); } /* * Common code for mount and mountroot */ static int ffs_mountfs(struct vnode *odevvp, struct mount *mp, struct thread *td) { struct ufsmount *ump; struct fs *fs; struct cdev *dev; int error, i, len, ronly; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; struct vnode *devvp; int candelete, canspeedup; fs = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; devvp = mntfs_allocvp(mp, odevvp); KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); dev = devvp->v_rdev; KASSERT(dev->si_snapdata == NULL, ("non-NULL snapshot data")); if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { mntfs_freevp(devvp); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); mntfs_freevp(devvp); return (error); } dev_ref(dev); devvp->v_bufobj.bo_ops = &ffs_ops; BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag |= BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); VOP_UNLOCK(devvp); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > maxphys) mp->mnt_iosize_max = maxphys; if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } /* fetch the superblock and summary information */ if ((mp->mnt_flag & (MNT_ROOTFS | MNT_FORCE)) != 0) error = ffs_sbsearch(devvp, &fs, 0, M_UFSMNT, ffs_use_bread); else error = ffs_sbget(devvp, &fs, UFS_STDSB, 0, M_UFSMNT, ffs_use_bread); if (error != 0) goto out; fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly dismounted\n", mp->mnt_stat.f_mntonname); } else { vfs_mount_error(mp, "R/W mount on %s denied. " "Filesystem is not clean - run fsck.%s", mp->mnt_stat.f_mntonname, (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate journal contents"); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s: lost blocks %jd files %d\n", mp->mnt_stat.f_mntonname, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: mount pending error: blocks %jd " "files %d\n", mp->mnt_stat.f_mntonname, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ len = 1024; mp->mnt_gjprovider = malloc((uint64_t)len, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &len, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { if ((mp->mnt_flag & MNT_RDONLY) == 0) printf("WARNING: %s: GJOURNAL flag on fs " "but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf("WARNING: %s: GJOURNAL flag on fs but no " "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = fs; if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; ump->um_snapgone = ffs_snapgone; if ((mp->mnt_flag & MNT_UNTRUSTED) != 0) ump->um_check_blkno = ffs_check_blkno; else ump->um_check_blkno = NULL; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); sx_init(&ump->um_checkpath_lock, "uchpth"); fs->fs_ronly = ronly; fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } ump->um_bsize = fs->fs_bsize; ump->um_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf("WARNING: %s: multilabel flag on fs but " "no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: ACLs flag on fs conflicts with " "\"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_ACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_ACLS; mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_TRIM) != 0) { len = sizeof(int); if (g_io_getattr("GEOM::candelete", cp, &len, &candelete) == 0) { if (candelete) ump->um_flags |= UM_CANDELETE; else printf("WARNING: %s: TRIM flag on fs but disk " "does not support TRIM\n", mp->mnt_stat.f_mntonname); } else { printf("WARNING: %s: TRIM flag on fs but disk does " "not confirm that it supports TRIM\n", mp->mnt_stat.f_mntonname); } if (((ump->um_flags) & UM_CANDELETE) != 0) { ump->um_trim_tq = taskqueue_create("trim", M_WAITOK, taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM, &ump->um_trimlisthashsize); } } len = sizeof(int); if (g_io_getattr("GEOM::canspeedup", cp, &len, &canspeedup) == 0) { if (canspeedup) ump->um_flags |= UM_CANSPEEDUP; } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_odevvp = odevvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); mp->mnt_stat.f_iosize = fs->fs_bsize; if (mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { ffs_flushfiles(mp, FORCECLOSE, td); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem state information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ return (0); out: if (fs != NULL) { free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); } if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump != NULL) { mtx_destroy(UFS_MTX(ump)); sx_destroy(&ump->um_checkpath_lock); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } MPASS(ump->um_softdep == NULL); free(ump, M_UFSMNT); mp->mnt_data = NULL; } BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); mntfs_freevp(devvp); dev_rel(dev); return (error); } /* * A read function for use by filesystem-layer routines. */ static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size) { struct buf *bp; int error; KASSERT(*bufp == NULL, ("ffs_use_bread: non-NULL *bufp %p\n", *bufp)); *bufp = malloc(size, M_UFSMNT, M_WAITOK); if ((error = bread((struct vnode *)devfd, btodb(loc), size, NOCRED, &bp)) != 0) return (error); bcopy(bp->b_data, *bufp, size); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (0); } /* * unmount system call */ static int ffs_unmount(struct mount *mp, int mntflags) { struct thread *td; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; td = curthread; fs = ump->um_fs; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; susp = fs->fs_ronly == 0; #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("WARNING: unmount %s: ufs_extattr_stop " "returned errno %d\n", mp->mnt_stat.f_mntonname, error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { error = vfs_write_suspend_umnt(mp); if (error != 0) goto fail1; } if (MOUNTEDSOFTDEP(mp)) error = softdep_flushfiles(mp, flags, td); else error = ffs_flushfiles(mp, flags, td); if (error != 0 && !ffs_fsfail_cleanup(ump, error)) goto fail; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: unmount %s: pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); MPASS(ump->um_softdep == NULL); if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (ffs_fsfail_cleanup(ump, error)) error = 0; if (error != 0 && !ffs_fsfail_cleanup(ump, error)) { fs->fs_clean = 0; goto fail; } } if (susp) vfs_write_resume(mp, VR_START_WRITE); if (ump->um_trim_tq != NULL) { MPASS(ump->um_trim_inflight == 0); taskqueue_free(ump->um_trim_tq); free (ump->um_trimhash, M_TRIM); } vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); g_topology_lock(); g_vfs_close(ump->um_cp); g_topology_unlock(); BO_LOCK(&ump->um_odevvp->v_bufobj); ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&ump->um_odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); mntfs_freevp(ump->um_devvp); vrele(ump->um_odevvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); sx_destroy(&ump->um_checkpath_lock); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; if (td->td_su == mp) { td->td_su = NULL; vfs_rel(mp); } return (error); fail: if (susp) vfs_write_resume(mp, VR_START_WRITE); fail1: #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(struct mount *mp, int flags, struct thread *td) { struct ufsmount *ump; int qerror, error; ump = VFSTOUFS(mp); qerror = 0; #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { error = quotaoff(td, mp, i); if (error != 0) { if ((flags & EARLYFLUSH) == 0) return (error); else qerror = error; } } /* * Here we fall through to vflush again to ensure that * we have gotten rid of all the system vnodes, unless * quotas must not be closed. */ } #endif /* devvp is not locked there */ if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Do not close system files if quotas were not closed, to be * able to sync the remaining dquots. The freeblks softupdate * workitems might hold a reference on a dquot, preventing * quotaoff() from completing. Next round of * softdep_flushworklist() iteration should process the * blockers, allowing the next run of quotaoff() to finally * flush held dquots. * * Otherwise, flush all the files. */ if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) return (error); /* * If this is a forcible unmount and there were any files that * were unlinked but still open, then vflush() will have * truncated and freed those files, which might have started * some trim work. Wait here for any trims to complete * and process the blkfrees which follow the trims. * This may create more dirty devvp buffers and softdep deps. */ if (ump->um_trim_tq != NULL) { while (ump->um_trim_inflight != 0) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); } /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(struct mount *mp, struct statfs *sbp) { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = UFS_MAXNAMLEN; return (0); } static bool sync_doupdate(struct inode *ip) { return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) != 0); } static int ffs_sync_lazy_filter(struct vnode *vp, void *arg __unused) { struct inode *ip; /* * Flags are safe to access because ->v_data invalidation * is held off by listmtx. */ if (vp->v_type == VNON) return (false); ip = VTOI(vp); if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) return (false); return (true); } /* * For a lazy sync, we only care about access times, quotas and the * superblock. Other filesystem changes are already converted to * cylinder group blocks or inode blocks updates and are written to * disk by syncer. */ static int ffs_sync_lazy(struct mount *mp) { struct vnode *mvp, *vp; struct inode *ip; int allerror, error; allerror = 0; if ((mp->mnt_flag & MNT_NOATIME) != 0) { #ifdef QUOTA qsync(mp); #endif goto sbupdate; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, ffs_sync_lazy_filter, NULL) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); /* * The IN_ACCESS flag is converted to IN_MODIFIED by * ufs_close() and ufs_getattr() by the calls to * ufs_itimes_locked(), without subsequent UFS_UPDATE(). * Test also all the other timestamp flags too, to pick up * any other cases that could be missed. */ if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK)) != 0) continue; #ifdef QUOTA qsyncvp(vp); #endif if (sync_doupdate(ip)) error = ffs_update(vp, 0); if (error != 0) allerror = error; vput(vp); } sbupdate: if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; return (allerror); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked busy using * vfs_busy(). */ static int ffs_sync(struct mount *mp, int waitfor) { struct vnode *mvp, *vp, *devvp; struct thread *td; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; suspend = 0; suspended = 0; td = curthread; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) panic("%s: ffs_sync: modification on read-only filesystem", fs->fs_fsmnt); if (waitfor == MNT_LAZY) { if (!rebooting) return (ffs_sync_lazy(mp)); waitfor = MNT_NOWAIT; } /* * Write back each (modified) inode. */ lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) lockreq = LK_EXCLUSIVE; lockreq |= LK_INTERLOCK; loop: /* Grab snapshot of secondary write counts */ MNT_ILOCK(mp); secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; MNT_IUNLOCK(mp); /* Grab snapshot of softdep dependency counts */ softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Depend on the vnode interlock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } if ((error = vget(vp, lockreq)) != 0) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } #ifdef QUOTA qsyncvp(vp); #endif for (;;) { error = ffs_syncvnode(vp, waitfor, 0); if (error == ERELOOKUP) continue; if (error != 0) allerror = error; break; } vput(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT || rebooting) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) goto loop; } devvp = ump->um_devvp; bo = &devvp->v_bufobj; BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, waitfor, td); VOP_UNLOCK(devvp); if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) error = ffs_sbupdate(ump, waitfor, 0); if (error != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; if (allerror == 0 && waitfor == MNT_WAIT) goto loop; } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) { MNT_IUNLOCK(mp); goto loop; /* More work needed */ } mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else BO_UNLOCK(bo); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; return (allerror); } int ffs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(struct mount *mp, ino_t ino, int flags, struct vnode **vpp, int ffs_flags) { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; daddr_t dbn; int error; MPASS((ffs_flags & (FFSV_REPLACE | FFSV_REPLACE_DOOMED)) == 0 || (flags & LK_EXCLUSIVE) != 0); error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { if ((ffs_flags & FFSV_REPLACE) == 0 || ((ffs_flags & FFSV_REPLACE_DOOMED) == 0 || !VN_IS_DOOMED(*vpp))) return (0); vgone(*vpp); vput(*vpp); } /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); fs = ump->um_fs; ip = uma_zalloc_smr(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? &ffs_vnodeops1 : &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree_smr(uma_inode, ip); return (error); } /* * FFS supports recursive locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE | LK_NOWITNESS, NULL); VN_LOCK_AREC(vp); vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_number = ino; ip->i_ea_refs = 0; ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; ip->i_mode = 0; /* ensure error cases below throw away vnode */ cluster_init_vn(&ip->i_clusterw); #ifdef DIAGNOSTIC ufs_init_trackers(ip); #endif #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { uma_zfree_smr(uma_inode, ip); *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { /* * Calls from ffs_valloc() (i.e. FFSV_REPLACE set) * operate on empty inode, which must not be found by * other threads until fully filled. Vnode for empty * inode must be not re-inserted on the hash by other * thread, after removal by us at the beginning. */ MPASS((ffs_flags & FFSV_REPLACE) == 0); return (0); } if (I_IS_UFS1(ip)) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); if ((ffs_flags & FFSV_NEWINODE) != 0) { /* New inode, just zero out its contents. */ if (I_IS_UFS1(ip)) memset(ip->i_din1, 0, sizeof(struct ufs1_dinode)); else memset(ip->i_din2, 0, sizeof(struct ufs2_dinode)); } else { /* Read the disk contents for the inode, copy into the inode. */ dbn = fsbtodb(fs, ino_to_fsba(fs, ino)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error != 0) { /* * The inode does not contain anything useful, so it * would be misleading to leave it on its hash chain. * With mode still zero, it will be unlinked and * returned to the free list by vput(). */ vgone(vp); vput(vp); *vpp = NULL; return (error); } if ((error = ffs_load_inode(bp, ip, fs, ino)) != 0) { bqrelse(bp); vgone(vp); vput(vp); *vpp = NULL; return (error); } bqrelse(bp); } if (DOINGSOFTDEP(vp) && (!fs->fs_ronly || (ffs_flags & FFSV_FORCEINODEDEP) != 0)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, &vp); if (error) { vgone(vp); vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ if (vp->v_type != VFIFO) { /* FFS supports shared locking for all files except fifos. */ VN_LOCK_ASHARE(vp); } /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { while (ip->i_gen == 0) ip->i_gen = arc4random(); if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { UFS_INODE_SET_FLAG(ip, IN_MODIFIED); DIP_SET(ip, i_gen, ip->i_gen); } } #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vgone(vp); vput(vp); *vpp = NULL; return (error); } } #endif vn_set_state(vp, VSTATE_CONSTRUCTED); *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - for UFS2 check that the inode number is initialized * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct ufid *ufhp; ufhp = (struct ufid *)fhp; return (ffs_inotovp(mp, ufhp->ufid_ino, ufhp->ufid_gen, flags, vpp, 0)); } /* * Return a vnode from a mounted filesystem for inode with specified * generation number. Return ESTALE if the inode with given generation * number no longer exists on that filesystem. */ int ffs_inotovp(struct mount *mp, ino_t ino, uint64_t gen, int lflags, struct vnode **vpp, int ffs_flags) { struct ufsmount *ump; struct vnode *nvp; struct inode *ip; struct fs *fs; struct cg *cgp; struct buf *bp; uint64_t cg; ump = VFSTOUFS(mp); fs = ump->um_fs; *vpp = NULL; if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); /* * Need to check if inode is initialized because UFS2 does lazy * initialization and nfs_fhtovp can offer arbitrary inode numbers. */ if (fs->fs_magic == FS_UFS2_MAGIC) { cg = ino_to_cg(fs, ino); if (ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp) != 0) return (ESTALE); if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { brelse(bp); return (ESTALE); } brelse(bp); } if (ffs_vgetf(mp, ino, lflags, &nvp, ffs_flags) != 0) return (ESTALE); ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != gen || ip->i_effnlink <= 0) { if (ip->i_mode == 0) vgone(nvp); vput(nvp); return (ESTALE); } vnode_create_vobject(nvp, DIP(ip, i_size), curthread); *vpp = nvp; return (0); } /* * Initialize the filesystem. */ static int ffs_init(struct vfsconf *vfsp) { ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(struct vfsconf *vfsp) { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); ffs_susp_uninitialize(); taskqueue_drain_all(taskqueue_thread); return (ret); } /* * Structure used to pass information from ffs_sbupdate to its * helper routine ffs_use_bwrite. */ struct devfd { struct ufsmount *ump; struct buf *sbbp; int waitfor; int suspended; int error; }; /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(struct ufsmount *ump, int waitfor, int suspended) { struct fs *fs; struct buf *sbbp; struct devfd devfd; fs = ump->um_fs; if (fs->fs_ronly == 1 && (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE)) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * Initialize info needed for write function. */ devfd.ump = ump; devfd.sbbp = sbbp; devfd.waitfor = waitfor; devfd.suspended = suspended; devfd.error = 0; return (ffs_sbput(&devfd, fs, fs->fs_sblockloc, ffs_use_bwrite)); } /* * Write function for use by filesystem-layer routines. */ static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size) { struct devfd *devfdp; struct ufsmount *ump; struct buf *bp; struct fs *fs; int error; devfdp = devfd; ump = devfdp->ump; fs = ump->um_fs; /* * Writing the superblock summary information. */ if (loc != fs->fs_sblockloc) { bp = getblk(ump->um_devvp, btodb(loc), size, 0, 0, 0); bcopy(buf, bp->b_data, (uint64_t)size); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (0); } /* * Writing the superblock itself. We need to do special checks for it. */ bp = devfdp->sbbp; if (ffs_fsfail_cleanup(ump, devfdp->error)) devfdp->error = 0; if (devfdp->error != 0) { brelse(bp); return (devfdp->error); } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } if (MOUNTEDSOFTDEP(ump->um_mountp)) softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); UFS_LOCK(ump); bcopy((caddr_t)fs, bp->b_data, (uint64_t)fs->fs_sbsize); UFS_UNLOCK(ump); fs = (struct fs *)bp->b_data; fs->fs_fmod = 0; ffs_oldfscompat_write(fs); fs->fs_si = NULL; /* Recalculate the superblock hash */ fs->fs_ckhash = ffs_calc_sbhash(fs); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (devfdp->error); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree_smr(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) != 0) softdep_handle_error(bp); #endif /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * We should mark the cylinder group buffer origbp as * dirty, to not lose the failed write. */ if ((bp->b_ioflags & BIO_ERROR) != 0) origbp->b_vflags |= BV_BKGRDERR; BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. Clear B_IOSTARTED in case of error. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~(B_CACHE | B_IOSTARTED); pbrelvp(bp); /* * Prevent brelse() from trying to keep and re-dirtying bp on * errors. It causes b_bufobj dereference in * bdirty()/reassignbuf(), and b_bufobj was cleared in * pbrelvp() above. */ if ((bp->b_ioflags & BIO_ERROR) != 0) bp->b_flags |= B_INVAL; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { struct buf *newbp; struct cg *cgp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!BUF_ISLOCKED(bp)) panic("bufwrite: buffer is not busy???"); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); if (newbp == NULL) goto normal_write; KASSERT(buf_mapped(bp), ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; BO_UNLOCK(bp->b_bufobj); newbp->b_xflags |= (bp->b_xflags & BX_FSPRIV) | BX_BKGRDMARKER; newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; pbgetvp(bp->b_vp, newbp); #ifdef SOFTUPDATES /* * Move over the dependencies. If there are rollbacks, * leave the parent buffer dirtied as it will need to * be written again. */ if (LIST_EMPTY(&bp->b_dep) || softdep_move_dependencies(bp, newbp) == 0) bundirty(bp); #else bundirty(bp); #endif /* * Initiate write on the copy, release the original. The * BKGRDINPROG flag prevents it from going away until * the background write completes. We have to recalculate * its check hash in case the buffer gets freed and then * reconstituted from the buffer cache during a later read. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_ckhash = 0; cgp->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); } bqrelse(bp); bp = newbp; } else /* Mark the buffer clean */ bundirty(bp); /* Let the normal bufwrite do the rest for us */ normal_write: /* * If we are writing a cylinder group, update its time. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_old_time = cgp->cg_time = time_second; } return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; struct buf *tbp; int error, nocopy; /* * This is the bufobj strategy for the private VCHR vnodes * used by FFS to access the underlying storage device. * We override the default bufobj strategy and thus bypass * VOP_STRATEGY() for these vnodes. */ vp = bo2vnode(bo); KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR || bp->b_vp->v_rdev == NULL || bp->b_vp->v_rdev->si_mountpt == NULL || VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL || vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp, ("ffs_geom_strategy() with wrong vp")); if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); nocopy = bp->b_flags & B_NOCOPY; bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bp->b_flags &= ~B_BARRIER; bufdone(bp); return; } } (void)runningbufclaim(bp, bp->b_bufsize); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bp->b_flags &= ~B_BARRIER; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif /* * Check for metadata that needs check-hashes and update them. */ switch (bp->b_xflags & BX_FSPRIV) { case BX_CYLGRP: ((struct cg *)bp->b_data)->cg_ckhash = 0; ((struct cg *)bp->b_data)->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); break; case BX_SUPERBLOCK: case BX_INODE: case BX_INDIR: case BX_DIR: printf("Check-hash write is unimplemented!!!\n"); break; case 0: break; default: printf("multiple buffer types 0x%b\n", (bp->b_xflags & BX_FSPRIV), PRINT_UFS_BUF_XFLAGS); break; } } if (bp->b_iocmd != BIO_READ && ffs_enxio_enable) bp->b_xflags |= BX_CVTENXIO; g_vfs_strategy(bo, bp); } int ffs_own_mount(const struct mount *mp) { if (mp->mnt_op == &ufs_vfsops) return (1); return (0); } #ifdef DDB #ifdef SOFTUPDATES /* defined in ffs_softdep.c */ extern void db_print_ffs(struct ufsmount *ump); DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* SOFTUPDATES */ #endif /* DDB */ diff --git a/sys/ufs/ufs/dinode.h b/sys/ufs/ufs/dinode.h index b4117a99c262..49f7fd91aa26 100644 --- a/sys/ufs/ufs/dinode.h +++ b/sys/ufs/ufs/dinode.h @@ -1,215 +1,229 @@ /*- * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause) * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the authors may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)dinode.h 8.3 (Berkeley) 1/21/94 */ #ifndef _UFS_UFS_DINODE_H_ #define _UFS_UFS_DINODE_H_ /* * The root inode is the root of the filesystem. Inode 0 can't be used for * normal purposes and historically bad blocks were linked to inode 1, thus * the root inode is 2. (Inode 1 is no longer used for this purpose, however * numerous dump tapes make this assumption, so we are stuck with it). */ #define UFS_ROOTINO ((ino_t)2) /* * The Whiteout inode# is a dummy non-zero inode number which will * never be allocated to a real file. It is used as a place holder * in the directory entry which has been tagged as a DT_WHT entry. * See the comments about UFS_ROOTINO above. */ #define UFS_WINO ((ino_t)1) /* * The size of physical and logical block numbers and time fields in UFS. */ typedef int32_t ufs1_daddr_t; typedef int64_t ufs2_daddr_t; typedef int64_t ufs_lbn_t; typedef int64_t ufs_time_t; /* File permissions. */ #define IEXEC 0000100 /* Executable. */ #define IWRITE 0000200 /* Writeable. */ #define IREAD 0000400 /* Readable. */ #define ISVTX 0001000 /* Sticky bit. */ #define ISGID 0002000 /* Set-gid. */ #define ISUID 0004000 /* Set-uid. */ /* File types. */ #define IFMT 0170000 /* Mask of file type. */ #define IFIFO 0010000 /* Named pipe (fifo). */ #define IFCHR 0020000 /* Character device. */ #define IFDIR 0040000 /* Directory file. */ #define IFBLK 0060000 /* Block device. */ #define IFREG 0100000 /* Regular file. */ #define IFLNK 0120000 /* Symbolic link. */ #define IFSOCK 0140000 /* UNIX domain socket. */ #define IFWHT 0160000 /* Whiteout. */ /* - * A dinode contains all the meta-data associated with a UFS2 file. - * This structure defines the on-disk format of a dinode. Since + * Each UFS filesystem version defines the on-disk format of its dinode. + * + * A UFS2 dinode contains all the meta-data associated with a UFS2 file. + * This structure defines the on-disk format of a UFS2 dinode. Since * this structure describes an on-disk structure, all its fields * are defined by types with precise widths. */ - #define UFS_NXADDR 2 /* External addresses in inode. */ #define UFS_NDADDR 12 /* Direct addresses in inode. */ #define UFS_NIADDR 3 /* Indirect addresses in inode. */ struct ufs2_dinode { uint16_t di_mode; /* 0: IFMT, permissions; see below. */ uint16_t di_nlink; /* 2: File link count. */ uint32_t di_uid; /* 4: File owner. */ uint32_t di_gid; /* 8: File group. */ uint32_t di_blksize; /* 12: Inode blocksize. */ uint64_t di_size; /* 16: File byte count. */ uint64_t di_blocks; /* 24: Blocks actually held. */ ufs_time_t di_atime; /* 32: Last access time. */ ufs_time_t di_mtime; /* 40: Last modified time. */ ufs_time_t di_ctime; /* 48: Last inode change time. */ ufs_time_t di_birthtime; /* 56: Inode creation time. */ int32_t di_mtimensec; /* 64: Last modified time. */ int32_t di_atimensec; /* 68: Last access time. */ int32_t di_ctimensec; /* 72: Last inode change time. */ int32_t di_birthnsec; /* 76: Inode creation time. */ uint32_t di_gen; /* 80: Generation number. */ uint32_t di_kernflags; /* 84: Kernel flags. */ uint32_t di_flags; /* 88: Status flags (chflags). */ uint32_t di_extsize; /* 92: External attributes size. */ ufs2_daddr_t di_extb[UFS_NXADDR];/* 96: External attributes block. */ union { struct { ufs2_daddr_t di_db /* 112: Direct disk blocks. */ [UFS_NDADDR]; ufs2_daddr_t di_ib /* 208: Indirect disk blocks. */ [UFS_NIADDR]; }; char di_shortlink /* 112: Embedded symbolic link. */ [(UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)]; }; uint64_t di_modrev; /* 232: i_modrev for NFSv4 */ union { uint32_t di_freelink; /* 240: SUJ: Next unlinked inode. */ uint32_t di_dirdepth; /* 240: IFDIR: depth from root dir */ }; uint32_t di_ckhash; /* 244: if CK_INODE, its check-hash */ uint32_t di_spare[2]; /* 248: Reserved; currently unused */ }; /* * The di_db fields may be overlaid with other information for * file types that do not have associated disk storage. Block * and character devices overlay the first data block with their * dev_t value. Short symbolic links place their path in the * di_db area. */ #define di_rdev di_db[0] /* * A UFS1 dinode contains all the meta-data associated with a UFS1 file. * This structure defines the on-disk format of a UFS1 dinode. Since * this structure describes an on-disk structure, all its fields * are defined by types with precise widths. */ struct ufs1_dinode { uint16_t di_mode; /* 0: IFMT, permissions; see below. */ uint16_t di_nlink; /* 2: File link count. */ union { uint32_t di_freelink; /* 4: SUJ: Next unlinked inode. */ uint32_t di_dirdepth; /* 4: IFDIR: depth from root dir */ }; uint64_t di_size; /* 8: File byte count. */ - int32_t di_atime; /* 16: Last access time. */ + uint32_t di_atime; /* 16: Last access time. */ int32_t di_atimensec; /* 20: Last access time. */ - int32_t di_mtime; /* 24: Last modified time. */ + uint32_t di_mtime; /* 24: Last modified time. */ int32_t di_mtimensec; /* 28: Last modified time. */ - int32_t di_ctime; /* 32: Last inode change time. */ + uint32_t di_ctime; /* 32: Last inode change time. */ int32_t di_ctimensec; /* 36: Last inode change time. */ union { struct { ufs1_daddr_t di_db /* 40: Direct disk blocks. */ [UFS_NDADDR]; ufs1_daddr_t di_ib /* 88: Indirect disk blocks. */ [UFS_NIADDR]; }; char di_shortlink /* 40: Embedded symbolic link. */ [(UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)]; }; uint32_t di_flags; /* 100: Status flags (chflags). */ uint32_t di_blocks; /* 104: Blocks actually held. */ uint32_t di_gen; /* 108: Generation number. */ uint32_t di_uid; /* 112: File owner. */ uint32_t di_gid; /* 116: File group. */ uint64_t di_modrev; /* 120: i_modrev for NFSv4 */ }; #define UFS_LINK_MAX 65500 /* leave a few spare for special values */ +/* + * These structures hold or reference an on-disk dinode. + */ +union dinode { + struct ufs1_dinode dp1; + struct ufs2_dinode dp2; +}; + +union dinodep { + struct ufs1_dinode *dp1; + struct ufs2_dinode *dp2; +}; + #endif /* _UFS_UFS_DINODE_H_ */ diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index dc7e2560d6e3..607f19c5b946 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -1,317 +1,314 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)inode.h 8.9 (Berkeley) 5/14/95 */ #ifndef _UFS_UFS_INODE_H_ #define _UFS_UFS_INODE_H_ #include #include #include #include #ifdef DIAGNOSTIC #include #endif #include /* * This must agree with the definition in . */ #define doff_t int32_t #ifdef DIAGNOSTIC struct iown_tracker { struct thread *tr_owner; struct stack tr_st; struct stack tr_unlock; int tr_gen; }; #endif /* * The inode is used to describe each active (or recently active) file in the * UFS filesystem. It is composed of two types of information. The first part * is the information that is needed only while the file is active (such as * the identity of the file and linkage to speed its lookup). The second part * is the permanent meta-data associated with the file which is read in * from the permanent dinode from long term storage when the file becomes * active, and is put back when the file is no longer being used. * * An inode may only be changed while holding either the exclusive * vnode lock or the shared vnode lock and the vnode interlock. We use * the latter only for "read" and "get" operations that require * changing i_flag, or a timestamp. This locking protocol allows executing * those operations without having to upgrade the vnode lock from shared to * exclusive. */ struct inode { TAILQ_ENTRY(inode) i_nextsnap; /* Snapshot file list. */ struct vnode *i_vnode; /* Vnode associated with this inode. */ struct ufsmount *i_ump; /* Ufsmount point associated with this inode. */ struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ union { struct dirhash *dirhash; /* Hashing for large directories. */ daddr_t *snapblklist; /* Collect expunged snapshot blocks. */ } i_un; /* * The real copy of the on-disk inode. */ - union { - struct ufs1_dinode *din1; /* UFS1 on-disk dinode. */ - struct ufs2_dinode *din2; /* UFS2 on-disk dinode. */ - } dinode_u; + union dinodep i_dp; /* On-disk dinode */ ino_t i_number; /* The identity of the inode. */ uint32_t i_flag; /* flags, see below */ int32_t i_effnlink; /* i_nlink when I/O completes */ /* * Side effects; used during directory lookup. */ int32_t i_count; /* Size of free slot in directory. */ doff_t i_endoff; /* End of useful stuff in directory. */ doff_t i_diroff; /* Offset in dir, where we found last entry. */ doff_t i_offset; /* Offset of free space in directory. */ #ifdef DIAGNOSTIC int i_lock_gen; struct iown_tracker i_count_tracker; struct iown_tracker i_endoff_tracker; struct iown_tracker i_offset_tracker; #endif int i_nextclustercg; /* last cg searched for cluster */ struct vn_clusterw i_clusterw; /* Buffer clustering information */ /* * Data for extended attribute modification. */ uint8_t *i_ea_area; /* Pointer to malloced copy of EA area */ unsigned i_ea_len; /* Length of i_ea_area */ int i_ea_error; /* First errno in transaction */ int i_ea_refs; /* Number of users of EA area */ /* * Copies from the on-disk dinode itself. */ uint64_t i_size; /* File byte count. */ uint64_t i_gen; /* Generation number. */ uint32_t i_flags; /* Status flags (chflags). */ uint32_t i_uid; /* File owner. */ uint32_t i_gid; /* File group. */ int32_t i_nlink; /* File link count. */ uint16_t i_mode; /* IFMT, permissions; see below. */ }; /* * These flags are kept in i_flag. */ #define IN_ACCESS 0x0001 /* Access time update request. */ #define IN_CHANGE 0x0002 /* Inode change time update request. */ #define IN_UPDATE 0x0004 /* Modification time update request. */ #define IN_MODIFIED 0x0008 /* Inode has been modified. */ #define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */ #define IN_LAZYMOD 0x0020 /* Modified, but don't write yet. */ #define IN_LAZYACCESS 0x0040 /* Process IN_ACCESS after the suspension finished */ #define IN_EA_LOCKED 0x0080 /* Extended attributes locked */ #define IN_EA_LOCKWAIT 0x0100 /* Want extended attributes lock */ #define IN_TRUNCATED 0x0200 /* Journaled truncation pending. */ #define IN_UFS2 0x0400 /* UFS2 vs UFS1 */ #define IN_IBLKDATA 0x0800 /* datasync requires inode block update */ #define IN_SIZEMOD 0x1000 /* Inode size has been modified */ #define IN_ENDOFF 0x2000 /* Free space at the end of directory, try to truncate when possible */ #define PRINT_INODE_FLAGS "\20\20b16\17b15\16b14\15sizemod" \ "\14iblkdata\13is_ufs2\12truncated\11ea_lockwait\10ea_locked" \ "\7lazyaccess\6lazymod\5needsync\4modified\3update\2change\1access" #define UFS_INODE_FLAG_LAZY_MASK \ (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE | IN_LAZYMOD | \ IN_LAZYACCESS) /* * Some flags can persist a vnode transitioning to 0 hold count and being tkaen * off the list. */ #define UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE \ (UFS_INODE_FLAG_LAZY_MASK & ~(IN_LAZYMOD | IN_LAZYACCESS)) #define UFS_INODE_SET_MODE(ip, mode) do { \ struct inode *_ip = (ip); \ int _mode = (mode); \ \ ASSERT_VOP_IN_SEQC(ITOV(_ip)); \ atomic_store_short(&(_ip)->i_mode, _mode); \ } while (0) #define UFS_INODE_SET_FLAG(ip, flags) do { \ struct inode *_ip = (ip); \ struct vnode *_vp = ITOV(_ip); \ int _flags = (flags); \ \ _ip->i_flag |= _flags; \ if (_flags & UFS_INODE_FLAG_LAZY_MASK) \ vlazy(_vp); \ } while (0) #define UFS_INODE_SET_FLAG_SHARED(ip, flags) do { \ struct inode *_ip = (ip); \ struct vnode *_vp = ITOV(_ip); \ int _flags = (flags); \ \ ASSERT_VI_UNLOCKED(_vp, __func__); \ if ((_ip->i_flag & (_flags)) != _flags) { \ VI_LOCK(_vp); \ _ip->i_flag |= _flags; \ if (_flags & UFS_INODE_FLAG_LAZY_MASK) \ vlazy(_vp); \ VI_UNLOCK(_vp); \ } \ } while (0) #define i_dirhash i_un.dirhash #define i_snapblklist i_un.snapblklist -#define i_din1 dinode_u.din1 -#define i_din2 dinode_u.din2 +#define i_din1 i_dp.dp1 +#define i_din2 i_dp.dp2 #define ITOUMP(ip) ((ip)->i_ump) #define ITODEV(ip) (ITOUMP(ip)->um_dev) #define ITODEVVP(ip) (ITOUMP(ip)->um_devvp) #define ITOFS(ip) (ITOUMP(ip)->um_fs) #define ITOVFS(ip) ((ip)->i_vnode->v_mount) #ifdef _KERNEL static inline _Bool I_IS_UFS1(const struct inode *ip) { return ((ip->i_flag & IN_UFS2) == 0); } static inline _Bool I_IS_UFS2(const struct inode *ip) { return ((ip->i_flag & IN_UFS2) != 0); } #endif /* _KERNEL */ /* * The DIP macro is used to access fields in the dinode that are * not cached in the inode itself. */ #define DIP(ip, field) (I_IS_UFS1(ip) ? (ip)->i_din1->d##field : \ (ip)->i_din2->d##field) #define DIP_SET(ip, field, val) do { \ if (I_IS_UFS1(ip)) \ (ip)->i_din1->d##field = (val); \ else \ (ip)->i_din2->d##field = (val); \ } while (0) #define DIP_SET_NLINK(ip, val) do { \ KASSERT(ip->i_nlink >= 0, ("%s:%d %s(): setting negative " \ "nlink value %d for inode %jd\n", __FILE__, __LINE__, \ __FUNCTION__, (ip)->i_nlink, (ip)->i_number)); \ DIP_SET(ip, i_nlink, val); \ } while (0) #define IS_SNAPSHOT(ip) ((ip)->i_flags & SF_SNAPSHOT) #define IS_UFS(vp) ((vp)->v_data != NULL) /* * Structure used to pass around logical block paths generated by * ufs_getlbns and used by truncate and bmap code. */ struct indir { ufs2_daddr_t in_lbn; /* Logical block number. */ int in_off; /* Offset in buffer. */ }; /* Convert between inode pointers and vnode pointers. */ #define VTOI(vp) ((struct inode *)(vp)->v_data) #define VTOI_SMR(vp) ((struct inode *)vn_load_v_data_smr(vp)) #define ITOV(ip) ((ip)->i_vnode) /* Determine if soft dependencies are being done */ #define MOUNTEDSOFTDEP(mp) (((mp)->mnt_flag & MNT_SOFTDEP) != 0) #define DOINGSOFTDEP(vp) MOUNTEDSOFTDEP((vp)->v_mount) #define MOUNTEDSUJ(mp) (((mp)->mnt_flag & (MNT_SOFTDEP | MNT_SUJ)) == \ (MNT_SOFTDEP | MNT_SUJ)) #define DOINGSUJ(vp) MOUNTEDSUJ((vp)->v_mount) /* This overlays the fid structure (see mount.h). */ struct ufid { uint16_t ufid_len; /* Length of structure. */ uint16_t ufid_pad; /* Force 32-bit alignment. */ uint32_t ufid_ino; /* File number (ino). */ uint32_t ufid_gen; /* Generation number. */ }; #ifdef _KERNEL #ifdef DIAGNOSTIC void ufs_init_trackers(struct inode *ip); void ufs_unlock_tracker(struct inode *ip); doff_t ufs_get_i_offset(struct inode *ip, const char *file, int line); void ufs_set_i_offset(struct inode *ip, doff_t off, const char *file, int line); #define I_OFFSET(ip) ufs_get_i_offset(ip, __FILE__, __LINE__) #define SET_I_OFFSET(ip, off) ufs_set_i_offset(ip, off, __FILE__, __LINE__) int32_t ufs_get_i_count(struct inode *ip, const char *file, int line); void ufs_set_i_count(struct inode *ip, int32_t cnt, const char *file, int line); #define I_COUNT(ip) ufs_get_i_count(ip, __FILE__, __LINE__) #define SET_I_COUNT(ip, cnt) ufs_set_i_count(ip, cnt, __FILE__, __LINE__) doff_t ufs_get_i_endoff(struct inode *ip, const char *file, int line); void ufs_set_i_endoff(struct inode *ip, doff_t off, const char *file, int line); #define I_ENDOFF(ip) ufs_get_i_endoff(ip, __FILE__, __LINE__) #define SET_I_ENDOFF(ip, off) ufs_set_i_endoff(ip, off, __FILE__, __LINE__) #else #define I_OFFSET(ip) ((ip)->i_offset) #define SET_I_OFFSET(ip, off) ((ip)->i_offset = (off)) #define I_COUNT(ip) ((ip)->i_count) #define SET_I_COUNT(ip, cnt) ((ip)->i_count = cnt) #define I_ENDOFF(ip) ((ip)->i_endoff) #define SET_I_ENDOFF(ip, off) ((ip)->i_endoff = off) #endif #endif /* _KERNEL */ #endif /* !_UFS_UFS_INODE_H_ */ diff --git a/usr.sbin/makefs/ffs.c b/usr.sbin/makefs/ffs.c index 97c1d6ccefda..8c4b790f53b0 100644 --- a/usr.sbin/makefs/ffs.c +++ b/usr.sbin/makefs/ffs.c @@ -1,1205 +1,1205 @@ /* $NetBSD: ffs.c,v 1.45 2011/10/09 22:49:26 christos Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2001 Wasabi Systems, Inc. * All rights reserved. * * Written by Luke Mewburn for Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed for the NetBSD Project by * Wasabi Systems, Inc. * 4. The name of Wasabi Systems, Inc. may not be used to endorse * or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95 */ #include #if HAVE_NBTOOL_CONFIG_H #include "nbtool_config.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #if HAVE_STRUCT_STATVFS_F_IOSIZE && HAVE_FSTATVFS #include #endif #include #include #include #include "ffs/ufs_bswap.h" #include "ffs/ufs_inode.h" #include "ffs/newfs_extern.h" #include "ffs/ffs_extern.h" #undef clrbuf #include "makefs.h" #include "ffs.h" #undef DIP #define DIP(dp, field) \ ((ffs_opts->version == 1) ? \ - (dp)->ffs1_din.di_##field : (dp)->ffs2_din.di_##field) + (dp)->dp1.di_##field : (dp)->dp2.di_##field) /* * Various file system defaults (cribbed from newfs(8)). */ #define DFL_FRAGSIZE 4096 /* fragment size */ #define DFL_BLKSIZE 32768 /* block size */ #define DFL_SECSIZE 512 /* sector size */ #define DFL_CYLSPERGROUP 65536 /* cylinders per group */ #define DFL_FRAGSPERINODE 4 /* fragments per inode */ #define DFL_ROTDELAY 0 /* rotational delay */ #define DFL_NRPOS 1 /* rotational positions */ #define DFL_RPM 3600 /* rpm of disk */ #define DFL_NSECTORS 64 /* # of sectors */ #define DFL_NTRACKS 16 /* # of tracks */ typedef struct { u_char *buf; /* buf for directory */ doff_t size; /* full size of buf */ doff_t cur; /* offset of current entry */ } dirbuf_t; static int ffs_create_image(const char *, fsinfo_t *); static void ffs_dump_fsinfo(fsinfo_t *); static void ffs_dump_dirbuf(dirbuf_t *, const char *, int); static void ffs_make_dirbuf(dirbuf_t *, const char *, fsnode *, int); static int ffs_populate_dir(const char *, fsnode *, fsinfo_t *); static void ffs_size_dir(fsnode *, fsinfo_t *); static void ffs_validate(const char *, fsnode *, fsinfo_t *); static void ffs_write_file(union dinode *, uint32_t, void *, fsinfo_t *); static void ffs_write_inode(union dinode *, uint32_t, const fsinfo_t *); static void *ffs_build_dinode1(struct ufs1_dinode *, dirbuf_t *, fsnode *, fsnode *, fsinfo_t *); static void *ffs_build_dinode2(struct ufs2_dinode *, dirbuf_t *, fsnode *, fsnode *, fsinfo_t *); /* publicly visible functions */ void ffs_prep_opts(fsinfo_t *fsopts) { ffs_opt_t *ffs_opts = ecalloc(1, sizeof(*ffs_opts)); const option_t ffs_options[] = { { 'b', "bsize", &ffs_opts->bsize, OPT_INT32, 1, INT_MAX, "block size" }, { 'f', "fsize", &ffs_opts->fsize, OPT_INT32, 1, INT_MAX, "fragment size" }, { 'd', "density", &ffs_opts->density, OPT_INT32, 1, INT_MAX, "bytes per inode" }, { 'm', "minfree", &ffs_opts->minfree, OPT_INT32, 0, 99, "minfree" }, { 'M', "maxbpg", &ffs_opts->maxbpg, OPT_INT32, 1, INT_MAX, "max blocks per file in a cg" }, { 'a', "avgfilesize", &ffs_opts->avgfilesize, OPT_INT32, 1, INT_MAX, "expected average file size" }, { 'n', "avgfpdir", &ffs_opts->avgfpdir, OPT_INT32, 1, INT_MAX, "expected # of files per directory" }, { 'x', "extent", &ffs_opts->maxbsize, OPT_INT32, 1, INT_MAX, "maximum # extent size" }, { 'g', "maxbpcg", &ffs_opts->maxblkspercg, OPT_INT32, 1, INT_MAX, "max # of blocks per group" }, { 'v', "version", &ffs_opts->version, OPT_INT32, 1, 2, "UFS version" }, { 'o', "optimization", NULL, OPT_STRBUF, 0, 0, "Optimization (time|space)" }, { 'l', "label", ffs_opts->label, OPT_STRARRAY, 1, sizeof(ffs_opts->label), "UFS label" }, { 's', "softupdates", &ffs_opts->softupdates, OPT_INT32, 0, 1, "enable softupdates" }, { .name = NULL } }; ffs_opts->bsize= -1; ffs_opts->fsize= -1; ffs_opts->cpg= -1; ffs_opts->density= -1; ffs_opts->min_inodes= false; ffs_opts->minfree= -1; ffs_opts->optimization= -1; ffs_opts->maxcontig= -1; ffs_opts->maxbpg= -1; ffs_opts->avgfilesize= -1; ffs_opts->avgfpdir= -1; ffs_opts->version = 1; ffs_opts->softupdates = 0; fsopts->fs_specific = ffs_opts; fsopts->fs_options = copy_opts(ffs_options); } void ffs_cleanup_opts(fsinfo_t *fsopts) { free(fsopts->fs_specific); free(fsopts->fs_options); } int ffs_parse_opts(const char *option, fsinfo_t *fsopts) { ffs_opt_t *ffs_opts = fsopts->fs_specific; option_t *ffs_options = fsopts->fs_options; char buf[1024]; int rv; assert(option != NULL); assert(fsopts != NULL); assert(ffs_opts != NULL); if (debug & DEBUG_FS_PARSE_OPTS) printf("ffs_parse_opts: got `%s'\n", option); rv = set_option(ffs_options, option, buf, sizeof(buf)); if (rv == -1) return 0; if (ffs_options[rv].name == NULL) abort(); switch (ffs_options[rv].letter) { case 'o': if (strcmp(buf, "time") == 0) { ffs_opts->optimization = FS_OPTTIME; } else if (strcmp(buf, "space") == 0) { ffs_opts->optimization = FS_OPTSPACE; } else { warnx("Invalid optimization `%s'", buf); return 0; } break; default: break; } return 1; } void ffs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts) { struct fs *superblock; struct timeval start; assert(image != NULL); assert(dir != NULL); assert(root != NULL); assert(fsopts != NULL); if (debug & DEBUG_FS_MAKEFS) printf("ffs_makefs: image %s directory %s root %p\n", image, dir, root); /* if user wants no free space, use minimum number of inodes */ if (fsopts->minsize == 0 && fsopts->freeblockpc == 0 && fsopts->freeblocks == 0) ((ffs_opt_t *)fsopts->fs_specific)->min_inodes = true; /* validate tree and options */ TIMER_START(start); ffs_validate(dir, root, fsopts); TIMER_RESULTS(start, "ffs_validate"); printf("Calculated size of `%s': %lld bytes, %lld inodes\n", image, (long long)fsopts->size, (long long)fsopts->inodes); /* create image */ TIMER_START(start); if (ffs_create_image(image, fsopts) == -1) errx(1, "Image file `%s' not created.", image); TIMER_RESULTS(start, "ffs_create_image"); fsopts->curinode = UFS_ROOTINO; if (debug & DEBUG_FS_MAKEFS) putchar('\n'); /* populate image */ printf("Populating `%s'\n", image); TIMER_START(start); if (! ffs_populate_dir(dir, root, fsopts)) errx(1, "Image file `%s' not populated.", image); TIMER_RESULTS(start, "ffs_populate_dir"); /* ensure no outstanding buffers remain */ if (debug & DEBUG_FS_MAKEFS) bcleanup(); /* update various superblock parameters */ superblock = fsopts->superblock; superblock->fs_fmod = 0; superblock->fs_old_cstotal.cs_ndir = superblock->fs_cstotal.cs_ndir; superblock->fs_old_cstotal.cs_nbfree = superblock->fs_cstotal.cs_nbfree; superblock->fs_old_cstotal.cs_nifree = superblock->fs_cstotal.cs_nifree; superblock->fs_old_cstotal.cs_nffree = superblock->fs_cstotal.cs_nffree; /* write out superblock; image is now complete */ ffs_write_superblock(fsopts->superblock, fsopts); if (close(fsopts->fd) == -1) err(1, "Closing `%s'", image); fsopts->fd = -1; printf("Image `%s' complete\n", image); } /* end of public functions */ static void ffs_validate(const char *dir, fsnode *root, fsinfo_t *fsopts) { #ifdef notyet int32_t spc, nspf, ncyl, fssize; #endif ffs_opt_t *ffs_opts = fsopts->fs_specific; assert(dir != NULL); assert(root != NULL); assert(fsopts != NULL); assert(ffs_opts != NULL); if (debug & DEBUG_FS_VALIDATE) { printf("ffs_validate: before defaults set:\n"); ffs_dump_fsinfo(fsopts); } /* set FFS defaults */ if (fsopts->sectorsize == -1) fsopts->sectorsize = DFL_SECSIZE; if (fsopts->sectorsize != DFL_SECSIZE) warnx("sectorsize %d may produce nonfunctional image", fsopts->sectorsize); if (ffs_opts->fsize == -1) ffs_opts->fsize = MAX(DFL_FRAGSIZE, fsopts->sectorsize); if (ffs_opts->bsize == -1) ffs_opts->bsize = MIN(DFL_BLKSIZE, 8 * ffs_opts->fsize); if (ffs_opts->cpg == -1) ffs_opts->cpg = DFL_CYLSPERGROUP; else ffs_opts->cpgflg = 1; /* fsopts->density is set below */ if (ffs_opts->nsectors == -1) ffs_opts->nsectors = DFL_NSECTORS; if (ffs_opts->minfree == -1) ffs_opts->minfree = MINFREE; if (ffs_opts->optimization == -1) ffs_opts->optimization = DEFAULTOPT; if (ffs_opts->maxcontig == -1) ffs_opts->maxcontig = MAX(1, MIN(MAXPHYS, FFS_MAXBSIZE) / ffs_opts->bsize); /* XXX ondisk32 */ if (ffs_opts->maxbpg == -1) ffs_opts->maxbpg = ffs_opts->bsize / sizeof(int32_t); if (ffs_opts->avgfilesize == -1) ffs_opts->avgfilesize = AVFILESIZ; if (ffs_opts->avgfpdir == -1) ffs_opts->avgfpdir = AFPDIR; if (fsopts->maxsize > 0 && roundup(fsopts->minsize, ffs_opts->bsize) > fsopts->maxsize) errx(1, "`%s' minsize of %lld rounded up to ffs bsize of %d " "exceeds maxsize %lld. Lower bsize, or round the minimum " "and maximum sizes to bsize.", dir, (long long)fsopts->minsize, ffs_opts->bsize, (long long)fsopts->maxsize); /* calculate size of tree */ ffs_size_dir(root, fsopts); fsopts->inodes += UFS_ROOTINO; /* include first two inodes */ if (debug & DEBUG_FS_VALIDATE) printf("ffs_validate: size of tree: %lld bytes, %lld inodes\n", (long long)fsopts->size, (long long)fsopts->inodes); /* add requested slop */ fsopts->size += fsopts->freeblocks; fsopts->inodes += fsopts->freefiles; if (fsopts->freefilepc > 0) fsopts->inodes = fsopts->inodes * (100 + fsopts->freefilepc) / 100; if (fsopts->freeblockpc > 0) fsopts->size = fsopts->size * (100 + fsopts->freeblockpc) / 100; /* * Add space needed for superblock, cylblock and to store inodes. * All of those segments are aligned to the block size. * XXX: This has to match calculations done in ffs_mkfs. */ if (ffs_opts->version == 1) { fsopts->size += roundup(SBLOCK_UFS1 + SBLOCKSIZE, ffs_opts->bsize); fsopts->size += roundup(SBLOCKSIZE, ffs_opts->bsize); fsopts->size += ffs_opts->bsize; fsopts->size += DINODE1_SIZE * roundup(fsopts->inodes, ffs_opts->bsize / DINODE1_SIZE); } else { fsopts->size += roundup(SBLOCK_UFS2 + SBLOCKSIZE, ffs_opts->bsize); fsopts->size += roundup(SBLOCKSIZE, ffs_opts->bsize); fsopts->size += ffs_opts->bsize; fsopts->size += DINODE2_SIZE * roundup(fsopts->inodes, ffs_opts->bsize / DINODE2_SIZE); } /* add minfree */ if (ffs_opts->minfree > 0) fsopts->size = fsopts->size * (100 + ffs_opts->minfree) / 100; /* * XXX any other fs slop to add, such as csum's, bitmaps, etc ?? */ if (fsopts->size < fsopts->minsize) /* ensure meets minimum size */ fsopts->size = fsopts->minsize; /* round up to the next block */ fsopts->size = roundup(fsopts->size, ffs_opts->bsize); /* round up to requested block size, if any */ if (fsopts->roundup > 0) fsopts->size = roundup(fsopts->size, fsopts->roundup); /* calculate density to just fit inodes if no free space */ if (ffs_opts->density == -1) ffs_opts->density = fsopts->size / fsopts->inodes + 1; if (debug & DEBUG_FS_VALIDATE) { printf("ffs_validate: after defaults set:\n"); ffs_dump_fsinfo(fsopts); printf("ffs_validate: dir %s; %lld bytes, %lld inodes\n", dir, (long long)fsopts->size, (long long)fsopts->inodes); } /* now check calculated sizes vs requested sizes */ if (fsopts->maxsize > 0 && fsopts->size > fsopts->maxsize) { errx(1, "`%s' size of %lld is larger than the maxsize of %lld.", dir, (long long)fsopts->size, (long long)fsopts->maxsize); } } static void ffs_dump_fsinfo(fsinfo_t *f) { ffs_opt_t *fs = f->fs_specific; printf("fsopts at %p\n", f); printf("\tsize %lld, inodes %lld, curinode %u\n", (long long)f->size, (long long)f->inodes, f->curinode); printf("\tminsize %lld, maxsize %lld\n", (long long)f->minsize, (long long)f->maxsize); printf("\tfree files %lld, freefile %% %d\n", (long long)f->freefiles, f->freefilepc); printf("\tfree blocks %lld, freeblock %% %d\n", (long long)f->freeblocks, f->freeblockpc); printf("\tneedswap %d, sectorsize %d\n", f->needswap, f->sectorsize); printf("\tbsize %d, fsize %d, cpg %d, density %d\n", fs->bsize, fs->fsize, fs->cpg, fs->density); printf("\tnsectors %d, rpm %d, minfree %d\n", fs->nsectors, fs->rpm, fs->minfree); printf("\tmaxcontig %d, maxbpg %d\n", fs->maxcontig, fs->maxbpg); printf("\toptimization %s\n", fs->optimization == FS_OPTSPACE ? "space" : "time"); } static int ffs_create_image(const char *image, fsinfo_t *fsopts) { #if HAVE_STRUCT_STATVFS_F_IOSIZE && HAVE_FSTATVFS struct statvfs sfs; #endif struct fs *fs; char *buf; int i, bufsize; off_t bufrem; int oflags = O_RDWR | O_CREAT; time_t tstamp; assert (image != NULL); assert (fsopts != NULL); /* create image */ if (fsopts->offset == 0) oflags |= O_TRUNC; if ((fsopts->fd = open(image, oflags, 0666)) == -1) { warn("Can't open `%s' for writing", image); return (-1); } /* zero image */ #if HAVE_STRUCT_STATVFS_F_IOSIZE && HAVE_FSTATVFS if (fstatvfs(fsopts->fd, &sfs) == -1) { #endif bufsize = 8192; #if HAVE_STRUCT_STATVFS_F_IOSIZE && HAVE_FSTATVFS warn("can't fstatvfs `%s', using default %d byte chunk", image, bufsize); } else bufsize = sfs.f_iosize; #endif bufrem = fsopts->size; if (fsopts->sparse) { if (ftruncate(fsopts->fd, bufrem) == -1) { warn("sparse option disabled."); fsopts->sparse = 0; } } if (fsopts->sparse) { /* File truncated at bufrem. Remaining is 0 */ bufrem = 0; buf = NULL; } else { if (debug & DEBUG_FS_CREATE_IMAGE) printf("zero-ing image `%s', %lld sectors, " "using %d byte chunks\n", image, (long long)bufrem, bufsize); buf = ecalloc(1, bufsize); } if (fsopts->offset != 0) if (lseek(fsopts->fd, fsopts->offset, SEEK_SET) == -1) { warn("can't seek"); free(buf); return -1; } while (bufrem > 0) { i = write(fsopts->fd, buf, MIN(bufsize, bufrem)); if (i == -1) { warn("zeroing image, %lld bytes to go", (long long)bufrem); free(buf); return (-1); } bufrem -= i; } if (buf) free(buf); /* make the file system */ if (debug & DEBUG_FS_CREATE_IMAGE) printf("calling mkfs(\"%s\", ...)\n", image); if (stampst.st_ino != 0) tstamp = stampst.st_ctime; else tstamp = start_time.tv_sec; srandom(tstamp); fs = ffs_mkfs(image, fsopts, tstamp); fsopts->superblock = (void *)fs; if (debug & DEBUG_FS_CREATE_IMAGE) { time_t t; t = (time_t)((struct fs *)fsopts->superblock)->fs_time; printf("mkfs returned %p; fs_time %s", fsopts->superblock, ctime(&t)); printf("fs totals: nbfree %lld, nffree %lld, nifree %lld, ndir %lld\n", (long long)fs->fs_cstotal.cs_nbfree, (long long)fs->fs_cstotal.cs_nffree, (long long)fs->fs_cstotal.cs_nifree, (long long)fs->fs_cstotal.cs_ndir); } if (fs->fs_cstotal.cs_nifree + (off_t)UFS_ROOTINO < fsopts->inodes) { warnx( "Image file `%s' has %lld free inodes; %lld are required.", image, (long long)(fs->fs_cstotal.cs_nifree + UFS_ROOTINO), (long long)fsopts->inodes); return (-1); } return (fsopts->fd); } static void ffs_size_dir(fsnode *root, fsinfo_t *fsopts) { struct direct tmpdir; fsnode * node; int curdirsize, this; ffs_opt_t *ffs_opts = fsopts->fs_specific; /* node may be NULL (empty directory) */ assert(fsopts != NULL); assert(ffs_opts != NULL); if (debug & DEBUG_FS_SIZE_DIR) printf("ffs_size_dir: entry: bytes %lld inodes %lld\n", (long long)fsopts->size, (long long)fsopts->inodes); #define ADDDIRENT(e) do { \ tmpdir.d_namlen = strlen((e)); \ this = DIRSIZ_SWAP(0, &tmpdir, 0); \ if (debug & DEBUG_FS_SIZE_DIR_ADD_DIRENT) \ printf("ADDDIRENT: was: %s (%d) this %d cur %d\n", \ e, tmpdir.d_namlen, this, curdirsize); \ if (this + curdirsize > roundup(curdirsize, DIRBLKSIZ)) \ curdirsize = roundup(curdirsize, DIRBLKSIZ); \ curdirsize += this; \ if (debug & DEBUG_FS_SIZE_DIR_ADD_DIRENT) \ printf("ADDDIRENT: now: %s (%d) this %d cur %d\n", \ e, tmpdir.d_namlen, this, curdirsize); \ } while (0); #define ADDSIZE(x) do { \ if ((size_t)(x) < UFS_NDADDR * (size_t)ffs_opts->bsize) { \ fsopts->size += roundup((x), ffs_opts->fsize); \ } else { \ /* Count space consumed by indirecttion blocks. */ \ fsopts->size += ffs_opts->bsize * \ (howmany((x), UFS_NDADDR * ffs_opts->bsize) - 1); \ /* \ * If the file is big enough to use indirect blocks, \ * we allocate bsize block for trailing data. \ */ \ fsopts->size += roundup((x), ffs_opts->bsize); \ } \ } while (0); curdirsize = 0; for (node = root; node != NULL; node = node->next) { ADDDIRENT(node->name); if (node == root) { /* we're at "." */ assert(strcmp(node->name, ".") == 0); ADDDIRENT(".."); } else if ((node->inode->flags & FI_SIZED) == 0) { /* don't count duplicate names */ node->inode->flags |= FI_SIZED; if (debug & DEBUG_FS_SIZE_DIR_NODE) printf("ffs_size_dir: `%s' size %lld\n", node->name, (long long)node->inode->st.st_size); fsopts->inodes++; if (node->type == S_IFREG) ADDSIZE(node->inode->st.st_size); if (node->type == S_IFLNK) { size_t slen; slen = strlen(node->symlink) + 1; if (slen >= (ffs_opts->version == 1 ? UFS1_MAXSYMLINKLEN : UFS2_MAXSYMLINKLEN)) ADDSIZE(slen); } } if (node->type == S_IFDIR) ffs_size_dir(node->child, fsopts); } ADDSIZE(curdirsize); if (debug & DEBUG_FS_SIZE_DIR) printf("ffs_size_dir: exit: size %lld inodes %lld\n", (long long)fsopts->size, (long long)fsopts->inodes); } static void * ffs_build_dinode1(struct ufs1_dinode *dinp, dirbuf_t *dbufp, fsnode *cur, fsnode *root, fsinfo_t *fsopts) { size_t slen; void *membuf; struct stat *st = stampst.st_ino != 0 ? &stampst : &cur->inode->st; memset(dinp, 0, sizeof(*dinp)); dinp->di_mode = cur->inode->st.st_mode; dinp->di_nlink = cur->inode->nlink; dinp->di_size = cur->inode->st.st_size; #if HAVE_STRUCT_STAT_ST_FLAGS dinp->di_flags = cur->inode->st.st_flags; #endif dinp->di_gen = random(); dinp->di_uid = cur->inode->st.st_uid; dinp->di_gid = cur->inode->st.st_gid; dinp->di_atime = st->st_atime; dinp->di_mtime = st->st_mtime; dinp->di_ctime = st->st_ctime; #if HAVE_STRUCT_STAT_ST_MTIMENSEC dinp->di_atimensec = st->st_atimensec; dinp->di_mtimensec = st->st_mtimensec; dinp->di_ctimensec = st->st_ctimensec; #endif /* not set: di_db, di_ib, di_blocks, di_spare */ membuf = NULL; if (cur == root) { /* "."; write dirbuf */ membuf = dbufp->buf; dinp->di_size = dbufp->size; } else if (S_ISBLK(cur->type) || S_ISCHR(cur->type)) { dinp->di_size = 0; /* a device */ dinp->di_rdev = ufs_rw32(cur->inode->st.st_rdev, fsopts->needswap); } else if (S_ISLNK(cur->type)) { /* symlink */ slen = strlen(cur->symlink); if (slen < UFS1_MAXSYMLINKLEN) { /* short link */ memcpy(dinp->di_shortlink, cur->symlink, slen); } else membuf = cur->symlink; dinp->di_size = slen; } return membuf; } static void * ffs_build_dinode2(struct ufs2_dinode *dinp, dirbuf_t *dbufp, fsnode *cur, fsnode *root, fsinfo_t *fsopts) { size_t slen; void *membuf; struct stat *st = stampst.st_ino != 0 ? &stampst : &cur->inode->st; memset(dinp, 0, sizeof(*dinp)); dinp->di_mode = cur->inode->st.st_mode; dinp->di_nlink = cur->inode->nlink; dinp->di_size = cur->inode->st.st_size; #if HAVE_STRUCT_STAT_ST_FLAGS dinp->di_flags = cur->inode->st.st_flags; #endif dinp->di_gen = random(); dinp->di_uid = cur->inode->st.st_uid; dinp->di_gid = cur->inode->st.st_gid; dinp->di_atime = st->st_atime; dinp->di_mtime = st->st_mtime; dinp->di_ctime = st->st_ctime; #if HAVE_STRUCT_STAT_BIRTHTIME dinp->di_birthtime = st->st_birthtime; #else dinp->di_birthtime = st->st_ctime; #endif #if HAVE_STRUCT_STAT_ST_MTIMENSEC dinp->di_atimensec = st->st_atimensec; dinp->di_mtimensec = st->st_mtimensec; dinp->di_ctimensec = st->st_ctimensec; #if HAVE_STRUCT_STAT_BIRTHTIME dinp->di_birthnsec = st->st_birthtimensec; #else dinp->di_birthnsec = st->st_ctimensec; #endif #endif /* not set: di_db, di_ib, di_blocks, di_spare */ membuf = NULL; if (cur == root) { /* "."; write dirbuf */ membuf = dbufp->buf; dinp->di_size = dbufp->size; } else if (S_ISBLK(cur->type) || S_ISCHR(cur->type)) { dinp->di_size = 0; /* a device */ dinp->di_rdev = ufs_rw64(cur->inode->st.st_rdev, fsopts->needswap); } else if (S_ISLNK(cur->type)) { /* symlink */ slen = strlen(cur->symlink); if (slen < UFS2_MAXSYMLINKLEN) { /* short link */ memcpy(dinp->di_shortlink, cur->symlink, slen); } else membuf = cur->symlink; dinp->di_size = slen; } return membuf; } static int ffs_populate_dir(const char *dir, fsnode *root, fsinfo_t *fsopts) { fsnode *cur; dirbuf_t dirbuf; union dinode din; void *membuf; char path[MAXPATHLEN + 1]; ffs_opt_t *ffs_opts = fsopts->fs_specific; assert(dir != NULL); assert(root != NULL); assert(fsopts != NULL); assert(ffs_opts != NULL); (void)memset(&dirbuf, 0, sizeof(dirbuf)); if (debug & DEBUG_FS_POPULATE) printf("ffs_populate_dir: PASS 1 dir %s node %p\n", dir, root); /* * pass 1: allocate inode numbers, build directory `file' */ for (cur = root; cur != NULL; cur = cur->next) { if ((cur->inode->flags & FI_ALLOCATED) == 0) { cur->inode->flags |= FI_ALLOCATED; if (cur == root && cur->parent != NULL) cur->inode->ino = cur->parent->inode->ino; else { cur->inode->ino = fsopts->curinode; fsopts->curinode++; } } ffs_make_dirbuf(&dirbuf, cur->name, cur, fsopts->needswap); if (cur == root) { /* we're at "."; add ".." */ ffs_make_dirbuf(&dirbuf, "..", cur->parent == NULL ? cur : cur->parent->first, fsopts->needswap); root->inode->nlink++; /* count my parent's link */ } else if (cur->child != NULL) root->inode->nlink++; /* count my child's link */ /* * XXX possibly write file and long symlinks here, * ensuring that blocks get written before inodes? * otoh, this isn't a real filesystem, so who * cares about ordering? :-) */ } if (debug & DEBUG_FS_POPULATE_DIRBUF) ffs_dump_dirbuf(&dirbuf, dir, fsopts->needswap); /* * pass 2: write out dirbuf, then non-directories at this level */ if (debug & DEBUG_FS_POPULATE) printf("ffs_populate_dir: PASS 2 dir %s\n", dir); for (cur = root; cur != NULL; cur = cur->next) { if (cur->inode->flags & FI_WRITTEN) continue; /* skip hard-linked entries */ cur->inode->flags |= FI_WRITTEN; if (cur->contents == NULL) { if (snprintf(path, sizeof(path), "%s/%s/%s", cur->root, cur->path, cur->name) >= (int)sizeof(path)) errx(1, "Pathname too long."); } if (cur->child != NULL) continue; /* child creates own inode */ /* build on-disk inode */ if (ffs_opts->version == 1) - membuf = ffs_build_dinode1(&din.ffs1_din, &dirbuf, cur, + membuf = ffs_build_dinode1(&din.dp1, &dirbuf, cur, root, fsopts); else - membuf = ffs_build_dinode2(&din.ffs2_din, &dirbuf, cur, + membuf = ffs_build_dinode2(&din.dp2, &dirbuf, cur, root, fsopts); if (debug & DEBUG_FS_POPULATE_NODE) { printf("ffs_populate_dir: writing ino %d, %s", cur->inode->ino, inode_type(cur->type)); if (cur->inode->nlink > 1) printf(", nlink %d", cur->inode->nlink); putchar('\n'); } if (membuf != NULL) { ffs_write_file(&din, cur->inode->ino, membuf, fsopts); } else if (S_ISREG(cur->type)) { ffs_write_file(&din, cur->inode->ino, (cur->contents) ? cur->contents : path, fsopts); } else { assert (! S_ISDIR(cur->type)); ffs_write_inode(&din, cur->inode->ino, fsopts); } } /* * pass 3: write out sub-directories */ if (debug & DEBUG_FS_POPULATE) printf("ffs_populate_dir: PASS 3 dir %s\n", dir); for (cur = root; cur != NULL; cur = cur->next) { if (cur->child == NULL) continue; if ((size_t)snprintf(path, sizeof(path), "%s/%s", dir, cur->name) >= sizeof(path)) errx(1, "Pathname too long."); if (! ffs_populate_dir(path, cur->child, fsopts)) return (0); } if (debug & DEBUG_FS_POPULATE) printf("ffs_populate_dir: DONE dir %s\n", dir); /* cleanup */ if (dirbuf.buf != NULL) free(dirbuf.buf); return (1); } static void ffs_write_file(union dinode *din, uint32_t ino, void *buf, fsinfo_t *fsopts) { int isfile, ffd; char *fbuf, *p; off_t bufleft, chunk, offset; ssize_t nread; struct inode in; struct m_buf * bp; ffs_opt_t *ffs_opts = fsopts->fs_specific; struct m_vnode vp = { fsopts, NULL }; assert (din != NULL); assert (buf != NULL); assert (fsopts != NULL); assert (ffs_opts != NULL); isfile = S_ISREG(DIP(din, mode)); fbuf = NULL; ffd = -1; p = NULL; in.i_fs = (struct fs *)fsopts->superblock; in.i_devvp = (void *)&vp; if (debug & DEBUG_FS_WRITE_FILE) { printf( "ffs_write_file: ino %u, din %p, isfile %d, %s, size %lld", ino, din, isfile, inode_type(DIP(din, mode) & S_IFMT), (long long)DIP(din, size)); if (isfile) printf(", file '%s'\n", (char *)buf); else printf(", buffer %p\n", buf); } in.i_number = ino; in.i_size = DIP(din, size); if (ffs_opts->version == 1) - memcpy(&in.i_din.ffs1_din, &din->ffs1_din, - sizeof(in.i_din.ffs1_din)); + memcpy(&in.i_din.dp1, &din->dp1, + sizeof(in.i_din.dp1)); else - memcpy(&in.i_din.ffs2_din, &din->ffs2_din, - sizeof(in.i_din.ffs2_din)); + memcpy(&in.i_din.dp2, &din->dp2, + sizeof(in.i_din.dp2)); if (DIP(din, size) == 0) goto write_inode_and_leave; /* mmm, cheating */ if (isfile) { fbuf = emalloc(ffs_opts->bsize); if ((ffd = open((char *)buf, O_RDONLY)) == -1) { err(EXIT_FAILURE, "Can't open `%s' for reading", (char *)buf); } } else { p = buf; } chunk = 0; for (bufleft = DIP(din, size); bufleft > 0; bufleft -= chunk) { chunk = MIN(bufleft, ffs_opts->bsize); if (!isfile) ; else if ((nread = read(ffd, fbuf, chunk)) == -1) err(EXIT_FAILURE, "Reading `%s', %lld bytes to go", (char *)buf, (long long)bufleft); else if (nread != chunk) errx(EXIT_FAILURE, "Reading `%s', %lld bytes to go, " "read %zd bytes, expected %ju bytes, does " "metalog size= attribute mismatch source size?", (char *)buf, (long long)bufleft, nread, (uintmax_t)chunk); else p = fbuf; offset = DIP(din, size) - bufleft; if (debug & DEBUG_FS_WRITE_FILE_BLOCK) printf( "ffs_write_file: write %p offset %lld size %lld left %lld\n", p, (long long)offset, (long long)chunk, (long long)bufleft); /* * XXX if holey support is desired, do the check here * * XXX might need to write out last bit in fragroundup * sized chunk. however, ffs_balloc() handles this for us */ errno = ffs_balloc(&in, offset, chunk, &bp); bad_ffs_write_file: if (errno != 0) err(1, "Writing inode %d (%s), bytes %lld + %lld", ino, isfile ? (char *)buf : inode_type(DIP(din, mode) & S_IFMT), (long long)offset, (long long)chunk); memcpy(bp->b_data, p, chunk); errno = bwrite(bp); if (errno != 0) goto bad_ffs_write_file; if (!isfile) p += chunk; } write_inode_and_leave: ffs_write_inode(&in.i_din, in.i_number, fsopts); if (fbuf) free(fbuf); if (ffd != -1) close(ffd); } static void ffs_dump_dirbuf(dirbuf_t *dbuf, const char *dir, int needswap) { doff_t i; struct direct *de; uint16_t reclen; assert (dbuf != NULL); assert (dir != NULL); printf("ffs_dump_dirbuf: dir %s size %d cur %d\n", dir, dbuf->size, dbuf->cur); for (i = 0; i < dbuf->size; ) { de = (struct direct *)(dbuf->buf + i); reclen = ufs_rw16(de->d_reclen, needswap); printf( " inode %4d %7s offset %4d reclen %3d namlen %3d name %s\n", ufs_rw32(de->d_ino, needswap), inode_type(DTTOIF(de->d_type)), i, reclen, de->d_namlen, de->d_name); i += reclen; assert(reclen > 0); } } static void ffs_make_dirbuf(dirbuf_t *dbuf, const char *name, fsnode *node, int needswap) { struct direct de, *dp; uint16_t llen, reclen; u_char *newbuf; assert (dbuf != NULL); assert (name != NULL); assert (node != NULL); /* create direct entry */ (void)memset(&de, 0, sizeof(de)); de.d_ino = ufs_rw32(node->inode->ino, needswap); de.d_type = IFTODT(node->type); de.d_namlen = (uint8_t)strlen(name); strcpy(de.d_name, name); reclen = DIRSIZ_SWAP(0, &de, needswap); de.d_reclen = ufs_rw16(reclen, needswap); dp = (struct direct *)(dbuf->buf + dbuf->cur); llen = 0; if (dp != NULL) llen = DIRSIZ_SWAP(0, dp, needswap); if (debug & DEBUG_FS_MAKE_DIRBUF) printf( "ffs_make_dirbuf: dbuf siz %d cur %d lastlen %d\n" " ino %d type %d reclen %d namlen %d name %.30s\n", dbuf->size, dbuf->cur, llen, ufs_rw32(de.d_ino, needswap), de.d_type, reclen, de.d_namlen, de.d_name); if (reclen + dbuf->cur + llen > roundup(dbuf->size, DIRBLKSIZ)) { if (debug & DEBUG_FS_MAKE_DIRBUF) printf("ffs_make_dirbuf: growing buf to %d\n", dbuf->size + DIRBLKSIZ); newbuf = erealloc(dbuf->buf, dbuf->size + DIRBLKSIZ); dbuf->buf = newbuf; dbuf->size += DIRBLKSIZ; memset(dbuf->buf + dbuf->size - DIRBLKSIZ, 0, DIRBLKSIZ); dbuf->cur = dbuf->size - DIRBLKSIZ; } else if (dp) { /* shrink end of previous */ dp->d_reclen = ufs_rw16(llen,needswap); dbuf->cur += llen; } dp = (struct direct *)(dbuf->buf + dbuf->cur); memcpy(dp, &de, reclen); dp->d_reclen = ufs_rw16(dbuf->size - dbuf->cur, needswap); } /* * cribbed from sys/ufs/ffs/ffs_alloc.c */ static void ffs_write_inode(union dinode *dp, uint32_t ino, const fsinfo_t *fsopts) { char *buf; struct ufs1_dinode *dp1; struct ufs2_dinode *dp2, *dip; struct cg *cgp; struct fs *fs; int cg, cgino; uint32_t i; daddr_t d; char sbbuf[FFS_MAXBSIZE]; uint32_t initediblk; ffs_opt_t *ffs_opts = fsopts->fs_specific; assert (dp != NULL); assert (ino > 0); assert (fsopts != NULL); assert (ffs_opts != NULL); fs = (struct fs *)fsopts->superblock; cg = ino_to_cg(fs, ino); cgino = ino % fs->fs_ipg; if (debug & DEBUG_FS_WRITE_INODE) printf("ffs_write_inode: din %p ino %u cg %d cgino %d\n", dp, ino, cg, cgino); ffs_rdfs(fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, &sbbuf, fsopts); cgp = (struct cg *)sbbuf; if (!cg_chkmagic_swap(cgp, fsopts->needswap)) errx(1, "ffs_write_inode: cg %d: bad magic number", cg); assert (isclr(cg_inosused_swap(cgp, fsopts->needswap), cgino)); buf = emalloc(fs->fs_bsize); dp1 = (struct ufs1_dinode *)buf; dp2 = (struct ufs2_dinode *)buf; if (fs->fs_cstotal.cs_nifree == 0) errx(1, "ffs_write_inode: fs out of inodes for ino %u", ino); if (fs->fs_cs(fs, cg).cs_nifree == 0) errx(1, "ffs_write_inode: cg %d out of inodes for ino %u", cg, ino); setbit(cg_inosused_swap(cgp, fsopts->needswap), cgino); ufs_add32(cgp->cg_cs.cs_nifree, -1, fsopts->needswap); fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; if (S_ISDIR(DIP(dp, mode))) { ufs_add32(cgp->cg_cs.cs_ndir, 1, fsopts->needswap); fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } /* * Initialize inode blocks on the fly for UFS2. */ initediblk = ufs_rw32(cgp->cg_initediblk, fsopts->needswap); while (ffs_opts->version == 2 && cgino + INOPB(fs) > initediblk && initediblk < ufs_rw32(cgp->cg_niblk, fsopts->needswap)) { memset(buf, 0, fs->fs_bsize); dip = (struct ufs2_dinode *)buf; for (i = 0; i < INOPB(fs); i++) { dip->di_gen = random(); dip++; } ffs_wtfs(fsbtodb(fs, ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)), fs->fs_bsize, buf, fsopts); initediblk += INOPB(fs); cgp->cg_initediblk = ufs_rw32(initediblk, fsopts->needswap); } ffs_wtfs(fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, &sbbuf, fsopts); /* now write inode */ d = fsbtodb(fs, ino_to_fsba(fs, ino)); ffs_rdfs(d, fs->fs_bsize, buf, fsopts); if (fsopts->needswap) { if (ffs_opts->version == 1) - ffs_dinode1_swap(&dp->ffs1_din, + ffs_dinode1_swap(&dp->dp1, &dp1[ino_to_fsbo(fs, ino)]); else - ffs_dinode2_swap(&dp->ffs2_din, + ffs_dinode2_swap(&dp->dp2, &dp2[ino_to_fsbo(fs, ino)]); } else { if (ffs_opts->version == 1) - dp1[ino_to_fsbo(fs, ino)] = dp->ffs1_din; + dp1[ino_to_fsbo(fs, ino)] = dp->dp1; else - dp2[ino_to_fsbo(fs, ino)] = dp->ffs2_din; + dp2[ino_to_fsbo(fs, ino)] = dp->dp2; } ffs_wtfs(d, fs->fs_bsize, buf, fsopts); free(buf); } void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vwarnx(fmt, ap); va_end(ap); exit(1); } diff --git a/usr.sbin/makefs/ffs/ufs_inode.h b/usr.sbin/makefs/ffs/ufs_inode.h index 67581dda3359..e84fa57f6540 100644 --- a/usr.sbin/makefs/ffs/ufs_inode.h +++ b/usr.sbin/makefs/ffs/ufs_inode.h @@ -1,98 +1,93 @@ /* $NetBSD: ufs_inode.h,v 1.3 2003/08/07 11:25:34 agc Exp $ */ /* From: NetBSD: inode.h,v 1.27 2001/12/18 10:57:23 fvdl Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)inode.h 8.9 (Berkeley) 5/14/95 */ -union dinode { - struct ufs1_dinode ffs1_din; - struct ufs2_dinode ffs2_din; -}; - struct inode { ino_t i_number; /* The identity of the inode. */ struct vnode *i_devvp; /* vnode pointer (contains fsopts) */ struct fs *i_fs; /* File system */ union dinode i_din; uint64_t i_size; }; -#define i_ffs1_atime i_din.ffs1_din.di_atime -#define i_ffs1_atimensec i_din.ffs1_din.di_atimensec -#define i_ffs1_blocks i_din.ffs1_din.di_blocks -#define i_ffs1_ctime i_din.ffs1_din.di_ctime -#define i_ffs1_ctimensec i_din.ffs1_din.di_ctimensec -#define i_ffs1_db i_din.ffs1_din.di_db -#define i_ffs1_flags i_din.ffs1_din.di_flags -#define i_ffs1_gen i_din.ffs1_din.di_gen -#define i_ffs11_gid i_din.ffs1_din.di_gid -#define i_ffs1_ib i_din.ffs1_din.di_ib -#define i_ffs1_mode i_din.ffs1_din.di_mode -#define i_ffs1_mtime i_din.ffs1_din.di_mtime -#define i_ffs1_mtimensec i_din.ffs1_din.di_mtimensec -#define i_ffs1_nlink i_din.ffs1_din.di_nlink -#define i_ffs1_rdev i_din.ffs1_din.di_rdev -#define i_ffs1_shortlink i_din.ffs1_din.di_shortlink -#define i_ffs1_size i_din.ffs1_din.di_size -#define i_ffs1_uid i_din.ffs1_din.di_uid +#define i_ffs1_atime i_din.dp1.di_atime +#define i_ffs1_atimensec i_din.dp1.di_atimensec +#define i_ffs1_blocks i_din.dp1.di_blocks +#define i_ffs1_ctime i_din.dp1.di_ctime +#define i_ffs1_ctimensec i_din.dp1.di_ctimensec +#define i_ffs1_db i_din.dp1.di_db +#define i_ffs1_flags i_din.dp1.di_flags +#define i_ffs1_gen i_din.dp1.di_gen +#define i_ffs11_gid i_din.dp1.di_gid +#define i_ffs1_ib i_din.dp1.di_ib +#define i_ffs1_mode i_din.dp1.di_mode +#define i_ffs1_mtime i_din.dp1.di_mtime +#define i_ffs1_mtimensec i_din.dp1.di_mtimensec +#define i_ffs1_nlink i_din.dp1.di_nlink +#define i_ffs1_rdev i_din.dp1.di_rdev +#define i_ffs1_shortlink i_din.dp1.di_shortlink +#define i_ffs1_size i_din.dp1.di_size +#define i_ffs1_uid i_din.dp1.di_uid -#define i_ffs2_atime i_din.ffs2_din.di_atime -#define i_ffs2_atimensec i_din.ffs2_din.di_atimensec -#define i_ffs2_blocks i_din.ffs2_din.di_blocks -#define i_ffs2_ctime i_din.ffs2_din.di_ctime -#define i_ffs2_ctimensec i_din.ffs2_din.di_ctimensec -#define i_ffs2_birthtime i_din.ffs2_din.di_birthtime -#define i_ffs2_birthnsec i_din.ffs2_din.di_birthnsec -#define i_ffs2_db i_din.ffs2_din.di_db -#define i_ffs2_flags i_din.ffs2_din.di_flags -#define i_ffs2_gen i_din.ffs2_din.di_gen -#define i_ffs21_gid i_din.ffs2_din.di_gid -#define i_ffs2_ib i_din.ffs2_din.di_ib -#define i_ffs2_mode i_din.ffs2_din.di_mode -#define i_ffs2_mtime i_din.ffs2_din.di_mtime -#define i_ffs2_mtimensec i_din.ffs2_din.di_mtimensec -#define i_ffs2_nlink i_din.ffs2_din.di_nlink -#define i_ffs2_rdev i_din.ffs2_din.di_rdev -#define i_ffs2_shortlink i_din.ffs2_din.di_shortlink -#define i_ffs2_size i_din.ffs2_din.di_size -#define i_ffs2_uid i_din.ffs2_din.di_uid +#define i_ffs2_atime i_din.dp2.di_atime +#define i_ffs2_atimensec i_din.dp2.di_atimensec +#define i_ffs2_blocks i_din.dp2.di_blocks +#define i_ffs2_ctime i_din.dp2.di_ctime +#define i_ffs2_ctimensec i_din.dp2.di_ctimensec +#define i_ffs2_birthtime i_din.dp2.di_birthtime +#define i_ffs2_birthnsec i_din.dp2.di_birthnsec +#define i_ffs2_db i_din.dp2.di_db +#define i_ffs2_flags i_din.dp2.di_flags +#define i_ffs2_gen i_din.dp2.di_gen +#define i_ffs21_gid i_din.dp2.di_gid +#define i_ffs2_ib i_din.dp2.di_ib +#define i_ffs2_mode i_din.dp2.di_mode +#define i_ffs2_mtime i_din.dp2.di_mtime +#define i_ffs2_mtimensec i_din.dp2.di_mtimensec +#define i_ffs2_nlink i_din.dp2.di_nlink +#define i_ffs2_rdev i_din.dp2.di_rdev +#define i_ffs2_shortlink i_din.dp2.di_shortlink +#define i_ffs2_size i_din.dp2.di_size +#define i_ffs2_uid i_din.dp2.di_uid #undef DIP #define DIP(ip, field) \ (((ip)->i_fs->fs_magic == FS_UFS1_MAGIC) ? \ (ip)->i_ffs1_##field : (ip)->i_ffs2_##field) diff --git a/usr.sbin/quot/quot.c b/usr.sbin/quot/quot.c index dc189f0b8e42..27f0c1ee0068 100644 --- a/usr.sbin/quot/quot.c +++ b/usr.sbin/quot/quot.c @@ -1,630 +1,626 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1991, 1994 Wolfgang Solfrank. * Copyright (C) 1991, 1994 TooLs GmbH. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* some flags of what to do: */ static char estimate; static char count; static char unused; static void (*func)(int, struct fs *, char *); static long blocksize; static char *header; static int headerlen; static union dinode *get_inode(int, struct fs *, ino_t); static int virtualblocks(struct fs *, union dinode *); static int isfree(struct fs *, union dinode *); static void inituser(void); static void usrrehash(void); static struct user *user(uid_t); static int cmpusers(const void *, const void *); static void uses(uid_t, daddr_t, time_t); static void initfsizes(void); static void dofsizes(int, struct fs *, char *); static void douser(int, struct fs *, char *); static void donames(int, struct fs *, char *); static void usage(void); static void quot(char *, char *); /* * Original BSD quot doesn't round to number of frags/blocks, * doesn't account for indirection blocks and gets it totally * wrong if the size is a multiple of the blocksize. * The new code always counts the number of 512 byte blocks * instead of the number of kilobytes and converts them to * kByte when done (on request). * * Due to the size of modern disks, we must cast intermediate * values to 64 bits to prevent potential overflows. */ #ifdef COMPAT #define SIZE(n) (n) #else #define SIZE(n) ((int)(((quad_t)(n) * 512 + blocksize - 1)/blocksize)) #endif #define INOCNT(fs) ((fs)->fs_ipg) #define INOSZ(fs) \ (((fs)->fs_magic == FS_UFS1_MAGIC ? sizeof(struct ufs1_dinode) : \ sizeof(struct ufs2_dinode)) * INOCNT(fs)) -union dinode { - struct ufs1_dinode dp1; - struct ufs2_dinode dp2; -}; #define DIP(fs, dp, field) \ (((fs)->fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) static union dinode * get_inode(int fd, struct fs *super, ino_t ino) { static caddr_t ipbuf; static struct cg *cgp; static ino_t last; static int cg; struct ufs2_dinode *di2; if (fd < 0) { /* flush cache */ if (ipbuf) { free(ipbuf); ipbuf = 0; if (super != NULL && super->fs_magic == FS_UFS2_MAGIC) { free(cgp); cgp = 0; } } return 0; } if (!ipbuf || ino < last || ino >= last + INOCNT(super)) { if (super->fs_magic == FS_UFS2_MAGIC && (!cgp || cg != ino_to_cg(super, ino))) { cg = ino_to_cg(super, ino); if (!cgp && !(cgp = malloc(super->fs_cgsize))) errx(1, "allocate cg"); if (lseek(fd, (off_t)cgtod(super, cg) << super->fs_fshift, 0) < 0) err(1, "lseek cg"); if (read(fd, cgp, super->fs_cgsize) != super->fs_cgsize) err(1, "read cg"); if (!cg_chkmagic(cgp)) errx(1, "cg has bad magic"); } if (!ipbuf && !(ipbuf = malloc(INOSZ(super)))) errx(1, "allocate inodes"); last = rounddown(ino, INOCNT(super)); if (lseek(fd, (off_t)ino_to_fsba(super, last) << super->fs_fshift, 0) < (off_t)0 || read(fd, ipbuf, INOSZ(super)) != (ssize_t)INOSZ(super)) err(1, "read inodes"); } if (super->fs_magic == FS_UFS1_MAGIC) return ((union dinode *) &((struct ufs1_dinode *)ipbuf)[ino % INOCNT(super)]); di2 = &((struct ufs2_dinode *)ipbuf)[ino % INOCNT(super)]; /* If the inode is unused, it might be unallocated too, so zero it. */ if (isclr(cg_inosused(cgp), ino % super->fs_ipg)) bzero(di2, sizeof (*di2)); return ((union dinode *)di2); } #ifdef COMPAT #define actualblocks(fs, dp) (DIP(fs, dp, di_blocks) / 2) #else #define actualblocks(fs, dp) DIP(fs, dp, di_blocks) #endif static int virtualblocks(struct fs *super, union dinode *dp) { off_t nblk, sz; sz = DIP(super, dp, di_size); #ifdef COMPAT if (lblkno(super,sz) >= UFS_NDADDR) { nblk = blkroundup(super,sz); if (sz == nblk) nblk += super->fs_bsize; } return sz / 1024; #else /* COMPAT */ if (lblkno(super,sz) >= UFS_NDADDR) { nblk = blkroundup(super,sz); sz = lblkno(super,nblk); sz = (sz - UFS_NDADDR + NINDIR(super) - 1) / NINDIR(super); while (sz > 0) { nblk += sz * super->fs_bsize; /* sz - 1 rounded up */ sz = (sz - 1 + NINDIR(super) - 1) / NINDIR(super); } } else nblk = fragroundup(super,sz); return nblk / 512; #endif /* COMPAT */ } static int isfree(struct fs *super, union dinode *dp) { #ifdef COMPAT return (DIP(super, dp, di_mode) & IFMT) == 0; #else /* COMPAT */ switch (DIP(super, dp, di_mode) & IFMT) { case IFIFO: case IFLNK: /* should check FASTSYMLINK? */ case IFDIR: case IFREG: return 0; case IFCHR: case IFBLK: case IFSOCK: case IFWHT: case 0: return 1; default: errx(1, "unknown IFMT 0%o", DIP(super, dp, di_mode) & IFMT); } #endif } static struct user { uid_t uid; char *name; daddr_t space; long count; daddr_t spc30; daddr_t spc60; daddr_t spc90; } *users; static int nusers; static void inituser(void) { int i; struct user *usr; if (!nusers) { nusers = 8; if (!(users = (struct user *)calloc(nusers,sizeof(struct user)))) errx(1, "allocate users"); } else { for (usr = users, i = nusers; --i >= 0; usr++) { usr->space = usr->spc30 = usr->spc60 = usr->spc90 = 0; usr->count = 0; } } } static void usrrehash(void) { int i; struct user *usr, *usrn; struct user *svusr; svusr = users; nusers <<= 1; if (!(users = (struct user *)calloc(nusers,sizeof(struct user)))) errx(1, "allocate users"); for (usr = svusr, i = nusers >> 1; --i >= 0; usr++) { for (usrn = users + (usr->uid&(nusers - 1)); usrn->name; usrn--) { if (usrn <= users) usrn = users + nusers; } *usrn = *usr; } } static struct user * user(uid_t uid) { struct user *usr; int i; struct passwd *pwd; while (1) { for (usr = users + (uid&(nusers - 1)), i = nusers; --i >= 0; usr--) { if (!usr->name) { usr->uid = uid; if (!(pwd = getpwuid(uid))) { if ((usr->name = (char *)malloc(7))) sprintf(usr->name,"#%d",uid); } else { if ((usr->name = (char *) malloc(strlen(pwd->pw_name) + 1))) strcpy(usr->name,pwd->pw_name); } if (!usr->name) errx(1, "allocate users"); return usr; } else if (usr->uid == uid) return usr; if (usr <= users) usr = users + nusers; } usrrehash(); } } static int cmpusers(const void *v1, const void *v2) { const struct user *u1, *u2; u1 = (const struct user *)v1; u2 = (const struct user *)v2; return u2->space - u1->space; } #define sortusers(users) (qsort((users),nusers,sizeof(struct user), \ cmpusers)) static void uses(uid_t uid, daddr_t blks, time_t act) { static time_t today; struct user *usr; if (!today) time(&today); usr = user(uid); usr->count++; usr->space += blks; if (today - act > 90L * 24L * 60L * 60L) usr->spc90 += blks; if (today - act > 60L * 24L * 60L * 60L) usr->spc60 += blks; if (today - act > 30L * 24L * 60L * 60L) usr->spc30 += blks; } #ifdef COMPAT #define FSZCNT 500 #else #define FSZCNT 512 #endif struct fsizes { struct fsizes *fsz_next; daddr_t fsz_first, fsz_last; ino_t fsz_count[FSZCNT]; daddr_t fsz_sz[FSZCNT]; } *fsizes; static void initfsizes(void) { struct fsizes *fp; int i; for (fp = fsizes; fp; fp = fp->fsz_next) { for (i = FSZCNT; --i >= 0;) { fp->fsz_count[i] = 0; fp->fsz_sz[i] = 0; } } } static void dofsizes(int fd, struct fs *super, char *name) { ino_t inode, maxino; union dinode *dp; daddr_t sz, ksz; struct fsizes *fp, **fsp; int i; maxino = super->fs_ncg * super->fs_ipg - 1; #ifdef COMPAT if (!(fsizes = (struct fsizes *)malloc(sizeof(struct fsizes)))) errx(1, "allocate fsize structure"); #endif /* COMPAT */ for (inode = 0; inode < maxino; inode++) { errno = 0; if ((dp = get_inode(fd,super,inode)) #ifdef COMPAT && ((DIP(super, dp, di_mode) & IFMT) == IFREG || (DIP(super, dp, di_mode) & IFMT) == IFDIR) #else /* COMPAT */ && !isfree(super, dp) #endif /* COMPAT */ ) { sz = estimate ? virtualblocks(super, dp) : actualblocks(super, dp); #ifdef COMPAT if (sz >= FSZCNT) { fsizes->fsz_count[FSZCNT-1]++; fsizes->fsz_sz[FSZCNT-1] += sz; } else { fsizes->fsz_count[sz]++; fsizes->fsz_sz[sz] += sz; } #else /* COMPAT */ ksz = SIZE(sz); for (fsp = &fsizes; (fp = *fsp); fsp = &fp->fsz_next) { if (ksz < fp->fsz_last) break; } if (!fp || ksz < fp->fsz_first) { if (!(fp = (struct fsizes *) malloc(sizeof(struct fsizes)))) errx(1, "allocate fsize structure"); fp->fsz_next = *fsp; *fsp = fp; fp->fsz_first = rounddown(ksz, FSZCNT); fp->fsz_last = fp->fsz_first + FSZCNT; for (i = FSZCNT; --i >= 0;) { fp->fsz_count[i] = 0; fp->fsz_sz[i] = 0; } } fp->fsz_count[ksz % FSZCNT]++; fp->fsz_sz[ksz % FSZCNT] += sz; #endif /* COMPAT */ } else if (errno) { err(1, "%s", name); } } sz = 0; for (fp = fsizes; fp; fp = fp->fsz_next) { for (i = 0; i < FSZCNT; i++) { if (fp->fsz_count[i]) printf("%jd\t%jd\t%d\n", (intmax_t)(fp->fsz_first + i), (intmax_t)fp->fsz_count[i], SIZE(sz += fp->fsz_sz[i])); } } } static void douser(int fd, struct fs *super, char *name) { ino_t inode, maxino; struct user *usr, *usrs; union dinode *dp; int n; maxino = super->fs_ncg * super->fs_ipg - 1; for (inode = 0; inode < maxino; inode++) { errno = 0; if ((dp = get_inode(fd,super,inode)) && !isfree(super, dp)) uses(DIP(super, dp, di_uid), estimate ? virtualblocks(super, dp) : actualblocks(super, dp), DIP(super, dp, di_atime)); else if (errno) { err(1, "%s", name); } } if (!(usrs = (struct user *)malloc(nusers * sizeof(struct user)))) errx(1, "allocate users"); bcopy(users,usrs,nusers * sizeof(struct user)); sortusers(usrs); for (usr = usrs, n = nusers; --n >= 0 && usr->count; usr++) { printf("%5d",SIZE(usr->space)); if (count) printf("\t%5ld",usr->count); printf("\t%-8s",usr->name); if (unused) printf("\t%5d\t%5d\t%5d", SIZE(usr->spc30), SIZE(usr->spc60), SIZE(usr->spc90)); printf("\n"); } free(usrs); } static void donames(int fd, struct fs *super, char *name) { int c; ino_t maxino; uintmax_t inode; union dinode *dp; maxino = super->fs_ncg * super->fs_ipg - 1; /* first skip the name of the filesystem */ while ((c = getchar()) != EOF && (c < '0' || c > '9')) while ((c = getchar()) != EOF && c != '\n'); ungetc(c,stdin); while (scanf("%ju", &inode) == 1) { if (inode > maxino) { warnx("illegal inode %ju", inode); return; } errno = 0; if ((dp = get_inode(fd,super,inode)) && !isfree(super, dp)) { printf("%s\t",user(DIP(super, dp, di_uid))->name); /* now skip whitespace */ while ((c = getchar()) == ' ' || c == '\t'); /* and print out the remainder of the input line */ while (c != EOF && c != '\n') { putchar(c); c = getchar(); } putchar('\n'); } else { if (errno) { err(1, "%s", name); } /* skip this line */ while ((c = getchar()) != EOF && c != '\n'); } if (c == EOF) break; } } static void usage(void) { #ifdef COMPAT fprintf(stderr,"usage: quot [-nfcvha] [filesystem ...]\n"); #else /* COMPAT */ fprintf(stderr,"usage: quot [-acfhknv] [filesystem ...]\n"); #endif /* COMPAT */ exit(1); } void quot(char *name, char *mp) { int fd; struct fs *fs; get_inode(-1, NULL, 0); /* flush cache */ inituser(); initfsizes(); if ((fd = open(name,0)) < 0) { warn("%s", name); close(fd); return; } switch (errno = sbget(fd, &fs, UFS_STDSB, UFS_NOCSUM)) { case 0: break; case ENOENT: warn("Cannot find file system superblock"); close(fd); return; default: warn("Unable to read file system superblock"); close(fd); return; } printf("%s:",name); if (mp) printf(" (%s)",mp); putchar('\n'); (*func)(fd, fs, name); free(fs); close(fd); } int main(int argc, char *argv[]) { char all = 0; struct statfs *mp; struct fstab *fs; int cnt; func = douser; #ifndef COMPAT header = getbsize(&headerlen,&blocksize); #endif while (--argc > 0 && **++argv == '-') { while (*++*argv) { switch (**argv) { case 'n': func = donames; break; case 'c': func = dofsizes; break; case 'a': all = 1; break; case 'f': count = 1; break; case 'h': estimate = 1; break; #ifndef COMPAT case 'k': blocksize = 1024; break; #endif /* COMPAT */ case 'v': unused = 1; break; default: usage(); } } } if (all) { cnt = getmntinfo(&mp,MNT_NOWAIT); for (; --cnt >= 0; mp++) { if (!strncmp(mp->f_fstypename, "ufs", MFSNAMELEN)) quot(mp->f_mntfromname, mp->f_mntonname); } } while (--argc >= 0) { if ((fs = getfsfile(*argv)) != NULL) quot(fs->fs_spec, 0); else quot(*argv,0); argv++; } return 0; }