diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h index ad82c5f80da1..3b80169c1e3c 100644 --- a/sbin/fsck_ffs/fsck.h +++ b/sbin/fsck_ffs/fsck.h @@ -1,543 +1,544 @@ /*- * SPDX-License-Identifier: BSD-3-Clause and BSD-2-Clause * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fsck.h 8.4 (Berkeley) 5/9/95 * $FreeBSD$ */ #ifndef _FSCK_H_ #define _FSCK_H_ #include #include #include #include #define MAXDUP 10 /* limit on dup blks (per inode) */ #define MAXBAD 10 /* limit on bad blks (per inode) */ #define MINBUFS 100 /* minimum number of buffers required */ #define INOBUFSIZE 64*1024 /* size of buffer to read inodes in pass1 */ #define ZEROBUFSIZE (dev_bsize * 128) /* size of zero buffer used by -Z */ union dinode { struct ufs1_dinode dp1; struct ufs2_dinode dp2; }; #define DIP(dp, field) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (dp)->dp1.field : (dp)->dp2.field) #define DIP_SET(dp, field, val) do { \ if (sblock.fs_magic == FS_UFS1_MAGIC) \ (dp)->dp1.field = (val); \ else \ (dp)->dp2.field = (val); \ } while (0) /* * Each inode on the file system is described by the following structure. * The linkcnt is initially set to the value in the inode. Each time it * is found during the descent in passes 2, 3, and 4 the count is * decremented. Any inodes whose count is non-zero after pass 4 needs to * have its link count adjusted by the value remaining in ino_linkcnt. */ struct inostat { u_char ino_state; /* state of inode, see below */ u_char ino_type:4; /* type of inode */ u_char ino_idtype:4; /* idesc id_type, SNAP or ADDR */ u_short ino_linkcnt; /* number of links not found */ }; /* * Inode states. */ #define USTATE 0x1 /* inode not allocated */ #define FSTATE 0x2 /* inode is file */ #define FZLINK 0x3 /* inode is file with a link count of zero */ #define DSTATE 0x4 /* inode is directory */ #define DZLINK 0x5 /* inode is directory with a zero link count */ #define DFOUND 0x6 /* directory found during descent */ /* 0x7 UNUSED - see S_IS_DVALID() definition */ #define DCLEAR 0x8 /* directory is to be cleared */ #define FCLEAR 0x9 /* file is to be cleared */ /* DUNFOUND === (state == DSTATE || state == DZLINK) */ #define S_IS_DUNFOUND(state) (((state) & ~0x1) == DSTATE) /* DVALID === (state == DSTATE || state == DZLINK || state == DFOUND) */ #define S_IS_DVALID(state) (((state) & ~0x3) == DSTATE) #define INO_IS_DUNFOUND(ino) S_IS_DUNFOUND(inoinfo(ino)->ino_state) #define INO_IS_DVALID(ino) S_IS_DVALID(inoinfo(ino)->ino_state) /* * Inode state information is contained on per cylinder group lists * which are described by the following structure. */ extern struct inostatlist { long il_numalloced; /* number of inodes allocated in this cg */ struct inostat *il_stat;/* inostat info for this cylinder group */ } *inostathead; /* * Structure to reference a dinode. */ struct inode { struct bufarea *i_bp; /* buffer containing the dinode */ union dinode *i_dp; /* pointer to dinode in buffer */ ino_t i_number; /* inode number */ }; /* * Size of hash tables */ #define HASHSIZE 2048 #define HASH(x) ((x * 2654435761) & (HASHSIZE - 1)) /* * buffer cache structure. */ struct bufarea { TAILQ_ENTRY(bufarea) b_list; /* LRU buffer queue */ LIST_ENTRY(bufarea) b_hash; /* hash list */ ufs2_daddr_t b_bno; /* disk block number */ int b_size; /* size of I/O */ int b_errs; /* I/O error */ int b_flags; /* B_ flags below */ int b_type; /* BT_ type below */ int b_refcnt; /* ref count of users */ int b_index; /* for BT_LEVEL, ptr index */ /* for BT_INODES, first inum */ union { char *b_buf; /* buffer space */ ufs1_daddr_t *b_indir1; /* UFS1 indirect block */ ufs2_daddr_t *b_indir2; /* UFS2 indirect block */ struct fs *b_fs; /* super block */ struct cg *b_cg; /* cylinder group */ struct ufs1_dinode *b_dinode1; /* UFS1 inode block */ struct ufs2_dinode *b_dinode2; /* UFS2 inode block */ } b_un; }; #define IBLK(bp, i) \ ((sblock.fs_magic == FS_UFS1_MAGIC) ? \ (bp)->b_un.b_indir1[i] : (bp)->b_un.b_indir2[i]) #define IBLK_SET(bp, i, val) do { \ if (sblock.fs_magic == FS_UFS1_MAGIC) \ (bp)->b_un.b_indir1[i] = (val); \ else \ (bp)->b_un.b_indir2[i] = (val); \ } while (0) /* * Buffer flags */ #define B_DIRTY 0x00000001 /* Buffer is dirty */ /* * Type of data in buffer */ #define BT_UNKNOWN 0 /* Buffer type is unknown */ #define BT_SUPERBLK 1 /* Buffer holds a superblock */ #define BT_CYLGRP 2 /* Buffer holds a cylinder group map */ #define BT_LEVEL1 3 /* Buffer holds single level indirect */ #define BT_LEVEL2 4 /* Buffer holds double level indirect */ #define BT_LEVEL3 5 /* Buffer holds triple level indirect */ #define BT_EXTATTR 6 /* Buffer holds external attribute data */ #define BT_INODES 7 /* Buffer holds inodes */ #define BT_DIRDATA 8 /* Buffer holds directory data */ #define BT_DATA 9 /* Buffer holds user data */ #define BT_NUMBUFTYPES 10 #define BT_NAMES { \ "unknown", \ "Superblock", \ "Cylinder Group", \ "Single Level Indirect", \ "Double Level Indirect", \ "Triple Level Indirect", \ "External Attribute", \ "Inode Block", \ "Directory Contents", \ "User Data" } extern char *buftype[]; #define BT_BUFTYPE(type) \ type < BT_NUMBUFTYPES ? buftype[type] : buftype[BT_UNKNOWN] extern long readcnt[BT_NUMBUFTYPES]; extern long totalreadcnt[BT_NUMBUFTYPES]; extern struct timespec readtime[BT_NUMBUFTYPES]; extern struct timespec totalreadtime[BT_NUMBUFTYPES]; extern struct timespec startprog; extern struct bufarea *icachebp; /* inode cache buffer */ extern struct bufarea sblk; /* file system superblock */ extern struct bufarea *pdirbp; /* current directory contents */ #define dirty(bp) do { \ if (fswritefd < 0) \ pfatal("SETTING DIRTY FLAG IN READ_ONLY MODE\n"); \ else \ (bp)->b_flags |= B_DIRTY; \ } while (0) #define initbarea(bp, type) do { \ (bp)->b_bno = (ufs2_daddr_t)-4; \ (bp)->b_size = 0; \ (bp)->b_errs = 0; \ (bp)->b_flags = 0; \ (bp)->b_type = type; \ (bp)->b_refcnt = 0; \ (bp)->b_index = 0; \ } while (0) #define sbdirty() dirty(&sblk) #define sblock (*sblk.b_un.b_fs) enum fixstate {DONTKNOW, NOFIX, FIX, IGNORE}; extern ino_t cursnapshot; struct inodesc { enum fixstate id_fix; /* policy on fixing errors */ int (*id_func)(struct inodesc *); /* function to be applied to blocks of inode */ struct bufarea *id_bp; /* ckinode: buffer with indirect pointers */ union dinode *id_dp; /* ckinode: dinode being traversed */ ino_t id_number; /* inode number described */ ino_t id_parent; /* for DATA nodes, their parent */ ufs_lbn_t id_lbn; /* logical block number of current block */ ufs2_daddr_t id_blkno; /* current block number being examined */ int id_level; /* level of indirection of this block */ int id_numfrags; /* number of frags contained in block */ ufs_lbn_t id_lballoc; /* pass1: last LBN that is allocated */ off_t id_filesize; /* for DATA nodes, the size of the directory */ ufs2_daddr_t id_entryno;/* for DATA nodes, current entry number */ int id_loc; /* for DATA nodes, current location in dir */ struct direct *id_dirp; /* for DATA nodes, ptr to current entry */ char *id_name; /* for DATA nodes, name to find or enter */ char id_type; /* type of descriptor, DATA, ADDR, or SNAP */ }; /* file types */ #define DATA 1 /* a directory */ #define SNAP 2 /* a snapshot */ #define ADDR 3 /* anything but a directory or a snapshot */ /* * Linked list of duplicate blocks. * * The list is composed of two parts. The first part of the * list (from duplist through the node pointed to by muldup) * contains a single copy of each duplicate block that has been * found. The second part of the list (from muldup to the end) * contains duplicate blocks that have been found more than once. * To check if a block has been found as a duplicate it is only * necessary to search from duplist through muldup. To find the * total number of times that a block has been found as a duplicate * the entire list must be searched for occurrences of the block * in question. The following diagram shows a sample list where * w (found twice), x (found once), y (found three times), and z * (found once) are duplicate block numbers: * * w -> y -> x -> z -> y -> w -> y * ^ ^ * | | * duplist muldup */ struct dups { struct dups *next; ufs2_daddr_t dup; }; extern struct dups *duplist; /* head of dup list */ extern struct dups *muldup; /* end of unique duplicate dup block numbers */ /* * Inode cache data structures. */ struct inoinfo { SLIST_ENTRY(inoinfo) i_hash; /* hash list */ ino_t i_number; /* inode number of this entry */ ino_t i_parent; /* inode number of parent */ ino_t i_dotdot; /* inode number of `..' */ size_t i_isize; /* size of inode */ u_int i_depth; /* depth of directory from root */ u_int i_flags; /* flags, see below */ u_int i_numblks; /* size of block array in bytes */ ufs2_daddr_t i_blks[1]; /* actually longer */ }; extern SLIST_HEAD(inohash, inoinfo) *inphash; extern struct inoinfo **inpsort; /* * flags for struct inoinfo */ #define INFO_NEW 0x0000001 /* replaced broken directory */ extern long dirhash, inplast; extern unsigned long numdirs, listmax; extern long countdirs; /* number of directories we actually found */ #define MIBSIZE 3 /* size of fsck sysctl MIBs */ extern int adjblkcnt[MIBSIZE]; /* MIB cmd to adjust inode block count */ extern int adjrefcnt[MIBSIZE]; /* MIB cmd to adjust inode reference count */ extern int adjndir[MIBSIZE]; /* MIB cmd to adjust number of directories */ extern int adjnbfree[MIBSIZE]; /* MIB cmd to adjust number of free blocks */ extern int adjnifree[MIBSIZE]; /* MIB cmd to adjust number of free inodes */ extern int adjnffree[MIBSIZE]; /* MIB cmd to adjust number of free frags */ extern int adjnumclusters[MIBSIZE]; /* MIB cmd adjust number of free clusters */ extern int adjdepth[MIBSIZE]; /* MIB cmd to adjust directory depth count */ extern int freefiles[MIBSIZE]; /* MIB cmd to free a set of files */ extern int freedirs[MIBSIZE]; /* MIB cmd to free a set of directories */ extern int freeblks[MIBSIZE]; /* MIB cmd to free a set of data blocks */ extern int setsize[MIBSIZE]; /* MIB cmd to set inode size */ extern struct fsck_cmd cmd; /* sysctl file system update commands */ extern int bkgrdcheck; /* determine if background check is possible */ extern int bkgrdsumadj; /* whether the kernel has the ability to adjust the superblock summary fields */ extern off_t bflag; /* location of alternate super block */ extern int bkgrdflag; /* use a snapshot to run on an active system */ extern char *blockmap; /* ptr to primary blk allocation map */ extern char *cdevname; /* name of device being checked */ extern int cgheader_corrupt; /* one or more CG headers are corrupt */ extern char ckclean; /* only do work if not cleanly unmounted */ extern int ckhashadd; /* check hashes to be added */ extern char *copybuf; /* buffer to copy snapshot blocks */ extern int cvtlevel; /* convert to newer file system format */ extern long dev_bsize; /* computed value of DEV_BSIZE */ extern u_int real_dev_bsize; /* actual disk sector size, not overridden */ extern int debug; /* output debugging info */ extern int Eflag; /* delete empty data blocks */ extern int fsmodified; /* 1 => write done to file system */ extern int fsreadfd; /* file descriptor for reading file system */ extern int fswritefd; /* file descriptor for writing file system */ extern char havesb; /* superblock has been read */ extern int inoopt; /* trim out unused inodes */ extern ino_t lfdir; /* lost & found directory inode number */ extern int lfmode; /* lost & found directory creation mode */ extern const char *lfname; /* lost & found directory name */ extern ufs2_daddr_t maxfsblock; /* number of blocks in the file system */ extern ino_t maxino; /* number of inodes in file system */ extern ufs2_daddr_t n_blks; /* number of blocks in use */ extern ino_t n_files; /* number of files in use */ extern char nflag; /* assume a no response */ extern char preen; /* just fix normal inconsistencies */ extern char rerun; /* rerun fsck. Only used in non-preen mode */ extern char resolved; /* cleared if unresolved changes => not clean */ extern int returntosingle; /* 1 => return to single user mode on exit */ extern long secsize; /* actual disk sector size */ extern char skipclean; /* skip clean file systems if preening */ extern int snapcnt; /* number of active snapshots */ extern struct inode snaplist[FSMAXSNAP + 1]; /* list of active snapshots */ extern char snapname[BUFSIZ]; /* when doing snapshots, the name of the file */ extern int sujrecovery; /* 1 => doing check using the journal */ extern int surrender; /* Give up if reads fail */ extern char usedsoftdep; /* just fix soft dependency inconsistencies */ extern int wantrestart; /* Restart fsck on early termination */ extern char yflag; /* assume a yes response */ extern int zflag; /* zero unused directory space */ extern int Zflag; /* zero empty data blocks */ extern volatile sig_atomic_t got_siginfo; /* received a SIGINFO */ extern volatile sig_atomic_t got_sigalarm; /* received a SIGALRM */ #define clearinode(dp) \ if (sblock.fs_magic == FS_UFS1_MAGIC) { \ (dp)->dp1 = zino.dp1; \ } else { \ (dp)->dp2 = zino.dp2; \ } extern union dinode zino; #define setbmap(blkno) setbit(blockmap, blkno) #define testbmap(blkno) isset(blockmap, blkno) #define clrbmap(blkno) clrbit(blockmap, blkno) #define STOP 0x01 #define SKIP 0x02 #define KEEPON 0x04 #define ALTERED 0x08 #define FOUND 0x10 #define EEXIT 8 /* Standard error exit. */ #define ERERUN 16 /* fsck needs to be re-run. */ #define ERESTART -1 int flushentry(void); /* * Wrapper for malloc() that flushes the cylinder group cache to try * to get space. */ static inline void* Malloc(size_t size) { void *retval; while ((retval = malloc(size)) == NULL) if (flushentry() == 0) break; return (retval); } /* * Wrapper for calloc() that flushes the cylinder group cache to try * to get space. */ static inline void* Calloc(size_t cnt, size_t size) { void *retval; while ((retval = calloc(cnt, size)) == NULL) if (flushentry() == 0) break; return (retval); } struct fstab; void adjust(struct inodesc *, int lcnt); void alarmhandler(int sig); ufs2_daddr_t allocblk(long cg, long frags, ufs2_daddr_t (*checkblkavail) (ufs2_daddr_t blkno, long frags)); ino_t allocdir(ino_t parent, ino_t request, int mode); ino_t allocino(ino_t request, int type); void binval(struct bufarea *); void blkerror(ino_t ino, const char *type, ufs2_daddr_t blk); char *blockcheck(char *name); int blread(int fd, char *buf, ufs2_daddr_t blk, long size); void bufinit(void); void blwrite(int fd, char *buf, ufs2_daddr_t blk, ssize_t size); void blerase(int fd, ufs2_daddr_t blk, long size); void blzero(int fd, ufs2_daddr_t blk, long size); void brelse(struct bufarea *); struct inoinfo *cacheino(union dinode *dp, ino_t inumber); void catch(int); void catchquit(int); void cgdirty(struct bufarea *); struct bufarea *cglookup(int cg); int changeino(ino_t dir, const char *name, ino_t newnum, int depth); void check_blkcnt(struct inode *ip); int check_cgmagic(int cg, struct bufarea *cgbp); void rebuild_cg(int cg, struct bufarea *cgbp); void check_dirdepth(struct inoinfo *inp); +int chkfilesize(mode_t mode, u_int64_t filesize); int chkrange(ufs2_daddr_t blk, int cnt); void ckfini(int markclean); int ckinode(union dinode *dp, struct inodesc *); void clri(struct inodesc *, const char *type, int flag); int clearentry(struct inodesc *); void copyonwrite(struct fs *, struct bufarea *, ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long)); void direrror(ino_t ino, const char *errmesg); int dirscan(struct inodesc *); int dofix(struct inodesc *, const char *msg); int eascan(struct inodesc *, struct ufs2_dinode *dp); void fileerror(ino_t cwd, ino_t ino, const char *errmesg); void finalIOstats(void); int findino(struct inodesc *); int findname(struct inodesc *); void flush(int fd, struct bufarea *bp); int freeblock(struct inodesc *); void freedirino(ino_t ino, ino_t parent); void freeino(ino_t ino); void freeinodebuf(void); void fsckinit(void); void fsutilinit(void); int ftypeok(union dinode *dp); void getblk(struct bufarea *bp, ufs2_daddr_t blk, long size); struct bufarea *getdatablk(ufs2_daddr_t blkno, long size, int type); struct inoinfo *getinoinfo(ino_t inumber); union dinode *getnextinode(ino_t inumber, int rebuiltcg); void getpathname(char *namebuf, ino_t curdir, ino_t ino); void ginode(ino_t, struct inode *); void gjournal_check(const char *filesys); void infohandler(int sig); void irelse(struct inode *); ufs2_daddr_t ino_blkatoff(union dinode *, ino_t, ufs_lbn_t, int *, struct bufarea **); void inocleanup(void); void inodirty(struct inode *); struct inostat *inoinfo(ino_t inum); void IOstats(char *what); int linkup(ino_t orphan, ino_t parentdir, char *name); int makeentry(ino_t parent, ino_t ino, const char *name); int openfilesys(char *dev); void panic(const char *fmt, ...) __printflike(1, 2); void pass1(void); void pass1b(void); int pass1check(struct inodesc *); void pass2(void); void pass3(void); void pass4(void); void pass5(void); void pfatal(const char *fmt, ...) __printflike(1, 2); void propagate(void); void prtbuf(struct bufarea *, const char *, ...) __printflike(2, 3); void prtinode(struct inode *); void pwarn(const char *fmt, ...) __printflike(1, 2); int readsb(void); int removecachedino(ino_t); int reply(const char *question); void rwerror(const char *mesg, ufs2_daddr_t blk); void sblock_init(void); void setinodebuf(int, ino_t); int setup(char *dev); int snapblkfree(struct fs *, ufs2_daddr_t, long, ino_t, ufs2_daddr_t (*)(ufs2_daddr_t, long)); void snapremove(ino_t); void snapflush(ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long)); ufs2_daddr_t std_checkblkavail(ufs2_daddr_t blkno, long frags); ufs2_daddr_t suj_checkblkavail(ufs2_daddr_t, long); int suj_check(const char *filesys); void update_maps(struct cg *, struct cg*, int); #endif /* !_FSCK_H_ */ diff --git a/sbin/fsck_ffs/fsutil.c b/sbin/fsck_ffs/fsutil.c index 7602203e6e90..5edc258d54bf 100644 --- a/sbin/fsck_ffs/fsutil.c +++ b/sbin/fsck_ffs/fsutil.c @@ -1,1474 +1,1499 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)utilities.c 8.6 (Berkeley) 5/19/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsck.h" int sujrecovery = 0; static struct bufarea *allocbuf(const char *); static void cg_write(struct bufarea *); static void slowio_start(void); static void slowio_end(void); static void printIOstats(void); static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */ static struct timespec startpass, finishpass; struct timeval slowio_starttime; int slowio_delay_usec = 10000; /* Initial IO delay for background fsck */ int slowio_pollcnt; static struct bufarea cgblk; /* backup buffer for cylinder group blocks */ static struct bufarea failedbuf; /* returned by failed getdatablk() */ static TAILQ_HEAD(bufqueue, bufarea) bufqueuehd; /* head of buffer cache LRU */ static LIST_HEAD(bufhash, bufarea) bufhashhd[HASHSIZE]; /* buffer hash list */ static struct bufhash freebufs; /* unused buffers */ static int numbufs; /* size of buffer cache */ static int cachelookups; /* number of cache lookups */ static int cachereads; /* number of cache reads */ static int flushtries; /* number of tries to reclaim memory */ char *buftype[BT_NUMBUFTYPES] = BT_NAMES; void fsutilinit(void) { diskreads = totaldiskreads = totalreads = 0; bzero(&startpass, sizeof(struct timespec)); bzero(&finishpass, sizeof(struct timespec)); bzero(&slowio_starttime, sizeof(struct timeval)); slowio_delay_usec = 10000; slowio_pollcnt = 0; flushtries = 0; } int ftypeok(union dinode *dp) { switch (DIP(dp, di_mode) & IFMT) { case IFDIR: case IFREG: case IFBLK: case IFCHR: case IFLNK: case IFSOCK: case IFIFO: return (1); default: if (debug) printf("bad file type 0%o\n", DIP(dp, di_mode)); return (0); } } int reply(const char *question) { int persevere; char c; if (preen) pfatal("INTERNAL ERROR: GOT TO reply()"); persevere = strcmp(question, "CONTINUE") == 0 || strcmp(question, "LOOK FOR ALTERNATE SUPERBLOCKS") == 0; printf("\n"); if (!persevere && (nflag || (fswritefd < 0 && bkgrdflag == 0))) { printf("%s? no\n\n", question); resolved = 0; return (0); } if (yflag || (persevere && nflag)) { printf("%s? yes\n\n", question); return (1); } do { printf("%s? [yn] ", question); (void) fflush(stdout); c = getc(stdin); while (c != '\n' && getc(stdin) != '\n') { if (feof(stdin)) { resolved = 0; return (0); } } } while (c != 'y' && c != 'Y' && c != 'n' && c != 'N'); printf("\n"); if (c == 'y' || c == 'Y') return (1); resolved = 0; return (0); } /* * Look up state information for an inode. */ struct inostat * inoinfo(ino_t inum) { static struct inostat unallocated = { USTATE, 0, 0, 0 }; struct inostatlist *ilp; int iloff; if (inum >= maxino) errx(EEXIT, "inoinfo: inumber %ju out of range", (uintmax_t)inum); ilp = &inostathead[inum / sblock.fs_ipg]; iloff = inum % sblock.fs_ipg; if (iloff >= ilp->il_numalloced) return (&unallocated); return (&ilp->il_stat[iloff]); } /* * Malloc buffers and set up cache. */ void bufinit(void) { int i; initbarea(&failedbuf, BT_UNKNOWN); failedbuf.b_errs = -1; failedbuf.b_un.b_buf = NULL; if ((cgblk.b_un.b_buf = Malloc((unsigned int)sblock.fs_bsize)) == NULL) errx(EEXIT, "Initial malloc(%d) failed", sblock.fs_bsize); initbarea(&cgblk, BT_CYLGRP); numbufs = cachelookups = cachereads = 0; TAILQ_INIT(&bufqueuehd); LIST_INIT(&freebufs); for (i = 0; i < HASHSIZE; i++) LIST_INIT(&bufhashhd[i]); for (i = 0; i < BT_NUMBUFTYPES; i++) { readtime[i].tv_sec = totalreadtime[i].tv_sec = 0; readtime[i].tv_nsec = totalreadtime[i].tv_nsec = 0; readcnt[i] = totalreadcnt[i] = 0; } } static struct bufarea * allocbuf(const char *failreason) { struct bufarea *bp; char *bufp; bp = (struct bufarea *)Malloc(sizeof(struct bufarea)); bufp = Malloc((unsigned int)sblock.fs_bsize); if (bp == NULL || bufp == NULL) { errx(EEXIT, "%s", failreason); /* NOTREACHED */ } numbufs++; bp->b_un.b_buf = bufp; TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list); initbarea(bp, BT_UNKNOWN); return (bp); } /* * Manage cylinder group buffers. * * Use getblk() here rather than cgget() because the cylinder group * may be corrupted but we want it anyway so we can fix it. */ static struct bufarea *cgbufs; /* header for cylinder group cache */ static int flushtries; /* number of tries to reclaim memory */ struct bufarea * cglookup(int cg) { struct bufarea *cgbp; struct cg *cgp; if ((unsigned) cg >= sblock.fs_ncg) errx(EEXIT, "cglookup: out of range cylinder group %d", cg); if (cgbufs == NULL) { cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea)); if (cgbufs == NULL) errx(EEXIT, "Cannot allocate cylinder group buffers"); } cgbp = &cgbufs[cg]; if (cgbp->b_un.b_cg != NULL) return (cgbp); cgp = NULL; if (flushtries == 0) cgp = Malloc((unsigned int)sblock.fs_cgsize); if (cgp == NULL) { if (sujrecovery) errx(EEXIT,"Ran out of memory during journal recovery"); flush(fswritefd, &cgblk); getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize); return (&cgblk); } cgbp->b_un.b_cg = cgp; initbarea(cgbp, BT_CYLGRP); getblk(cgbp, cgtod(&sblock, cg), sblock.fs_cgsize); return (cgbp); } /* * Mark a cylinder group buffer as dirty. * Update its check-hash if they are enabled. */ void cgdirty(struct bufarea *cgbp) { struct cg *cg; cg = cgbp->b_un.b_cg; if ((sblock.fs_metackhash & CK_CYLGRP) != 0) { cg->cg_ckhash = 0; cg->cg_ckhash = calculate_crc32c(~0L, (void *)cg, sblock.fs_cgsize); } dirty(cgbp); } /* * Attempt to flush a cylinder group cache entry. * Return whether the flush was successful. */ int flushentry(void) { struct bufarea *cgbp; if (sujrecovery || flushtries == sblock.fs_ncg || cgbufs == NULL) return (0); cgbp = &cgbufs[flushtries++]; if (cgbp->b_un.b_cg == NULL) return (0); flush(fswritefd, cgbp); free(cgbp->b_un.b_buf); cgbp->b_un.b_buf = NULL; return (1); } /* * Manage a cache of filesystem disk blocks. */ struct bufarea * getdatablk(ufs2_daddr_t blkno, long size, int type) { struct bufarea *bp; struct bufhash *bhdp; cachelookups++; /* * If out of range, return empty buffer with b_err == -1 * * Skip check for inodes because chkrange() considers * metadata areas invalid to write data. */ if (type != BT_INODES && chkrange(blkno, size / sblock.fs_fsize)) { failedbuf.b_refcnt++; return (&failedbuf); } bhdp = &bufhashhd[HASH(blkno)]; LIST_FOREACH(bp, bhdp, b_hash) if (bp->b_bno == fsbtodb(&sblock, blkno)) { if (debug && bp->b_size != size) { prtbuf(bp, "getdatablk: size mismatch"); pfatal("getdatablk: b_size %d != size %ld\n", bp->b_size, size); } TAILQ_REMOVE(&bufqueuehd, bp, b_list); goto foundit; } /* * Move long-term busy buffer back to the front of the LRU so we * do not endless inspect them for recycling. */ bp = TAILQ_LAST(&bufqueuehd, bufqueue); if (bp != NULL && bp->b_refcnt != 0) { TAILQ_REMOVE(&bufqueuehd, bp, b_list); TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list); } /* * Allocate up to the minimum number of buffers before * considering recycling any of them. */ if (size > sblock.fs_bsize) errx(EEXIT, "Excessive buffer size %ld > %d\n", size, sblock.fs_bsize); if ((bp = LIST_FIRST(&freebufs)) != NULL) { LIST_REMOVE(bp, b_hash); } else if (numbufs < MINBUFS) { bp = allocbuf("cannot create minimal buffer pool"); } else if (sujrecovery) { /* * SUJ recovery does not want anything written until it * has successfully completed (so it can fail back to * full fsck). Thus, we can only recycle clean buffers. */ TAILQ_FOREACH_REVERSE(bp, &bufqueuehd, bufqueue, b_list) if ((bp->b_flags & B_DIRTY) == 0 && bp->b_refcnt == 0) break; if (bp == NULL) bp = allocbuf("Ran out of memory during " "journal recovery"); else LIST_REMOVE(bp, b_hash); } else { /* * Recycle oldest non-busy buffer. */ TAILQ_FOREACH_REVERSE(bp, &bufqueuehd, bufqueue, b_list) if (bp->b_refcnt == 0) break; if (bp == NULL) bp = allocbuf("Ran out of memory for buffers"); else LIST_REMOVE(bp, b_hash); } TAILQ_REMOVE(&bufqueuehd, bp, b_list); flush(fswritefd, bp); bp->b_type = type; LIST_INSERT_HEAD(bhdp, bp, b_hash); getblk(bp, blkno, size); cachereads++; /* fall through */ foundit: TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list); if (debug && bp->b_type != type) { printf("getdatablk: buffer type changed to %s", BT_BUFTYPE(type)); prtbuf(bp, ""); } if (bp->b_errs == 0) bp->b_refcnt++; return (bp); } void getblk(struct bufarea *bp, ufs2_daddr_t blk, long size) { ufs2_daddr_t dblk; struct timespec start, finish; dblk = fsbtodb(&sblock, blk); if (bp->b_bno == dblk) { totalreads++; } else { if (debug) { readcnt[bp->b_type]++; clock_gettime(CLOCK_REALTIME_PRECISE, &start); } bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, dblk, size); if (debug) { clock_gettime(CLOCK_REALTIME_PRECISE, &finish); timespecsub(&finish, &start, &finish); timespecadd(&readtime[bp->b_type], &finish, &readtime[bp->b_type]); } bp->b_bno = dblk; bp->b_size = size; } } void brelse(struct bufarea *bp) { if (bp->b_refcnt <= 0) prtbuf(bp, "brelse: buffer with negative reference count"); bp->b_refcnt--; } void binval(struct bufarea *bp) { bp->b_flags &= ~B_DIRTY; LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&freebufs, bp, b_hash); } void flush(int fd, struct bufarea *bp) { struct inode ip; if ((bp->b_flags & B_DIRTY) == 0) return; bp->b_flags &= ~B_DIRTY; if (fswritefd < 0) { pfatal("WRITING IN READ_ONLY MODE.\n"); return; } if (bp->b_errs != 0) pfatal("WRITING %sZERO'ED BLOCK %lld TO DISK\n", (bp->b_errs == bp->b_size / dev_bsize) ? "" : "PARTIALLY ", (long long)bp->b_bno); bp->b_errs = 0; /* * Write using the appropriate function. */ switch (bp->b_type) { case BT_SUPERBLK: if (bp != &sblk) pfatal("BUFFER %p DOES NOT MATCH SBLK %p\n", bp, &sblk); /* * Superblocks are always pre-copied so we do not need * to check them for copy-on-write. */ if (sbput(fd, bp->b_un.b_fs, 0) == 0) fsmodified = 1; break; case BT_CYLGRP: /* * Cylinder groups are always pre-copied so we do not * need to check them for copy-on-write. */ if (sujrecovery) cg_write(bp); if (cgput(fswritefd, &sblock, bp->b_un.b_cg) == 0) fsmodified = 1; break; case BT_INODES: if (debug && sblock.fs_magic == FS_UFS2_MAGIC) { struct ufs2_dinode *dp = bp->b_un.b_dinode2; int i; for (i = 0; i < bp->b_size; dp++, i += sizeof(*dp)) { if (ffs_verify_dinode_ckhash(&sblock, dp) == 0) continue; pwarn("flush: INODE CHECK-HASH FAILED"); ip.i_bp = bp; ip.i_dp = (union dinode *)dp; ip.i_number = bp->b_index + (i / sizeof(*dp)); prtinode(&ip); if (preen || reply("FIX") != 0) { if (preen) printf(" (FIXED)\n"); ffs_update_dinode_ckhash(&sblock, dp); inodirty(&ip); } } } /* FALLTHROUGH */ default: copyonwrite(&sblock, bp, std_checkblkavail); blwrite(fd, bp->b_un.b_buf, bp->b_bno, bp->b_size); break; } } /* * If there are any snapshots, ensure that all the blocks that they * care about have been copied, then release the snapshot inodes. * These operations need to be done before we rebuild the cylinder * groups so that any block allocations are properly recorded. * Since all the cylinder group maps have already been copied in * the snapshots, no further snapshot copies will need to be done. */ void snapflush(ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t, long)) { struct bufarea *bp; int cnt; if (snapcnt > 0) { if (debug) printf("Check for snapshot copies\n"); TAILQ_FOREACH_REVERSE(bp, &bufqueuehd, bufqueue, b_list) if ((bp->b_flags & B_DIRTY) != 0) copyonwrite(&sblock, bp, checkblkavail); for (cnt = 0; cnt < snapcnt; cnt++) irelse(&snaplist[cnt]); snapcnt = 0; } } /* * Journaled soft updates does not maintain cylinder group summary * information during cleanup, so this routine recalculates the summary * information and updates the superblock summary in preparation for * writing out the cylinder group. */ static void cg_write(struct bufarea *bp) { ufs1_daddr_t fragno, cgbno, maxbno; u_int8_t *blksfree; struct csum *csp; struct cg *cgp; int blk; int i; /* * Fix the frag and cluster summary. */ cgp = bp->b_un.b_cg; cgp->cg_cs.cs_nbfree = 0; cgp->cg_cs.cs_nffree = 0; bzero(&cgp->cg_frsum, sizeof(cgp->cg_frsum)); maxbno = fragstoblks(&sblock, sblock.fs_fpg); if (sblock.fs_contigsumsize > 0) { for (i = 1; i <= sblock.fs_contigsumsize; i++) cg_clustersum(cgp)[i] = 0; bzero(cg_clustersfree(cgp), howmany(maxbno, CHAR_BIT)); } blksfree = cg_blksfree(cgp); for (cgbno = 0; cgbno < maxbno; cgbno++) { if (ffs_isfreeblock(&sblock, blksfree, cgbno)) continue; if (ffs_isblock(&sblock, blksfree, cgbno)) { ffs_clusteracct(&sblock, cgp, cgbno, 1); cgp->cg_cs.cs_nbfree++; continue; } fragno = blkstofrags(&sblock, cgbno); blk = blkmap(&sblock, blksfree, fragno); ffs_fragacct(&sblock, blk, cgp->cg_frsum, 1); for (i = 0; i < sblock.fs_frag; i++) if (isset(blksfree, fragno + i)) cgp->cg_cs.cs_nffree++; } /* * Update the superblock cg summary from our now correct values * before writing the block. */ csp = &sblock.fs_cs(&sblock, cgp->cg_cgx); sblock.fs_cstotal.cs_ndir += cgp->cg_cs.cs_ndir - csp->cs_ndir; sblock.fs_cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree - csp->cs_nbfree; sblock.fs_cstotal.cs_nifree += cgp->cg_cs.cs_nifree - csp->cs_nifree; sblock.fs_cstotal.cs_nffree += cgp->cg_cs.cs_nffree - csp->cs_nffree; sblock.fs_cs(&sblock, cgp->cg_cgx) = cgp->cg_cs; } void rwerror(const char *mesg, ufs2_daddr_t blk) { if (bkgrdcheck) exit(EEXIT); if (preen == 0) printf("\n"); pfatal("CANNOT %s: %ld", mesg, (long)blk); if (reply("CONTINUE") == 0) exit(EEXIT); } void ckfini(int markclean) { struct bufarea *bp, *nbp; int ofsmodified, cnt, cg; if (bkgrdflag) { unlink(snapname); if ((!(sblock.fs_flags & FS_UNCLEAN)) != markclean) { cmd.value = FS_UNCLEAN; cmd.size = markclean ? -1 : 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FILE SYSTEM DIRTY FLAG\n"); if (!preen) { printf("\n***** FILE SYSTEM MARKED %s *****\n", markclean ? "CLEAN" : "DIRTY"); if (!markclean) rerun = 1; } } else if (!preen && !markclean) { printf("\n***** FILE SYSTEM STILL DIRTY *****\n"); rerun = 1; } bkgrdflag = 0; } if (debug && cachelookups > 0) printf("cache with %d buffers missed %d of %d (%d%%)\n", numbufs, cachereads, cachelookups, (int)(cachereads * 100 / cachelookups)); if (fswritefd < 0) { (void)close(fsreadfd); return; } /* * To remain idempotent with partial truncations the buffers * must be flushed in this order: * 1) cylinder groups (bitmaps) * 2) indirect, directory, external attribute, and data blocks * 3) inode blocks * 4) superblock * This ordering preserves access to the modified pointers * until they are freed. */ /* Step 1: cylinder groups */ if (debug) printf("Flush Cylinder groups\n"); if (cgbufs != NULL) { for (cnt = 0; cnt < sblock.fs_ncg; cnt++) { if (cgbufs[cnt].b_un.b_cg == NULL) continue; flush(fswritefd, &cgbufs[cnt]); free(cgbufs[cnt].b_un.b_cg); } free(cgbufs); cgbufs = NULL; } flush(fswritefd, &cgblk); free(cgblk.b_un.b_buf); cgblk.b_un.b_buf = NULL; cnt = 0; /* Step 2: indirect, directory, external attribute, and data blocks */ if (debug) printf("Flush indirect, directory, external attribute, " "and data blocks\n"); if (pdirbp != NULL) { brelse(pdirbp); pdirbp = NULL; } TAILQ_FOREACH_REVERSE_SAFE(bp, &bufqueuehd, bufqueue, b_list, nbp) { switch (bp->b_type) { /* These should not be in the buffer cache list */ case BT_UNKNOWN: case BT_SUPERBLK: case BT_CYLGRP: default: prtbuf(bp,"ckfini: improper buffer type on cache list"); continue; /* These are the ones to flush in this step */ case BT_LEVEL1: case BT_LEVEL2: case BT_LEVEL3: case BT_EXTATTR: case BT_DIRDATA: case BT_DATA: break; /* These are the ones to flush in the next step */ case BT_INODES: continue; } if (debug && bp->b_refcnt != 0) prtbuf(bp, "ckfini: clearing in-use buffer"); TAILQ_REMOVE(&bufqueuehd, bp, b_list); LIST_REMOVE(bp, b_hash); cnt++; flush(fswritefd, bp); free(bp->b_un.b_buf); free((char *)bp); } /* Step 3: inode blocks */ if (debug) printf("Flush inode blocks\n"); if (icachebp != NULL) { brelse(icachebp); icachebp = NULL; } TAILQ_FOREACH_REVERSE_SAFE(bp, &bufqueuehd, bufqueue, b_list, nbp) { if (debug && bp->b_refcnt != 0) prtbuf(bp, "ckfini: clearing in-use buffer"); TAILQ_REMOVE(&bufqueuehd, bp, b_list); LIST_REMOVE(bp, b_hash); cnt++; flush(fswritefd, bp); free(bp->b_un.b_buf); free((char *)bp); } if (numbufs != cnt) errx(EEXIT, "panic: lost %d buffers", numbufs - cnt); /* Step 4: superblock */ if (debug) printf("Flush the superblock\n"); flush(fswritefd, &sblk); if (havesb && cursnapshot == 0 && sblk.b_bno != sblock.fs_sblockloc / dev_bsize) { if (preen || reply("UPDATE STANDARD SUPERBLOCK")) { /* Change write destination to standard superblock */ sblock.fs_sblockactualloc = sblock.fs_sblockloc; sblk.b_bno = sblock.fs_sblockloc / dev_bsize; sbdirty(); flush(fswritefd, &sblk); } else { markclean = 0; } } if (cursnapshot == 0 && sblock.fs_clean != markclean) { if ((sblock.fs_clean = markclean) != 0) { sblock.fs_flags &= ~(FS_UNCLEAN | FS_NEEDSFSCK); sblock.fs_pendingblocks = 0; sblock.fs_pendinginodes = 0; } sbdirty(); ofsmodified = fsmodified; flush(fswritefd, &sblk); fsmodified = ofsmodified; if (!preen) { printf("\n***** FILE SYSTEM MARKED %s *****\n", markclean ? "CLEAN" : "DIRTY"); if (!markclean) rerun = 1; } } else if (!preen) { if (markclean) { printf("\n***** FILE SYSTEM IS CLEAN *****\n"); } else { printf("\n***** FILE SYSTEM STILL DIRTY *****\n"); rerun = 1; } } /* * Free allocated tracking structures. */ if (blockmap != NULL) free(blockmap); blockmap = NULL; if (inostathead != NULL) { for (cg = 0; cg < sblock.fs_ncg; cg++) if (inostathead[cg].il_stat != NULL) free((char *)inostathead[cg].il_stat); free(inostathead); } inostathead = NULL; inocleanup(); finalIOstats(); (void)close(fsreadfd); (void)close(fswritefd); } /* * Print out I/O statistics. */ void IOstats(char *what) { int i; if (debug == 0) return; if (diskreads == 0) { printf("%s: no I/O\n\n", what); return; } if (startpass.tv_sec == 0) startpass = startprog; printf("%s: I/O statistics\n", what); printIOstats(); totaldiskreads += diskreads; diskreads = 0; for (i = 0; i < BT_NUMBUFTYPES; i++) { timespecadd(&totalreadtime[i], &readtime[i], &totalreadtime[i]); totalreadcnt[i] += readcnt[i]; readtime[i].tv_sec = readtime[i].tv_nsec = 0; readcnt[i] = 0; } clock_gettime(CLOCK_REALTIME_PRECISE, &startpass); } void finalIOstats(void) { int i; if (debug == 0) return; printf("Final I/O statistics\n"); totaldiskreads += diskreads; diskreads = totaldiskreads; startpass = startprog; for (i = 0; i < BT_NUMBUFTYPES; i++) { timespecadd(&totalreadtime[i], &readtime[i], &totalreadtime[i]); totalreadcnt[i] += readcnt[i]; readtime[i] = totalreadtime[i]; readcnt[i] = totalreadcnt[i]; } printIOstats(); } static void printIOstats(void) { long long msec, totalmsec; int i; clock_gettime(CLOCK_REALTIME_PRECISE, &finishpass); timespecsub(&finishpass, &startpass, &finishpass); printf("Running time: %jd.%03ld sec\n", (intmax_t)finishpass.tv_sec, finishpass.tv_nsec / 1000000); printf("buffer reads by type:\n"); for (totalmsec = 0, i = 0; i < BT_NUMBUFTYPES; i++) totalmsec += readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000; if (totalmsec == 0) totalmsec = 1; for (i = 0; i < BT_NUMBUFTYPES; i++) { if (readcnt[i] == 0) continue; msec = readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000; printf("%21s:%8ld %2ld.%ld%% %4jd.%03ld sec %2lld.%lld%%\n", buftype[i], readcnt[i], readcnt[i] * 100 / diskreads, (readcnt[i] * 1000 / diskreads) % 10, (intmax_t)readtime[i].tv_sec, readtime[i].tv_nsec / 1000000, msec * 100 / totalmsec, (msec * 1000 / totalmsec) % 10); } printf("\n"); } int blread(int fd, char *buf, ufs2_daddr_t blk, long size) { char *cp; int i, errs; off_t offset; offset = blk; offset *= dev_bsize; if (bkgrdflag) slowio_start(); totalreads++; diskreads++; if (pread(fd, buf, (int)size, offset) == size) { if (bkgrdflag) slowio_end(); return (0); } /* * This is handled specially here instead of in rwerror because * rwerror is used for all sorts of errors, not just true read/write * errors. It should be refactored and fixed. */ if (surrender) { pfatal("CANNOT READ_BLK: %ld", (long)blk); errx(EEXIT, "ABORTING DUE TO READ ERRORS"); } else rwerror("READ BLK", blk); errs = 0; memset(buf, 0, (size_t)size); printf("THE FOLLOWING DISK SECTORS COULD NOT BE READ:"); for (cp = buf, i = 0; i < size; i += secsize, cp += secsize) { if (pread(fd, cp, (int)secsize, offset + i) != secsize) { if (secsize != dev_bsize && dev_bsize != 1) printf(" %jd (%jd),", (intmax_t)(blk * dev_bsize + i) / secsize, (intmax_t)blk + i / dev_bsize); else printf(" %jd,", (intmax_t)blk + i / dev_bsize); errs++; } } printf("\n"); if (errs) resolved = 0; return (errs); } void blwrite(int fd, char *buf, ufs2_daddr_t blk, ssize_t size) { int i; char *cp; off_t offset; if (fd < 0) return; offset = blk; offset *= dev_bsize; if (pwrite(fd, buf, size, offset) == size) { fsmodified = 1; return; } resolved = 0; rwerror("WRITE BLK", blk); printf("THE FOLLOWING SECTORS COULD NOT BE WRITTEN:"); for (cp = buf, i = 0; i < size; i += dev_bsize, cp += dev_bsize) if (pwrite(fd, cp, dev_bsize, offset + i) != dev_bsize) printf(" %jd,", (intmax_t)blk + i / dev_bsize); printf("\n"); return; } void blerase(int fd, ufs2_daddr_t blk, long size) { off_t ioarg[2]; if (fd < 0) return; ioarg[0] = blk * dev_bsize; ioarg[1] = size; ioctl(fd, DIOCGDELETE, ioarg); /* we don't really care if we succeed or not */ return; } /* * Fill a contiguous region with all-zeroes. Note ZEROBUFSIZE is by * definition a multiple of dev_bsize. */ void blzero(int fd, ufs2_daddr_t blk, long size) { static char *zero; off_t offset, len; if (fd < 0) return; if (zero == NULL) { zero = calloc(ZEROBUFSIZE, 1); if (zero == NULL) errx(EEXIT, "cannot allocate buffer pool"); } offset = blk * dev_bsize; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); while (size > 0) { len = MIN(ZEROBUFSIZE, size); if (write(fd, zero, len) != len) rwerror("WRITE BLK", blk); blk += len / dev_bsize; size -= len; } } /* * Verify cylinder group's magic number and other parameters. If the * test fails, offer an option to rebuild the whole cylinder group. * * Return 1 if the cylinder group is good or return 0 if it is bad. */ #undef CHK #define CHK(lhs, op, rhs, fmt) \ if (lhs op rhs) { \ pwarn("UFS%d cylinder group %d failed: " \ "%s (" #fmt ") %s %s (" #fmt ")\n", \ sblock.fs_magic == FS_UFS1_MAGIC ? 1 : 2, cg, \ #lhs, (intmax_t)lhs, #op, #rhs, (intmax_t)rhs); \ error = 1; \ } int check_cgmagic(int cg, struct bufarea *cgbp) { struct cg *cgp = cgbp->b_un.b_cg; uint32_t cghash, calchash; static int prevfailcg = -1; long start; int error; /* * Extended cylinder group checks. */ calchash = cgp->cg_ckhash; if ((sblock.fs_metackhash & CK_CYLGRP) != 0 && (ckhashadd & CK_CYLGRP) == 0) { cghash = cgp->cg_ckhash; cgp->cg_ckhash = 0; calchash = calculate_crc32c(~0L, (void *)cgp, sblock.fs_cgsize); cgp->cg_ckhash = cghash; } error = 0; CHK(cgp->cg_ckhash, !=, calchash, "%jd"); CHK(cg_chkmagic(cgp), ==, 0, "%jd"); CHK(cgp->cg_cgx, !=, cg, "%jd"); CHK(cgp->cg_ndblk, >, sblock.fs_fpg, "%jd"); if (sblock.fs_magic == FS_UFS1_MAGIC) { CHK(cgp->cg_old_niblk, !=, sblock.fs_ipg, "%jd"); CHK(cgp->cg_old_ncyl, >, sblock.fs_old_cpg, "%jd"); } else if (sblock.fs_magic == FS_UFS2_MAGIC) { CHK(cgp->cg_niblk, !=, sblock.fs_ipg, "%jd"); CHK(cgp->cg_initediblk, >, sblock.fs_ipg, "%jd"); } if (cgbase(&sblock, cg) + sblock.fs_fpg < sblock.fs_size) { CHK(cgp->cg_ndblk, !=, sblock.fs_fpg, "%jd"); } else { CHK(cgp->cg_ndblk, !=, sblock.fs_size - cgbase(&sblock, cg), "%jd"); } start = sizeof(*cgp); if (sblock.fs_magic == FS_UFS2_MAGIC) { CHK(cgp->cg_iusedoff, !=, start, "%jd"); } else if (sblock.fs_magic == FS_UFS1_MAGIC) { CHK(cgp->cg_niblk, !=, 0, "%jd"); CHK(cgp->cg_initediblk, !=, 0, "%jd"); CHK(cgp->cg_old_ncyl, !=, sblock.fs_old_cpg, "%jd"); CHK(cgp->cg_old_niblk, !=, sblock.fs_ipg, "%jd"); CHK(cgp->cg_old_btotoff, !=, start, "%jd"); CHK(cgp->cg_old_boff, !=, cgp->cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t), "%jd"); CHK(cgp->cg_iusedoff, !=, cgp->cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t), "%jd"); } CHK(cgp->cg_freeoff, !=, cgp->cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT), "%jd"); if (sblock.fs_contigsumsize == 0) { CHK(cgp->cg_nextfreeoff, !=, cgp->cg_freeoff + howmany(sblock.fs_fpg, CHAR_BIT), "%jd"); } else { CHK(cgp->cg_nclusterblks, !=, cgp->cg_ndblk / sblock.fs_frag, "%jd"); CHK(cgp->cg_clustersumoff, !=, roundup(cgp->cg_freeoff + howmany(sblock.fs_fpg, CHAR_BIT), sizeof(u_int32_t)) - sizeof(u_int32_t), "%jd"); CHK(cgp->cg_clusteroff, !=, cgp->cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t), "%jd"); CHK(cgp->cg_nextfreeoff, !=, cgp->cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT), "%jd"); } if (error == 0) return (1); if (prevfailcg == cg) return (0); prevfailcg = cg; pfatal("CYLINDER GROUP %d: INTEGRITY CHECK FAILED", cg); printf("\n"); return (0); } void rebuild_cg(int cg, struct bufarea *cgbp) { struct cg *cgp = cgbp->b_un.b_cg; long start; /* * Zero out the cylinder group and then initialize critical fields. * Bit maps and summaries will be recalculated by later passes. */ memset(cgp, 0, (size_t)sblock.fs_cgsize); cgp->cg_magic = CG_MAGIC; cgp->cg_cgx = cg; cgp->cg_niblk = sblock.fs_ipg; cgp->cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); if (cgbase(&sblock, cg) + sblock.fs_fpg < sblock.fs_size) cgp->cg_ndblk = sblock.fs_fpg; else cgp->cg_ndblk = sblock.fs_size - cgbase(&sblock, cg); start = sizeof(*cgp); if (sblock.fs_magic == FS_UFS2_MAGIC) { cgp->cg_iusedoff = start; } else if (sblock.fs_magic == FS_UFS1_MAGIC) { cgp->cg_niblk = 0; cgp->cg_initediblk = 0; cgp->cg_old_ncyl = sblock.fs_old_cpg; cgp->cg_old_niblk = sblock.fs_ipg; cgp->cg_old_btotoff = start; cgp->cg_old_boff = cgp->cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); cgp->cg_iusedoff = cgp->cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } cgp->cg_freeoff = cgp->cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); cgp->cg_nextfreeoff = cgp->cg_freeoff + howmany(sblock.fs_fpg,CHAR_BIT); if (sblock.fs_contigsumsize > 0) { cgp->cg_nclusterblks = cgp->cg_ndblk / sblock.fs_frag; cgp->cg_clustersumoff = roundup(cgp->cg_nextfreeoff, sizeof(u_int32_t)); cgp->cg_clustersumoff -= sizeof(u_int32_t); cgp->cg_clusteroff = cgp->cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t); cgp->cg_nextfreeoff = cgp->cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } cgp->cg_ckhash = calculate_crc32c(~0L, (void *)cgp, sblock.fs_cgsize); cgdirty(cgbp); } /* * allocate a data block with the specified number of fragments */ ufs2_daddr_t allocblk(long startcg, long frags, ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags)) { ufs2_daddr_t blkno, newblk; if (sujrecovery && checkblkavail == std_checkblkavail) { pfatal("allocblk: std_checkblkavail used for SUJ recovery\n"); return (0); } if (frags <= 0 || frags > sblock.fs_frag) return (0); for (blkno = MAX(cgdata(&sblock, startcg), 0); blkno < maxfsblock - sblock.fs_frag; blkno += sblock.fs_frag) { if ((newblk = (*checkblkavail)(blkno, frags)) == 0) continue; if (newblk > 0) return (newblk); if (newblk < 0) blkno = -newblk; } for (blkno = MAX(cgdata(&sblock, 0), 0); blkno < cgbase(&sblock, startcg) - sblock.fs_frag; blkno += sblock.fs_frag) { if ((newblk = (*checkblkavail)(blkno, frags)) == 0) continue; if (newblk > 0) return (newblk); if (newblk < 0) blkno = -newblk; } return (0); } ufs2_daddr_t std_checkblkavail(ufs2_daddr_t blkno, long frags) { struct bufarea *cgbp; struct cg *cgp; ufs2_daddr_t j, k, baseblk; long cg; if ((u_int64_t)blkno > sblock.fs_size) return (0); for (j = 0; j <= sblock.fs_frag - frags; j++) { if (testbmap(blkno + j)) continue; for (k = 1; k < frags; k++) if (testbmap(blkno + j + k)) break; if (k < frags) { j += k; continue; } cg = dtog(&sblock, blkno + j); cgbp = cglookup(cg); cgp = cgbp->b_un.b_cg; if (!check_cgmagic(cg, cgbp)) return (-((cg + 1) * sblock.fs_fpg - sblock.fs_frag)); baseblk = dtogd(&sblock, blkno + j); for (k = 0; k < frags; k++) { setbmap(blkno + j + k); clrbit(cg_blksfree(cgp), baseblk + k); } n_blks += frags; if (frags == sblock.fs_frag) cgp->cg_cs.cs_nbfree--; else cgp->cg_cs.cs_nffree -= frags; cgdirty(cgbp); return (blkno + j); } return (0); } +/* + * Check whether a file size is within the limits for the filesystem. + * Return 1 when valid and 0 when too big. + * + * This should match the file size limit in ffs_mountfs(). + */ +int +chkfilesize(mode_t mode, u_int64_t filesize) +{ + u_int64_t kernmaxfilesize; + + if (sblock.fs_magic == FS_UFS1_MAGIC) + kernmaxfilesize = (off_t)0x40000000 * sblock.fs_bsize - 1; + else + kernmaxfilesize = sblock.fs_maxfilesize; + if (filesize > kernmaxfilesize || + filesize > sblock.fs_maxfilesize || + (mode == IFDIR && filesize > MAXDIRSIZE)) { + if (debug) + printf("bad file size %ju:", (uintmax_t)filesize); + return (0); + } + return (1); +} + /* * Slow down IO so as to leave some disk bandwidth for other processes */ void slowio_start() { /* Delay one in every 8 operations */ slowio_pollcnt = (slowio_pollcnt + 1) & 7; if (slowio_pollcnt == 0) { gettimeofday(&slowio_starttime, NULL); } } void slowio_end() { struct timeval tv; int delay_usec; if (slowio_pollcnt != 0) return; /* Update the slowdown interval. */ gettimeofday(&tv, NULL); delay_usec = (tv.tv_sec - slowio_starttime.tv_sec) * 1000000 + (tv.tv_usec - slowio_starttime.tv_usec); if (delay_usec < 64) delay_usec = 64; if (delay_usec > 2500000) delay_usec = 2500000; slowio_delay_usec = (slowio_delay_usec * 63 + delay_usec) >> 6; /* delay by 8 times the average IO delay */ if (slowio_delay_usec > 64) usleep(slowio_delay_usec * 8); } /* * Find a pathname */ void getpathname(char *namebuf, ino_t curdir, ino_t ino) { int len; char *cp; struct inode ip; struct inodesc idesc; static int busy = 0; if (curdir == ino && ino == UFS_ROOTINO) { (void)strcpy(namebuf, "/"); return; } if (busy || !INO_IS_DVALID(curdir)) { (void)strcpy(namebuf, "?"); return; } busy = 1; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = DATA; idesc.id_fix = IGNORE; cp = &namebuf[MAXPATHLEN - 1]; *cp = '\0'; if (curdir != ino) { idesc.id_parent = curdir; goto namelookup; } while (ino != UFS_ROOTINO) { idesc.id_number = ino; idesc.id_func = findino; idesc.id_name = strdup(".."); ginode(ino, &ip); if ((ckinode(ip.i_dp, &idesc) & FOUND) == 0) { irelse(&ip); free(idesc.id_name); break; } irelse(&ip); free(idesc.id_name); namelookup: idesc.id_number = idesc.id_parent; idesc.id_parent = ino; idesc.id_func = findname; idesc.id_name = namebuf; ginode(idesc.id_number, &ip); if ((ckinode(ip.i_dp, &idesc) & FOUND) == 0) { irelse(&ip); break; } irelse(&ip); len = strlen(namebuf); cp -= len; memmove(cp, namebuf, (size_t)len); *--cp = '/'; if (cp < &namebuf[UFS_MAXNAMLEN]) break; ino = idesc.id_number; } busy = 0; if (ino != UFS_ROOTINO) *--cp = '?'; memmove(namebuf, cp, (size_t)(&namebuf[MAXPATHLEN] - cp)); } void catch(int sig __unused) { ckfini(0); exit(12); } /* * When preening, allow a single quit to signal * a special exit after file system checks complete * so that reboot sequence may be interrupted. */ void catchquit(int sig __unused) { printf("returning to single-user after file system check\n"); returntosingle = 1; (void)signal(SIGQUIT, SIG_DFL); } /* * determine whether an inode should be fixed. */ int dofix(struct inodesc *idesc, const char *msg) { switch (idesc->id_fix) { case DONTKNOW: if (idesc->id_type == DATA) direrror(idesc->id_number, msg); else pwarn("%s", msg); if (preen) { printf(" (SALVAGED)\n"); idesc->id_fix = FIX; return (ALTERED); } if (reply("SALVAGE") == 0) { idesc->id_fix = NOFIX; return (0); } idesc->id_fix = FIX; return (ALTERED); case FIX: return (ALTERED); case NOFIX: case IGNORE: return (0); default: errx(EEXIT, "UNKNOWN INODESC FIX MODE %d", idesc->id_fix); } /* NOTREACHED */ return (0); } #include /* * Print details about a buffer. */ void prtbuf(struct bufarea *bp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (preen) (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); va_end(ap); printf(": bp %p, type %s, bno %jd, size %d, refcnt %d, flags %s, " "index %jd\n", bp, BT_BUFTYPE(bp->b_type), (intmax_t) bp->b_bno, bp->b_size, bp->b_refcnt, bp->b_flags & B_DIRTY ? "dirty" : "clean", (intmax_t) bp->b_index); } /* * An unexpected inconsistency occurred. * Die if preening or file system is running with soft dependency protocol, * otherwise just print message and continue. */ void pfatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (!preen) { (void)vfprintf(stdout, fmt, ap); va_end(ap); if (usedsoftdep) (void)fprintf(stdout, "\nUNEXPECTED SOFT UPDATE INCONSISTENCY\n"); /* * Force foreground fsck to clean up inconsistency. */ if (bkgrdflag) { cmd.value = FS_NEEDSFSCK; cmd.size = 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n"); fprintf(stdout, "CANNOT RUN IN BACKGROUND\n"); ckfini(0); exit(EEXIT); } return; } if (cdevname == NULL) cdevname = strdup("fsck"); (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); (void)fprintf(stdout, "\n%s: UNEXPECTED%sINCONSISTENCY; RUN fsck MANUALLY.\n", cdevname, usedsoftdep ? " SOFT UPDATE " : " "); /* * Force foreground fsck to clean up inconsistency. */ if (bkgrdflag) { cmd.value = FS_NEEDSFSCK; cmd.size = 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n"); } ckfini(0); exit(EEXIT); } /* * Pwarn just prints a message when not preening or running soft dependency * protocol, or a warning (preceded by filename) when preening. */ void pwarn(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (preen) (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); va_end(ap); } /* * Stub for routines from kernel. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); pfatal("INTERNAL INCONSISTENCY:"); (void)vfprintf(stdout, fmt, ap); va_end(ap); exit(EEXIT); } diff --git a/sbin/fsck_ffs/pass1.c b/sbin/fsck_ffs/pass1.c index 863bf34ff0fc..d328234220ad 100644 --- a/sbin/fsck_ffs/pass1.c +++ b/sbin/fsck_ffs/pass1.c @@ -1,624 +1,614 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)pass1.c 8.6 (Berkeley) 4/28/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include "fsck.h" static ufs2_daddr_t badblk; static ufs2_daddr_t dupblk; static ino_t lastino; /* last inode in use */ static int checkinode(ino_t inumber, struct inodesc *, int rebuiltcg); void pass1(void) { struct inostat *info; struct inodesc idesc; struct bufarea *cgbp; struct cg *cgp; ino_t inumber, inosused, mininos; ufs2_daddr_t i, cgd; u_int8_t *cp; int c, rebuiltcg; badblk = dupblk = lastino = 0; /* * Set file system reserved blocks in used block map. */ for (c = 0; c < sblock.fs_ncg; c++) { cgd = cgdmin(&sblock, c); if (c == 0) { i = cgbase(&sblock, c); } else i = cgsblock(&sblock, c); for (; i < cgd; i++) setbmap(i); } i = sblock.fs_csaddr; cgd = i + howmany(sblock.fs_cssize, sblock.fs_fsize); for (; i < cgd; i++) setbmap(i); /* * Find all allocated blocks. */ memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_func = pass1check; n_files = n_blks = 0; for (c = 0; c < sblock.fs_ncg; c++) { inumber = c * sblock.fs_ipg; cgbp = cglookup(c); cgp = cgbp->b_un.b_cg; rebuiltcg = 0; if (!check_cgmagic(c, cgbp)) { if (!reply("REBUILD CYLINDER GROUP")) { cgheader_corrupt = 1; if (!nflag) { printf("YOU WILL NEED TO RERUN FSCK.\n"); rerun = 1; } } else { rebuild_cg(c, cgbp); rebuiltcg = 1; } } if (!rebuiltcg && sblock.fs_magic == FS_UFS2_MAGIC) { inosused = cgp->cg_initediblk; if (inosused > sblock.fs_ipg) { pfatal("Too many initialized inodes (%ju > %d) " "in cylinder group %d\nReset to %d\n", (uintmax_t)inosused, sblock.fs_ipg, c, sblock.fs_ipg); inosused = sblock.fs_ipg; } } else { inosused = sblock.fs_ipg; } if (got_siginfo) { printf("%s: phase 1: cyl group %d of %d (%d%%)\n", cdevname, c, sblock.fs_ncg, c * 100 / sblock.fs_ncg); got_siginfo = 0; } if (got_sigalarm) { setproctitle("%s p1 %d%%", cdevname, c * 100 / sblock.fs_ncg); got_sigalarm = 0; } /* * If we are using soft updates, then we can trust the * cylinder group inode allocation maps to tell us which * inodes are allocated. We will scan the used inode map * to find the inodes that are really in use, and then * read only those inodes in from disk. */ if ((preen || inoopt) && usedsoftdep && !rebuiltcg) { cp = &cg_inosused(cgp)[(inosused - 1) / CHAR_BIT]; for ( ; inosused != 0; cp--) { if (*cp == 0) { if (inosused > CHAR_BIT) inosused -= CHAR_BIT; else inosused = 0; continue; } for (i = 1 << (CHAR_BIT - 1); i > 0; i >>= 1) { if (*cp & i) break; inosused--; } break; } } /* * Allocate inoinfo structures for the allocated inodes. */ inostathead[c].il_numalloced = inosused; if (inosused == 0) { inostathead[c].il_stat = NULL; continue; } info = Calloc((unsigned)inosused, sizeof(struct inostat)); if (info == NULL) errx(EEXIT, "cannot alloc %u bytes for inoinfo", (unsigned)(sizeof(struct inostat) * inosused)); inostathead[c].il_stat = info; /* * Scan the allocated inodes. */ setinodebuf(c, inosused); for (i = 0; i < inosused; i++, inumber++) { if (inumber < UFS_ROOTINO) { (void)getnextinode(inumber, rebuiltcg); continue; } /* * NULL return indicates probable end of allocated * inodes during cylinder group rebuild attempt. * We always keep trying until we get to the minimum * valid number for this cylinder group. */ if (checkinode(inumber, &idesc, rebuiltcg) == 0 && i > cgp->cg_initediblk) break; } /* * This optimization speeds up future runs of fsck * by trimming down the number of inodes in cylinder * groups that formerly had many inodes but now have * fewer in use. */ mininos = roundup(inosused + INOPB(&sblock), INOPB(&sblock)); if (inoopt && !preen && !rebuiltcg && sblock.fs_magic == FS_UFS2_MAGIC && cgp->cg_initediblk > 2 * INOPB(&sblock) && mininos < cgp->cg_initediblk) { i = cgp->cg_initediblk; if (mininos < 2 * INOPB(&sblock)) cgp->cg_initediblk = 2 * INOPB(&sblock); else cgp->cg_initediblk = mininos; pwarn("CYLINDER GROUP %d: RESET FROM %ju TO %d %s\n", c, i, cgp->cg_initediblk, "VALID INODES"); cgdirty(cgbp); } if (inosused < sblock.fs_ipg) continue; lastino += 1; if (lastino < (c * sblock.fs_ipg)) inosused = 0; else inosused = lastino - (c * sblock.fs_ipg); if (rebuiltcg && inosused > cgp->cg_initediblk && sblock.fs_magic == FS_UFS2_MAGIC) { cgp->cg_initediblk = roundup(inosused, INOPB(&sblock)); pwarn("CYLINDER GROUP %d: FOUND %d VALID INODES\n", c, cgp->cg_initediblk); } /* * If we were not able to determine in advance which inodes * were in use, then reduce the size of the inoinfo structure * to the size necessary to describe the inodes that we * really found. Always leave map space in the first cylinder * group in case we need to a root or lost+found directory. */ if (inumber == lastino || c == 0) continue; inostathead[c].il_numalloced = inosused; if (inosused == 0) { free(inostathead[c].il_stat); inostathead[c].il_stat = NULL; continue; } info = Calloc((unsigned)inosused, sizeof(struct inostat)); if (info == NULL) errx(EEXIT, "cannot alloc %u bytes for inoinfo", (unsigned)(sizeof(struct inostat) * inosused)); memmove(info, inostathead[c].il_stat, inosused * sizeof(*info)); free(inostathead[c].il_stat); inostathead[c].il_stat = info; } freeinodebuf(); } static int checkinode(ino_t inumber, struct inodesc *idesc, int rebuiltcg) { struct inode ip; union dinode *dp; - off_t kernmaxfilesize; ufs2_daddr_t ndb; mode_t mode; intmax_t size, fixsize; int j, ret, offset; if ((dp = getnextinode(inumber, rebuiltcg)) == NULL) { pfatal("INVALID INODE"); goto unknown; } mode = DIP(dp, di_mode) & IFMT; if (mode == 0) { if ((sblock.fs_magic == FS_UFS1_MAGIC && (memcmp(dp->dp1.di_db, zino.dp1.di_db, UFS_NDADDR * sizeof(ufs1_daddr_t)) || memcmp(dp->dp1.di_ib, zino.dp1.di_ib, UFS_NIADDR * sizeof(ufs1_daddr_t)) || dp->dp1.di_mode || dp->dp1.di_size)) || (sblock.fs_magic == FS_UFS2_MAGIC && (memcmp(dp->dp2.di_db, zino.dp2.di_db, UFS_NDADDR * sizeof(ufs2_daddr_t)) || memcmp(dp->dp2.di_ib, zino.dp2.di_ib, UFS_NIADDR * sizeof(ufs2_daddr_t)) || dp->dp2.di_mode || dp->dp2.di_size))) { pfatal("PARTIALLY ALLOCATED INODE I=%lu", (u_long)inumber); if (reply("CLEAR") == 1) { ginode(inumber, &ip); clearinode(ip.i_dp); inodirty(&ip); irelse(&ip); } } inoinfo(inumber)->ino_state = USTATE; return (1); } lastino = inumber; - /* This should match the file size limit in ffs_mountfs(). */ - if (sblock.fs_magic == FS_UFS1_MAGIC) - kernmaxfilesize = (off_t)0x40000000 * sblock.fs_bsize - 1; - else - kernmaxfilesize = sblock.fs_maxfilesize; - if (DIP(dp, di_size) > kernmaxfilesize || - DIP(dp, di_size) > sblock.fs_maxfilesize || - (mode == IFDIR && DIP(dp, di_size) > MAXDIRSIZE)) { - if (debug) - printf("bad size %ju:", (uintmax_t)DIP(dp, di_size)); + if (chkfilesize(mode, DIP(dp, di_size)) == 0) { pfatal("BAD FILE SIZE"); goto unknown; } if (!preen && mode == IFMT && reply("HOLD BAD BLOCK") == 1) { ginode(inumber, &ip); dp = ip.i_dp; DIP_SET(dp, di_size, sblock.fs_fsize); DIP_SET(dp, di_mode, IFREG|0600); inodirty(&ip); irelse(&ip); } if ((mode == IFBLK || mode == IFCHR || mode == IFIFO || mode == IFSOCK) && DIP(dp, di_size) != 0) { if (debug) printf("bad special-file size %ju:", (uintmax_t)DIP(dp, di_size)); pfatal("BAD SPECIAL-FILE SIZE"); goto unknown; } if ((mode == IFBLK || mode == IFCHR) && (dev_t)DIP(dp, di_rdev) == NODEV) { if (debug) printf("bad special-file rdev NODEV:"); pfatal("BAD SPECIAL-FILE RDEV"); goto unknown; } ndb = howmany(DIP(dp, di_size), sblock.fs_bsize); if (ndb < 0) { if (debug) printf("negative size %ju ndb %ju:", (uintmax_t)DIP(dp, di_size), (uintmax_t)ndb); pfatal("NEGATIVE FILE SIZE"); goto unknown; } if (mode == IFBLK || mode == IFCHR) ndb++; if (mode == IFLNK) { /* * Fake ndb value so direct/indirect block checks below * will detect any garbage after symlink string. */ if (DIP(dp, di_size) < (off_t)sblock.fs_maxsymlinklen) { if (sblock.fs_magic == FS_UFS1_MAGIC) ndb = howmany(DIP(dp, di_size), sizeof(ufs1_daddr_t)); else ndb = howmany(DIP(dp, di_size), sizeof(ufs2_daddr_t)); if (ndb > UFS_NDADDR) { j = ndb - UFS_NDADDR; for (ndb = 1; j > 1; j--) ndb *= NINDIR(&sblock); ndb += UFS_NDADDR; } } } for (j = ndb; ndb < UFS_NDADDR && j < UFS_NDADDR; j++) { if (DIP(dp, di_db[j]) == 0) continue; if (debug) printf("invalid direct addr[%d]: %ju\n", j, (uintmax_t)DIP(dp, di_db[j])); pfatal("INVALID DIRECT BLOCK"); ginode(inumber, &ip); prtinode(&ip); if (reply("CLEAR") == 1) { DIP_SET(ip.i_dp, di_db[j], 0); inodirty(&ip); } irelse(&ip); } for (j = 0, ndb -= UFS_NDADDR; ndb > 0; j++) ndb /= NINDIR(&sblock); for (; j < UFS_NIADDR; j++) { if (DIP(dp, di_ib[j]) == 0) continue; if (debug) printf("invalid indirect addr: %ju\n", (uintmax_t)DIP(dp, di_ib[j])); pfatal("INVALID INDIRECT BLOCK"); ginode(inumber, &ip); prtinode(&ip); if (reply("CLEAR") == 1) { DIP_SET(ip.i_dp, di_ib[j], 0); inodirty(&ip); } irelse(&ip); } if (ftypeok(dp) == 0) { pfatal("UNKNOWN FILE TYPE"); goto unknown; } n_files++; inoinfo(inumber)->ino_linkcnt = DIP(dp, di_nlink); if (mode == IFDIR) { if (DIP(dp, di_size) == 0) { inoinfo(inumber)->ino_state = DCLEAR; } else if (DIP(dp, di_nlink) == 0) { inoinfo(inumber)->ino_state = DZLINK; } else { inoinfo(inumber)->ino_state = DSTATE; } cacheino(dp, inumber); countdirs++; } else if (DIP(dp, di_nlink) <= 0) inoinfo(inumber)->ino_state = FZLINK; else inoinfo(inumber)->ino_state = FSTATE; inoinfo(inumber)->ino_type = IFTODT(mode); badblk = dupblk = 0; idesc->id_number = inumber; if (DIP(dp, di_flags) & SF_SNAPSHOT) inoinfo(inumber)->ino_idtype = SNAP; else inoinfo(inumber)->ino_idtype = ADDR; idesc->id_type = inoinfo(inumber)->ino_idtype; (void)ckinode(dp, idesc); if (sblock.fs_magic == FS_UFS2_MAGIC && dp->dp2.di_extsize > 0) { ndb = howmany(dp->dp2.di_extsize, sblock.fs_bsize); for (j = 0; j < UFS_NXADDR; j++) { if (--ndb == 0 && (offset = blkoff(&sblock, dp->dp2.di_extsize)) != 0) idesc->id_numfrags = numfrags(&sblock, fragroundup(&sblock, offset)); else idesc->id_numfrags = sblock.fs_frag; if (dp->dp2.di_extb[j] == 0) continue; idesc->id_blkno = dp->dp2.di_extb[j]; ret = (*idesc->id_func)(idesc); if (ret & STOP) break; } } if (sblock.fs_magic == FS_UFS2_MAGIC) eascan(idesc, &dp->dp2); idesc->id_entryno *= btodb(sblock.fs_fsize); if (DIP(dp, di_blocks) != idesc->id_entryno) { pwarn("INCORRECT BLOCK COUNT I=%lu (%ju should be %ju)", (u_long)inumber, (uintmax_t)DIP(dp, di_blocks), (uintmax_t)idesc->id_entryno); if (preen) printf(" (CORRECTED)\n"); else if (reply("CORRECT") == 0) return (1); if (bkgrdflag == 0) { ginode(inumber, &ip); DIP_SET(ip.i_dp, di_blocks, idesc->id_entryno); inodirty(&ip); irelse(&ip); } else { cmd.value = idesc->id_number; cmd.size = idesc->id_entryno - DIP(dp, di_blocks); if (debug) printf("adjblkcnt ino %ju amount %lld\n", (uintmax_t)cmd.value, (long long)cmd.size); if (sysctl(adjblkcnt, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("ADJUST INODE BLOCK COUNT", cmd.value); } } /* * UFS does not allow files to end with a hole; it requires that * the last block of a file be allocated. The last allocated block * in a file is tracked in id_lballoc. Here, we check for a size * past the last allocated block of the file and if that is found, * shorten the file to reference the last allocated block to avoid * having it reference a hole at its end. * * Soft updates will always ensure that the file size is correct * for files that contain only direct block pointers. However * soft updates does not roll back sizes for files with indirect * blocks that it has set to unallocated because their contents * have not yet been written to disk. Hence, the file can appear * to have a hole at its end because the block pointer has been * rolled back to zero. Thus finding a hole at the end of a file * that is located in an indirect block receives only a warning * while finding a hole at the end of a file in a direct block * receives a fatal error message. */ size = DIP(dp, di_size); if (idesc->id_lballoc < lblkno(&sblock, size - 1) && /* exclude embedded symbolic links */ ((mode != IFLNK) || size >= sblock.fs_maxsymlinklen)) { fixsize = lblktosize(&sblock, idesc->id_lballoc + 1); if (size > UFS_NDADDR * sblock.fs_bsize) pwarn("INODE %lu: FILE SIZE %ju BEYOND END OF " "ALLOCATED FILE, SIZE SHOULD BE %ju", (u_long)inumber, size, fixsize); else pfatal("INODE %lu: FILE SIZE %ju BEYOND END OF " "ALLOCATED FILE, SIZE SHOULD BE %ju", (u_long)inumber, size, fixsize); if (preen) printf(" (ADJUSTED)\n"); else if (reply("ADJUST") == 0) return (1); if (bkgrdflag == 0) { ginode(inumber, &ip); DIP_SET(ip.i_dp, di_size, fixsize); inodirty(&ip); irelse(&ip); } else { cmd.value = idesc->id_number; cmd.size = fixsize; if (debug) printf("setsize ino %ju size set to %ju\n", (uintmax_t)cmd.value, (uintmax_t)cmd.size); if (sysctl(setsize, MIBSIZE, 0, 0, &cmd, sizeof cmd) == -1) rwerror("SET INODE SIZE", cmd.value); } } return (1); unknown: ginode(inumber, &ip); prtinode(&ip); inoinfo(inumber)->ino_state = USTATE; if (reply("CLEAR") == 1) { clearinode(ip.i_dp); inodirty(&ip); } irelse(&ip); return (1); } int pass1check(struct inodesc *idesc) { int res = KEEPON; int anyout, nfrags; ufs2_daddr_t blkno = idesc->id_blkno; struct dups *dlp; struct dups *new; if (idesc->id_type == SNAP) { if (blkno == BLK_NOCOPY) return (KEEPON); if (idesc->id_number == cursnapshot) { if (blkno == blkstofrags(&sblock, idesc->id_lbn)) return (KEEPON); if (blkno == BLK_SNAP) { blkno = blkstofrags(&sblock, idesc->id_lbn); idesc->id_entryno -= idesc->id_numfrags; } } else { if (blkno == BLK_SNAP) return (KEEPON); } } if ((anyout = chkrange(blkno, idesc->id_numfrags)) != 0) { blkerror(idesc->id_number, "BAD", blkno); if (badblk++ >= MAXBAD) { pwarn("EXCESSIVE BAD BLKS I=%lu", (u_long)idesc->id_number); if (preen) printf(" (SKIPPING)\n"); else if (reply("CONTINUE") == 0) { ckfini(0); exit(EEXIT); } rerun = 1; return (STOP); } } for (nfrags = idesc->id_numfrags; nfrags > 0; blkno++, nfrags--) { if (anyout && chkrange(blkno, 1)) { res = SKIP; } else if (!testbmap(blkno)) { n_blks++; setbmap(blkno); } else { blkerror(idesc->id_number, "DUP", blkno); if (dupblk++ >= MAXDUP) { pwarn("EXCESSIVE DUP BLKS I=%lu", (u_long)idesc->id_number); if (preen) printf(" (SKIPPING)\n"); else if (reply("CONTINUE") == 0) { ckfini(0); exit(EEXIT); } rerun = 1; return (STOP); } new = (struct dups *)Malloc(sizeof(struct dups)); if (new == NULL) { pfatal("DUP TABLE OVERFLOW."); if (reply("CONTINUE") == 0) { ckfini(0); exit(EEXIT); } rerun = 1; return (STOP); } new->dup = blkno; if (muldup == NULL) { duplist = muldup = new; new->next = NULL; } else { new->next = muldup->next; muldup->next = new; } for (dlp = duplist; dlp != muldup; dlp = dlp->next) if (dlp->dup == blkno) break; if (dlp == muldup && dlp->dup != blkno) muldup = new; } /* * count the number of blocks found in id_entryno */ idesc->id_entryno++; } if (idesc->id_level == 0 && idesc->id_lballoc < idesc->id_lbn) idesc->id_lballoc = idesc->id_lbn; return (res); } diff --git a/sbin/fsck_ffs/suj.c b/sbin/fsck_ffs/suj.c index 8fed3d7723d6..d51e0ff4d83b 100644 --- a/sbin/fsck_ffs/suj.c +++ b/sbin/fsck_ffs/suj.c @@ -1,2522 +1,2525 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2009, 2010 Jeffrey W. Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsck.h" #define DOTDOT_OFFSET DIRECTSIZ(1) struct suj_seg { TAILQ_ENTRY(suj_seg) ss_next; struct jsegrec ss_rec; uint8_t *ss_blk; }; struct suj_rec { TAILQ_ENTRY(suj_rec) sr_next; union jrec *sr_rec; }; TAILQ_HEAD(srechd, suj_rec); struct suj_ino { LIST_ENTRY(suj_ino) si_next; struct srechd si_recs; struct srechd si_newrecs; struct srechd si_movs; struct jtrncrec *si_trunc; ino_t si_ino; char si_skipparent; char si_hasrecs; char si_blkadj; char si_linkadj; int si_mode; nlink_t si_nlinkadj; nlink_t si_nlink; nlink_t si_dotlinks; }; LIST_HEAD(inohd, suj_ino); struct suj_blk { LIST_ENTRY(suj_blk) sb_next; struct srechd sb_recs; ufs2_daddr_t sb_blk; }; LIST_HEAD(blkhd, suj_blk); struct suj_cg { LIST_ENTRY(suj_cg) sc_next; struct blkhd sc_blkhash[HASHSIZE]; struct inohd sc_inohash[HASHSIZE]; struct ino_blk *sc_lastiblk; struct suj_ino *sc_lastino; struct suj_blk *sc_lastblk; struct bufarea *sc_cgbp; struct cg *sc_cgp; int sc_cgx; }; static LIST_HEAD(cghd, suj_cg) cghash[HASHSIZE]; static struct suj_cg *lastcg; static TAILQ_HEAD(seghd, suj_seg) allsegs; static uint64_t oldseq; static struct fs *fs = NULL; static ino_t sujino; /* * Summary statistics. */ static uint64_t freefrags; static uint64_t freeblocks; static uint64_t freeinos; static uint64_t freedir; static uint64_t jbytes; static uint64_t jrecs; static jmp_buf jmpbuf; typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int); static void err_suj(const char *, ...) __dead2; static void ino_trunc(ino_t, off_t); static void ino_decr(ino_t); static void ino_adjust(struct suj_ino *); static void ino_build(struct suj_ino *); static int blk_isfree(ufs2_daddr_t); static void initsuj(void); static void * errmalloc(size_t n) { void *a; a = Malloc(n); if (a == NULL) err(EX_OSERR, "malloc(%zu)", n); return (a); } /* * When hit a fatal error in journalling check, print out * the error and then offer to fallback to normal fsck. */ static void err_suj(const char * restrict fmt, ...) { va_list ap; if (preen) (void)fprintf(stdout, "%s: ", cdevname); va_start(ap, fmt); (void)vfprintf(stdout, fmt, ap); va_end(ap); longjmp(jmpbuf, -1); } /* * Lookup a cg by number in the hash so we can keep track of which cgs * need stats rebuilt. */ static struct suj_cg * cg_lookup(int cgx) { struct cghd *hd; struct suj_cg *sc; struct bufarea *cgbp; if (cgx < 0 || cgx >= fs->fs_ncg) err_suj("Bad cg number %d\n", cgx); if (lastcg && lastcg->sc_cgx == cgx) return (lastcg); cgbp = cglookup(cgx); if (!check_cgmagic(cgx, cgbp)) err_suj("UNABLE TO REBUILD CYLINDER GROUP %d", cgx); hd = &cghash[HASH(cgx)]; LIST_FOREACH(sc, hd, sc_next) if (sc->sc_cgx == cgx) { sc->sc_cgbp = cgbp; sc->sc_cgp = sc->sc_cgbp->b_un.b_cg; lastcg = sc; return (sc); } sc = errmalloc(sizeof(*sc)); bzero(sc, sizeof(*sc)); sc->sc_cgbp = cgbp; sc->sc_cgp = sc->sc_cgbp->b_un.b_cg; sc->sc_cgx = cgx; LIST_INSERT_HEAD(hd, sc, sc_next); return (sc); } /* * Lookup an inode number in the hash and allocate a suj_ino if it does * not exist. */ static struct suj_ino * ino_lookup(ino_t ino, int creat) { struct suj_ino *sino; struct inohd *hd; struct suj_cg *sc; sc = cg_lookup(ino_to_cg(fs, ino)); if (sc->sc_lastino && sc->sc_lastino->si_ino == ino) return (sc->sc_lastino); hd = &sc->sc_inohash[HASH(ino)]; LIST_FOREACH(sino, hd, si_next) if (sino->si_ino == ino) return (sino); if (creat == 0) return (NULL); sino = errmalloc(sizeof(*sino)); bzero(sino, sizeof(*sino)); sino->si_ino = ino; TAILQ_INIT(&sino->si_recs); TAILQ_INIT(&sino->si_newrecs); TAILQ_INIT(&sino->si_movs); LIST_INSERT_HEAD(hd, sino, si_next); return (sino); } /* * Lookup a block number in the hash and allocate a suj_blk if it does * not exist. */ static struct suj_blk * blk_lookup(ufs2_daddr_t blk, int creat) { struct suj_blk *sblk; struct suj_cg *sc; struct blkhd *hd; sc = cg_lookup(dtog(fs, blk)); if (sc->sc_lastblk && sc->sc_lastblk->sb_blk == blk) return (sc->sc_lastblk); hd = &sc->sc_blkhash[HASH(fragstoblks(fs, blk))]; LIST_FOREACH(sblk, hd, sb_next) if (sblk->sb_blk == blk) return (sblk); if (creat == 0) return (NULL); sblk = errmalloc(sizeof(*sblk)); bzero(sblk, sizeof(*sblk)); sblk->sb_blk = blk; TAILQ_INIT(&sblk->sb_recs); LIST_INSERT_HEAD(hd, sblk, sb_next); return (sblk); } static int blk_overlaps(struct jblkrec *brec, ufs2_daddr_t start, int frags) { ufs2_daddr_t bstart; ufs2_daddr_t bend; ufs2_daddr_t end; end = start + frags; bstart = brec->jb_blkno + brec->jb_oldfrags; bend = bstart + brec->jb_frags; if (start < bend && end > bstart) return (1); return (0); } static int blk_equals(struct jblkrec *brec, ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t start, int frags) { if (brec->jb_ino != ino || brec->jb_lbn != lbn) return (0); if (brec->jb_blkno + brec->jb_oldfrags != start) return (0); if (brec->jb_frags < frags) return (0); return (1); } static void blk_setmask(struct jblkrec *brec, int *mask) { int i; for (i = brec->jb_oldfrags; i < brec->jb_oldfrags + brec->jb_frags; i++) *mask |= 1 << i; } /* * Determine whether a given block has been reallocated to a new location. * Returns a mask of overlapping bits if any frags have been reused or * zero if the block has not been re-used and the contents can be trusted. * * This is used to ensure that an orphaned pointer due to truncate is safe * to be freed. The mask value can be used to free partial blocks. */ static int blk_freemask(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags) { struct suj_blk *sblk; struct suj_rec *srec; struct jblkrec *brec; int mask; int off; /* * To be certain we're not freeing a reallocated block we lookup * this block in the blk hash and see if there is an allocation * journal record that overlaps with any fragments in the block * we're concerned with. If any fragments have been reallocated * the block has already been freed and re-used for another purpose. */ mask = 0; sblk = blk_lookup(blknum(fs, blk), 0); if (sblk == NULL) return (0); off = blk - sblk->sb_blk; TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { brec = (struct jblkrec *)srec->sr_rec; /* * If the block overlaps but does not match * exactly this record refers to the current * location. */ if (blk_overlaps(brec, blk, frags) == 0) continue; if (blk_equals(brec, ino, lbn, blk, frags) == 1) mask = 0; else blk_setmask(brec, &mask); } if (debug) printf("blk_freemask: blk %jd sblk %jd off %d mask 0x%X\n", blk, sblk->sb_blk, off, mask); return (mask >> off); } /* * Determine whether it is safe to follow an indirect. It is not safe * if any part of the indirect has been reallocated or the last journal * entry was an allocation. Just allocated indirects may not have valid * pointers yet and all of their children will have their own records. * It is also not safe to follow an indirect if the cg bitmap has been * cleared as a new allocation may write to the block prior to the journal * being written. * * Returns 1 if it's safe to follow the indirect and 0 otherwise. */ static int blk_isindir(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn) { struct suj_blk *sblk; struct jblkrec *brec; sblk = blk_lookup(blk, 0); if (sblk == NULL) return (1); if (TAILQ_EMPTY(&sblk->sb_recs)) return (1); brec = (struct jblkrec *)TAILQ_LAST(&sblk->sb_recs, srechd)->sr_rec; if (blk_equals(brec, ino, lbn, blk, fs->fs_frag)) if (brec->jb_op == JOP_FREEBLK) return (!blk_isfree(blk)); return (0); } /* * Check to see if the requested block is available. * We can just check in the cylinder-group maps as * they will only have usable blocks in them. */ ufs2_daddr_t suj_checkblkavail(ufs2_daddr_t blkno, long frags) { struct bufarea *cgbp; struct cg *cgp; ufs2_daddr_t j, k, baseblk; long cg; if ((u_int64_t)blkno > sblock.fs_size) return (0); cg = dtog(&sblock, blkno); cgbp = cglookup(cg); cgp = cgbp->b_un.b_cg; if (!check_cgmagic(cg, cgbp)) return (-((cg + 1) * sblock.fs_fpg - sblock.fs_frag)); baseblk = dtogd(&sblock, blkno); for (j = 0; j <= sblock.fs_frag - frags; j++) { if (!isset(cg_blksfree(cgp), baseblk + j)) continue; for (k = 1; k < frags; k++) if (!isset(cg_blksfree(cgp), baseblk + j + k)) break; if (k < frags) { j += k; continue; } for (k = 0; k < frags; k++) clrbit(cg_blksfree(cgp), baseblk + j + k); n_blks += frags; if (frags == sblock.fs_frag) cgp->cg_cs.cs_nbfree--; else cgp->cg_cs.cs_nffree -= frags; cgdirty(cgbp); return ((cg * sblock.fs_fpg) + baseblk + j); } return (0); } /* * Clear an inode from the cg bitmap. If the inode was already clear return * 0 so the caller knows it does not have to check the inode contents. */ static int ino_free(ino_t ino, int mode) { struct suj_cg *sc; uint8_t *inosused; struct cg *cgp; int cg; cg = ino_to_cg(fs, ino); ino = ino % fs->fs_ipg; sc = cg_lookup(cg); cgp = sc->sc_cgp; inosused = cg_inosused(cgp); /* * The bitmap may never have made it to the disk so we have to * conditionally clear. We can avoid writing the cg in this case. */ if (isclr(inosused, ino)) return (0); freeinos++; clrbit(inosused, ino); if (ino < cgp->cg_irotor) cgp->cg_irotor = ino; cgp->cg_cs.cs_nifree++; if ((mode & IFMT) == IFDIR) { freedir++; cgp->cg_cs.cs_ndir--; } cgdirty(sc->sc_cgbp); return (1); } /* * Free 'frags' frags starting at filesystem block 'bno' skipping any frags * set in the mask. */ static void blk_free(ino_t ino, ufs2_daddr_t bno, int mask, int frags) { ufs1_daddr_t fragno, cgbno; struct suj_cg *sc; struct cg *cgp; int i, cg; uint8_t *blksfree; if (debug) printf("Freeing %d frags at blk %jd mask 0x%x\n", frags, bno, mask); /* * Check to see if the block needs to be claimed by a snapshot. * If wanted, the snapshot references it. Otherwise we free it. */ if (snapblkfree(fs, bno, lfragtosize(fs, frags), ino, suj_checkblkavail)) return; cg = dtog(fs, bno); sc = cg_lookup(cg); cgp = sc->sc_cgp; cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); /* * If it's not allocated we only wrote the journal entry * and never the bitmaps. Here we unconditionally clear and * resolve the cg summary later. */ if (frags == fs->fs_frag && mask == 0) { fragno = fragstoblks(fs, cgbno); ffs_setblock(fs, blksfree, fragno); freeblocks++; } else { /* * deallocate the fragment */ for (i = 0; i < frags; i++) if ((mask & (1 << i)) == 0 && isclr(blksfree, cgbno +i)) { freefrags++; setbit(blksfree, cgbno + i); } } cgdirty(sc->sc_cgbp); } /* * Returns 1 if the whole block starting at 'bno' is marked free and 0 * otherwise. */ static int blk_isfree(ufs2_daddr_t bno) { struct suj_cg *sc; sc = cg_lookup(dtog(fs, bno)); return ffs_isblock(fs, cg_blksfree(sc->sc_cgp), dtogd(fs, bno)); } /* * Determine whether a block exists at a particular lbn in an inode. * Returns 1 if found, 0 if not. lbn may be negative for indirects * or ext blocks. */ static int blk_isat(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int *frags) { struct inode ip; union dinode *dp; ufs2_daddr_t nblk; ginode(ino, &ip); dp = ip.i_dp; if (DIP(dp, di_nlink) == 0 || DIP(dp, di_mode) == 0) { irelse(&ip); return (0); } nblk = ino_blkatoff(dp, ino, lbn, frags, NULL); irelse(&ip); return (nblk == blk); } /* * Clear the directory entry at diroff that should point to child. Minimal * checking is done and it is assumed that this path was verified with isat. */ static void ino_clrat(ino_t parent, off_t diroff, ino_t child) { union dinode *dip; struct direct *dp; struct inode ip; ufs2_daddr_t blk; struct bufarea *bp; ufs_lbn_t lbn; int blksize; int frags; int doff; if (debug) printf("Clearing inode %ju from parent %ju at offset %jd\n", (uintmax_t)child, (uintmax_t)parent, diroff); lbn = lblkno(fs, diroff); doff = blkoff(fs, diroff); ginode(parent, &ip); dip = ip.i_dp; blk = ino_blkatoff(dip, parent, lbn, &frags, NULL); blksize = sblksize(fs, DIP(dip, di_size), lbn); irelse(&ip); bp = getdatablk(blk, blksize, BT_DIRDATA); if (bp->b_errs != 0) err_suj("ino_clrat: UNRECOVERABLE I/O ERROR"); dp = (struct direct *)&bp->b_un.b_buf[doff]; if (dp->d_ino != child) errx(1, "Inode %ju does not exist in %ju at %jd", (uintmax_t)child, (uintmax_t)parent, diroff); dp->d_ino = 0; dirty(bp); brelse(bp); /* * The actual .. reference count will already have been removed * from the parent by the .. remref record. */ } /* * Determines whether a pointer to an inode exists within a directory * at a specified offset. Returns the mode of the found entry. */ static int ino_isat(ino_t parent, off_t diroff, ino_t child, int *mode, int *isdot) { struct inode ip; union dinode *dip; struct bufarea *bp; struct direct *dp; ufs2_daddr_t blk; ufs_lbn_t lbn; int blksize; int frags; int dpoff; int doff; *isdot = 0; ginode(parent, &ip); dip = ip.i_dp; *mode = DIP(dip, di_mode); if ((*mode & IFMT) != IFDIR) { if (debug) { /* * This can happen if the parent inode * was reallocated. */ if (*mode != 0) printf("Directory %ju has bad mode %o\n", (uintmax_t)parent, *mode); else printf("Directory %ju has zero mode\n", (uintmax_t)parent); } irelse(&ip); return (0); } lbn = lblkno(fs, diroff); doff = blkoff(fs, diroff); blksize = sblksize(fs, DIP(dip, di_size), lbn); if (diroff + DIRECTSIZ(1) > DIP(dip, di_size) || doff >= blksize) { if (debug) printf("ino %ju absent from %ju due to offset %jd" " exceeding size %jd\n", (uintmax_t)child, (uintmax_t)parent, diroff, DIP(dip, di_size)); irelse(&ip); return (0); } blk = ino_blkatoff(dip, parent, lbn, &frags, NULL); irelse(&ip); if (blk <= 0) { if (debug) printf("Sparse directory %ju", (uintmax_t)parent); return (0); } bp = getdatablk(blk, blksize, BT_DIRDATA); if (bp->b_errs != 0) err_suj("ino_isat: UNRECOVERABLE I/O ERROR"); /* * Walk through the records from the start of the block to be * certain we hit a valid record and not some junk in the middle * of a file name. Stop when we reach or pass the expected offset. */ dpoff = rounddown(doff, DIRBLKSIZ); do { dp = (struct direct *)&bp->b_un.b_buf[dpoff]; if (dpoff == doff) break; if (dp->d_reclen == 0) break; dpoff += dp->d_reclen; } while (dpoff <= doff); if (dpoff > fs->fs_bsize) err_suj("Corrupt directory block in dir ino %ju\n", (uintmax_t)parent); /* Not found. */ if (dpoff != doff) { if (debug) printf("ino %ju not found in %ju, lbn %jd, dpoff %d\n", (uintmax_t)child, (uintmax_t)parent, lbn, dpoff); brelse(bp); return (0); } /* * We found the item in question. Record the mode and whether it's * a . or .. link for the caller. */ if (dp->d_ino == child) { if (child == parent) *isdot = 1; else if (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.') *isdot = 1; *mode = DTTOIF(dp->d_type); brelse(bp); return (1); } if (debug) printf("ino %ju doesn't match dirent ino %ju in parent %ju\n", (uintmax_t)child, (uintmax_t)dp->d_ino, (uintmax_t)parent); brelse(bp); return (0); } #define VISIT_INDIR 0x0001 #define VISIT_EXT 0x0002 #define VISIT_ROOT 0x0004 /* Operation came via root & valid pointers. */ /* * Read an indirect level which may or may not be linked into an inode. */ static void indir_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, uint64_t *frags, ino_visitor visitor, int flags) { struct bufarea *bp; ufs_lbn_t lbnadd; ufs2_daddr_t nblk; ufs_lbn_t nlbn; int level; int i; /* * Don't visit indirect blocks with contents we can't trust. This * should only happen when indir_visit() is called to complete a * truncate that never finished and not when a pointer is found via * an inode. */ if (blk == 0) return; level = lbn_level(lbn); if (level == -1) err_suj("Invalid level for lbn %jd\n", lbn); if ((flags & VISIT_ROOT) == 0 && blk_isindir(blk, ino, lbn) == 0) { if (debug) printf("blk %jd ino %ju lbn %jd(%d) is not indir.\n", blk, (uintmax_t)ino, lbn, level); goto out; } lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); bp = getdatablk(blk, fs->fs_bsize, BT_LEVEL1 + level); if (bp->b_errs != 0) err_suj("indir_visit: UNRECOVERABLE I/O ERROR"); for (i = 0; i < NINDIR(fs); i++) { if ((nblk = IBLK(bp, i)) == 0) continue; if (level == 0) { nlbn = -lbn + i * lbnadd; (*frags) += fs->fs_frag; visitor(ino, nlbn, nblk, fs->fs_frag); } else { nlbn = (lbn + 1) - (i * lbnadd); indir_visit(ino, nlbn, nblk, frags, visitor, flags); } } brelse(bp); out: if (flags & VISIT_INDIR) { (*frags) += fs->fs_frag; visitor(ino, lbn, blk, fs->fs_frag); } } /* * Visit each block in an inode as specified by 'flags' and call a * callback function. The callback may inspect or free blocks. The * count of frags found according to the size in the file is returned. * This is not valid for sparse files but may be used to determine * the correct di_blocks for a file. */ static uint64_t ino_visit(union dinode *dp, ino_t ino, ino_visitor visitor, int flags) { ufs_lbn_t nextlbn; ufs_lbn_t tmpval; ufs_lbn_t lbn; uint64_t size; uint64_t fragcnt; int mode; int frags; int i; size = DIP(dp, di_size); mode = DIP(dp, di_mode) & IFMT; fragcnt = 0; if ((flags & VISIT_EXT) && fs->fs_magic == FS_UFS2_MAGIC && dp->dp2.di_extsize) { for (i = 0; i < UFS_NXADDR; i++) { if (dp->dp2.di_extb[i] == 0) continue; frags = sblksize(fs, dp->dp2.di_extsize, i); frags = numfrags(fs, frags); fragcnt += frags; visitor(ino, -1 - i, dp->dp2.di_extb[i], frags); } } /* Skip datablocks for short links and devices. */ if (mode == IFBLK || mode == IFCHR || (mode == IFLNK && size < fs->fs_maxsymlinklen)) return (fragcnt); for (i = 0; i < UFS_NDADDR; i++) { if (DIP(dp, di_db[i]) == 0) continue; frags = sblksize(fs, size, i); frags = numfrags(fs, frags); fragcnt += frags; visitor(ino, i, DIP(dp, di_db[i]), frags); } /* * We know the following indirects are real as we're following * real pointers to them. */ flags |= VISIT_ROOT; for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, lbn = nextlbn) { nextlbn = lbn + tmpval; tmpval *= NINDIR(fs); if (DIP(dp, di_ib[i]) == 0) continue; indir_visit(ino, -lbn - i, DIP(dp, di_ib[i]), &fragcnt, visitor, flags); } return (fragcnt); } /* * Null visitor function used when we just want to count blocks and * record the lbn. */ ufs_lbn_t visitlbn; static void null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { if (lbn > 0) visitlbn = lbn; } /* * Recalculate di_blocks when we discover that a block allocation or * free was not successfully completed. The kernel does not roll this back * because it would be too expensive to compute which indirects were * reachable at the time the inode was written. */ static void ino_adjblks(struct suj_ino *sino) { struct inode ip; union dinode *dp; uint64_t blocks; uint64_t frags; off_t isize; off_t size; ino_t ino; ino = sino->si_ino; ginode(ino, &ip); dp = ip.i_dp; /* No need to adjust zero'd inodes. */ if (DIP(dp, di_mode) == 0) { irelse(&ip); return; } /* * Visit all blocks and count them as well as recording the last * valid lbn in the file. If the file size doesn't agree with the * last lbn we need to truncate to fix it. Otherwise just adjust * the blocks count. */ visitlbn = 0; frags = ino_visit(dp, ino, null_visit, VISIT_INDIR | VISIT_EXT); blocks = fsbtodb(fs, frags); /* * We assume the size and direct block list is kept coherent by * softdep. For files that have extended into indirects we truncate * to the size in the inode or the maximum size permitted by * populated indirects. */ if (visitlbn >= UFS_NDADDR) { isize = DIP(dp, di_size); size = lblktosize(fs, visitlbn + 1); if (isize > size) isize = size; /* Always truncate to free any unpopulated indirects. */ ino_trunc(ino, isize); irelse(&ip); return; } if (blocks == DIP(dp, di_blocks)) { irelse(&ip); return; } if (debug) printf("ino %ju adjusting block count from %jd to %jd\n", (uintmax_t)ino, DIP(dp, di_blocks), blocks); DIP_SET(dp, di_blocks, blocks); inodirty(&ip); irelse(&ip); } static void blk_free_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { blk_free(ino, blk, blk_freemask(blk, ino, lbn, frags), frags); } /* * Free a block or tree of blocks that was previously rooted in ino at * the given lbn. If the lbn is an indirect all children are freed * recursively. */ static void blk_free_lbn(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t lbn, int frags, int follow) { uint64_t resid; int mask; mask = blk_freemask(blk, ino, lbn, frags); resid = 0; if (lbn <= -UFS_NDADDR && follow && mask == 0) indir_visit(ino, lbn, blk, &resid, blk_free_visit, VISIT_INDIR); else blk_free(ino, blk, mask, frags); } static void ino_setskip(struct suj_ino *sino, ino_t parent) { int isdot; int mode; if (ino_isat(sino->si_ino, DOTDOT_OFFSET, parent, &mode, &isdot)) sino->si_skipparent = 1; } static void ino_remref(ino_t parent, ino_t child, uint64_t diroff, int isdotdot) { struct suj_ino *sino; struct suj_rec *srec; struct jrefrec *rrec; /* * Lookup this inode to see if we have a record for it. */ sino = ino_lookup(child, 0); /* * Tell any child directories we've already removed their * parent link cnt. Don't try to adjust our link down again. */ if (sino != NULL && isdotdot == 0) ino_setskip(sino, parent); /* * No valid record for this inode. Just drop the on-disk * link by one. */ if (sino == NULL || sino->si_hasrecs == 0) { ino_decr(child); return; } /* * Use ino_adjust() if ino_check() has already processed this * child. If we lose the last non-dot reference to a * directory it will be discarded. */ if (sino->si_linkadj) { if (sino->si_nlink == 0) err_suj("ino_remref: ino %ld mode 0%o about to go " "negative\n", sino->si_ino, sino->si_mode); sino->si_nlink--; if (isdotdot) sino->si_dotlinks--; ino_adjust(sino); return; } /* * If we haven't yet processed this inode we need to make * sure we will successfully discover the lost path. If not * use nlinkadj to remember. */ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { rrec = (struct jrefrec *)srec->sr_rec; if (rrec->jr_parent == parent && rrec->jr_diroff == diroff) return; } sino->si_nlinkadj++; } /* * Free the children of a directory when the directory is discarded. */ static void ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { struct suj_ino *sino; struct bufarea *bp; struct direct *dp; off_t diroff; int skipparent; int isdotdot; int dpoff; int size; sino = ino_lookup(ino, 0); if (sino) skipparent = sino->si_skipparent; else skipparent = 0; size = lfragtosize(fs, frags); bp = getdatablk(blk, size, BT_DIRDATA); if (bp->b_errs != 0) err_suj("ino_free_children: UNRECOVERABLE I/O ERROR"); dp = (struct direct *)&bp->b_un.b_buf[0]; for (dpoff = 0; dpoff < size && dp->d_reclen; dpoff += dp->d_reclen) { dp = (struct direct *)&bp->b_un.b_buf[dpoff]; if (dp->d_ino == 0 || dp->d_ino == UFS_WINO) continue; if (dp->d_namlen == 1 && dp->d_name[0] == '.') continue; isdotdot = dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.'; if (isdotdot && skipparent == 1) continue; if (debug) printf("Directory %ju removing ino %ju name %s\n", (uintmax_t)ino, (uintmax_t)dp->d_ino, dp->d_name); diroff = lblktosize(fs, lbn) + dpoff; ino_remref(ino, dp->d_ino, diroff, isdotdot); } brelse(bp); } /* * Reclaim an inode, freeing all blocks and decrementing all children's * link counts. Free the inode back to the cg. */ static void ino_reclaim(struct inode *ip, ino_t ino, int mode) { union dinode *dp; uint32_t gen; dp = ip->i_dp; if (ino == UFS_ROOTINO) err_suj("Attempting to free UFS_ROOTINO\n"); if (debug) printf("Truncating and freeing ino %ju, nlink %d, mode %o\n", (uintmax_t)ino, DIP(dp, di_nlink), DIP(dp, di_mode)); /* We are freeing an inode or directory. */ if ((DIP(dp, di_mode) & IFMT) == IFDIR) ino_visit(dp, ino, ino_free_children, 0); DIP_SET(dp, di_nlink, 0); if ((DIP(dp, di_flags) & SF_SNAPSHOT) != 0) snapremove(ino); ino_visit(dp, ino, blk_free_visit, VISIT_EXT | VISIT_INDIR); /* Here we have to clear the inode and release any blocks it holds. */ gen = DIP(dp, di_gen); if (fs->fs_magic == FS_UFS1_MAGIC) bzero(dp, sizeof(struct ufs1_dinode)); else bzero(dp, sizeof(struct ufs2_dinode)); DIP_SET(dp, di_gen, gen); inodirty(ip); ino_free(ino, mode); return; } /* * Adjust an inode's link count down by one when a directory goes away. */ static void ino_decr(ino_t ino) { struct inode ip; union dinode *dp; int reqlink; int nlink; int mode; ginode(ino, &ip); dp = ip.i_dp; nlink = DIP(dp, di_nlink); mode = DIP(dp, di_mode); if (nlink < 1) err_suj("Inode %d link count %d invalid\n", ino, nlink); if (mode == 0) err_suj("Inode %d has a link of %d with 0 mode\n", ino, nlink); nlink--; if ((mode & IFMT) == IFDIR) reqlink = 2; else reqlink = 1; if (nlink < reqlink) { if (debug) printf("ino %ju not enough links to live %d < %d\n", (uintmax_t)ino, nlink, reqlink); ino_reclaim(&ip, ino, mode); irelse(&ip); return; } DIP_SET(dp, di_nlink, nlink); inodirty(&ip); irelse(&ip); } /* * Adjust the inode link count to 'nlink'. If the count reaches zero * free it. */ static void ino_adjust(struct suj_ino *sino) { struct jrefrec *rrec; struct suj_rec *srec; struct suj_ino *stmp; union dinode *dp; struct inode ip; nlink_t nlink; nlink_t reqlink; int recmode; int isdot; int mode; ino_t ino; nlink = sino->si_nlink; ino = sino->si_ino; mode = sino->si_mode & IFMT; /* * If it's a directory with no dot links, it was truncated before * the name was cleared. We need to clear the dirent that * points at it. */ if (mode == IFDIR && nlink == 1 && sino->si_dotlinks == 0) { sino->si_nlink = nlink = 0; TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { rrec = (struct jrefrec *)srec->sr_rec; if (ino_isat(rrec->jr_parent, rrec->jr_diroff, ino, &recmode, &isdot) == 0) continue; ino_clrat(rrec->jr_parent, rrec->jr_diroff, ino); break; } if (srec == NULL) errx(1, "Directory %ju name not found", (uintmax_t)ino); } /* * If it's a directory with no real names pointing to it go ahead * and truncate it. This will free any children. */ if (mode == IFDIR && nlink - sino->si_dotlinks == 0) { sino->si_nlink = nlink = 0; /* * Mark any .. links so they know not to free this inode * when they are removed. */ TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { rrec = (struct jrefrec *)srec->sr_rec; if (rrec->jr_diroff == DOTDOT_OFFSET) { stmp = ino_lookup(rrec->jr_parent, 0); if (stmp) ino_setskip(stmp, ino); } } } ginode(ino, &ip); dp = ip.i_dp; mode = DIP(dp, di_mode) & IFMT; if (nlink > UFS_LINK_MAX) err_suj("ino %ju nlink manipulation error, new %ju, old %d\n", (uintmax_t)ino, (uintmax_t)nlink, DIP(dp, di_nlink)); if (debug) printf("Adjusting ino %ju, nlink %ju, old link %d lastmode %o\n", (uintmax_t)ino, (uintmax_t)nlink, DIP(dp, di_nlink), sino->si_mode); if (mode == 0) { if (debug) printf("ino %ju, zero inode freeing bitmap\n", (uintmax_t)ino); ino_free(ino, sino->si_mode); irelse(&ip); return; } /* XXX Should be an assert? */ if (mode != sino->si_mode && debug) printf("ino %ju, mode %o != %o\n", (uintmax_t)ino, mode, sino->si_mode); if ((mode & IFMT) == IFDIR) reqlink = 2; else reqlink = 1; /* If the inode doesn't have enough links to live, free it. */ if (nlink < reqlink) { if (debug) printf("ino %ju not enough links to live %ju < %ju\n", (uintmax_t)ino, (uintmax_t)nlink, (uintmax_t)reqlink); ino_reclaim(&ip, ino, mode); irelse(&ip); return; } /* If required write the updated link count. */ if (DIP(dp, di_nlink) == nlink) { if (debug) printf("ino %ju, link matches, skipping.\n", (uintmax_t)ino); irelse(&ip); return; } DIP_SET(dp, di_nlink, nlink); inodirty(&ip); irelse(&ip); } /* * Truncate some or all blocks in an indirect, freeing any that are required * and zeroing the indirect. */ static void indir_trunc(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, ufs_lbn_t lastlbn, union dinode *dp) { struct bufarea *bp; ufs_lbn_t lbnadd; ufs2_daddr_t nblk; ufs_lbn_t next; ufs_lbn_t nlbn; int isdirty; int level; int i; if (blk == 0) return; isdirty = 0; level = lbn_level(lbn); if (level == -1) err_suj("Invalid level for lbn %jd\n", lbn); lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); bp = getdatablk(blk, fs->fs_bsize, BT_LEVEL1 + level); if (bp->b_errs != 0) err_suj("indir_trunc: UNRECOVERABLE I/O ERROR"); for (i = 0; i < NINDIR(fs); i++) { if ((nblk = IBLK(bp, i)) == 0) continue; if (level != 0) { nlbn = (lbn + 1) - (i * lbnadd); /* * Calculate the lbn of the next indirect to * determine if any of this indirect must be * reclaimed. */ next = -(lbn + level) + ((i+1) * lbnadd); if (next <= lastlbn) continue; indir_trunc(ino, nlbn, nblk, lastlbn, dp); /* If all of this indirect was reclaimed, free it. */ nlbn = next - lbnadd; if (nlbn < lastlbn) continue; } else { nlbn = -lbn + i * lbnadd; if (nlbn < lastlbn) continue; } isdirty = 1; blk_free(ino, nblk, 0, fs->fs_frag); IBLK_SET(bp, i, 0); } if (isdirty) dirty(bp); brelse(bp); } /* * Truncate an inode to the minimum of the given size or the last populated * block after any over size have been discarded. The kernel would allocate * the last block in the file but fsck does not and neither do we. This * code never extends files, only shrinks them. */ static void ino_trunc(ino_t ino, off_t size) { struct inode ip; union dinode *dp; struct bufarea *bp; ufs2_daddr_t bn; uint64_t totalfrags; ufs_lbn_t nextlbn; ufs_lbn_t lastlbn; ufs_lbn_t tmpval; ufs_lbn_t lbn; ufs_lbn_t i; int blksize, frags; off_t cursize; off_t off; int mode; ginode(ino, &ip); dp = ip.i_dp; mode = DIP(dp, di_mode) & IFMT; cursize = DIP(dp, di_size); /* If no size change, nothing to do */ if (size == cursize) { irelse(&ip); return; } if (debug) printf("Truncating ino %ju, mode %o to size %jd from size %jd\n", (uintmax_t)ino, mode, size, cursize); /* Skip datablocks for short links and devices. */ if (mode == 0 || mode == IFBLK || mode == IFCHR || (mode == IFLNK && cursize < fs->fs_maxsymlinklen)) { irelse(&ip); return; } /* Don't extend. */ if (size > cursize) { irelse(&ip); return; } if ((DIP(dp, di_flags) & SF_SNAPSHOT) != 0) { if (size > 0) err_suj("Partial truncation of ino %ju snapshot file\n", (uintmax_t)ino); snapremove(ino); } lastlbn = lblkno(fs, blkroundup(fs, size)); for (i = lastlbn; i < UFS_NDADDR; i++) { if ((bn = DIP(dp, di_db[i])) == 0) continue; blksize = sblksize(fs, cursize, i); blk_free(ino, bn, 0, numfrags(fs, blksize)); DIP_SET(dp, di_db[i], 0); } /* * Follow indirect blocks, freeing anything required. */ for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, lbn = nextlbn) { nextlbn = lbn + tmpval; tmpval *= NINDIR(fs); /* If we're not freeing any in this indirect range skip it. */ if (lastlbn >= nextlbn) continue; if ((bn = DIP(dp, di_ib[i])) == 0) continue; indir_trunc(ino, -lbn - i, bn, lastlbn, dp); /* If we freed everything in this indirect free the indir. */ if (lastlbn > lbn) continue; blk_free(ino, bn, 0, fs->fs_frag); DIP_SET(dp, di_ib[i], 0); } /* * Now that we've freed any whole blocks that exceed the desired * truncation size, figure out how many blocks remain and what the * last populated lbn is. We will set the size to this last lbn * rather than worrying about allocating the final lbn as the kernel * would've done. This is consistent with normal fsck behavior. */ visitlbn = 0; totalfrags = ino_visit(dp, ino, null_visit, VISIT_INDIR | VISIT_EXT); if (size > lblktosize(fs, visitlbn + 1)) size = lblktosize(fs, visitlbn + 1); /* * If we're truncating direct blocks we have to adjust frags * accordingly. */ if (visitlbn < UFS_NDADDR && totalfrags) { long oldspace, newspace; bn = DIP(dp, di_db[visitlbn]); if (bn == 0) err_suj("Bad blk at ino %ju lbn %jd\n", (uintmax_t)ino, visitlbn); oldspace = sblksize(fs, cursize, visitlbn); newspace = sblksize(fs, size, visitlbn); if (oldspace != newspace) { bn += numfrags(fs, newspace); frags = numfrags(fs, oldspace - newspace); blk_free(ino, bn, 0, frags); totalfrags -= frags; } } DIP_SET(dp, di_blocks, fsbtodb(fs, totalfrags)); DIP_SET(dp, di_size, size); inodirty(&ip); /* * If we've truncated into the middle of a block or frag we have * to zero it here. Otherwise the file could extend into * uninitialized space later. */ off = blkoff(fs, size); if (off && DIP(dp, di_mode) != IFDIR) { long clrsize; bn = ino_blkatoff(dp, ino, visitlbn, &frags, NULL); if (bn == 0) err_suj("Block missing from ino %ju at lbn %jd\n", (uintmax_t)ino, visitlbn); clrsize = frags * fs->fs_fsize; bp = getdatablk(bn, clrsize, BT_DATA); if (bp->b_errs != 0) err_suj("ino_trunc: UNRECOVERABLE I/O ERROR"); clrsize -= off; bzero(&bp->b_un.b_buf[off], clrsize); dirty(bp); brelse(bp); } irelse(&ip); return; } /* * Process records available for one inode and determine whether the * link count is correct or needs adjusting. */ static void ino_check(struct suj_ino *sino) { struct suj_rec *srec; struct jrefrec *rrec; nlink_t dotlinks; nlink_t newlinks; nlink_t removes; nlink_t nlink; ino_t ino; int isdot; int isat; int mode; if (sino->si_hasrecs == 0) return; ino = sino->si_ino; rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec; nlink = rrec->jr_nlink; newlinks = 0; dotlinks = 0; removes = sino->si_nlinkadj; TAILQ_FOREACH(srec, &sino->si_recs, sr_next) { rrec = (struct jrefrec *)srec->sr_rec; isat = ino_isat(rrec->jr_parent, rrec->jr_diroff, rrec->jr_ino, &mode, &isdot); if (isat && (mode & IFMT) != (rrec->jr_mode & IFMT)) err_suj("Inode mode/directory type mismatch %o != %o\n", mode, rrec->jr_mode); if (debug) printf("jrefrec: op %d ino %ju, nlink %ju, parent %ju, " "diroff %jd, mode %o, isat %d, isdot %d\n", rrec->jr_op, (uintmax_t)rrec->jr_ino, (uintmax_t)rrec->jr_nlink, (uintmax_t)rrec->jr_parent, (uintmax_t)rrec->jr_diroff, rrec->jr_mode, isat, isdot); mode = rrec->jr_mode & IFMT; if (rrec->jr_op == JOP_REMREF) removes++; newlinks += isat; if (isdot) dotlinks += isat; } /* * The number of links that remain are the starting link count * subtracted by the total number of removes with the total * links discovered back in. An incomplete remove thus * makes no change to the link count but an add increases * by one. */ if (debug) printf( "ino %ju nlink %ju newlinks %ju removes %ju dotlinks %ju\n", (uintmax_t)ino, (uintmax_t)nlink, (uintmax_t)newlinks, (uintmax_t)removes, (uintmax_t)dotlinks); nlink += newlinks; nlink -= removes; sino->si_linkadj = 1; sino->si_nlink = nlink; sino->si_dotlinks = dotlinks; sino->si_mode = mode; ino_adjust(sino); } /* * Process records available for one block and determine whether it is * still allocated and whether the owning inode needs to be updated or * a free completed. */ static void blk_check(struct suj_blk *sblk) { struct suj_rec *srec; struct jblkrec *brec; struct suj_ino *sino; ufs2_daddr_t blk; int mask; int frags; int isat; /* * Each suj_blk actually contains records for any fragments in that * block. As a result we must evaluate each record individually. */ sino = NULL; TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { brec = (struct jblkrec *)srec->sr_rec; frags = brec->jb_frags; blk = brec->jb_blkno + brec->jb_oldfrags; isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags); if (sino == NULL || sino->si_ino != brec->jb_ino) { sino = ino_lookup(brec->jb_ino, 1); sino->si_blkadj = 1; } if (debug) printf("op %d blk %jd ino %ju lbn %jd frags %d isat %d (%d)\n", brec->jb_op, blk, (uintmax_t)brec->jb_ino, brec->jb_lbn, brec->jb_frags, isat, frags); /* * If we found the block at this address we still have to * determine if we need to free the tail end that was * added by adding contiguous fragments from the same block. */ if (isat == 1) { if (frags == brec->jb_frags) continue; mask = blk_freemask(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags); mask >>= frags; blk += frags; frags = brec->jb_frags - frags; blk_free(brec->jb_ino, blk, mask, frags); continue; } /* * The block wasn't found, attempt to free it. It won't be * freed if it was actually reallocated. If this was an * allocation we don't want to follow indirects as they * may not be written yet. Any children of the indirect will * have their own records. If it's a free we need to * recursively free children. */ blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags, brec->jb_op == JOP_FREEBLK); } } /* * Walk the list of inode records for this cg and resolve moved and duplicate * inode references now that we have a complete picture. */ static void cg_build(struct suj_cg *sc) { struct suj_ino *sino; int i; for (i = 0; i < HASHSIZE; i++) LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) ino_build(sino); } /* * Handle inodes requiring truncation. This must be done prior to * looking up any inodes in directories. */ static void cg_trunc(struct suj_cg *sc) { struct suj_ino *sino; int i; for (i = 0; i < HASHSIZE; i++) { LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) { if (sino->si_trunc) { ino_trunc(sino->si_ino, sino->si_trunc->jt_size); sino->si_blkadj = 0; sino->si_trunc = NULL; } if (sino->si_blkadj) ino_adjblks(sino); } } } static void cg_adj_blk(struct suj_cg *sc) { struct suj_ino *sino; int i; for (i = 0; i < HASHSIZE; i++) { LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) { if (sino->si_blkadj) ino_adjblks(sino); } } } /* * Free any partially allocated blocks and then resolve inode block * counts. */ static void cg_check_blk(struct suj_cg *sc) { struct suj_blk *sblk; int i; for (i = 0; i < HASHSIZE; i++) LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next) blk_check(sblk); } /* * Walk the list of inode records for this cg, recovering any * changes which were not complete at the time of crash. */ static void cg_check_ino(struct suj_cg *sc) { struct suj_ino *sino; int i; for (i = 0; i < HASHSIZE; i++) LIST_FOREACH(sino, &sc->sc_inohash[i], si_next) ino_check(sino); } static void cg_apply(void (*apply)(struct suj_cg *)) { struct suj_cg *scg; int i; for (i = 0; i < HASHSIZE; i++) LIST_FOREACH(scg, &cghash[i], sc_next) apply(scg); } /* * Process the unlinked but referenced file list. Freeing all inodes. */ static void ino_unlinked(void) { struct inode ip; union dinode *dp; uint16_t mode; ino_t inon; ino_t ino; ino = fs->fs_sujfree; fs->fs_sujfree = 0; while (ino != 0) { ginode(ino, &ip); dp = ip.i_dp; mode = DIP(dp, di_mode) & IFMT; inon = DIP(dp, di_freelink); DIP_SET(dp, di_freelink, 0); inodirty(&ip); /* * XXX Should this be an errx? */ if (DIP(dp, di_nlink) == 0) { if (debug) printf("Freeing unlinked ino %ju mode %o\n", (uintmax_t)ino, mode); ino_reclaim(&ip, ino, mode); } else if (debug) printf("Skipping ino %ju mode %o with link %d\n", (uintmax_t)ino, mode, DIP(dp, di_nlink)); ino = inon; irelse(&ip); } } /* * Append a new record to the list of records requiring processing. */ static void ino_append(union jrec *rec) { struct jrefrec *refrec; struct jmvrec *mvrec; struct suj_ino *sino; struct suj_rec *srec; mvrec = &rec->rec_jmvrec; refrec = &rec->rec_jrefrec; if (debug && mvrec->jm_op == JOP_MVREF) printf("ino move: ino %ju, parent %ju, " "diroff %jd, oldoff %jd\n", (uintmax_t)mvrec->jm_ino, (uintmax_t)mvrec->jm_parent, (uintmax_t)mvrec->jm_newoff, (uintmax_t)mvrec->jm_oldoff); else if (debug && (refrec->jr_op == JOP_ADDREF || refrec->jr_op == JOP_REMREF)) printf("ino ref: op %d, ino %ju, nlink %ju, " "parent %ju, diroff %jd\n", refrec->jr_op, (uintmax_t)refrec->jr_ino, (uintmax_t)refrec->jr_nlink, (uintmax_t)refrec->jr_parent, (uintmax_t)refrec->jr_diroff); sino = ino_lookup(((struct jrefrec *)rec)->jr_ino, 1); sino->si_hasrecs = 1; srec = errmalloc(sizeof(*srec)); srec->sr_rec = rec; TAILQ_INSERT_TAIL(&sino->si_newrecs, srec, sr_next); } /* * Add a reference adjustment to the sino list and eliminate dups. The * primary loop in ino_build_ref() checks for dups but new ones may be * created as a result of offset adjustments. */ static void ino_add_ref(struct suj_ino *sino, struct suj_rec *srec) { struct jrefrec *refrec; struct suj_rec *srn; struct jrefrec *rrn; refrec = (struct jrefrec *)srec->sr_rec; /* * We walk backwards so that the oldest link count is preserved. If * an add record conflicts with a remove keep the remove. Redundant * removes are eliminated in ino_build_ref. Otherwise we keep the * oldest record at a given location. */ for (srn = TAILQ_LAST(&sino->si_recs, srechd); srn; srn = TAILQ_PREV(srn, srechd, sr_next)) { rrn = (struct jrefrec *)srn->sr_rec; if (rrn->jr_parent != refrec->jr_parent || rrn->jr_diroff != refrec->jr_diroff) continue; if (rrn->jr_op == JOP_REMREF || refrec->jr_op == JOP_ADDREF) { rrn->jr_mode = refrec->jr_mode; return; } /* * Adding a remove. * * Replace the record in place with the old nlink in case * we replace the head of the list. Abandon srec as a dup. */ refrec->jr_nlink = rrn->jr_nlink; srn->sr_rec = srec->sr_rec; return; } TAILQ_INSERT_TAIL(&sino->si_recs, srec, sr_next); } /* * Create a duplicate of a reference at a previous location. */ static void ino_dup_ref(struct suj_ino *sino, struct jrefrec *refrec, off_t diroff) { struct jrefrec *rrn; struct suj_rec *srn; rrn = errmalloc(sizeof(*refrec)); *rrn = *refrec; rrn->jr_op = JOP_ADDREF; rrn->jr_diroff = diroff; srn = errmalloc(sizeof(*srn)); srn->sr_rec = (union jrec *)rrn; ino_add_ref(sino, srn); } /* * Add a reference to the list at all known locations. We follow the offset * changes for a single instance and create duplicate add refs at each so * that we can tolerate any version of the directory block. Eliminate * removes which collide with adds that are seen in the journal. They should * not adjust the link count down. */ static void ino_build_ref(struct suj_ino *sino, struct suj_rec *srec) { struct jrefrec *refrec; struct jmvrec *mvrec; struct suj_rec *srp; struct suj_rec *srn; struct jrefrec *rrn; off_t diroff; refrec = (struct jrefrec *)srec->sr_rec; /* * Search for a mvrec that matches this offset. Whether it's an add * or a remove we can delete the mvref after creating a dup record in * the old location. */ if (!TAILQ_EMPTY(&sino->si_movs)) { diroff = refrec->jr_diroff; for (srn = TAILQ_LAST(&sino->si_movs, srechd); srn; srn = srp) { srp = TAILQ_PREV(srn, srechd, sr_next); mvrec = (struct jmvrec *)srn->sr_rec; if (mvrec->jm_parent != refrec->jr_parent || mvrec->jm_newoff != diroff) continue; diroff = mvrec->jm_oldoff; TAILQ_REMOVE(&sino->si_movs, srn, sr_next); free(srn); ino_dup_ref(sino, refrec, diroff); } } /* * If a remove wasn't eliminated by an earlier add just append it to * the list. */ if (refrec->jr_op == JOP_REMREF) { ino_add_ref(sino, srec); return; } /* * Walk the list of records waiting to be added to the list. We * must check for moves that apply to our current offset and remove * them from the list. Remove any duplicates to eliminate removes * with corresponding adds. */ TAILQ_FOREACH_SAFE(srn, &sino->si_newrecs, sr_next, srp) { switch (srn->sr_rec->rec_jrefrec.jr_op) { case JOP_ADDREF: /* * This should actually be an error we should * have a remove for every add journaled. */ rrn = (struct jrefrec *)srn->sr_rec; if (rrn->jr_parent != refrec->jr_parent || rrn->jr_diroff != refrec->jr_diroff) break; TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); break; case JOP_REMREF: /* * Once we remove the current iteration of the * record at this address we're done. */ rrn = (struct jrefrec *)srn->sr_rec; if (rrn->jr_parent != refrec->jr_parent || rrn->jr_diroff != refrec->jr_diroff) break; TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); ino_add_ref(sino, srec); return; case JOP_MVREF: /* * Update our diroff based on any moves that match * and remove the move. */ mvrec = (struct jmvrec *)srn->sr_rec; if (mvrec->jm_parent != refrec->jr_parent || mvrec->jm_oldoff != refrec->jr_diroff) break; ino_dup_ref(sino, refrec, mvrec->jm_oldoff); refrec->jr_diroff = mvrec->jm_newoff; TAILQ_REMOVE(&sino->si_newrecs, srn, sr_next); break; default: err_suj("ino_build_ref: Unknown op %d\n", srn->sr_rec->rec_jrefrec.jr_op); } } ino_add_ref(sino, srec); } /* * Walk the list of new records and add them in-order resolving any * dups and adjusted offsets. */ static void ino_build(struct suj_ino *sino) { struct suj_rec *srec; while ((srec = TAILQ_FIRST(&sino->si_newrecs)) != NULL) { TAILQ_REMOVE(&sino->si_newrecs, srec, sr_next); switch (srec->sr_rec->rec_jrefrec.jr_op) { case JOP_ADDREF: case JOP_REMREF: ino_build_ref(sino, srec); break; case JOP_MVREF: /* * Add this mvrec to the queue of pending mvs. */ TAILQ_INSERT_TAIL(&sino->si_movs, srec, sr_next); break; default: err_suj("ino_build: Unknown op %d\n", srec->sr_rec->rec_jrefrec.jr_op); } } if (TAILQ_EMPTY(&sino->si_recs)) sino->si_hasrecs = 0; } /* * Modify journal records so they refer to the base block number * and a start and end frag range. This is to facilitate the discovery * of overlapping fragment allocations. */ static void blk_build(struct jblkrec *blkrec) { struct suj_rec *srec; struct suj_blk *sblk; struct jblkrec *blkrn; ufs2_daddr_t blk; int frag; if (debug) printf("blk_build: op %d blkno %jd frags %d oldfrags %d " "ino %ju lbn %jd\n", blkrec->jb_op, (uintmax_t)blkrec->jb_blkno, blkrec->jb_frags, blkrec->jb_oldfrags, (uintmax_t)blkrec->jb_ino, (uintmax_t)blkrec->jb_lbn); blk = blknum(fs, blkrec->jb_blkno); frag = fragnum(fs, blkrec->jb_blkno); sblk = blk_lookup(blk, 1); /* * Rewrite the record using oldfrags to indicate the offset into * the block. Leave jb_frags as the actual allocated count. */ blkrec->jb_blkno -= frag; blkrec->jb_oldfrags = frag; if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag) err_suj("Invalid fragment count %d oldfrags %d\n", blkrec->jb_frags, frag); /* * Detect dups. If we detect a dup we always discard the oldest * record as it is superseded by the new record. This speeds up * later stages but also eliminates free records which are used * to indicate that the contents of indirects can be trusted. */ TAILQ_FOREACH(srec, &sblk->sb_recs, sr_next) { blkrn = (struct jblkrec *)srec->sr_rec; if (blkrn->jb_ino != blkrec->jb_ino || blkrn->jb_lbn != blkrec->jb_lbn || blkrn->jb_blkno != blkrec->jb_blkno || blkrn->jb_frags != blkrec->jb_frags || blkrn->jb_oldfrags != blkrec->jb_oldfrags) continue; if (debug) printf("Removed dup.\n"); /* Discard the free which is a dup with an alloc. */ if (blkrec->jb_op == JOP_FREEBLK) return; TAILQ_REMOVE(&sblk->sb_recs, srec, sr_next); free(srec); break; } srec = errmalloc(sizeof(*srec)); srec->sr_rec = (union jrec *)blkrec; TAILQ_INSERT_TAIL(&sblk->sb_recs, srec, sr_next); } static void ino_build_trunc(struct jtrncrec *rec) { struct suj_ino *sino; if (debug) printf("ino_build_trunc: op %d ino %ju, size %jd\n", rec->jt_op, (uintmax_t)rec->jt_ino, (uintmax_t)rec->jt_size); + if (chkfilesize(IFREG, rec->jt_size) == 0) + err_suj("ino_build: truncation size too large %ju\n", + (intmax_t)rec->jt_size); sino = ino_lookup(rec->jt_ino, 1); if (rec->jt_op == JOP_SYNC) { sino->si_trunc = NULL; return; } if (sino->si_trunc == NULL || sino->si_trunc->jt_size > rec->jt_size) sino->si_trunc = rec; } /* * Build up tables of the operations we need to recover. */ static void suj_build(void) { struct suj_seg *seg; union jrec *rec; int off; int i; TAILQ_FOREACH(seg, &allsegs, ss_next) { if (debug) printf("seg %jd has %d records, oldseq %jd.\n", seg->ss_rec.jsr_seq, seg->ss_rec.jsr_cnt, seg->ss_rec.jsr_oldest); off = 0; rec = (union jrec *)seg->ss_blk; for (i = 0; i < seg->ss_rec.jsr_cnt; off += JREC_SIZE, rec++) { /* skip the segrec. */ if ((off % real_dev_bsize) == 0) continue; switch (rec->rec_jrefrec.jr_op) { case JOP_ADDREF: case JOP_REMREF: case JOP_MVREF: ino_append(rec); break; case JOP_NEWBLK: case JOP_FREEBLK: blk_build((struct jblkrec *)rec); break; case JOP_TRUNC: case JOP_SYNC: ino_build_trunc((struct jtrncrec *)rec); break; default: err_suj("Unknown journal operation %d (%d)\n", rec->rec_jrefrec.jr_op, off); } i++; } } } /* * Prune the journal segments to those we care about based on the * oldest sequence in the newest segment. Order the segment list * based on sequence number. */ static void suj_prune(void) { struct suj_seg *seg; struct suj_seg *segn; uint64_t newseq; int discard; if (debug) printf("Pruning up to %jd\n", oldseq); /* First free the expired segments. */ TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { if (seg->ss_rec.jsr_seq >= oldseq) continue; TAILQ_REMOVE(&allsegs, seg, ss_next); free(seg->ss_blk); free(seg); } /* Next ensure that segments are ordered properly. */ seg = TAILQ_FIRST(&allsegs); if (seg == NULL) { if (debug) printf("Empty journal\n"); return; } newseq = seg->ss_rec.jsr_seq; for (;;) { seg = TAILQ_LAST(&allsegs, seghd); if (seg->ss_rec.jsr_seq >= newseq) break; TAILQ_REMOVE(&allsegs, seg, ss_next); TAILQ_INSERT_HEAD(&allsegs, seg, ss_next); newseq = seg->ss_rec.jsr_seq; } if (newseq != oldseq) { TAILQ_FOREACH(seg, &allsegs, ss_next) { printf("%jd, ", seg->ss_rec.jsr_seq); } printf("\n"); err_suj("Journal file sequence mismatch %jd != %jd\n", newseq, oldseq); } /* * The kernel may asynchronously write segments which can create * gaps in the sequence space. Throw away any segments after the * gap as the kernel guarantees only those that are contiguously * reachable are marked as completed. */ discard = 0; TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { if (!discard && newseq++ == seg->ss_rec.jsr_seq) { jrecs += seg->ss_rec.jsr_cnt; jbytes += seg->ss_rec.jsr_blocks * real_dev_bsize; continue; } discard = 1; if (debug) printf("Journal order mismatch %jd != %jd pruning\n", newseq-1, seg->ss_rec.jsr_seq); TAILQ_REMOVE(&allsegs, seg, ss_next); free(seg->ss_blk); free(seg); } if (debug) printf("Processing journal segments from %jd to %jd\n", oldseq, newseq-1); } /* * Verify the journal inode before attempting to read records. */ static int suj_verifyino(union dinode *dp) { if (DIP(dp, di_nlink) != 1) { printf("Invalid link count %d for journal inode %ju\n", DIP(dp, di_nlink), (uintmax_t)sujino); return (-1); } if ((DIP(dp, di_flags) & (SF_IMMUTABLE | SF_NOUNLINK)) != (SF_IMMUTABLE | SF_NOUNLINK)) { printf("Invalid flags 0x%X for journal inode %ju\n", DIP(dp, di_flags), (uintmax_t)sujino); return (-1); } if (DIP(dp, di_mode) != (IFREG | IREAD)) { printf("Invalid mode %o for journal inode %ju\n", DIP(dp, di_mode), (uintmax_t)sujino); return (-1); } if (DIP(dp, di_size) < SUJ_MIN) { printf("Invalid size %jd for journal inode %ju\n", DIP(dp, di_size), (uintmax_t)sujino); return (-1); } if (DIP(dp, di_modrev) != fs->fs_mtime) { printf("Journal timestamp does not match fs mount time\n"); return (-1); } return (0); } struct jblocks { struct jextent *jb_extent; /* Extent array. */ int jb_avail; /* Available extents. */ int jb_used; /* Last used extent. */ int jb_head; /* Allocator head. */ int jb_off; /* Allocator extent offset. */ }; struct jextent { ufs2_daddr_t je_daddr; /* Disk block address. */ int je_blocks; /* Disk block count. */ }; static struct jblocks *suj_jblocks; static struct jblocks * jblocks_create(void) { struct jblocks *jblocks; int size; jblocks = errmalloc(sizeof(*jblocks)); jblocks->jb_avail = 10; jblocks->jb_used = 0; jblocks->jb_head = 0; jblocks->jb_off = 0; size = sizeof(struct jextent) * jblocks->jb_avail; jblocks->jb_extent = errmalloc(size); bzero(jblocks->jb_extent, size); return (jblocks); } /* * Return the next available disk block and the amount of contiguous * free space it contains. */ static ufs2_daddr_t jblocks_next(struct jblocks *jblocks, int bytes, int *actual) { struct jextent *jext; ufs2_daddr_t daddr; int freecnt; int blocks; blocks = btodb(bytes); jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks - jblocks->jb_off; if (freecnt == 0) { jblocks->jb_off = 0; if (++jblocks->jb_head > jblocks->jb_used) return (0); jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks; } if (freecnt > blocks) freecnt = blocks; *actual = dbtob(freecnt); daddr = jext->je_daddr + jblocks->jb_off; return (daddr); } /* * Advance the allocation head by a specified number of bytes, consuming * one journal segment. */ static void jblocks_advance(struct jblocks *jblocks, int bytes) { jblocks->jb_off += btodb(bytes); } static void jblocks_destroy(struct jblocks *jblocks) { free(jblocks->jb_extent); free(jblocks); } static void jblocks_add(struct jblocks *jblocks, ufs2_daddr_t daddr, int blocks) { struct jextent *jext; int size; jext = &jblocks->jb_extent[jblocks->jb_used]; /* Adding the first block. */ if (jext->je_daddr == 0) { jext->je_daddr = daddr; jext->je_blocks = blocks; return; } /* Extending the last extent. */ if (jext->je_daddr + jext->je_blocks == daddr) { jext->je_blocks += blocks; return; } /* Adding a new extent. */ if (++jblocks->jb_used == jblocks->jb_avail) { jblocks->jb_avail *= 2; size = sizeof(struct jextent) * jblocks->jb_avail; jext = errmalloc(size); bzero(jext, size); bcopy(jblocks->jb_extent, jext, sizeof(struct jextent) * jblocks->jb_used); free(jblocks->jb_extent); jblocks->jb_extent = jext; } jext = &jblocks->jb_extent[jblocks->jb_used]; jext->je_daddr = daddr; jext->je_blocks = blocks; return; } /* * Add a file block from the journal to the extent map. We can't read * each file block individually because the kernel treats it as a circular * buffer and segments may span mutliple contiguous blocks. */ static void suj_add_block(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags) { jblocks_add(suj_jblocks, fsbtodb(fs, blk), fsbtodb(fs, frags)); } static void suj_read(void) { uint8_t block[1 * 1024 * 1024]; struct suj_seg *seg; struct jsegrec *recn; struct jsegrec *rec; ufs2_daddr_t blk; int readsize; int blocks; int recsize; int size; int i; /* * Read records until we exhaust the journal space. If we find * an invalid record we start searching for a valid segment header * at the next block. This is because we don't have a head/tail * pointer and must recover the information indirectly. At the gap * between the head and tail we won't necessarily have a valid * segment. */ restart: for (;;) { size = sizeof(block); blk = jblocks_next(suj_jblocks, size, &readsize); if (blk == 0) return; size = readsize; /* * Read 1MB at a time and scan for records within this block. */ if (pread(fsreadfd, &block, size, dbtob(blk)) != size) { err_suj("Error reading journal block %jd\n", (intmax_t)blk); } for (rec = (void *)block; size; size -= recsize, rec = (struct jsegrec *)((uintptr_t)rec + recsize)) { recsize = real_dev_bsize; if (rec->jsr_time != fs->fs_mtime) { #ifdef notdef if (debug) printf("Rec time %jd != fs mtime %jd\n", rec->jsr_time, fs->fs_mtime); #endif jblocks_advance(suj_jblocks, recsize); continue; } if (rec->jsr_cnt == 0) { if (debug) printf("Found illegal count %d\n", rec->jsr_cnt); jblocks_advance(suj_jblocks, recsize); continue; } blocks = rec->jsr_blocks; recsize = blocks * real_dev_bsize; if (recsize > size) { /* * We may just have run out of buffer, restart * the loop to re-read from this spot. */ if (size < fs->fs_bsize && size != readsize && recsize <= fs->fs_bsize) goto restart; if (debug) printf("Found invalid segsize %d > %d\n", recsize, size); recsize = real_dev_bsize; jblocks_advance(suj_jblocks, recsize); continue; } /* * Verify that all blocks in the segment are present. */ for (i = 1; i < blocks; i++) { recn = (void *)((uintptr_t)rec) + i * real_dev_bsize; if (recn->jsr_seq == rec->jsr_seq && recn->jsr_time == rec->jsr_time) continue; if (debug) printf("Incomplete record %jd (%d)\n", rec->jsr_seq, i); recsize = i * real_dev_bsize; jblocks_advance(suj_jblocks, recsize); goto restart; } seg = errmalloc(sizeof(*seg)); seg->ss_blk = errmalloc(recsize); seg->ss_rec = *rec; bcopy((void *)rec, seg->ss_blk, recsize); if (rec->jsr_oldest > oldseq) oldseq = rec->jsr_oldest; TAILQ_INSERT_TAIL(&allsegs, seg, ss_next); jblocks_advance(suj_jblocks, recsize); } } } /* * Orchestrate the verification of a filesystem via the softupdates journal. */ int suj_check(const char *filesys) { struct inodesc idesc; struct csum *cgsum; union dinode *jip; struct inode ip; uint64_t blocks; int i, retval; struct suj_seg *seg; struct suj_seg *segn; initsuj(); fs = &sblock; if (real_dev_bsize == 0 && ioctl(fsreadfd, DIOCGSECTORSIZE, &real_dev_bsize) == -1) real_dev_bsize = secsize; if (debug) printf("dev_bsize %u\n", real_dev_bsize); /* * Set an exit point when SUJ check failed */ retval = setjmp(jmpbuf); if (retval != 0) { pwarn("UNEXPECTED SU+J INCONSISTENCY\n"); TAILQ_FOREACH_SAFE(seg, &allsegs, ss_next, segn) { TAILQ_REMOVE(&allsegs, seg, ss_next); free(seg->ss_blk); free(seg); } if (reply("FALLBACK TO FULL FSCK") == 0) { ckfini(0); exit(EEXIT); } else return (-1); } /* * Search the root directory for the SUJ_FILE. */ idesc.id_type = DATA; idesc.id_fix = IGNORE; idesc.id_number = UFS_ROOTINO; idesc.id_func = findino; idesc.id_name = SUJ_FILE; ginode(UFS_ROOTINO, &ip); if ((ckinode(ip.i_dp, &idesc) & FOUND) == FOUND) { sujino = idesc.id_parent; irelse(&ip); } else { printf("Journal inode removed. Use tunefs to re-create.\n"); sblock.fs_flags &= ~FS_SUJ; sblock.fs_sujfree = 0; irelse(&ip); return (-1); } /* * Fetch the journal inode and verify it. */ ginode(sujino, &ip); jip = ip.i_dp; printf("** SU+J Recovering %s\n", filesys); if (suj_verifyino(jip) != 0 || (!preen && !reply("USE JOURNAL"))) { irelse(&ip); return (-1); } /* * Build a list of journal blocks in jblocks before parsing the * available journal blocks in with suj_read(). */ printf("** Reading %jd byte journal from inode %ju.\n", DIP(jip, di_size), (uintmax_t)sujino); suj_jblocks = jblocks_create(); blocks = ino_visit(jip, sujino, suj_add_block, 0); if (blocks != numfrags(fs, DIP(jip, di_size))) { printf("Sparse journal inode %ju.\n", (uintmax_t)sujino); irelse(&ip); return (-1); } irelse(&ip); suj_read(); jblocks_destroy(suj_jblocks); suj_jblocks = NULL; if (preen || reply("RECOVER")) { printf("** Building recovery table.\n"); suj_prune(); suj_build(); cg_apply(cg_build); printf("** Resolving unreferenced inode list.\n"); ino_unlinked(); printf("** Processing journal entries.\n"); cg_apply(cg_trunc); cg_apply(cg_check_blk); cg_apply(cg_adj_blk); cg_apply(cg_check_ino); } if (preen == 0 && (jrecs > 0 || jbytes > 0) && reply("WRITE CHANGES") == 0) return (0); /* * Check block counts of snapshot inodes and * make copies of any needed snapshot blocks. */ for (i = 0; i < snapcnt; i++) check_blkcnt(&snaplist[i]); snapflush(suj_checkblkavail); /* * Recompute the fs summary info from correct cs summaries. */ bzero(&fs->fs_cstotal, sizeof(struct csum_total)); for (i = 0; i < fs->fs_ncg; i++) { cgsum = &fs->fs_cs(fs, i); fs->fs_cstotal.cs_nffree += cgsum->cs_nffree; fs->fs_cstotal.cs_nbfree += cgsum->cs_nbfree; fs->fs_cstotal.cs_nifree += cgsum->cs_nifree; fs->fs_cstotal.cs_ndir += cgsum->cs_ndir; } fs->fs_pendinginodes = 0; fs->fs_pendingblocks = 0; fs->fs_clean = 1; fs->fs_time = time(NULL); fs->fs_mtime = time(NULL); sbdirty(); ckfini(1); if (jrecs > 0 || jbytes > 0) { printf("** %jd journal records in %jd bytes for %.2f%% utilization\n", jrecs, jbytes, ((float)jrecs / (float)(jbytes / JREC_SIZE)) * 100); printf("** Freed %jd inodes (%jd dirs) %jd blocks, and %jd frags.\n", freeinos, freedir, freeblocks, freefrags); } return (0); } static void initsuj(void) { int i; for (i = 0; i < HASHSIZE; i++) LIST_INIT(&cghash[i]); lastcg = NULL; TAILQ_INIT(&allsegs); oldseq = 0; fs = NULL; sujino = 0; freefrags = 0; freeblocks = 0; freeinos = 0; freedir = 0; jbytes = 0; jrecs = 0; suj_jblocks = NULL; }