diff --git a/sbin/fsck_ffs/dir.c b/sbin/fsck_ffs/dir.c --- a/sbin/fsck_ffs/dir.c +++ b/sbin/fsck_ffs/dir.c @@ -679,7 +679,7 @@ struct bufarea *bp, *nbp; struct inodesc idesc; union dinode *dp; - int indiralloced; + long cg, indiralloced; char *cp; nbp = NULL; @@ -687,6 +687,7 @@ pwarn("NO SPACE LEFT IN %s", name); if (!preen && reply("EXPAND") == 0) return (0); + cg = ino_to_cg(&sblock, ip->i_number); dp = ip->i_dp; filesize = DIP(dp, di_size); lastlbn = lblkno(&sblock, filesize); @@ -705,7 +706,7 @@ bp = getdirblk(oldblk, lastlbnsize); if (bp->b_errs) goto bad; - if ((newblk = allocblk(sblock.fs_frag)) == 0) + if ((newblk = allocblk(cg, sblock.fs_frag, checkblkavail)) == 0) goto bad; nbp = getdatablk(newblk, sblock.fs_bsize, BT_DIRDATA); if (nbp->b_errs) @@ -731,7 +732,7 @@ printf(" (EXPANDED)\n"); return (1); } - if ((newblk = allocblk(sblock.fs_frag)) == 0) + if ((newblk = allocblk(cg, sblock.fs_frag, checkblkavail)) == 0) goto bad; bp = getdirblk(newblk, sblock.fs_bsize); if (bp->b_errs) @@ -749,7 +750,8 @@ * Allocate indirect block if needed. */ if ((indirblk = DIP(dp, di_ib[0])) == 0) { - if ((indirblk = allocblk(sblock.fs_frag)) == 0) + indirblk = allocblk(cg, sblock.fs_frag, checkblkavail); + if (indirblk == 0) goto bad; indiralloced = 1; } diff --git a/sbin/fsck_ffs/ea.c b/sbin/fsck_ffs/ea.c --- a/sbin/fsck_ffs/ea.c +++ b/sbin/fsck_ffs/ea.c @@ -74,8 +74,10 @@ blksiz = sblock.fs_fsize; else blksiz = sblock.fs_bsize; - printf("blksiz = %ju\n", (intmax_t)blksiz); bp = getdatablk(dp->di_extb[0], blksiz, BT_EXTATTR); + if (bp->b_errs) + return (STOP); + printf("blksiz = %ju\n", (intmax_t)blksiz); cp = (u_char *)bp->b_un.b_buf; for (n = 0; n < blksiz; n++) { printf("%02x", cp[n]); diff --git a/sbin/fsck_ffs/fsck.h b/sbin/fsck_ffs/fsck.h --- a/sbin/fsck_ffs/fsck.h +++ b/sbin/fsck_ffs/fsck.h @@ -200,8 +200,7 @@ #define BT_INODES 7 /* Buffer holds inodes */ #define BT_DIRDATA 8 /* Buffer holds directory data */ #define BT_DATA 9 /* Buffer holds user data */ -#define BT_EMPTY 10 /* Buffer allocated but not filled */ -#define BT_NUMBUFTYPES 11 +#define BT_NUMBUFTYPES 10 #define BT_NAMES { \ "unknown", \ "Superblock", \ @@ -212,8 +211,7 @@ "External Attribute", \ "Inode Block", \ "Directory Contents", \ - "User Data", \ - "Allocated but not filled" } + "User Data" } extern char *buftype[]; #define BT_BUFTYPE(type) \ type < BT_NUMBUFTYPES ? buftype[type] : buftype[BT_UNKNOWN] @@ -234,7 +232,7 @@ (bp)->b_flags |= B_DIRTY; \ } while (0) #define initbarea(bp, type) do { \ - (bp)->b_bno = (ufs2_daddr_t)-1; \ + (bp)->b_bno = (ufs2_daddr_t)-4; \ (bp)->b_size = 0; \ (bp)->b_errs = 0; \ (bp)->b_flags = 0; \ @@ -347,6 +345,7 @@ extern char *cdevname; /* name of device being checked */ extern char ckclean; /* only do work if not cleanly unmounted */ extern int ckhashadd; /* check hashes to be added */ +extern char *copybuf; /* buffer to copy snapshot blocks */ extern int cvtlevel; /* convert to newer file system format */ extern long dev_bsize; /* computed value of DEV_BSIZE */ extern u_int real_dev_bsize; /* actual disk sector size, not overridden */ @@ -371,6 +370,8 @@ extern int returntosingle; /* 1 => return to single user mode on exit */ extern long secsize; /* actual disk sector size */ extern char skipclean; /* skip clean file systems if preening */ +extern int snapcnt; /* number of active snapshots */ +extern struct inode snaplist[FSMAXSNAP + 1]; /* list of active snapshots */ extern char snapname[BUFSIZ]; /* when doing snapshots, the name of the file */ extern int sujrecovery; /* 1 => doing check using the journal */ extern int surrender; /* Give up if reads fail */ @@ -441,7 +442,8 @@ void adjust(struct inodesc *, int lcnt); void alarmhandler(int sig); -ufs2_daddr_t allocblk(long frags); +ufs2_daddr_t allocblk(long cg, long frags, ufs2_daddr_t (*checkblkavail) + (ufs2_daddr_t blkno, long frags)); ino_t allocdir(ino_t parent, ino_t request, int mode); ino_t allocino(ino_t request, int type); void blkerror(ino_t ino, const char *type, ufs2_daddr_t blk); @@ -459,11 +461,13 @@ struct bufarea *cglookup(int cg); int changeino(ino_t dir, const char *name, ino_t newnum); int check_cgmagic(int cg, struct bufarea *cgbp, int requestrebuild); +ufs2_daddr_t checkblkavail(ufs2_daddr_t blkno, long frags); int chkrange(ufs2_daddr_t blk, int cnt); void ckfini(int markclean); int ckinode(union dinode *dp, struct inodesc *); void clri(struct inodesc *, const char *type, int flag); int clearentry(struct inodesc *); +void copyonwrite(struct fs *, struct bufarea *); void direrror(ino_t ino, const char *errmesg); int dirscan(struct inodesc *); int dofix(struct inodesc *, const char *msg); @@ -505,6 +509,7 @@ void pass5(void); void pfatal(const char *fmt, ...) __printflike(1, 2); void propagate(void); +void prtbuf(struct bufarea *, const char *, ...) __printflike(2, 3); void prtinode(struct inode *); void pwarn(const char *fmt, ...) __printflike(1, 2); int readsb(void); @@ -513,6 +518,10 @@ void sblock_init(void); void setinodebuf(int, ino_t); int setup(char *dev); +int snapblkfree(struct fs *, ufs2_daddr_t, long, ino_t, + ufs2_daddr_t (*)(ufs2_daddr_t, long)); +void snapremove(ino_t); +void snapflush(void); void gjournal_check(const char *filesys); int suj_check(const char *filesys); void update_maps(struct cg *, struct cg*, int); diff --git a/sbin/fsck_ffs/fsutil.c b/sbin/fsck_ffs/fsutil.c --- a/sbin/fsck_ffs/fsutil.c +++ b/sbin/fsck_ffs/fsutil.c @@ -71,7 +71,6 @@ static void slowio_start(void); static void slowio_end(void); static void printIOstats(void); -static void prtbuf(const char *, struct bufarea *); static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */ static struct timespec startpass, finishpass; @@ -79,6 +78,7 @@ int slowio_delay_usec = 10000; /* Initial IO delay for background fsck */ int slowio_pollcnt; static struct bufarea cgblk; /* backup buffer for cylinder group blocks */ +static struct bufarea failedbuf; /* returned by failed getdatablk() */ static TAILQ_HEAD(bufqueue, bufarea) bufqueuehd; /* head of buffer cache LRU */ static LIST_HEAD(bufhash, bufarea) bufhashhd[HASHSIZE]; /* buffer hash list */ static int numbufs; /* size of buffer cache */ @@ -187,6 +187,9 @@ { int i; + initbarea(&failedbuf, BT_UNKNOWN); + failedbuf.b_errs = -1; + failedbuf.b_un.b_buf = NULL; if ((cgblk.b_un.b_buf = Malloc((unsigned int)sblock.fs_bsize)) == NULL) errx(EEXIT, "Initial malloc(%d) failed", sblock.fs_bsize); initbarea(&cgblk, BT_CYLGRP); @@ -300,7 +303,7 @@ } /* - * Manage a cache of directory blocks. + * Manage a cache of filesystem disk blocks. */ struct bufarea * getdatablk(ufs2_daddr_t blkno, long size, int type) @@ -309,16 +312,19 @@ struct bufhash *bhdp; cachelookups++; - /* If out of range, return empty buffer with b_err == -1 */ - if (type != BT_INODES && chkrange(blkno, size / sblock.fs_fsize)) { - blkno = -1; - type = BT_EMPTY; - } + /* + * If out of range, return empty buffer with b_err == -1 + * + * Skip check for inodes because chkrange() considers + * metadata areas invalid to write data. + */ + if (type != BT_INODES && chkrange(blkno, size / sblock.fs_fsize)) + return (&failedbuf); bhdp = &bufhashhd[HASH(blkno)]; LIST_FOREACH(bp, bhdp, b_hash) if (bp->b_bno == fsbtodb(&sblock, blkno)) { if (debug && bp->b_size != size) { - prtbuf("getdatablk: size mismatch", bp); + prtbuf(bp, "getdatablk: size mismatch"); pfatal("getdatablk: b_size %d != size %ld\n", bp->b_size, size); } @@ -378,7 +384,7 @@ if (debug && bp->b_type != type) { printf("getdatablk: buffer type changed to %s", BT_BUFTYPE(type)); - prtbuf("", bp); + prtbuf(bp, ""); } TAILQ_REMOVE(&bufqueuehd, bp, b_list); TAILQ_INSERT_HEAD(&bufqueuehd, bp, b_list); @@ -401,11 +407,7 @@ readcnt[bp->b_type]++; clock_gettime(CLOCK_REALTIME_PRECISE, &start); } - if (bp->b_type != BT_EMPTY) - bp->b_errs = - blread(fsreadfd, bp->b_un.b_buf, dblk, size); - else - bp->b_errs = -1; + bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, dblk, size); if (debug) { clock_gettime(CLOCK_REALTIME_PRECISE, &finish); timespecsub(&finish, &start, &finish); @@ -422,7 +424,7 @@ { if (bp->b_refcnt <= 0) - prtbuf("brelse: buffer with negative reference count", bp); + prtbuf(bp, "brelse: buffer with negative reference count"); bp->b_refcnt--; } @@ -451,10 +453,18 @@ if (bp != &sblk) pfatal("BUFFER %p DOES NOT MATCH SBLK %p\n", bp, &sblk); + /* + * Superblocks are always pre-copied so we do not need + * to check them for copy-on-write. + */ if (sbput(fd, bp->b_un.b_fs, 0) == 0) fsmodified = 1; break; case BT_CYLGRP: + /* + * Cylinder groups are always pre-copied so we do not + * need to check them for copy-on-write. + */ if (sujrecovery) cg_write(bp); if (cgput(fswritefd, &sblock, bp->b_un.b_cg) == 0) @@ -483,11 +493,38 @@ } /* FALLTHROUGH */ default: + copyonwrite(&sblock, bp); blwrite(fd, bp->b_un.b_buf, bp->b_bno, bp->b_size); break; } } +/* + * If there are any snapshots, ensure that all the blocks that they + * care about have been copied, then release the snapshot inodes. + * These operations need to be done before we rebuild the cylinder + * groups so that any block allocations are properly recorded. + * Since all the cylinder group maps have already been copied in + * the snapshots, no further snapshot copies will need to be done. + */ +void +snapflush(void) +{ + struct bufarea *bp; + int cnt; + + if (snapcnt > 0) { + if (debug) + printf("Check for snapshot copies\n"); + TAILQ_FOREACH_REVERSE(bp, &bufqueuehd, bufqueue, b_list) + if ((bp->b_flags & B_DIRTY) != 0) + copyonwrite(&sblock, bp); + for (cnt = 0; cnt < snapcnt; cnt++) + irelse(&snaplist[cnt]); + snapcnt = 0; + } +} + /* * Journaled soft updates does not maintain cylinder group summary * information during cleanup, so this routine recalculates the summary @@ -503,6 +540,7 @@ int blk; int i; + snapflush(); /* * Fix the frag and cluster summary. */ @@ -587,6 +625,7 @@ (void)close(fsreadfd); return; } + /* * To remain idempotent with partial truncations the buffers * must be flushed in this order: @@ -629,14 +668,9 @@ case BT_SUPERBLK: case BT_CYLGRP: default: - prtbuf("ckfini: improper buffer type on cache list",bp); + prtbuf(bp,"ckfini: improper buffer type on cache list"); continue; /* These are the ones to flush in this step */ - case BT_EMPTY: - if (bp->b_bno >= 0) - pfatal("Unused BT_EMPTY buffer for block %jd\n", - (intmax_t)bp->b_bno); - /* FALLTHROUGH */ case BT_LEVEL1: case BT_LEVEL2: case BT_LEVEL3: @@ -649,10 +683,11 @@ continue; } if (debug && bp->b_refcnt != 0) { - prtbuf("ckfini: clearing in-use buffer", bp); + prtbuf(bp, "ckfini: clearing in-use buffer"); pfatal("ckfini: clearing in-use buffer\n"); } TAILQ_REMOVE(&bufqueuehd, bp, b_list); + LIST_REMOVE(bp, b_hash); cnt++; flush(fswritefd, bp); free(bp->b_un.b_buf); @@ -667,10 +702,11 @@ } TAILQ_FOREACH_REVERSE_SAFE(bp, &bufqueuehd, bufqueue, b_list, nbp) { if (debug && bp->b_refcnt != 0) { - prtbuf("ckfini: clearing in-use buffer", bp); + prtbuf(bp, "ckfini: clearing in-use buffer"); pfatal("ckfini: clearing in-use buffer\n"); } TAILQ_REMOVE(&bufqueuehd, bp, b_list); + LIST_REMOVE(bp, b_hash); cnt++; flush(fswritefd, bp); free(bp->b_un.b_buf); @@ -1050,45 +1086,73 @@ * allocate a data block with the specified number of fragments */ ufs2_daddr_t -allocblk(long frags) +allocblk(long startcg, long frags, + ufs2_daddr_t (*checkblkavail)(ufs2_daddr_t blkno, long frags)) { - int i, j, k, cg, baseblk; - struct bufarea *cgbp; - struct cg *cgp; + ufs2_daddr_t blkno, newblk; if (frags <= 0 || frags > sblock.fs_frag) return (0); - for (i = 0; i < maxfsblock - sblock.fs_frag; i += sblock.fs_frag) { - for (j = 0; j <= sblock.fs_frag - frags; j++) { - if (testbmap(i + j)) - continue; - for (k = 1; k < frags; k++) - if (testbmap(i + j + k)) - break; - if (k < frags) { - j += k; - continue; - } - cg = dtog(&sblock, i + j); - cgbp = cglookup(cg); - cgp = cgbp->b_un.b_cg; - if (!check_cgmagic(cg, cgbp, 0)) { - i = (cg + 1) * sblock.fs_fpg - sblock.fs_frag; - continue; - } - baseblk = dtogd(&sblock, i + j); - for (k = 0; k < frags; k++) { - setbmap(i + j + k); - clrbit(cg_blksfree(cgp), baseblk + k); - } - n_blks += frags; - if (frags == sblock.fs_frag) - cgp->cg_cs.cs_nbfree--; - else - cgp->cg_cs.cs_nffree -= frags; - cgdirty(cgbp); - return (i + j); + for (blkno = cgdata(&sblock, startcg); + blkno < maxfsblock - sblock.fs_frag; + blkno += sblock.fs_frag) { + if ((newblk = (*checkblkavail)(blkno, frags)) == 0) + continue; + if (newblk > 0) + return (newblk); + if (newblk < 0) + blkno = -newblk; + } + for (blkno = cgdata(&sblock, 0); + blkno < cgbase(&sblock, startcg) - sblock.fs_frag; + blkno += sblock.fs_frag) { + if ((newblk = (*checkblkavail)(blkno, frags)) == 0) + continue; + if (newblk > 0) + return (newblk); + if (newblk < 0) + blkno = -newblk; + } + return (0); +} + +ufs2_daddr_t +checkblkavail(blkno, frags) + ufs2_daddr_t blkno; + long frags; +{ + struct bufarea *cgbp; + struct cg *cgp; + ufs2_daddr_t j, k, baseblk; + long cg; + + for (j = 0; j <= sblock.fs_frag - frags; j++) { + if (testbmap(blkno + j)) + continue; + for (k = 1; k < frags; k++) + if (testbmap(blkno + j + k)) + break; + if (k < frags) { + j += k; + continue; + } + cg = dtog(&sblock, blkno + j); + cgbp = cglookup(cg); + cgp = cgbp->b_un.b_cg; + if (!check_cgmagic(cg, cgbp, 0)) + return (-((cg + 1) * sblock.fs_fpg - sblock.fs_frag)); + baseblk = dtogd(&sblock, blkno + j); + for (k = 0; k < frags; k++) { + setbmap(blkno + j + k); + clrbit(cg_blksfree(cgp), baseblk + k); } + n_blks += frags; + if (frags == sblock.fs_frag) + cgp->cg_cs.cs_nbfree--; + else + cgp->cg_cs.cs_nffree -= frags; + cgdirty(cgbp); + return (blkno + j); } return (0); } @@ -1261,14 +1325,19 @@ /* * Print details about a buffer. */ -static void -prtbuf(const char *msg, struct bufarea *bp) +void +prtbuf(struct bufarea *bp, const char *fmt, ...) { - - printf("%s: bp %p, type %s, bno %jd, size %d, refcnt %d, flags %s, " - "index %jd\n", msg, bp, BT_BUFTYPE(bp->b_type), - (intmax_t) bp->b_bno, bp->b_size, bp->b_refcnt, - bp->b_flags & B_DIRTY ? "dirty" : "clean", (intmax_t) bp->b_index); + va_list ap; + va_start(ap, fmt); + if (preen) + (void)fprintf(stdout, "%s: ", cdevname); + (void)vfprintf(stdout, fmt, ap); + va_end(ap); + printf(": bp %p, type %s, bno %jd, size %d, refcnt %d, flags %s, " + "index %jd\n", bp, BT_BUFTYPE(bp->b_type), (intmax_t) bp->b_bno, + bp->b_size, bp->b_refcnt, bp->b_flags & B_DIRTY ? "dirty" : "clean", + (intmax_t) bp->b_index); } /* diff --git a/sbin/fsck_ffs/inode.c b/sbin/fsck_ffs/inode.c --- a/sbin/fsck_ffs/inode.c +++ b/sbin/fsck_ffs/inode.c @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include @@ -58,6 +59,8 @@ static int iblock(struct inodesc *, off_t isize, int type); static ufs2_daddr_t indir_blkatoff(ufs2_daddr_t, ino_t, ufs_lbn_t, ufs_lbn_t, struct bufarea **); +static int snapclean(struct inodesc *idesc); +static void chkcopyonwrite(struct fs *, ufs2_daddr_t); int ckinode(union dinode *dp, struct inodesc *idesc) @@ -378,8 +381,12 @@ int c; if (cnt <= 0 || blk <= 0 || blk > maxfsblock || - cnt - 1 > maxfsblock - blk) + cnt - 1 > maxfsblock - blk) { + if (debug) + printf("out of range: blk %ld, offset %i, size %d\n", + (long)blk, (int)fragnum(&sblock, blk), cnt); return (1); + } if (cnt > sblock.fs_frag || fragnum(&sblock, blk) + cnt > sblock.fs_frag) { if (debug) @@ -651,10 +658,17 @@ { struct dups *dlp; ufs2_daddr_t blkno; - long nfrags, res; + long size, nfrags, res; res = KEEPON; blkno = idesc->id_blkno; + if (idesc->id_type == SNAP) { + pfatal("clearing a snapshot dinode\n"); + return (STOP); + } + size = lfragtosize(&sblock, idesc->id_numfrags); + if (snapblkfree(&sblock, blkno, size, idesc->id_number, checkblkavail)) + return (res); for (nfrags = idesc->id_numfrags; nfrags > 0; blkno++, nfrags--) { if (chkrange(blkno, 1)) { res = SKIP; @@ -677,9 +691,323 @@ return (res); } +/* + * Prepare a snapshot file for being removed. + */ +void +snapremove(ino_t inum) +{ + struct inodesc idesc; + struct inode ip; + int i; + + for (i = 0; i < snapcnt; i++) + if (snaplist[i].i_number == inum) + break; + ip = snaplist[i]; + if (i == snapcnt || (DIP(ip.i_dp, di_flags) & SF_SNAPSHOT) == 0) { + printf("snapremove: inode %jd is not a snapshot\n", + (intmax_t)inum); + return; + } + /* + * Remove from active snapshot list. + */ + for (i++; i < FSMAXSNAP; i++) { + if (sblock.fs_snapinum[i] == 0) + break; + snaplist[i - 1] = snaplist[i]; + sblock.fs_snapinum[i - 1] = sblock.fs_snapinum[i]; + } + sblock.fs_snapinum[i - 1] = 0; + bzero(&snaplist[i - 1], sizeof(struct inode)); + snapcnt--; + idesc.id_type = SNAP; + idesc.id_func = snapclean; + idesc.id_number = inum; + (void)ckinode(ip.i_dp, &idesc); + DIP_SET(ip.i_dp, di_flags, DIP(ip.i_dp, di_flags) & ~SF_SNAPSHOT); + inodirty(&ip); + irelse(&ip); +} + +static int +snapclean(struct inodesc *idesc) +{ + ufs2_daddr_t blkno; + struct bufarea *bp; + union dinode *dp; + + blkno = idesc->id_blkno; + if (blkno == 0) + return (KEEPON); + + bp = idesc->id_bp; + dp = idesc->id_dp; + if (blkno == BLK_NOCOPY || blkno == BLK_SNAP || + (blkno == blkstofrags(&sblock, idesc->id_lbn) && + snapblkfree(&sblock, blkno, sblock.fs_bsize, idesc->id_number, + checkblkavail))) { + if (idesc->id_lbn < UFS_NDADDR) + DIP_SET(dp, di_db[idesc->id_lbn], 0); + else + IBLK_SET(bp, bp->b_index, 0); + if (blkno == blkstofrags(&sblock, idesc->id_lbn)) + DIP_SET(dp, di_blocks, DIP(dp, di_blocks) - + btodb(sblock.fs_bsize)); + dirty(bp); + } + return (KEEPON); +} + +/* + * Notification that a block is being freed. Return zero if the free + * should be allowed to proceed. Return non-zero if the snapshot file + * wants to claim the block. The block will be claimed if it is an + * uncopied part of one of the snapshots. It will be freed if it is + * either a BLK_NOCOPY or has already been copied in all of the snapshots. + * If a fragment is being freed, then all snapshots that care about + * it must make a copy since a snapshot file can only claim full sized + * blocks. Note that if more than one snapshot file maps the block, + * we can pick one at random to claim it. Since none of the snapshots + * can change, we are assurred that they will all see the same unmodified + * image. When deleting a snapshot file (see ino_trunc above), we + * must push any of these claimed blocks to one of the other snapshots + * that maps it. These claimed blocks are easily identified as they will + * have a block number equal to their logical block number within the + * snapshot. A copied block can never have this property because they + * must always have been allocated from a BLK_NOCOPY location. + */ +int +snapblkfree(fs, bno, size, inum, checkblkavail) + struct fs *fs; + ufs2_daddr_t bno; + long size; + ino_t inum; + ufs2_daddr_t (*checkblkavail)(long cg, long frags); +{ + union dinode *dp; + struct inode ip; + struct bufarea *snapbp; + ufs_lbn_t lbn; + ufs2_daddr_t blkno, relblkno; + int i, frags, claimedblk, copydone; + + /* If no snapshots, nothing to do */ + if (snapcnt == 0) + return (0); + if (debug) + printf("snapblkfree: in ino %ld free blkno %ld, size %ld\n", + inum, bno, size); + relblkno = blknum(fs, bno); + lbn = fragstoblks(fs, relblkno); + /* Direct blocks are always pre-copied */ + if (lbn < UFS_NDADDR) + return (0); + copydone = 0; + claimedblk = 0; + for (i = 0; i < snapcnt; i++) { + /* + * Lookup block being freed. + */ + ip = snaplist[i]; + dp = ip.i_dp; + blkno = ino_blkatoff(dp, inum != 0 ? inum : ip.i_number, + lbn, &frags, &snapbp); + /* + * Check to see if block needs to be copied. + */ + if (blkno == 0) { + /* + * A block that we map is being freed. If it has not + * been claimed yet, we will claim or copy it (below). + */ + claimedblk = 1; + } else if (blkno == BLK_SNAP) { + /* + * No previous snapshot claimed the block, + * so it will be freed and become a BLK_NOCOPY + * (don't care) for us. + */ + if (claimedblk) + pfatal("snapblkfree: inconsistent block type"); + IBLK_SET(snapbp, snapbp->b_index, BLK_NOCOPY); + dirty(snapbp); + brelse(snapbp); + continue; + } else /* BLK_NOCOPY or default */ { + /* + * If the snapshot has already copied the block + * (default), or does not care about the block, + * it is not needed. + */ + brelse(snapbp); + continue; + } + /* + * If this is a full size block, we will just grab it + * and assign it to the snapshot inode. Otherwise we + * will proceed to copy it. See explanation for this + * routine as to why only a single snapshot needs to + * claim this block. + */ + if (size == fs->fs_bsize) { + if (debug) + printf("Grabonremove snapshot %ju lbn %jd " + "from inum %ju\n", (intmax_t)ip.i_number, + (intmax_t)lbn, (uintmax_t)inum); + IBLK_SET(snapbp, snapbp->b_index, relblkno); + dirty(snapbp); + brelse(snapbp); + DIP_SET(dp, di_blocks, + DIP(dp, di_blocks) + btodb(size)); + inodirty(&ip); + return (1); + } + + /* First time through, read the contents of the old block. */ + if (copydone == 0) { + copydone = 1; + if (blread(fsreadfd, copybuf, fsbtodb(fs, relblkno), + fs->fs_bsize) != 0) { + pfatal("Could not read snapshot %ju block " + "%jd\n", (intmax_t)ip.i_number, + (intmax_t)relblkno); + continue; + } + } + /* + * This allocation will never require any additional + * allocations for the snapshot inode. + */ + blkno = (*allocblk)(dtog(fs, relblkno), fs->fs_frag, + checkblkavail); + if (blkno == 0) { + pfatal("Could not allocate block for snapshot %ju\n", + (intmax_t)ip.i_number); + continue; + } + if (debug) + printf("Copyonremove: snapino %jd lbn %jd for inum %ju " + "size %ld new blkno %jd\n", (intmax_t)ip.i_number, + (intmax_t)lbn, (uintmax_t)inum, size, + (intmax_t)blkno); + blwrite(fswritefd, copybuf, fsbtodb(fs, blkno), fs->fs_bsize); + IBLK_SET(snapbp, snapbp->b_index, blkno); + dirty(snapbp); + brelse(snapbp); + DIP_SET(dp, di_blocks, + DIP(dp, di_blocks) + btodb(fs->fs_bsize)); + inodirty(&ip); + } + return (0); +} + +/* + * Notification that a block is being written. Return if the block + * is part of a snapshot as snapshots never track other snapshots. + * The block will be copied in all of the snapshots that are tracking + * it and have not yet copied it. Some buffers may hold more than one + * block. Here we need to check each block in the buffer. + */ +void +copyonwrite(fs, bp) + struct fs *fs; + struct bufarea *bp; +{ + ufs2_daddr_t copyblkno; + long i, numblks; + + /* If no snapshots, nothing to do. */ + if (snapcnt == 0) + return; + numblks = blkroundup(fs, bp->b_size) / fs->fs_bsize; + if (debug) + prtbuf(bp, "copyonwrite: checking %jd block%s in buffer", + numblks, numblks > 1 ? "s" : ""); + copyblkno = blknum(fs, dbtofsb(fs, bp->b_bno)); + for (i = 0; i < numblks; i++) { + chkcopyonwrite(fs, copyblkno); + copyblkno += fs->fs_frag; + } +} + +static void +chkcopyonwrite(fs, copyblkno) + struct fs *fs; + ufs2_daddr_t copyblkno; +{ + struct inode ip; + union dinode *dp; + struct bufarea *snapbp; + ufs2_daddr_t blkno; + int i, frags, copydone; + ufs_lbn_t lbn; + + lbn = fragstoblks(fs, copyblkno); + /* Direct blocks are always pre-copied */ + if (lbn < UFS_NDADDR) + return; + copydone = 0; + for (i = 0; i < snapcnt; i++) { + /* + * Lookup block being freed. + */ + ip = snaplist[i]; + dp = ip.i_dp; + blkno = ino_blkatoff(dp, ip.i_number, lbn, &frags, &snapbp); + /* + * Check to see if block needs to be copied. + */ + if (blkno != 0) { + /* + * A block that we have already copied or don't track. + */ + brelse(snapbp); + continue; + } + /* First time through, read the contents of the old block. */ + if (copydone == 0) { + copydone = 1; + if (blread(fsreadfd, copybuf, fsbtodb(fs, copyblkno), + fs->fs_bsize) != 0) { + pfatal("Could not read snapshot %ju block " + "%jd\n", (intmax_t)ip.i_number, + (intmax_t)copyblkno); + continue; + } + } + /* + * This allocation will never require any additional + * allocations for the snapshot inode. + */ + if ((blkno = allocblk(dtog(fs, copyblkno), fs->fs_frag, + checkblkavail)) == 0) { + pfatal("Could not allocate block for snapshot %ju\n", + (intmax_t)ip.i_number); + continue; + } + if (debug) + prtbuf(snapbp, "Copyonwrite: snapino %jd lbn %jd using " + "blkno %ju setting in buffer", + (intmax_t)ip.i_number, (intmax_t)lbn, + (intmax_t)blkno); + blwrite(fswritefd, copybuf, fsbtodb(fs, blkno), fs->fs_bsize); + IBLK_SET(snapbp, snapbp->b_index, blkno); + dirty(snapbp); + brelse(snapbp); + DIP_SET(dp, di_blocks, + DIP(dp, di_blocks) + btodb(fs->fs_bsize)); + inodirty(&ip); + } + return; +} + void freeinodebuf(void) { + struct bufarea *bp; + int i; /* * Flush old contents in case they have been updated. @@ -689,6 +1017,14 @@ free((char *)inobuf.b_un.b_buf); inobuf.b_un.b_buf = NULL; firstinum = lastinum = 0; + /* + * Reload the snapshot inodes in case any of them changed. + */ + for (i = 0; i < snapcnt; i++) { + bp = snaplist[i].i_bp; + bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, bp->b_bno, + bp->b_size); + } } /* @@ -803,6 +1139,10 @@ printf(" (CLEARED)\n"); n_files--; if (bkgrdflag == 0) { + if (idesc->id_type == SNAP) { + snapremove(idesc->id_number); + idesc->id_type = ADDR; + } (void)ckinode(dp, idesc); inoinfo(idesc->id_number)->ino_state = USTATE; clearinode(dp); @@ -967,7 +1307,8 @@ cgdirty(cgbp); ginode(ino, &ip); dp = ip.i_dp; - DIP_SET(dp, di_db[0], allocblk((long)1)); + DIP_SET(dp, di_db[0], allocblk(ino_to_cg(&sblock, ino), (long)1, + checkblkavail)); if (DIP(dp, di_db[0]) == 0) { inoinfo(ino)->ino_state = USTATE; irelse(&ip); diff --git a/sbin/fsck_ffs/main.c b/sbin/fsck_ffs/main.c --- a/sbin/fsck_ffs/main.c +++ b/sbin/fsck_ffs/main.c @@ -491,6 +491,7 @@ */ if (preen == 0) printf("** Phase 5 - Check Cyl groups\n"); + snapflush(); pass5(); IOstats("Pass5"); diff --git a/sbin/fsck_ffs/setup.c b/sbin/fsck_ffs/setup.c --- a/sbin/fsck_ffs/setup.c +++ b/sbin/fsck_ffs/setup.c @@ -59,6 +59,9 @@ #include "fsck.h" struct inoinfo **inphead, **inpsort; /* info about all inodes */ +struct inode snaplist[FSMAXSNAP + 1]; /* list of active snapshots */ +int snapcnt; /* number of active snapshots */ +char *copybuf; /* buffer to copy snapshot blocks */ static int sbhashfailed; #define POWEROF2(num) (((num) & ((num) - 1)) == 0) @@ -66,6 +69,8 @@ static int calcsb(char *dev, int devfd, struct fs *fs); static void saverecovery(int readfd, int writefd); static int chkrecovery(int devfd); +static int getlbnblkno(struct inodesc *); +static int checksnapinfo(struct inode *); /* * Read in a superblock finding an alternate if necessary. @@ -75,7 +80,8 @@ int setup(char *dev) { - long bmapsize; + long i, bmapsize; + struct inode ip; /* * We are expected to have an open file descriptor and a superblock. @@ -174,6 +180,39 @@ usedsoftdep = 1; else usedsoftdep = 0; + /* + * Collect any snapshot inodes so that we can allow them to + * claim any blocks that we free. The code for doing this is + * imported here and into inode.c from sys/ufs/ffs/ffs_snapshot.c. + */ + for (snapcnt = 0; snapcnt < FSMAXSNAP; snapcnt++) { + if (sblock.fs_snapinum[snapcnt] == 0) + break; + ginode(sblock.fs_snapinum[snapcnt], &ip); + if ((DIP(ip.i_dp, di_mode) & IFMT) == IFREG && + (DIP(ip.i_dp, di_flags) & SF_SNAPSHOT) != 0 && + checksnapinfo(&ip)) { + snaplist[snapcnt] = ip; + continue; + } + printf("Removing non-snapshot inode %ju from snapshot list\n", + (uintmax_t)sblock.fs_snapinum[snapcnt]); + irelse(&ip); + for (i = snapcnt + 1; i < FSMAXSNAP; i++) { + if (sblock.fs_snapinum[i] == 0) + break; + sblock.fs_snapinum[i - 1] = sblock.fs_snapinum[i]; + } + sblock.fs_snapinum[i - 1] = 0; + snapcnt--; + sbdirty(); + } + if (snapcnt > 0 && copybuf == NULL) { + copybuf = Malloc(sblock.fs_bsize); + if (copybuf == NULL) + errx(EEXIT, "cannot allocate space for snapshot " + "copy buffer"); + } return (1); badsb: @@ -181,6 +220,136 @@ return (0); } +/* + * Check for valid snapshot information. + * + * Each snapshot has a list of blocks that have been copied. This list + * is consulted before checking the snapshot inode. Its purpose is to + * speed checking of commonly checked blocks and to avoid recursive + * checks of the snapshot inode. In particular, the list must contain + * the superblock, the superblock summary information, and all the + * cylinder group blocks. The list may contain other commonly checked + * pointers such as those of the blocks that contain the snapshot inodes. + * The list is sorted into block order to allow binary search lookup. + * + * The twelve direct direct block pointers of the snapshot are always + * copied, so we test for them first before checking the list itself + * (i.e., they are not in the list). + * + * The checksnapinfo() routine needs to ensure that the list contains at + * least the super block, its summary information, and the cylinder groups. + * Here we check the list first for the superblock, zero or more cylinder + * groups up to the location of the superblock summary information, the + * summary group information, and any remaining cylinder group maps that + * follow it. We skip over any other entries in the list. + */ +#define CHKBLKINLIST(chkblk) \ + /* All UFS_NDADDR blocks are copied */ \ + if ((chkblk) >= UFS_NDADDR) { \ + /* Skip over blocks that are not of interest */ \ + while (*blkp < (chkblk) && blkp < lastblkp) \ + blkp++; \ + /* Fail if end of list and not all blocks found */ \ + if (blkp >= lastblkp) { \ + pwarn("UFS%d snapshot inode %jd failed: " \ + "improper block list length (%jd)\n", \ + sblock.fs_magic == FS_UFS1_MAGIC ? 1 : 2, \ + (intmax_t)snapip->i_number, \ + (intmax_t)(lastblkp - &snapblklist[0])); \ + status = 0; \ + goto fail; \ + } \ + /* Fail if block we seek is missing */ \ + if (*blkp++ != (chkblk)) { \ + pwarn("UFS%d snapshot inode %jd failed: " \ + "block list (%jd) != %s (%jd)\n", \ + sblock.fs_magic == FS_UFS1_MAGIC ? 1 : 2, \ + (intmax_t)snapip->i_number, \ + (intmax_t)blkp[-1], #chkblk, \ + (intmax_t)chkblk); \ + status = 0; \ + goto fail; \ + } \ + } + +static int +checksnapinfo(struct inode *snapip) +{ + struct fs *fs; + struct bufarea *bp; + struct inodesc idesc; + daddr_t *snapblklist, *blkp, *lastblkp, csblkno; + int cg, loc, len, status; + ufs_lbn_t lbn; + size_t size; + + fs = &sblock; + memset(&idesc, 0, sizeof(struct inodesc)); + idesc.id_type = ADDR; + idesc.id_func = getlbnblkno; + idesc.id_number = snapip->i_number; + lbn = howmany(fs->fs_size, fs->fs_frag); + idesc.id_parent = lbn; /* sought after blkno */ + if ((ckinode(snapip->i_dp, &idesc) & FOUND) == 0) + return (0); + size = fragroundup(fs, + DIP(snapip->i_dp, di_size) - lblktosize(fs, lbn)); + bp = getdatablk(idesc.id_parent, size, BT_DATA); + snapblklist = (daddr_t *)bp->b_un.b_buf; + /* + * snapblklist[0] is the size of the list + * snapblklist[1] is the first element of the list + * + * We need to be careful to bound the size of the list and verify + * that we have not run off the end of it if it or its size has + * been corrupted. + */ + blkp = &snapblklist[1]; + lastblkp = &snapblklist[MAX(0, + MIN(snapblklist[0] + 1, size / sizeof(daddr_t)))]; + status = 1; + /* Check that the superblock is listed. */ + CHKBLKINLIST(lblkno(fs, fs->fs_sblockloc)); + /* + * Calculate where the summary information is located. + * Usually it is in the first cylinder group, but growfs + * may move it to the first cylinder group that it adds. + * + * Check all cylinder groups up to the summary information. + */ + csblkno = fragstoblks(fs, fs->fs_csaddr); + for (cg = 0; cg < fs->fs_ncg; cg++) { + if (fragstoblks(fs, cgtod(fs, cg)) > csblkno) + break; + CHKBLKINLIST(fragstoblks(fs, cgtod(fs, cg))); + } + /* Check the summary information block(s). */ + len = howmany(fs->fs_cssize, fs->fs_bsize); + for (loc = 0; loc < len; loc++) + CHKBLKINLIST(csblkno + loc); + /* Check the remaining cylinder groups. */ + for (; cg < fs->fs_ncg; cg++) + CHKBLKINLIST(fragstoblks(fs, cgtod(fs, cg))); +fail: + brelse(bp); + return (status); +} + +/* + * Return the block number associated with a specified inode lbn. + * Requested lbn is in id_parent. If found, block is returned in + * id_parent. + */ +static int +getlbnblkno(struct inodesc *idesc) +{ + + if (idesc->id_lbn < idesc->id_parent) + return (KEEPON); + idesc->id_parent = idesc->id_blkno; + return (STOP | FOUND); +} + /* * Open a device or file to be checked by fsck. */ diff --git a/sbin/fsck_ffs/suj.c b/sbin/fsck_ffs/suj.c --- a/sbin/fsck_ffs/suj.c +++ b/sbin/fsck_ffs/suj.c @@ -321,7 +321,7 @@ * To be certain we're not freeing a reallocated block we lookup * this block in the blk hash and see if there is an allocation * journal record that overlaps with any fragments in the block - * we're concerned with. If any fragments have ben reallocated + * we're concerned with. If any fragments have been reallocated * the block has already been freed and re-used for another purpose. */ mask = 0; @@ -378,6 +378,50 @@ return (0); } +/* + * Check to see if the requested block is available. + * We can just check in the cylinder-group maps as + * they will only have usable blocks in them. + */ +static ufs2_daddr_t +suj_checkblkavail(blkno, frags) + ufs2_daddr_t blkno; + long frags; +{ + struct bufarea *cgbp; + struct cg *cgp; + ufs2_daddr_t j, k, baseblk; + long cg; + + cg = dtog(&sblock, blkno); + cgbp = cglookup(cg); + cgp = cgbp->b_un.b_cg; + if (!check_cgmagic(cg, cgbp, 0)) + return (-((cg + 1) * sblock.fs_fpg - sblock.fs_frag)); + baseblk = dtogd(&sblock, blkno); + for (j = 0; j <= sblock.fs_frag - frags; j++) { + if (!isset(cg_blksfree(cgp), baseblk + j)) + continue; + for (k = 1; k < frags; k++) + if (!isset(cg_blksfree(cgp), baseblk + j + k)) + break; + if (k < frags) { + j += k; + continue; + } + for (k = 0; k < frags; k++) + clrbit(cg_blksfree(cgp), baseblk + j + k); + n_blks += frags; + if (frags == sblock.fs_frag) + cgp->cg_cs.cs_nbfree--; + else + cgp->cg_cs.cs_nffree -= frags; + cgdirty(cgbp); + return ((cg * sblock.fs_fpg) + baseblk + j); + } + return (0); +} + /* * Clear an inode from the cg bitmap. If the inode was already clear return * 0 so the caller knows it does not have to check the inode contents. @@ -431,6 +475,12 @@ if (debug) printf("Freeing %d frags at blk %jd mask 0x%x\n", frags, bno, mask); + /* + * Check to see if the block needs to be claimed by a snapshot. + * If wanted, the snapshot references it. Otherwise we free it. + */ + if (snapblkfree(fs, bno, lfragtosize(fs, frags), 0, suj_checkblkavail)) + return; cg = dtog(fs, bno); sc = cg_lookup(cg); cgp = sc->sc_cgp; @@ -1264,6 +1314,7 @@ if (size > 0) err_suj("Partial truncation of ino %ju snapshot file\n", (uintmax_t)ino); + snapremove(ino); } lastlbn = lblkno(fs, blkroundup(fs, size)); for (i = lastlbn; i < UFS_NDADDR; i++) { @@ -1283,13 +1334,13 @@ /* If we're not freeing any in this indirect range skip it. */ if (lastlbn >= nextlbn) continue; - if (DIP(dp, di_ib[i]) == 0) - continue; - indir_trunc(ino, -lbn - i, DIP(dp, di_ib[i]), lastlbn, dp); - /* If we freed everything in this indirect free the indir. */ - if (lastlbn > lbn) - continue; - blk_free(DIP(dp, di_ib[i]), 0, fs->fs_frag); + if ((bn = DIP(dp, di_ib[i])) == 0) + continue; + indir_trunc(ino, -lbn - i, bn, lastlbn, dp); + /* If we freed everything in this indirect free the indir. */ + if (lastlbn > lbn) + continue; + blk_free(bn, 0, fs->fs_frag); DIP_SET(dp, di_ib[i], 0); } /*