diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index d2627647360a..a1da0b05678a 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -1,3605 +1,3606 @@ /*- * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause) * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef ufs2_daddr_t allocfcn_t(struct inode *ip, uint64_t cg, ufs2_daddr_t bpref, int size, int rsize); static ufs2_daddr_t ffs_alloccg(struct inode *, uint64_t, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); static void ffs_blkfree_cg(struct ufsmount *, struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, struct workhead *); #ifdef INVARIANTS static int ffs_checkfreeblk(struct inode *, ufs2_daddr_t, long); #endif static void ffs_checkcgintegrity(struct fs *, uint64_t, int); static ufs2_daddr_t ffs_clusteralloc(struct inode *, uint64_t, ufs2_daddr_t, int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, uint64_t, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_hashalloc(struct inode *, uint64_t, ufs2_daddr_t, int, int, allocfcn_t *); static ufs2_daddr_t ffs_nodealloccg(struct inode *, uint64_t, ufs2_daddr_t, int, int); static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); static void ffs_ckhash_cg(struct buf *); /* * Allocate a block in the filesystem. * * The size of the requested block is given, which must be some * multiple of fs_fsize and <= fs_bsize. * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadratically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following hierarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadratically rehash into other cylinder groups, until an * available block is located. */ int ffs_alloc(struct inode *ip, ufs2_daddr_t lbn, ufs2_daddr_t bpref, int size, int flags, struct ucred *cred, ufs2_daddr_t *bnp) { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t bno; uint64_t cg, reclaimed; int64_t delta; #ifdef QUOTA int error; #endif *bnp = 0; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_alloc: bad size"); } if (cred == NOCRED) panic("ffs_alloc: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: #ifdef QUOTA UFS_UNLOCK(ump); error = chkdq(ip, btodb(size), cred, 0); if (error) return (error); UFS_LOCK(ump); #endif if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) goto nospace; if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); if (bno > 0) { delta = btodb(size); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); *bnp = bno; return (0); } nospace: #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(size), cred, FORCE); UFS_LOCK(ump); #endif if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); goto retry; } if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (reclaimed > 0 && ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Reallocate a fragment to a bigger size * * The number and size of the old block is given, and a preference * and new size is also specified. The allocator attempts to extend * the original block. Failing that, the regular block allocator is * invoked to get an appropriate block. */ int ffs_realloccg(struct inode *ip, ufs2_daddr_t lbprev, ufs2_daddr_t bprev, ufs2_daddr_t bpref, int osize, int nsize, int flags, struct ucred *cred, struct buf **bpp) { struct vnode *vp; struct fs *fs; struct buf *bp; struct ufsmount *ump; uint64_t cg, request, reclaimed; int error, gbflags; ufs2_daddr_t bno; int64_t delta; vp = ITOV(ip); ump = ITOUMP(ip); fs = ump->um_fs; bp = NULL; gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; #ifdef WITNESS gbflags |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0; #endif mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_realloccg: allocation on suspended filesystem"); if ((uint64_t)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (uint64_t)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { printf( "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, osize, nsize, fs->fs_fsmnt); panic("ffs_realloccg: bad size"); } if (cred == NOCRED) panic("ffs_realloccg: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { goto nospace; } if (bprev == 0) { printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, fs->fs_fsmnt); panic("ffs_realloccg: bad bprev"); } UFS_UNLOCK(ump); /* * Allocate the extra space in the buffer. */ error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); if (error) { return (error); } if (bp->b_blkno == bp->b_lblkno) { if (lbprev >= UFS_NDADDR) panic("ffs_realloccg: lbprev out of range"); bp->b_blkno = fsbtodb(fs, bprev); } #ifdef QUOTA error = chkdq(ip, btodb(nsize - osize), cred, 0); if (error) { brelse(bp); return (error); } #endif /* * Check for extension in the existing location. */ *bpp = NULL; cg = dtog(fs, bprev); UFS_LOCK(ump); bno = ffs_fragextend(ip, cg, bprev, osize, nsize); if (bno) { if (bp->b_blkno != fsbtodb(fs, bno)) panic("ffs_realloccg: bad blockno"); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } /* * Allocate a new disk location. */ if (bpref >= fs->fs_size) bpref = 0; switch ((int)fs->fs_optim) { case FS_OPTSPACE: /* * Allocate an exact sized fragment. Although this makes * best use of space, we will waste time relocating it if * the file continues to grow. If the fragmentation is * less than half of the minimum free reserve, we choose * to begin optimizing for time. */ request = nsize; if (fs->fs_minfree <= 5 || fs->fs_cstotal.cs_nffree > (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) break; log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTTIME; break; case FS_OPTTIME: /* * At this point we have discovered a file that is trying to * grow a small fragment to a larger fragment. To save time, * we allocate a full sized block, then free the unused portion. * If the file continues to grow, the `ffs_fragextend' call * above will be able to grow it in place without further * copying. If aberrant programs cause disk fragmentation to * grow within 2% of the free reserve, we choose to begin * optimizing for space. */ request = fs->fs_bsize; if (fs->fs_cstotal.cs_nffree < (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) break; log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTSPACE; break; default: printf("dev = %s, optim = %ld, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) /* * The usual case is that a smaller fragment that * was just allocated has been replaced with a bigger * fragment or a full-size block. If it is marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the block was written * earlier, but very uncommon. If the block has never * been written, there is no need to send a BIO_DELETE * for it when it is freed. The gain from avoiding the * TRIMs for the common case of unwritten blocks far * exceeds the cost of the write amplification for the * uncommon case of failing to send a TRIM for a block * that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); UFS_LOCK(ump); #endif nospace: /* * no space available */ if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; UFS_UNLOCK(ump); if (bp) { brelse(bp); bp = NULL; } UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); goto retry; } if (bp) brelse(bp); if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (reclaimed > 0 && ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Reallocate a sequence of blocks into a contiguous sequence of blocks. * * The vnode and an array of buffer pointers for a range of sequential * logical blocks to be made contiguous is given. The allocator attempts * to find a range of sequential blocks starting as close as possible * from the end of the allocation for the logical block immediately * preceding the current range. If successful, the physical block numbers * in the buffer pointers and in the inode are changed to reflect the new * allocation. If unsuccessful, the allocation is left unchanged. The * success in doing the reallocation is returned. Note that the error * return is not reflected back to the user. Rather the previous block * allocation will be used. */ SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "FFS filesystem"); static int doasyncfree = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "do not force synchronous writes when blocks are reallocated"); static int doreallocblks = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "enable block reallocation"); static int dotrimcons = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, "enable BIO_DELETE / TRIM consolidation"); static int maxclustersearch = 10; SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 0, "max number of cylinder group to search for contigous blocks"); #ifdef DIAGNOSTIC static int prtrealloc = 0; SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0, "print out FFS filesystem block reallocation operations"); #endif int ffs_reallocblks( struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap) { struct ufsmount *ump; int error; /* * We used to skip reallocating the blocks of a file into a * contiguous sequence if the underlying flash device requested * BIO_DELETE notifications, because devices that benefit from * BIO_DELETE also benefit from not moving the data. However, * the destination for the data is usually moved before the data * is written to the initially allocated location, so we rarely * suffer the penalty of extra writes. With the addition of the * consolidation of contiguous blocks into single BIO_DELETE * operations, having fewer but larger contiguous blocks reduces * the number of (slow and expensive) BIO_DELETE operations. So * when doing BIO_DELETE consolidation, we do block reallocation. * * Skip if reallocblks has been disabled globally. */ ump = ap->a_vp->v_mount->mnt_data; if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || doreallocblks == 0) return (ENOSPC); /* * We can't wait in softdep prealloc as it may fsync and recurse * here. Instead we simply fail to reallocate blocks if this * rare condition arises. */ if (DOINGSUJ(ap->a_vp)) if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) return (ENOSPC); vn_seqc_write_begin(ap->a_vp); error = ump->um_fstype == UFS1 ? ffs_reallocblks_ufs1(ap) : ffs_reallocblks_ufs2(ap); vn_seqc_write_end(ap->a_vp); return (error); } static int ffs_reallocblks_ufs1( struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap) { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp, *bp; ufs1_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs1_daddr_t soff, newblk, blkno; ufs2_daddr_t pref; struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkfreeblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, leave space for the indirect block. Indirect blocks * are initially laid out in a position after the last direct * block. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din1->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs1_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs1_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkfreeblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %d,", *bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din1->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din1->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) /* * The usual case is that a set of N-contiguous blocks * that was just allocated has been replaced with a * set of N+1-contiguous blocks. If they are marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the blocks were written * earlier, but very uncommon. If the blocks have never * been written, there is no need to send a BIO_DELETE * for them when they are freed. The gain from avoiding * the TRIMs for the common case of unwritten blocks * far exceeds the cost of the write amplification for * the uncommon case of failing to send a TRIM for the * blocks that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, bp->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %d,", blkno); #endif } #ifdef DIAGNOSTIC if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din1->di_db[0]) brelse(sbp); return (ENOSPC); } static int ffs_reallocblks_ufs2( struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap) { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp, *bp; ufs2_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs2_daddr_t soff, newblk, blkno, pref; struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkfreeblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, do not move anything in it. Indirect blocks are * usually initially laid out in a position between the data * blocks. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din2->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs2_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs2_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkfreeblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %jd,", (intmax_t)*bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din2->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din2->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) /* * The usual case is that a set of N-contiguous blocks * that was just allocated has been replaced with a * set of N+1-contiguous blocks. If they are marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the blocks were written * earlier, but very uncommon. If the blocks have never * been written, there is no need to send a BIO_DELETE * for them when they are freed. The gain from avoiding * the TRIMs for the common case of unwritten blocks * far exceeds the cost of the write amplification for * the uncommon case of failing to send a TRIM for the * blocks that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, bp->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkfreeblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %jd,", (intmax_t)blkno); #endif } #ifdef DIAGNOSTIC if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din2->di_db[0]) brelse(sbp); return (ENOSPC); } /* * Allocate an inode in the filesystem. * * If allocating a directory, use ffs_dirpref to select the inode. * If allocating in a directory, the following hierarchy is followed: * 1) allocate the preferred inode. * 2) allocate an inode in the same cylinder group. * 3) quadratically rehash into other cylinder groups, until an * available inode is located. * If no inode preference is given the following hierarchy is used * to allocate an inode: * 1) allocate an inode in cylinder group 0. * 2) quadratically rehash into other cylinder groups, until an * available inode is located. */ int ffs_valloc(struct vnode *pvp, int mode, struct ucred *cred, struct vnode **vpp) { struct inode *pip; struct fs *fs; struct inode *ip; struct timespec ts; struct ufsmount *ump; ino_t ino, ipref; uint64_t cg; int error, reclaimed; *vpp = NULL; pip = VTOI(pvp); ump = ITOUMP(pip); fs = ump->um_fs; UFS_LOCK(ump); reclaimed = 0; retry: if (fs->fs_cstotal.cs_nifree == 0) goto noinodes; if ((mode & IFMT) == IFDIR) ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); /* * Track number of dirs created one after another * in a same cg without intervening by files. */ if ((mode & IFMT) == IFDIR) { if (fs->fs_contigdirs[cg] < 255) fs->fs_contigdirs[cg]++; } else { if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; /* * Get rid of the cached old vnode, force allocation of a new vnode * for this inode. If this fails, release the allocated ino and * return the error. */ if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, FFSV_FORCEINSMQ | FFSV_REPLACE | FFSV_NEWINODE)) != 0) { ffs_vfree(pvp, ino, mode); return (error); } /* * We got an inode, so check mode and panic if it is already allocated. */ ip = VTOI(*vpp); if (ip->i_mode) { printf("mode = 0%o, inum = %ju, fs = %s\n", ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); panic("ffs_valloc: dup alloc"); } if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ printf("free inode %s/%ju had %ld blocks\n", fs->fs_fsmnt, (intmax_t)ino, (long)DIP(ip, i_blocks)); DIP_SET(ip, i_blocks, 0); } ip->i_flags = 0; DIP_SET(ip, i_flags, 0); if ((mode & IFMT) == IFDIR) DIP_SET(ip, i_dirdepth, DIP(pip, i_dirdepth) + 1); /* * Set up a new generation number for this inode. */ while (ip->i_gen == 0 || ++ip->i_gen == 0) ip->i_gen = arc4random(); DIP_SET(ip, i_gen, ip->i_gen); if (fs->fs_magic == FS_UFS2_MAGIC) { vfs_timestamp(&ts); ip->i_din2->di_birthtime = ts.tv_sec; ip->i_din2->di_birthnsec = ts.tv_nsec; } ip->i_flag = 0; (*vpp)->v_vflag = 0; (*vpp)->v_type = VNON; if (fs->fs_magic == FS_UFS2_MAGIC) { (*vpp)->v_op = &ffs_vnodeops2; UFS_INODE_SET_FLAG(ip, IN_UFS2); } else { (*vpp)->v_op = &ffs_vnodeops1; } return (0); noinodes: if (reclaimed == 0) { reclaimed = 1; softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); goto retry; } if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, pip->i_number, "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ static ino_t ffs_dirpref(struct inode *pip) { struct fs *fs; int cg, prefcg, curcg, dirsize, cgsize; int depth, range, start, end, numdirs, power, numerator, denominator; uint64_t avgifree, avgbfree, avgndir, curdirsize; uint64_t minifree, minbfree, maxndir; uint64_t maxcontigdirs; mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); fs = ITOFS(pip); avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; /* * Select a preferred cylinder group to place a new directory. * If we are near the root of the filesystem we aim to spread * them out as much as possible. As we descend deeper from the * root we cluster them closer together around their parent as * we expect them to be more closely interactive. Higher-level * directories like usr/src/sys and usr/src/bin should be * separated while the directories in these areas are more * likely to be accessed together so should be closer. * * We pick a range of cylinder groups around the cylinder group * of the directory in which we are being created. The size of * the range for our search is based on our depth from the root * of our filesystem. We then probe that range based on how many * directories are already present. The first new directory is at * 1/2 (middle) of the range; the second is in the first 1/4 of the * range, then at 3/4, 1/8, 3/8, 5/8, 7/8, 1/16, 3/16, 5/16, etc. */ depth = DIP(pip, i_dirdepth); range = fs->fs_ncg / (1 << depth); curcg = ino_to_cg(fs, pip->i_number); start = curcg - (range / 2); if (start < 0) start += fs->fs_ncg; end = curcg + (range / 2); if (end >= fs->fs_ncg) end -= fs->fs_ncg; numdirs = pip->i_effnlink - 1; power = fls(numdirs); numerator = (numdirs & ~(1 << (power - 1))) * 2 + 1; denominator = 1 << power; prefcg = (curcg - (range / 2) + (range * numerator / denominator)); if (prefcg < 0) prefcg += fs->fs_ncg; if (prefcg >= fs->fs_ncg) prefcg -= fs->fs_ncg; /* * If this filesystem is not tracking directory depths, * revert to the old algorithm. */ if (depth == 0 && pip->i_number != UFS_ROOTINO) prefcg = curcg; /* * Count various limits which used for * optimal allocation of a directory inode. */ maxndir = min(avgndir + (1 << depth), fs->fs_ipg); minifree = avgifree - avgifree / 4; if (minifree < 1) minifree = 1; minbfree = avgbfree - avgbfree / 4; if (minbfree < 1) minbfree = 1; cgsize = fs->fs_fsize * fs->fs_fpg; dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; if (dirsize < curdirsize) dirsize = curdirsize; if (dirsize <= 0) maxcontigdirs = 0; /* dirsize overflowed */ else maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); if (fs->fs_avgfpdir > 0) maxcontigdirs = min(maxcontigdirs, fs->fs_ipg / fs->fs_avgfpdir); if (maxcontigdirs == 0) maxcontigdirs = 1; /* * Limit number of dirs in one cg and reserve space for * regular files, but only if we have no deficit in * inodes or space. * * We are trying to find a suitable cylinder group nearby * our preferred cylinder group to place a new directory. * We scan from our preferred cylinder group forward looking * for a cylinder group that meets our criterion. If we get * to the final cylinder group and do not find anything, * we start scanning forwards from the beginning of the * filesystem. While it might seem sensible to start scanning * backwards or even to alternate looking forward and backward, * this approach fails badly when the filesystem is nearly full. * Specifically, we first search all the areas that have no space * and finally try the one preceding that. We repeat this on * every request and in the case of the final block end up * searching the entire filesystem. By jumping to the front * of the filesystem, our future forward searches always look * in new cylinder groups so finds every possible block after * one pass over the filesystem. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } /* * This is a backstop when we have deficit in space. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) break; return ((ino_t)(fs->fs_ipg * cg)); } /* * Select the desired position for the next block in a file. The file is * logically divided into sections. The first section is composed of the * direct blocks and the next fs_maxbpg blocks. Each additional section * contains fs_maxbpg blocks. * * If no blocks have been allocated in the first section, the policy is to * request a block in the same cylinder group as the inode that describes * the file. The first indirect is allocated immediately following the last * direct block and the data blocks for the first indirect immediately * follow it. * * If no blocks have been allocated in any other section, the indirect * block(s) are allocated in the same cylinder group as its inode in an * area reserved immediately following the inode blocks. The policy for * the data blocks is to place them in a cylinder group with a greater than * average number of free blocks. An appropriate cylinder group is found * by using a rotor that sweeps the cylinder groups. When a new group of * blocks is needed, the sweep begins in the cylinder group following the * cylinder group from which the previous allocation was made. The sweep * continues until a cylinder group with greater than the average number * of free blocks is found. If the allocation is for the first block in an * indirect block or the previous block is a hole, then the information on * the previous allocation is unavailable; here a best guess is made based * on the logical block number being allocated. * * If a section is already partially allocated, the policy is to * allocate blocks contiguously within the section if possible. */ ufs2_daddr_t ffs_blkpref_ufs1(struct inode *ip, ufs_lbn_t lbn, int indx, ufs1_daddr_t *bap) { struct fs *fs; uint64_t cg, inocg; uint64_t avgbfree, startcg; ufs2_daddr_t pref, prevbn; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && ip->i_din1->di_db[UFS_NDADDR - 1] != 0) pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ if (lbn == UFS_NDADDR) { pref = ip->i_din1->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) return (pref + fs->fs_frag); } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx == 0) { prevbn = 0; } else { prevbn = bap[indx - 1]; if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, fs->fs_bsize) != 0) prevbn = 0; } if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || prevbn == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, prevbn) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ return (prevbn + fs->fs_frag); } /* * Same as above, but for UFS2 */ ufs2_daddr_t ffs_blkpref_ufs2(struct inode *ip, ufs_lbn_t lbn, int indx, ufs2_daddr_t *bap) { struct fs *fs; uint64_t cg, inocg; uint64_t avgbfree, startcg; ufs2_daddr_t pref, prevbn; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && ip->i_din2->di_db[UFS_NDADDR - 1] != 0) pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ if (lbn == UFS_NDADDR) { pref = ip->i_din2->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) return (pref + fs->fs_frag); } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx == 0) { prevbn = 0; } else { prevbn = bap[indx - 1]; if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, fs->fs_bsize) != 0) prevbn = 0; } if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || prevbn == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, prevbn) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ return (prevbn + fs->fs_frag); } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadratically rehash on the cylinder group number. * 3) brute force search for a free block. * * Must be called with the UFS lock held. Will release the lock on success * and return with it held on failure. */ /*VARARGS5*/ static ufs2_daddr_t ffs_hashalloc(struct inode *ip, uint64_t cg, ufs2_daddr_t pref, int size, /* Search size for data blocks, mode for inodes */ int rsize, /* Real allocated size. */ allocfcn_t *allocator) { struct fs *fs; ufs2_daddr_t result; uint64_t i, icg = cg; mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); #ifdef INVARIANTS if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_hashalloc: allocation on suspended filesystem"); #endif fs = ITOFS(ip); /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size, rsize); if (result) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->fs_ncg; i *= 2) { cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); cg++; if (cg == fs->fs_ncg) cg = 0; } return (0); } /* * Determine whether a fragment can be extended. * * Check to see if the necessary fragments are available, and * if they are, allocate them. */ static ufs2_daddr_t ffs_fragextend(struct inode *ip, uint64_t cg, ufs2_daddr_t bprev, int osize, int nsize) { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int nffree; long bno; int frags, bbase; int i, error; uint8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) return (0); frags = numfrags(fs, nsize); bbase = fragnum(fs, bprev); if (bbase > fragnum(fs, (bprev + frags - 1))) { /* cannot extend across a block boundary */ return (0); } UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { ffs_checkcgintegrity(fs, cg, error); goto fail; } bno = dtogd(fs, bprev); blksfree = cg_blksfree(cgp); for (i = numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i)) goto fail; /* * the current fragment can be extended * deduct the count on fragment being extended into * increase the count on the remaining fragment (if any) * allocate the extended piece */ for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i)) break; cgp->cg_frsum[i - numfrags(fs, osize)]--; if (i != frags) cgp->cg_frsum[i - frags]++; for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree--; nffree++; } UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= nffree; fs->fs_cs(fs, cg).cs_nffree -= nffree; fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, frags, numfrags(fs, osize)); bdwrite(bp); return (bprev); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static ufs2_daddr_t ffs_alloccg(struct inode *ip, uint64_t cg, ufs2_daddr_t bpref, int size, int rsize) { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; int i, allocsiz, error, frags; uint8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) { ffs_checkcgintegrity(fs, cg, error); goto fail; } if (size == fs->fs_bsize) { UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } /* * check to see if any fragments are already available * allocsiz is the size which will be allocated, hacking * it down to a smaller size if necessary */ blksfree = cg_blksfree(cgp); frags = numfrags(fs, size); for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0) break; if (allocsiz == fs->fs_frag) { /* * no fragments were available, so a block will be * allocated, and hacked up */ if (cgp->cg_cs.cs_nbfree == 0) goto fail; UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } KASSERT(size == rsize, ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) goto fail; for (i = 0; i < frags; i++) clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree -= frags; cgp->cg_frsum[allocsiz]--; if (frags != allocsiz) cgp->cg_frsum[allocsiz - frags]++; UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= frags; fs->fs_cs(fs, cg).cs_nffree -= frags; fs->fs_fmod = 1; blkno = cgbase(fs, cg) + bno; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); bdwrite(bp); return (blkno); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Allocate a block in a cylinder group. * * This algorithm implements the following policy: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate the next available block on the block rotor for the * specified cylinder group. * Note that this routine only allocates fs_bsize blocks; these * blocks may be fragmented by the routine that allocates them. */ static ufs2_daddr_t ffs_alloccgblk(struct inode *ip, struct buf *bp, ufs2_daddr_t bpref, int size) { struct fs *fs; struct cg *cgp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; uint8_t *blksfree; int i, cgbpref; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); if (bpref == 0) { bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { /* map bpref to correct zone in this cg */ if (bpref < cgdata(fs, cgbpref)) bpref = cgmeta(fs, cgp->cg_cgx); else bpref = cgdata(fs, cgp->cg_cgx); } /* * if the requested block is available, use it */ bno = dtogd(fs, blknum(fs, bpref)); if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) goto gotit; /* * Take the next available block in this cylinder group. */ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); if (bno < 0) return (0); /* Update cg_rotor only if allocated from the data zone */ if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) cgp->cg_rotor = bno; gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; fs->fs_fmod = 1; blkno = cgbase(fs, cgp->cg_cgx) + bno; /* * If the caller didn't want the whole block free the frags here. */ size = numfrags(fs, size); if (size != fs->fs_frag) { bno = dtogd(fs, blkno); for (i = size; i < fs->fs_frag; i++) setbit(blksfree, bno + i); i = fs->fs_frag - size; cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; fs->fs_fmod = 1; cgp->cg_frsum[i]++; } /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); UFS_LOCK(ump); return (blkno); } /* * Determine whether a cluster can be allocated. * * We do not currently check for optimal rotational layout if there * are multiple choices in the same cylinder group. Instead we just * take the first one that we find following bpref. */ static ufs2_daddr_t ffs_clusteralloc(struct inode *ip, uint64_t cg, ufs2_daddr_t bpref, int len) { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int i, run, bit, map, got, error; ufs2_daddr_t bno; uint8_t *mapp; int32_t *lp; uint8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_maxcluster[cg] < len) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { ffs_checkcgintegrity(fs, cg, error); UFS_LOCK(ump); return (0); } /* * Check to see if a cluster of the needed size (or bigger) is * available in this cylinder group. */ lp = &cg_clustersum(cgp)[len]; for (i = len; i <= fs->fs_contigsumsize; i++) if (*lp++ > 0) break; if (i > fs->fs_contigsumsize) { /* * This is the first time looking for a cluster in this * cylinder group. Update the cluster summary information * to reflect the true maximum sized cluster so that * future cluster allocation requests can avoid reading * the cylinder group map only to find no clusters. */ lp = &cg_clustersum(cgp)[len - 1]; for (i = len - 1; i > 0; i--) if (*lp-- > 0) break; UFS_LOCK(ump); fs->fs_maxcluster[cg] = i; brelse(bp); return (0); } /* * Search the cluster map to find a big enough cluster. * We take the first one that we find, even if it is larger * than we need as we prefer to get one close to the previous * block allocation. We do not search before the current * preference point as we do not want to allocate a block * that is allocated before the previous one (as we will * then have to wait for another pass of the elevator * algorithm before it will be read). We prefer to fail and * be recalled to try an allocation in the next cylinder group. */ if (dtog(fs, bpref) != cg) bpref = cgdata(fs, cg); else bpref = blknum(fs, bpref); bpref = fragstoblks(fs, dtogd(fs, bpref)); mapp = &cg_clustersfree(cgp)[bpref / NBBY]; map = *mapp++; bit = 1 << (bpref % NBBY); for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { if ((map & bit) == 0) { run = 0; } else { run++; if (run == len) break; } if ((got & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (got >= cgp->cg_nclusterblks) { UFS_LOCK(ump); brelse(bp); return (0); } /* * Allocate the cluster that we have found. */ blksfree = cg_blksfree(cgp); for (i = 1; i <= len; i++) if (!ffs_isblock(fs, blksfree, got - run + i)) panic("ffs_clusteralloc: map mismatch"); bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); if (dtog(fs, bno) != cg) panic("ffs_clusteralloc: allocated out of group"); len = blkstofrags(fs, len); UFS_LOCK(ump); for (i = 0; i < len; i += fs->fs_frag) if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (bno); } static inline struct buf * getinobuf(struct inode *ip, uint64_t cg, uint32_t cginoblk, int gbflags) { struct fs *fs; fs = ITOFS(ip); return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, gbflags)); } /* * Synchronous inode initialization is needed only when barrier writes do not * work as advertised, and will impose a heavy cost on file creation in a newly * created filesystem. */ static int doasyncinodeinit = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, &doasyncinodeinit, 0, "Perform inode block initialization using asynchronous writes"); /* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, * allocate it using the following policy: * 1) allocate the requested inode. * 2) allocate the next available inode after the requested * inode in the specified cylinder group. */ static ufs2_daddr_t ffs_nodealloccg(struct inode *ip, uint64_t cg, ufs2_daddr_t ipref, int mode, int unused) { struct fs *fs; struct cg *cgp; struct buf *bp, *ibp; struct ufsmount *ump; uint8_t *inosused, *loc; struct ufs2_dinode *dp2; int error, start, len, i; uint32_t old_initediblk; ump = ITOUMP(ip); fs = ump->um_fs; check_nifree: if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { ffs_checkcgintegrity(fs, cg, error); UFS_LOCK(ump); return (0); } restart: if (cgp->cg_cs.cs_nifree == 0) { brelse(bp); UFS_LOCK(ump); return (0); } inosused = cg_inosused(cgp); if (ipref) { ipref %= fs->fs_ipg; if (isclr(inosused, ipref)) goto gotit; } start = cgp->cg_irotor / NBBY; len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { len = start + 1; start = 0; loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { printf("cg = %ju, irotor = %ld, fs = %s\n", (intmax_t)cg, (long)cgp->cg_irotor, fs->fs_fsmnt); panic("ffs_nodealloccg: map corrupted"); /* NOTREACHED */ } } ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; gotit: /* * Check to see if we need to initialize more inodes. */ if (fs->fs_magic == FS_UFS2_MAGIC && ipref + INOPB(fs) > cgp->cg_initediblk && cgp->cg_initediblk < cgp->cg_niblk) { old_initediblk = cgp->cg_initediblk; /* * Free the cylinder group lock before writing the * initialized inode block. Entering the * babarrierwrite() with the cylinder group lock * causes lock order violation between the lock and * snaplk. * * Another thread can decide to initialize the same * inode block, but whichever thread first gets the * cylinder group lock after writing the newly * allocated inode block will update it and the other * will realize that it has lost and leave the * cylinder group unchanged. */ ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); brelse(bp); if (ibp == NULL) { /* * The inode block buffer is already owned by * another thread, which must initialize it. * Wait on the buffer to allow another thread * to finish the updates, with dropped cg * buffer lock, then retry. */ ibp = getinobuf(ip, cg, old_initediblk, 0); brelse(ibp); UFS_LOCK(ump); goto check_nifree; } bzero(ibp->b_data, (int)fs->fs_bsize); dp2 = (struct ufs2_dinode *)(ibp->b_data); for (i = 0; i < INOPB(fs); i++) { while (dp2->di_gen == 0) dp2->di_gen = arc4random(); dp2++; } /* * Rather than adding a soft updates dependency to ensure * that the new inode block is written before it is claimed * by the cylinder group map, we just do a barrier write * here. The barrier write will ensure that the inode block * gets written before the updated cylinder group map can be * written. The barrier write should only slow down bulk * loading of newly created filesystems. */ if (doasyncinodeinit) babarrierwrite(ibp); else bwrite(ibp); /* * After the inode block is written, try to update the * cg initediblk pointer. If another thread beat us * to it, then leave it unchanged as the other thread * has already set it correctly. */ error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); UFS_LOCK(ump); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (error != 0) return (error); if (cgp->cg_initediblk == old_initediblk) cgp->cg_initediblk += INOPB(fs); goto restart; } cgp->cg_irotor = ipref; UFS_LOCK(ump); ACTIVECLEAR(fs, cg); setbit(inosused, ipref); cgp->cg_cs.cs_nifree--; fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; fs->fs_fmod = 1; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir++; fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); bdwrite(bp); return ((ino_t)(cg * fs->fs_ipg + ipref)); } /* * Free a block or fragment. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. */ static void ffs_blkfree_cg(struct ufsmount *ump, struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, struct workhead *dephd) { struct mount *mp; struct cg *cgp; struct buf *bp; daddr_t dbn; ufs1_daddr_t fragno, cgbno; int i, blk, frags, bbase, error; uint64_t cg; uint8_t *blksfree; struct cdev *dev; cg = dtog(fs, bno); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; } else if (devvp->v_type == VCHR) { /* * devvp is a normal disk device * XXXKIB: devvp is not locked there, v_rdev access depends on * busy mount, which prevents mntfs devvp from reclamation. */ dev = devvp->v_rdev; } else return; #ifdef INVARIANTS if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0 || fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_blkfree_cg: invalid size"); } #endif if ((uint64_t)bno >= fs->fs_size) { printf("bad block %jd, ino %ju\n", (intmax_t)bno, (intmax_t)inum); ffs_fserr(fs, inum, "bad block"); return; } if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) return; /* * Would like to just downgrade to read-only. Until that * capability is available, just toss the cylinder group * update and mark the filesystem as needing to run fsck. */ fs->fs_flags |= FS_NEEDSFSCK; if (devvp->v_type == VREG) dbn = fragstoblks(fs, cgtod(fs, cg)); else dbn = fsbtodb(fs, cgtod(fs, cg)); error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); KASSERT(error == 0, ("getblkx failed")); softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, numfrags(fs, size), dephd, true); bp->b_flags |= B_RELBUF | B_NOCACHE; bp->b_flags &= ~B_CACHE; bawrite(bp); return; } cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); UFS_LOCK(ump); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, blksfree, fragno)) { if (devvp->v_type == VREG) { UFS_UNLOCK(ump); /* devvp is a snapshot */ brelse(bp); return; } printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free block"); } ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(blksfree, cgbno + i)) { printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)(bno + i), fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free frag"); } setbit(blksfree, cgbno + i); } cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); mp = UFSTOVFS(ump); if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, numfrags(fs, size), dephd, false); bdwrite(bp); } /* * Structures and routines associated with trim management. * * The following requests are passed to trim_lookup to indicate * the actions that should be taken. */ #define NEW 1 /* if found, error else allocate and hash it */ #define OLD 2 /* if not found, error, else return it */ #define REPLACE 3 /* if not found, error else unhash and reallocate it */ #define DONE 4 /* if not found, error else unhash and return it */ #define SINGLE 5 /* don't look up, just allocate it and don't hash it */ MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); #define TRIMLIST_HASH(ump, key) \ (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) /* * These structures describe each of the block free requests aggregated * together to make up a trim request. */ struct trim_blkreq { TAILQ_ENTRY(trim_blkreq) blkreqlist; ufs2_daddr_t bno; long size; struct workhead *pdephd; struct workhead dephd; }; /* * Description of a trim request. */ struct ffs_blkfree_trim_params { TAILQ_HEAD(, trim_blkreq) blklist; LIST_ENTRY(ffs_blkfree_trim_params) hashlist; struct task task; struct ufsmount *ump; struct vnode *devvp; ino_t inum; ufs2_daddr_t bno; long size; long key; }; static void ffs_blkfree_trim_completed(struct buf *); static void ffs_blkfree_trim_task(void *ctx, int pending __unused); static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, struct vnode *, ufs2_daddr_t, long, ino_t, uint64_t, int); static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); /* * Called on trim completion to start a task to free the associated block(s). */ static void ffs_blkfree_trim_completed(struct buf *bp) { struct ffs_blkfree_trim_params *tp; tp = bp->b_fsprivate1; free(bp, M_TRIM); TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); } /* * Trim completion task that free associated block(s). */ static void ffs_blkfree_trim_task(void *ctx, int pending) { struct ffs_blkfree_trim_params *tp; struct trim_blkreq *blkelm; struct ufsmount *ump; tp = ctx; ump = tp->ump; while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, blkelm->size, tp->inum, blkelm->pdephd); TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); free(blkelm, M_TRIM); } vn_finished_secondary_write(UFSTOVFS(ump)); UFS_LOCK(ump); ump->um_trim_inflight -= 1; ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); UFS_UNLOCK(ump); free(tp, M_TRIM); } /* * Lookup a trim request by inode number. * Allocate if requested (NEW, REPLACE, SINGLE). */ static struct ffs_blkfree_trim_params * trim_lookup(struct ufsmount *ump, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, uint64_t key, int alloctype) { struct trimlist_hashhead *tphashhead; struct ffs_blkfree_trim_params *tp, *ntp; ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); if (alloctype != SINGLE) { KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); UFS_LOCK(ump); tphashhead = TRIMLIST_HASH(ump, key); LIST_FOREACH(tp, tphashhead, hashlist) if (key == tp->key) break; } switch (alloctype) { case NEW: KASSERT(tp == NULL, ("trim_lookup: found trim")); break; case OLD: KASSERT(tp != NULL, ("trim_lookup: missing call to ffs_blkrelease_start()")); UFS_UNLOCK(ump); free(ntp, M_TRIM); return (tp); case REPLACE: KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); LIST_REMOVE(tp, hashlist); /* tp will be freed by caller */ break; case DONE: KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); LIST_REMOVE(tp, hashlist); UFS_UNLOCK(ump); free(ntp, M_TRIM); return (tp); } TAILQ_INIT(&ntp->blklist); ntp->ump = ump; ntp->devvp = devvp; ntp->bno = bno; ntp->size = size; ntp->inum = inum; ntp->key = key; if (alloctype != SINGLE) { LIST_INSERT_HEAD(tphashhead, ntp, hashlist); UFS_UNLOCK(ump); } return (ntp); } /* * Dispatch a trim request. */ static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *tp) { struct ufsmount *ump; struct mount *mp; struct buf *bp; /* * Postpone the set of the free bit in the cg bitmap until the * BIO_DELETE is completed. Otherwise, due to disk queue * reordering, TRIM might be issued after we reuse the block * and write some new data into it. */ ump = tp->ump; bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); bp->b_iocmd = BIO_DELETE; bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); bp->b_iodone = ffs_blkfree_trim_completed; bp->b_bcount = tp->size; bp->b_fsprivate1 = tp; UFS_LOCK(ump); ump->um_trim_total += 1; ump->um_trim_inflight += 1; ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); UFS_UNLOCK(ump); mp = UFSTOVFS(ump); vn_start_secondary_write(NULL, &mp, 0); g_vfs_strategy(ump->um_bo, bp); } /* * Allocate a new key to use to identify a range of blocks. */ uint64_t ffs_blkrelease_start(struct ufsmount *ump, struct vnode *devvp, ino_t inum) { static u_long masterkey; uint64_t key; if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) return (SINGLETON_KEY); do { key = atomic_fetchadd_long(&masterkey, 1); } while (key < FIRST_VALID_KEY); (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); return (key); } /* * Deallocate a key that has been used to identify a range of blocks. */ void ffs_blkrelease_finish(struct ufsmount *ump, uint64_t key) { struct ffs_blkfree_trim_params *tp; if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) return; /* * If the vfs.ffs.dotrimcons sysctl option is enabled while * a file deletion is active, specifically after a call * to ffs_blkrelease_start() but before the call to * ffs_blkrelease_finish(), ffs_blkrelease_start() will * have handed out SINGLETON_KEY rather than starting a * collection sequence. Thus if we get a SINGLETON_KEY * passed to ffs_blkrelease_finish(), we just return rather * than trying to finish the nonexistent sequence. */ if (key == SINGLETON_KEY) { #ifdef INVARIANTS printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n", ump->um_mountp->mnt_stat.f_mntonname); #endif return; } /* * We are done with sending blocks using this key. Look up the key * using the DONE alloctype (in tp) to request that it be unhashed * as we will not be adding to it. If the key has never been used, * tp->size will be zero, so we can just free tp. Otherwise the call * to ffs_blkfree_sendtrim(tp) causes the block range described by * tp to be issued (and then tp to be freed). */ tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); if (tp->size == 0) free(tp, M_TRIM); else ffs_blkfree_sendtrim(tp); } /* * Setup to free a block or fragment. * * Check for snapshots that might want to claim the block. * If trims are requested, prepare a trim request. Attempt to * aggregate consecutive blocks into a single trim request. */ void ffs_blkfree(struct ufsmount *ump, struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, __enum_uint8(vtype) vtype, struct workhead *dephd, uint64_t key) { struct ffs_blkfree_trim_params *tp, *ntp; struct trim_blkreq *blkelm; /* * Check to see if a snapshot wants to claim the block. * Check that devvp is a normal disk device, not a snapshot, * it has a snapshot(s) associated with it, and one of the * snapshots wants to claim the block. */ if (devvp->v_type == VCHR && (devvp->v_vflag & VV_COPYONWRITE) && ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { return; } /* * Nothing to delay if TRIM is not required for this block or TRIM * is disabled or the operation is performed on a snapshot. */ if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || devvp->v_type == VREG) { ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); return; } blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); blkelm->bno = bno; blkelm->size = size; if (dephd == NULL) { blkelm->pdephd = NULL; } else { LIST_INIT(&blkelm->dephd); LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); blkelm->pdephd = &blkelm->dephd; } if (key == SINGLETON_KEY) { /* * Just a single non-contiguous piece. Use the SINGLE * alloctype to return a trim request that will not be * hashed for future lookup. */ tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); ffs_blkfree_sendtrim(tp); return; } /* * The callers of this function are not tracking whether or not * the blocks are contiguous. They are just saying that they * are freeing a set of blocks. It is this code that determines * the pieces of that range that are actually contiguous. * * Calling ffs_blkrelease_start() will have created an entry * that we will use. */ tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); if (tp->size == 0) { /* * First block of a potential range, set block and size * for the trim block. */ tp->bno = bno; tp->size = size; TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); return; } /* * If this block is a continuation of the range (either * follows at the end or preceeds in the front) then we * add it to the front or back of the list and return. * * If it is not a continuation of the trim that we were * building, using the REPLACE alloctype, we request that * the old trim request (still in tp) be unhashed and a * new range started (in ntp). The ffs_blkfree_sendtrim(tp) * call causes the block range described by tp to be issued * (and then tp to be freed). */ if (bno + numfrags(fs, size) == tp->bno) { TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); tp->bno = bno; tp->size += size; return; } else if (bno == tp->bno + numfrags(fs, tp->size)) { TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); tp->size += size; return; } ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); ffs_blkfree_sendtrim(tp); } #ifdef INVARIANTS /* * Verify allocation of a block or fragment. * Return 1 if block or fragment is free. */ static int ffs_checkfreeblk(struct inode *ip, ufs2_daddr_t bno, long size) { struct fs *fs; struct cg *cgp; struct buf *bp; ufs1_daddr_t cgbno; int i, frags, blkalloced; uint8_t *blksfree; fs = ITOFS(ip); if ((uint64_t)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("bsize = %ld, size = %ld, fs = %s\n", (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_checkfreeblk: bad size"); } if ((uint64_t)bno >= fs->fs_size) panic("ffs_checkfreeblk: too big block %jd", (intmax_t)bno); if (ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp) != 0) return (0); blksfree = cg_blksfree(cgp); cgbno = dtogd(fs, bno); if (size == fs->fs_bsize) { blkalloced = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); } else { frags = numfrags(fs, size); for (blkalloced = 0, i = 0; i < frags; i++) if (isset(blksfree, cgbno + i)) blkalloced++; if (blkalloced != 0 && blkalloced != frags) panic("ffs_checkfreeblk: partially free fragment"); } brelse(bp); return (blkalloced == 0); } #endif /* INVARIANTS */ /* * Free an inode. */ int ffs_vfree(struct vnode *pvp, ino_t ino, int mode) { struct ufsmount *ump; if (DOINGSOFTDEP(pvp)) { softdep_freefile(pvp, ino, mode); return (0); } ump = VFSTOUFS(pvp->v_mount); return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); } /* * Do the actual free operation. * The specified inode is placed back in the free map. */ int ffs_freefile(struct ufsmount *ump, struct fs *fs, struct vnode *devvp, ino_t ino, int mode, struct workhead *wkhd) { struct cg *cgp; struct buf *bp; daddr_t dbn; int error; uint64_t cg; uint8_t *inosused; struct cdev *dev; ino_t cgino; cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ dev = devvp->v_rdev; } else { bp = NULL; return (0); } if (ino >= fs->fs_ipg * fs->fs_ncg) panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { if (!MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) return (error); /* * Would like to just downgrade to read-only. Until that * capability is available, just toss the cylinder group * update and mark the filesystem as needing to run fsck. */ fs->fs_flags |= FS_NEEDSFSCK; if (devvp->v_type == VREG) dbn = fragstoblks(fs, cgtod(fs, cg)); else dbn = fsbtodb(fs, cgtod(fs, cg)); error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); KASSERT(error == 0, ("getblkx failed")); softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, true); bp->b_flags |= B_RELBUF | B_NOCACHE; bp->b_flags &= ~B_CACHE; bawrite(bp); return (error); } inosused = cg_inosused(cgp); cgino = ino % fs->fs_ipg; if (isclr(inosused, cgino)) { printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); if (fs->fs_ronly == 0) panic("ffs_freefile: freeing free inode"); } clrbit(inosused, cgino); if (cgino < cgp->cg_irotor) cgp->cg_irotor = cgino; cgp->cg_cs.cs_nifree++; UFS_LOCK(ump); fs->fs_cstotal.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir--; fs->fs_cstotal.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd, false); bdwrite(bp); return (0); } /* * Check to see if a file is free. * Used to check for allocated files in snapshots. * Return 1 if file is free. */ int ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino) { struct cg *cgp; struct buf *bp; int ret, error; uint64_t cg; uint8_t *inosused; cg = ino_to_cg(fs, ino); if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) return (1); if (ino >= fs->fs_ipg * fs->fs_ncg) return (1); if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0) return (1); inosused = cg_inosused(cgp); ino %= fs->fs_ipg; ret = isclr(inosused, ino); brelse(bp); return (ret); } /* * Find a block of the specified size in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static ufs1_daddr_t ffs_mapsearch(struct fs *fs, struct cg *cgp, ufs2_daddr_t bpref, int allocsiz) { ufs1_daddr_t bno; int start, len, loc, i; int blk, field, subfield, pos; uint8_t *blksfree; /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = cgp->cg_frotor / NBBY; blksfree = cg_blksfree(cgp); len = howmany(fs->fs_fpg, NBBY) - start; loc = scanc((uint64_t)len, (uint8_t *)&blksfree[start], fragtbl[fs->fs_frag], (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { len = start + 1; start = 0; loc = scanc((uint64_t)len, (uint8_t *)&blksfree[0], fragtbl[fs->fs_frag], (uint8_t)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { printf("start = %d, len = %d, fs = %s\n", start, len, fs->fs_fsmnt); panic("ffs_alloccg: map corrupted"); /* NOTREACHED */ } } bno = (start + len - loc) * NBBY; cgp->cg_frotor = bno; /* * found the byte in the map * sift through the bits to find the selected frag */ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, blksfree, bno); blk <<= 1; field = around[allocsiz]; subfield = inside[allocsiz]; for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { if ((blk & field) == subfield) return (bno + pos); field <<= 1; subfield <<= 1; } } printf("bno = %ju, fs = %s\n", (intmax_t)bno, fs->fs_fsmnt); panic("ffs_alloccg: block not in map"); return (-1); } /* * Fetch and verify a cylinder group. */ int ffs_getcg(struct fs *fs, struct vnode *devvp, uint64_t cg, int flags, struct buf **bpp, struct cg **cgpp) { struct buf *bp; struct cg *cgp; struct mount *mp; const struct statfs *sfs; daddr_t blkno; int error; *bpp = NULL; *cgpp = NULL; if ((fs->fs_metackhash & CK_CYLGRP) != 0) flags |= GB_CKHASH; if (devvp->v_type == VCHR) { blkno = fsbtodb(fs, cgtod(fs, cg)); mp = devvp->v_rdev->si_mountpt; } else { blkno = fragstoblks(fs, cgtod(fs, cg)); mp = devvp->v_mount; } error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL, NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp); if (error != 0) return (error); cgp = (struct cg *)bp->b_data; if ((fs->fs_metackhash & CK_CYLGRP) != 0 && (bp->b_flags & B_CKHASH) != 0 && cgp->cg_ckhash != bp->b_ckhash) { if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { sfs = &mp->mnt_stat; printf("UFS %s%s (%s) cylinder checkhash failed: " "cg %ju, cgp: 0x%x != bp: 0x%jx\n", devvp->v_type == VCHR ? "" : "snapshot of ", sfs->f_mntfromname, sfs->f_mntonname, (intmax_t)cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); } bp->b_flags &= ~B_CKHASH; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (EINTEGRITY); } if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { if (ppsratecheck(&VFSTOUFS(mp)->um_last_integritymsg, &VFSTOUFS(mp)->um_secs_integritymsg, 1)) { sfs = &mp->mnt_stat; printf("UFS %s%s (%s)", devvp->v_type == VCHR ? "" : "snapshot of ", sfs->f_mntfromname, sfs->f_mntonname); if (!cg_chkmagic(cgp)) printf(" cg %ju: bad magic number 0x%x should " "be 0x%x\n", (intmax_t)cg, cgp->cg_magic, CG_MAGIC); else printf(": wrong cylinder group cg %ju != " "cgx %u\n", (intmax_t)cg, cgp->cg_cgx); } bp->b_flags &= ~B_CKHASH; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (EINTEGRITY); } bp->b_flags &= ~B_CKHASH; bp->b_xflags |= BX_BKGRDWRITE; /* * If we are using check hashes on the cylinder group then we want * to limit changing the cylinder group time to when we are actually * going to write it to disk so that its check hash remains correct * in memory. If the CK_CYLGRP flag is set the time is updated in * ffs_bufwrite() as the buffer is queued for writing. Otherwise we * update the time here as we have done historically. */ if ((fs->fs_metackhash & CK_CYLGRP) != 0) bp->b_xflags |= BX_CYLGRP; else cgp->cg_old_time = cgp->cg_time = time_second; *bpp = bp; *cgpp = cgp; return (0); } static void ffs_ckhash_cg(struct buf *bp) { uint32_t ckhash; struct cg *cgp; cgp = (struct cg *)bp->b_data; ckhash = cgp->cg_ckhash; cgp->cg_ckhash = 0; bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); cgp->cg_ckhash = ckhash; } /* * Called when a cylinder group read has failed. If an integrity check * is the cause of failure then the cylinder group will not be usable * until the filesystem has been unmounted and fsck has been run to * repair it. To avoid future attempts to allocate resources from the * cylinder group, its available resources are set to zero in the * superblock summary information. Since it will appear to have no * resources available, no further calls will be made to allocate * resources from it. When resources are freed to the cylinder group * the resource free routines will find the cylinder group unusable so * the resource will simply be discarded and thus will not show up in * the superblock summary information until they are recovered by fsck. */ static void ffs_checkcgintegrity(struct fs *fs, uint64_t cg, int error) { if (error != EINTEGRITY) return; fs->fs_cstotal.cs_nffree -= fs->fs_cs(fs, cg).cs_nffree; fs->fs_cs(fs, cg).cs_nffree = 0; fs->fs_cstotal.cs_nbfree -= fs->fs_cs(fs, cg).cs_nbfree; fs->fs_cs(fs, cg).cs_nbfree = 0; fs->fs_cstotal.cs_nifree -= fs->fs_cs(fs, cg).cs_nifree; fs->fs_cs(fs, cg).cs_nifree = 0; fs->fs_maxcluster[cg] = 0; + fs->fs_flags |= FS_NEEDSFSCK; fs->fs_fmod = 1; } /* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: * fs: error message */ void ffs_fserr(struct fs *fs, ino_t inum, char *cp) { struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, fs->fs_fsmnt, cp); } /* * This function provides the capability for the fsck program to * update an active filesystem. Sixteen operations are provided: * * adjrefcnt(inode, amt) - adjusts the reference count on the * specified inode by the specified amount. Under normal * operation the count should always go down. Decrementing * the count to zero will cause the inode to be freed. * adjblkcnt(inode, amt) - adjust the number of blocks used by the * inode by the specified amount. * adjdepth(inode, amt) - adjust the depth of the specified directory * inode by the specified amount. * setsize(inode, size) - set the size of the inode to the * specified size. * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - * adjust the superblock summary. * freedirs(inode, count) - directory inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freefiles(inode, count) - file inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] * are marked as free. Blocks should never have to be marked * as in use. * setflags(flags, set/clear) - the fs_flags field has the specified * flags set (second parameter +1) or cleared (second parameter -1). * setcwd(dirinode) - set the current directory to dirinode in the * filesystem associated with the snapshot. * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." * in the current directory is oldvalue then change it to newvalue. * unlink(nameptr, oldvalue) - Verify that the inode number associated * with nameptr in the current directory is oldvalue then unlink it. */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT, 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_DEPTH, adjdepth, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust Directory Inode Depth"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Set the inode size"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of directories"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free frags"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free clusters"); static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of Directory Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of File Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of Blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Change Filesystem Flags"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Set Current Working Directory"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Change Value of .. Entry"); static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Unlink a Duplicate Name"); #ifdef DIAGNOSTIC static int fsckcmds = 0; SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "print out fsck_ffs-based filesystem update commands"); #endif /* DIAGNOSTIC */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct fsck_cmd cmd; struct ufsmount *ump; struct vnode *vp, *dvp, *fdvp; struct inode *ip, *dp; struct mount *mp; struct fs *fs; struct pwd *pwd; ufs2_daddr_t blkno; long blkcnt, blksize; uint64_t key; struct file *fp; cap_rights_t rights; int filetype, error; if (req->newptr == NULL || req->newlen > sizeof(cmd)) return (EBADRPC); if ((error = SYSCTL_IN(req, &cmd, sizeof(cmd))) != 0) return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); if ((error = getvnode(td, cmd.handle, cap_rights_init_one(&rights, CAP_FSCK), &fp)) != 0) return (error); vp = fp->f_vnode; if (vp->v_type != VREG && vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } vn_start_write(vp, &mp, V_WAIT); if (mp == NULL || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { vn_finished_write(mp); fdrop(fp, td); return (EINVAL); } ump = VFSTOUFS(mp); if (mp->mnt_flag & MNT_RDONLY) { vn_finished_write(mp); fdrop(fp, td); return (EROFS); } fs = ump->um_fs; filetype = IFREG; switch (oidp->oid_number) { case FFS_SET_FLAGS: #ifdef DIAGNOSTIC if (fsckcmds) printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, cmd.size > 0 ? "set" : "clear"); #endif /* DIAGNOSTIC */ if (cmd.size > 0) fs->fs_flags |= (long)cmd.value; else fs->fs_flags &= ~(long)cmd.value; break; case FFS_ADJ_REFCNT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust inode %jd link count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); ip->i_nlink += cmd.size; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_effnlink += cmd.size; UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); vput(vp); break; case FFS_ADJ_BLKCNT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust inode %jd block count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); vput(vp); break; case FFS_ADJ_DEPTH: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust directory inode %jd depth by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; if (vp->v_type != VDIR) { vput(vp); error = ENOTDIR; break; } ip = VTOI(vp); DIP_SET(ip, i_dirdepth, DIP(ip, i_dirdepth) + cmd.size); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); vput(vp); break; case FFS_SET_SIZE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: set inode %jd size to %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); DIP_SET(ip, i_size, cmd.size); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); vput(vp); break; case FFS_DIR_FREE: filetype = IFDIR; /* fall through */ case FFS_FILE_FREE: #ifdef DIAGNOSTIC if (fsckcmds) { if (cmd.size == 1) printf("%s: free %s inode %ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value); else printf("%s: free %s inodes %ju-%ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value, (uintmax_t)(cmd.value + cmd.size - 1)); } #endif /* DIAGNOSTIC */ while (cmd.size > 0) { if ((error = ffs_freefile(ump, fs, ump->um_devvp, cmd.value, filetype, NULL))) break; cmd.size -= 1; cmd.value += 1; } break; case FFS_BLK_FREE: #ifdef DIAGNOSTIC if (fsckcmds) { if (cmd.size == 1) printf("%s: free block %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); else printf("%s: free blocks %jd-%jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.value + cmd.size - 1); } #endif /* DIAGNOSTIC */ blkno = cmd.value; blkcnt = cmd.size; blksize = fs->fs_frag - (blkno % fs->fs_frag); key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); while (blkcnt > 0) { if (blkcnt < blksize) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL, key); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; } ffs_blkrelease_finish(ump, key); break; /* * Adjust superblock summaries. fsck(8) is expected to * submit deltas when necessary. */ case FFS_ADJ_NDIR: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of directories by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_ndir += cmd.value; break; case FFS_ADJ_NBFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free blocks by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nbfree += cmd.value; break; case FFS_ADJ_NIFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free inodes by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nifree += cmd.value; break; case FFS_ADJ_NFFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free frags by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nffree += cmd.value; break; case FFS_ADJ_NUMCLUSTERS: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free clusters by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_numclusters += cmd.value; break; case FFS_SET_CWD: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: set current directory to inode %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) break; AUDIT_ARG_VNODE1(vp); if ((error = change_dir(vp, td)) != 0) { vput(vp); break; } VOP_UNLOCK(vp); pwd_chdir(td, vp); break; case FFS_SET_DOTDOT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: change .. in cwd from %jd to %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ /* * First we have to get and lock the parent directory * to which ".." points. */ error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); if (error) break; /* * Now we get and lock the child directory containing "..". */ pwd = pwd_hold(td); dvp = pwd->pwd_cdir; if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) { vput(fdvp); pwd_drop(pwd); break; } dp = VTOI(dvp); SET_I_OFFSET(dp, 12); /* XXX mastertemplate.dot_reclen */ error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, DT_DIR, 0); cache_purge(fdvp); cache_purge(dvp); vput(dvp); vput(fdvp); pwd_drop(pwd); break; case FFS_UNLINK: #ifdef DIAGNOSTIC if (fsckcmds) { char buf[32]; if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) strncpy(buf, "Name_too_long", 32); printf("%s: unlink %s (inode %jd)\n", mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ /* * kern_funlinkat will do its own start/finish writes and * they do not nest, so drop ours here. Setting mp == NULL * indicates that vn_finished_write is not needed down below. */ vn_finished_write(mp); mp = NULL; error = kern_funlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE, 0, (ino_t)cmd.size); break; default: #ifdef DIAGNOSTIC if (fsckcmds) { printf("Invalid request %d from fsck\n", oidp->oid_number); } #endif /* DIAGNOSTIC */ error = EINVAL; break; } fdrop(fp, td); vn_finished_write(mp); return (error); } diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c index 6a3329078817..9cfd56472a78 100644 --- a/sys/ufs/ffs/ffs_snapshot.c +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -1,2752 +1,2755 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #include "opt_ffs.h" #ifdef NO_FFS_SNAPSHOT int ffs_snapshot(struct mount *mp, char *snapfile) { return (EINVAL); } int ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, __enum_uint8(vtype) vtype, struct workhead *wkhd) { return (EINVAL); } void ffs_snapremove(struct vnode *vp) { } void ffs_snapshot_mount(struct mount *mp) { } void ffs_snapshot_unmount(struct mount *mp) { } void ffs_snapgone(struct inode *ip) { } int ffs_copyonwrite(struct vnode *devvp, struct buf *bp) { return (EINVAL); } void ffs_sync_snap(struct mount *mp, int waitfor) { } #else FEATURE(ffs_snapshot, "FFS snapshot support"); LIST_HEAD(, snapdata) snapfree; static struct mtx snapfree_lock; MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); static void try_free_snapdata(struct vnode *devvp); static void revert_snaplock(struct vnode *, struct vnode *, struct snapdata *); static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); static int ffs_bp_snapblk(struct vnode *, struct buf *); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the code normally only ensures * persistence for the filesystem metadata contained within a * snapshot. Setting the following flag allows this crash * persistence to be enabled for file contents. */ int dopersistence = 0; #ifdef DIAGNOSTIC #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DIAGNOSTIC */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(struct mount *mp, char *snapfile) { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; ufs2_daddr_t blockno; uint64_t flag; char saved_nice = 0; #ifdef DIAGNOSTIC long redo = 0; #endif long snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs, *bpfs; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp; struct nameidata nd; struct mount *wrtmp; struct vattr vat; struct vnode *vp, *xvp, *mvp, *devvp; struct uio auio; struct iovec aiov; struct snapdata *sn; struct ufsmount *ump; #ifdef DIAGNOSTIC struct timespec starttime = {0, 0}, endtime; #endif ump = VFSTOUFS(mp); fs = ump->um_fs; sn = NULL; MNT_ILOCK(mp); flag = mp->mnt_flag; MNT_IUNLOCK(mp); /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; UFS_UNLOCK(ump); if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE, snapfile); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE_PNBUF(&nd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); vfs_rel(wrtmp); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE_PNBUF(&nd); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); if (error) { VOP_VPUT_PAIR(nd.ni_dvp, NULL, true); NDFREE_PNBUF(&nd); vn_finished_write(wrtmp); if (error == ERELOOKUP) goto restart; return (error); } vp = nd.ni_vp; vref(nd.ni_dvp); VOP_VPUT_PAIR(nd.ni_dvp, &vp, false); if (VN_IS_DOOMED(vp)) { error = EBADF; goto out; } vnode_create_vobject(nd.ni_vp, fs->fs_size, td); vp->v_vflag |= VV_SYSTEM; ip = VTOI(vp); devvp = ITODEVVP(ip); /* * Calculate the size of the filesystem then allocate the block * immediately following the last block of the filesystem that * will contain the snapshot list. This operation allows us to * set the size of the snapshot. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)numblks), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; bawrite(bp); ip->i_size = lblktosize(fs, (off_t)(numblks + 1)); vnode_pager_setsize(vp, ip->i_size); DIP_SET(ip, i_size, ip->i_size); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); if (cg % 10 == 0) { error = ffs_syncvnode(vp, MNT_WAIT, 0); /* vp possibly reclaimed if unlocked */ if (error != 0) goto out; } } + /* + * Change inode to snapshot type file. Before setting its block + * pointers to BLK_SNAP and BLK_NOCOPY in cgaccount, we have to + * set its type to SF_SNAPSHOT so that VOP_REMOVE will know that + * they need to be rolled back before attempting deletion. + */ + ip->i_flags |= SF_SNAPSHOT; + DIP_SET(ip, i_flags, ip->i_flags); + UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = roundup2(howmany(fs->fs_ncg, NBBY), sizeof(uint64_t)); space = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); UFS_LOCK(ump); fs->fs_active = space; UFS_UNLOCK(ump); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (cg % 10 == 0 && error == 0) error = ffs_syncvnode(vp, MNT_WAIT, 0); if (error) goto out; } - /* - * Change inode to snapshot type file. - */ - ip->i_flags |= SF_SNAPSHOT; - DIP_SET(ip, i_flags, ip->i_flags); - UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) goto out; VOP_UNLOCK(vp); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); saved_nice = p->p_nice; sched_nice(p, 0); PROC_UNLOCK(p); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (ip->i_effnlink == 0) { error = ENOENT; /* Snapshot file unlinked */ goto resumefs; } #ifdef DIAGNOSTIC if (collectsnapstats) nanotime(&starttime); #endif /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; #ifdef DIAGNOSTIC redo++; #endif error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto resumefs; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto resumefs; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ copy_fs = malloc((uint64_t)fs->fs_bsize, M_UFSMNT, M_WAITOK); bcopy(fs, copy_fs, fs->fs_sbsize); copy_fs->fs_si = malloc(sizeof(struct fs_summary_info), M_UFSMNT, M_ZERO | M_WAITOK); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&((char *)copy_fs)[fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((uint64_t)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); goto resumefs; } bcopy(bp->b_data, space, (uint64_t)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the size needed for the snapshot list. * Initial number of entries is composed of: * - one for each cylinder group map * - one for each block used by superblock summary table * - one for each snapshot inode block * - one for the superblock * - one for the snapshot list * The direct block entries in the snapshot are always * copied (see reason below). Note that the superblock and * the first cylinder group will almost always be allocated * in the direct blocks, but we add the slop for them in case * they do not end up there. The snapshot list size may get * expanded by one because of an update of an inode block for * an unlinked but still open file when it is expunged. * * Because the direct block pointers are always copied, they * are not added to the list. Instead ffs_copyonwrite() * explicitly checks for them before checking the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + /* superblock */ 1 + /* snaplist */ 1; MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_SUSPENDED; MNT_IUNLOCK(mp); loop: MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { if ((xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || xvp->v_type == VNON || IS_SNAPSHOT(VTOI(xvp))) { VI_UNLOCK(xvp); continue; } /* * We can skip parent directory vnode because it must have * this snapshot file in it. */ if (xvp == nd.ni_dvp) { VI_UNLOCK(xvp); continue; } vholdl(xvp); if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); vdrop(xvp); goto loop; } VI_LOCK(xvp); if (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { VI_UNLOCK(xvp); VOP_UNLOCK(xvp); vdrop(xvp); continue; } VI_UNLOCK(xvp); #ifdef DIAGNOSTIC if (snapdebug) vn_printf(xvp, "ffs_snapshot: busy vnode "); #endif if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp); vdrop(xvp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp); vdrop(xvp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < UFS_NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number, xvp->v_type, NULL, SINGLETON_KEY); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } } snaplistsize += 1; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 1); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 1); if (blkno) DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); VOP_UNLOCK(xvp); vdrop(xvp); if (error) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto resumefs; } } /* * Erase the journal file from the snapshot. */ if (fs->fs_flags & FS_SUJ) { error = softdep_journal_lookup(mp, &xvp); if (error) goto resumefs; xp = VTOI(xvp); if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 0); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 0); vput(xvp); } /* * Preallocate all the direct blocks in the snapshot inode so * that we never have to write the inode itself to commit an * update to the contents of the snapshot. Note that once * created, the size of the snapshot will never change, so * there will never be a need to write the inode except to * update the non-integrity-critical time fields and * allocated-block count. */ for (blockno = 0; blockno < UFS_NDADDR; blockno++) { if (DIP(ip, i_db[blockno]) != 0) continue; error = UFS_BALLOC(vp, lblktosize(fs, blockno), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto resumefs; error = readblock(vp, bp, blockno); bawrite(bp); if (error != 0) goto resumefs; } /* * Acquire a lock on the snapdata structure, creating it if necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the original * private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); xp = TAILQ_FIRST(&sn->sn_head); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg)) > blkno) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (sn->sn_blklist != NULL) panic("ffs_snapshot: non-empty list"); sn->sn_blklist = snapblklist; sn->sn_listsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %ju already on list", (uintmax_t)ip->i_number); TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); resumefs: ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); if (error != 0 && copy_fs != NULL) { free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs->fs_si, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; } KASSERT(error != 0 || (sn != NULL && copy_fs != NULL), ("missing snapshot setup parameters")); /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); #ifdef DIAGNOSTIC if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); timespecsub(&endtime, &starttime, &endtime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } #endif if (copy_fs == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP, 0); if (error == 0 && xp->i_effnlink == 0) { error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); } if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (I_IS_UFS1(ip)) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP, 0); if (error) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, (off_t)numblks); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); } else { loc = blkoff(fs, fs->fs_sblockloc); copy_fs->fs_fmod = 0; bpfs = (struct fs *)&nbp->b_data[loc]; bcopy((caddr_t)copy_fs, (caddr_t)bpfs, (uint64_t)fs->fs_sbsize); ffs_oldfscompat_write(bpfs, ump); bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = sn->sn_blklist; sn->sn_blklist = snapblklist; sn->sn_listsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) free(space, M_UFSMNT); done: free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs->fs_si, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; out: if (saved_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); sched_nice(td->td_proc, saved_nice); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); if (fs->fs_active != 0) { free(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } UFS_UNLOCK(ump); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); MNT_IUNLOCK(mp); NDFREE_PNBUF(&nd); vrele(nd.ni_dvp); if (error == 0) { (void) ffs_syncvnode(vp, MNT_WAIT, 0); VOP_UNLOCK(vp); } else if (VN_IS_DOOMED(vp)) { vput(vp); } else { int rmerr; /* Remove snapshot as its creation has failed. */ vput(vp); NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile); if ((rmerr = namei(&nd)) != 0 || (rmerr = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd)) != 0) printf("Delete of %s failed with error %d\n", nd.ni_dirp, rmerr); NDFREE_PNBUF(&nd); if (nd.ni_dvp != NULL) vput(nd.ni_dvp); if (nd.ni_vp != NULL) vput(nd.ni_vp); } vn_finished_write(wrtmp); process_deferred_inactive(mp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(int cg, struct vnode *vp, struct buf *nbp, int passno) { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ITOFS(ip); if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, 0, &bp, &cgp)) != 0) return (error); UFS_LOCK(ITOUMP(ip)); ACTIVESET(fs, cg); /* * Recomputation of summary information might not have been performed * at mount time. Sync up summary information for current cylinder * group while data is in memory to ensure that result of background * fsck is slightly more consistent. */ fs->fs_cs(fs, cg) = cgp->cg_cs; UFS_UNLOCK(ITOUMP(ip)); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); cgp = (struct cg *)nbp->b_data; bqrelse(bp); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < UFS_NDADDR) { for ( ; loc < UFS_NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP_SET(ip, i_db[loc], BLK_NOCOPY); else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP_SET(ip, i_db[loc], 0); else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { goto out; } indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { goto out; } indiroff = 0; } if (I_IS_UFS1(ip)) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); out: /* * We have to calculate the crc32c here rather than just setting the * BX_CYLGRP b_xflags because the allocation of the block for the * the cylinder group map will always be a full size block (fs_bsize) * even though the cylinder group may be smaller (fs_cgsize). The * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP * flag causes it to be computed over the size of the buffer. */ if ((fs->fs_metackhash & CK_CYLGRP) != 0) { ((struct cg *)nbp->b_data)->cg_ckhash = 0; ((struct cg *)nbp->b_data)->cg_ckhash = calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize); } return (error); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int expungetype, int clearmode) { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * or unlinked snapshots to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(dip->di_db, UFS_NDADDR * sizeof(ufs1_daddr_t)); bzero(dip->di_ib, UFS_NIADDR * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(struct vnode *snapvp, struct vnode *cancelvp, int level, ufs1_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, ufs_lbn_t blksperindir, struct fs *fs, int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int expungetype) { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int exptype) /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int expungetype) /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkp = &ip->i_din1->di_db[lbn]; UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); } else { error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int expungetype) { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL, SINGLETON_KEY); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int expungetype, int clearmode) { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(dip->di_db, UFS_NDADDR * sizeof(ufs2_daddr_t)); bzero(dip->di_ib, UFS_NIADDR * sizeof(ufs2_daddr_t)); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; else ffs_update_dinode_ckhash(fs, dip); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(struct vnode *snapvp, struct vnode *cancelvp, int level, ufs2_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, ufs_lbn_t blksperindir, struct fs *fs, int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int expungetype) { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & B_CACHE) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int exptype) /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int expungetype) /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkp = &ip->i_din2->di_db[lbn]; UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); } else { error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, struct fs *fs, ufs_lbn_t lblkno, int expungetype) { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP && lblkno >= UFS_NDADDR) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL, SINGLETON_KEY); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(struct inode *ip) { struct inode *xp; struct fs *fs; int snaploc; struct snapdata *sn; struct ufsmount *ump; /* * Find snapshot in incore list. */ xp = NULL; sn = ITODEVVP(ip)->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); #ifdef DIAGNOSTIC else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %ju\n", (uintmax_t)ip->i_number); #endif /* * Delete snapshot inode from superblock. Keep list dense. */ ump = ITOUMP(ip); fs = ump->um_fs; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } UFS_UNLOCK(ump); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(struct vnode *vp) { struct inode *ip; struct vnode *devvp; struct buf *ibp; struct fs *fs; ufs2_daddr_t numblks, blkno, dblk; int error, last, loc; struct snapdata *sn; ip = VTOI(vp); fs = ITOFS(ip); devvp = ITODEVVP(ip); /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) { sn = devvp->v_rdev->si_snapdata; TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; revert_snaplock(vp, devvp, sn); try_free_snapdata(devvp); } VI_UNLOCK(devvp); /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < UFS_NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (I_IS_UFS1(ip)) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); /* * The dirtied indirects must be written out before * softdep_setup_freeblocks() is called. Otherwise indir_trunc() * may find indirect pointers using the magic BLK_* values. */ if (DOINGSOFTDEP(vp)) ffs_syncvnode(vp, MNT_WAIT, 0); #ifdef QUOTA /* * Reenable disk quotas for ex-snapshot file. */ if (!getinoquota(ip)) (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); #endif } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, long size, ino_t inum, __enum_uint8(vtype) vtype, struct workhead *wkhd) { struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, error = 0, claimedblk = 0; struct snapdata *sn; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return (0); } /* * Use LK_SLEEPFAIL because sn might be freed under us while * both devvp interlock and snaplk are not owned. */ if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) goto retry; TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * Lookup block being written. */ if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= UFS_NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DIAGNOSTIC if (snapdebug) printf("%s %ju lbn %jd from inum %ju\n", "Grabonremove: snapino", (uintmax_t)ip->i_number, (intmax_t)lbn, (uintmax_t)inum); #endif /* * If journaling is tracking this write we must add * the work to the inode or indirect being written. */ if (wkhd != NULL) { if (lbn < UFS_NDADDR) softdep_inode_append(ip, curthread->td_ucred, wkhd); else softdep_buf_append(ibp, wkhd); } if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); lockmgr(vp->v_vnlock, LK_RELEASE, NULL); return (1); } if (lbn >= UFS_NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DIAGNOSTIC if (snapdebug) printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", "Copyonremove: snapino ", (uintmax_t)ip->i_number, (intmax_t)lbn, "for inum", (uintmax_t)inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((vtype == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ if (error != 0 && wkhd != NULL) softdep_freework(wkhd); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(struct mount *mp) { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snapdata *sn; struct vnode *vp; struct vnode *lastvp; struct inode *ip; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; lastvp = NULL; sn = NULL; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if (vp->v_type != VREG) { reason = "non-file snapshot"; } else if (!IS_SNAPSHOT(ip)) { reason = "non-snapshot"; } else if (ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { reason = "old format snapshot"; (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); (void)ffs_syncvnode(vp, MNT_WAIT, 0); } else { reason = NULL; } if (reason != NULL) { printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * Acquire a lock on the snapdata structure, creating it if * necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the * original private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %ju already on list", (uintmax_t)ip->i_number); else TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp); lastvp = vp; } vp = lastvp; /* * No usable snapshots found. */ if (sn == NULL || vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp); return; } snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp); free(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp); VI_LOCK(devvp); sn->sn_listsize = snaplistsize; sn->sn_blklist = (daddr_t *)snapblklist; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(struct mount *mp) { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snapdata *sn; struct inode *xp; struct vnode *vp; VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { vp = ITOV(xp); TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp)); VI_LOCK(devvp); revert_snaplock(vp, devvp, sn); lockmgr(&vp->v_lock, LK_RELEASE, NULL); if (xp->i_effnlink > 0) { VI_UNLOCK(devvp); vrele(vp); VI_LOCK(devvp); } sn = devvp->v_rdev->si_snapdata; } try_free_snapdata(devvp); VI_UNLOCK(devvp); } /* * Check the buffer block to be belong to device buffer that shall be * locked after snaplk. devvp shall be locked on entry, and will be * leaved locked upon exit. */ static int ffs_bp_snapblk(struct vnode *devvp, struct buf *bp) { struct snapdata *sn; struct fs *fs; ufs2_daddr_t lbn, *snapblklist; int lower, upper, mid; ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) return (0); fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) return (1); return (0); } void ffs_bdflush(struct bufobj *bo, struct buf *bp) { struct thread *td; struct vnode *vp, *devvp; struct buf *nbp; int bp_bdskip; if (bo->bo_dirty.bv_cnt <= dirtybufthresh) return; td = curthread; vp = bp->b_vp; devvp = bo2vnode(bo); KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); VI_LOCK(devvp); bp_bdskip = ffs_bp_snapblk(devvp, bp); if (bp_bdskip) bdwriteskip++; VI_UNLOCK(devvp); if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { (void) VOP_FSYNC(vp, MNT_NOWAIT, td); altbufferflushes++; } else { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* * Don't countdeps with the bo lock * held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (bp_bdskip) { VI_LOCK(devvp); if (!ffs_bp_snapblk(vp, nbp)) { VI_UNLOCK(devvp); BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } VI_UNLOCK(devvp); } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(struct vnode *devvp, struct buf *bp) { struct snapdata *sn; struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = NULL; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, error = 0; int launched_async_io, prev_norunningbuf; long saved_runningbufspace; if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) return (0); /* Update on a snapshot file */ if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); fs = ITOFS(ip); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); if (lbn < UFS_NDADDR) { VI_UNLOCK(devvp); return (0); /* Direct blocks are always copied */ } snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } launched_async_io = 0; prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; /* * Since I/O on bp isn't yet in progress and it may be blocked * for a long time waiting on snaplk, back it out of * runningbufspace, possibly waking other threads waiting for space. */ saved_runningbufspace = bp->b_runningbufspace; if (saved_runningbufspace != 0) runningbufwakeup(bp); /* * Not in the precomputed list, so check the snapshots. */ while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } return (0); /* Snapshot gone */ } } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef INVARIANTS if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DIAGNOSTIC if (snapdebug) { printf("Copyonwrite: snapino %ju lbn %jd for ", (uintmax_t)ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %ju", (uintmax_t)VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; } lockmgr(vp->v_vnlock, LK_RELEASE, NULL); td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | prev_norunningbuf; if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) waitrunningbufspace(); /* * I/O on bp will now be started, so count it in runningbufspace. */ if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } return (error); } /* * sync snapshots to force freework records waiting on snapshots to claim * blocks to free. */ void ffs_sync_snap(struct mount *mp, int waitfor) { struct snapdata *sn; struct vnode *devvp; struct vnode *vp; struct inode *ip; devvp = VFSTOUFS(mp)->um_devvp; if ((devvp->v_vflag & VV_COPYONWRITE) == 0) return; for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return; } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) == 0) break; } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); ffs_syncvnode(vp, waitfor, NO_INO_UPDT); } lockmgr(&sn->sn_lock, LK_RELEASE, NULL); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(struct vnode *vp, struct buf *bp, ufs2_daddr_t lbn) { struct inode *ip; struct fs *fs; ip = VTOI(vp); fs = ITOFS(ip); bp->b_iocmd = BIO_READ; bp->b_iooffset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); bp->b_iodone = bdone; g_vfs_strategy(&ITODEVVP(ip)->v_bufobj, bp); bufwait(bp); return (bp->b_error); } #endif /* * Process file deletes that were deferred by ufs_inactive() due to * the file system being suspended. Transfer IN_LAZYACCESS into * IN_MODIFIED for vnodes that were accessed during suspension. */ void process_deferred_inactive(struct mount *mp) { struct vnode *vp, *mvp; struct inode *ip; int error; (void) vn_start_secondary_write(NULL, &mp, V_WAIT); loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * IN_LAZYACCESS is checked here without holding any * vnode lock, but this flag is set only while holding * vnode interlock. */ if (vp->v_type == VNON || ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { VI_UNLOCK(vp); continue; } vholdl(vp); retry_vnode: error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); if (error != 0) { vdrop(vp); if (error == ENOENT) continue; /* vnode recycled */ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } ip = VTOI(vp); if ((ip->i_flag & IN_LAZYACCESS) != 0) { ip->i_flag &= ~IN_LAZYACCESS; UFS_INODE_SET_FLAG(ip, IN_MODIFIED); } VI_LOCK(vp); error = vinactive(vp); if (error == ERELOOKUP && vp->v_usecount == 0) { VI_UNLOCK(vp); VOP_UNLOCK(vp); goto retry_vnode; } VI_UNLOCK(vp); VOP_UNLOCK(vp); vdrop(vp); } vn_finished_secondary_write(mp); } #ifndef NO_FFS_SNAPSHOT static struct snapdata * ffs_snapdata_alloc(void) { struct snapdata *sn; /* * Fetch a snapdata from the free list if there is one available. */ mtx_lock(&snapfree_lock); sn = LIST_FIRST(&snapfree); if (sn != NULL) LIST_REMOVE(sn, sn_link); mtx_unlock(&snapfree_lock); if (sn != NULL) return (sn); /* * If there were no free snapdatas allocate one. */ sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); return (sn); } /* * The snapdata is never freed because we can not be certain that * there are no threads sleeping on the snap lock. Persisting * them permanently avoids costly synchronization in ffs_lock(). */ static void ffs_snapdata_free(struct snapdata *sn) { mtx_lock(&snapfree_lock); LIST_INSERT_HEAD(&snapfree, sn, sn_link); mtx_unlock(&snapfree_lock); } /* Try to free snapdata associated with devvp */ static void try_free_snapdata(struct vnode *devvp) { struct snapdata *sn; ufs2_daddr_t *snapblklist; ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || (devvp->v_vflag & VV_COPYONWRITE) == 0) return; devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); snapblklist = sn->sn_blklist; sn->sn_blklist = NULL; sn->sn_listsize = 0; lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (snapblklist != NULL) free(snapblklist, M_UFSMNT); ffs_snapdata_free(sn); VI_LOCK(devvp); } /* * Revert a vnode lock from using the snapshot lock back to its own lock. * * Aquire a lock on the vnode's own lock and release the lock on the * snapshot lock. If there are any recursions on the snapshot lock * get the same number of recursions on the vnode's own lock. */ static void revert_snaplock(struct vnode *vp, struct vnode *devvp, struct snapdata *sn) { int i; ASSERT_VI_LOCKED(devvp, "revert_snaplock"); /* * Avoid LOR with snapshot lock. The LK_NOWAIT should * never fail as the lock is currently unused. Rather than * panic, we recover by doing the blocking lock. */ for (i = 0; i <= sn->sn_lock.lk_recurse; i++) { if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, VI_MTX(devvp)) != 0) { printf("revert_snaplock: Unexpected LK_NOWAIT " "failure\n"); lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_INTERLOCK, VI_MTX(devvp)); } VI_LOCK(devvp); } KASSERT(vp->v_vnlock == &sn->sn_lock, ("revert_snaplock: lost lock mutation")); vp->v_vnlock = &vp->v_lock; while (sn->sn_lock.lk_recurse > 0) lockmgr(&sn->sn_lock, LK_RELEASE, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); } static struct snapdata * ffs_snapdata_acquire(struct vnode *devvp) { struct snapdata *nsn, *sn; int error; /* * Allocate a free snapdata. This is done before acquiring the * devvp lock to avoid allocation while the devvp interlock is * held. */ nsn = ffs_snapdata_alloc(); for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { /* * This is the first snapshot on this * filesystem and we use our pre-allocated * snapdata. Publish sn with the sn_lock * owned by us, to avoid the race. */ error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL); if (error != 0) panic("leaked sn, lockmgr error %d", error); sn = devvp->v_rdev->si_snapdata = nsn; VI_UNLOCK(devvp); nsn = NULL; break; } /* * There is a snapshots which already exists on this * filesystem, grab a reference to the common lock. */ error = lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)); if (error == 0) break; } /* * Free any unused snapdata. */ if (nsn != NULL) ffs_snapdata_free(nsn); return (sn); } #endif