Index: head/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- head/sys/ufs/ffs/ffs_snapshot.c (revision 168352) +++ head/sys/ufs/ffs/ffs_snapshot.c (revision 168353) @@ -1,2520 +1,2520 @@ /*- * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #define DEBUG 1 #include "opt_ffs.h" #ifdef NO_FFS_SNAPSHOT int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { return (EINVAL); } int ffs_snapblkfree(fs, devvp, bno, size, inum) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; { return (EINVAL); } void ffs_snapremove(vp) struct vnode *vp; { } void ffs_snapshot_mount(mp) struct mount *mp; { } void ffs_snapshot_unmount(mp) struct mount *mp; { } void ffs_snapgone(ip) struct inode *ip; { } int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { return (EINVAL); } #else TAILQ_HEAD(snaphead, inode); struct snapdata { struct snaphead sn_head; daddr_t sn_listsize; daddr_t *sn_blklist; struct lock sn_lock; }; static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); static void process_deferred_inactive(struct mount *); static void try_free_snapdata(struct vnode *devvp, struct thread *td); static int ffs_bp_snapblk(struct vnode *, struct buf *); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the following flag allows this * crash persistence to be disabled. */ int dopersistence = 0; #ifdef DEBUG #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DEBUG */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; int flag; struct timespec starttime = {0, 0}, endtime; char saved_nice = 0; long redo = 0, snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp, *sbp = NULL; struct nameidata nd; struct mount *wrtmp; struct vattr vat; struct vnode *vp, *xvp, *mvp, *devvp; struct uio auio; struct iovec aiov; struct snapdata *sn; struct ufsmount *ump; ump = VFSTOUFS(mp); fs = ump->um_fs; sn = NULL; MNT_ILOCK(mp); flag = mp->mnt_flag; MNT_IUNLOCK(mp); /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; UFS_UNLOCK(ump); if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); vfs_rel(wrtmp); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); VOP_UNLOCK(nd.ni_dvp, 0, td); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); vrele(nd.ni_dvp); return (error); } vp = nd.ni_vp; vp->v_vflag |= VV_SYSTEM; ip = VTOI(vp); devvp = ip->i_devvp; /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; ip->i_size = lblktosize(fs, (off_t)numblks); DIP_SET(ip, i_size, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; error = readblock(vp, bp, numblks - 1); bawrite(bp); if (error != 0) goto out; /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = howmany(fs->fs_ncg, NBBY); MALLOC(space, void *, len, M_DEVBUF, M_WAITOK|M_ZERO); UFS_LOCK(ump); fs->fs_active = space; UFS_UNLOCK(ump); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (error) goto out; } /* * Change inode to snapshot type file. */ ip->i_flags |= SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) goto out; VOP_UNLOCK(vp, 0, td); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { PROC_LOCK(td->td_proc); mtx_lock_spin(&sched_lock); saved_nice = td->td_proc->p_nice; sched_nice(td->td_proc, 0); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (ip->i_effnlink == 0) { error = ENOENT; /* Snapshot file unlinked */ goto out1; } if (collectsnapstats) nanotime(&starttime); /* The last block might have changed. Copy it again to be sure. */ error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error != 0) goto out1; error = readblock(vp, bp, numblks - 1); bp->b_flags |= B_VALIDSUSPWRT; bawrite(bp); if (error != 0) goto out1; /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; redo++; error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto out1; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &sbp); if (error) { brelse(sbp); sbp = NULL; goto out1; } loc = blkoff(fs, fs->fs_sblockloc); copy_fs = (struct fs *)(sbp->b_data + loc); bcopy(fs, copy_fs, fs->fs_sbsize); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); sbp = NULL; goto out1; } bcopy(bp->b_data, space, (u_int)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the needed size for the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_SUSPENDED; loop: MNT_VNODE_FOREACH(xvp, mp, mvp) { VI_LOCK(xvp); MNT_IUNLOCK(mp); if ((xvp->v_iflag & VI_DOOMED) || (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || xvp->v_type == VNON || (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { VI_UNLOCK(xvp); MNT_ILOCK(mp); continue; } /* * We can skip parent directory vnode because it must have * this snapshot file in it. */ if (xvp == nd.ni_dvp) { VI_UNLOCK(xvp); MNT_ILOCK(mp); continue; } vholdl(xvp); if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { MNT_ILOCK(mp); MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); vdrop(xvp); goto loop; } VI_LOCK(xvp); if (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { VI_UNLOCK(xvp); VOP_UNLOCK(xvp, 0, td); vdrop(xvp); MNT_ILOCK(mp); continue; } VI_UNLOCK(xvp); if (snapdebug) vprint("ffs_snapshot: busy vnode", xvp); if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp, 0, td); vdrop(xvp); MNT_ILOCK(mp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp, 0, td); vdrop(xvp); MNT_ILOCK(mp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } } snaplistsize += 1; if (xp->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY); if (blkno) DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode); VOP_UNLOCK(xvp, 0, td); vdrop(xvp); if (error) { free(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); sbp = NULL; MNT_VNODE_FOREACH_ABORT(mp, mvp); goto out1; } MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * If there already exist snapshots on this filesystem, grab a * reference to their shared lock. If this is the first snapshot * on this filesystem, we need to allocate a lock for the snapshots * to share. In either case, acquire the snapshot lock and give * up our original private lock. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn != NULL) { xp = TAILQ_FIRST(&sn->sn_head); VI_UNLOCK(devvp); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; } else { VI_UNLOCK(devvp); sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; mp_fixme("si_snapdata setting is racey."); devvp->v_rdev->si_snapdata = sn; xp = NULL; } lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(vp), td); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg) > blkno)) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (sn->sn_blklist != NULL) panic("ffs_snapshot: non-empty list"); sn->sn_blklist = snapblklist; sn->sn_listsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %d already on list", ip->i_number); TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); out1: KASSERT((sn != NULL && sbp != NULL && error == 0) || (sn == NULL && sbp == NULL && error != 0), ("email phk@ and mckusick@")); /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount); vn_start_write(NULL, &wrtmp, V_WAIT); if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); timespecsub(&endtime, &starttime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } if (sbp == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; if (xp->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP); if (error == 0 && xp->i_effnlink == 0) { error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode); } if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (ip->i_ump->um_fstype == UFS1) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); if (error) { fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len;; auio.uio_offset = ip->i_size; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); fs->fs_snapinum[snaploc] = 0; FREE(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = sn->sn_blklist; sn->sn_blklist = snapblklist; sn->sn_listsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) FREE(space, M_UFSMNT); /* * If another process is currently writing the buffer containing * the inode for this snapshot then a deadlock can occur. Drop * the snapshot lock until the buffer has been written. */ VREF(vp); /* Protect against ffs_snapgone() */ VOP_UNLOCK(vp, 0, td); (void) bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int) fs->fs_bsize, NOCRED, &nbp); brelse(nbp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (ip->i_effnlink == 0) error = ENOENT; /* Snapshot file unlinked */ else vrele(vp); /* Drop extra reference */ done: FREE(copy_fs->fs_csp, M_UFSMNT); bawrite(sbp); out: if (saved_nice > 0) { PROC_LOCK(td->td_proc); mtx_lock_spin(&sched_lock); sched_nice(td->td_proc, saved_nice); mtx_unlock_spin(&sched_lock); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); if (fs->fs_active != 0) { FREE(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } UFS_UNLOCK(ump); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); MNT_IUNLOCK(mp); if (error) (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td); (void) ffs_syncvnode(vp, MNT_WAIT); if (error) vput(vp); else VOP_UNLOCK(vp, 0, td); vrele(nd.ni_dvp); vn_finished_write(wrtmp); process_deferred_inactive(mp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(cg, vp, nbp, passno) int cg; struct vnode *vp; struct buf *nbp; int passno; { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ip->i_fs; error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, KERNCRED, &bp); if (error) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; if (!cg_chkmagic(cgp)) { brelse(bp); return (EIO); } UFS_LOCK(ip->i_ump); ACTIVESET(fs, cg); UFS_UNLOCK(ip->i_ump); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); cgp = (struct cg *)nbp->b_data; bqrelse(bp); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < NDADDR) { for ( ; loc < NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP_SET(ip, i_db[loc], BLK_NOCOPY); else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP_SET(ip, i_db[loc], 0); else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { return (error); } indiroff = (base + loc - NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { return (error); } indiroff = 0; } if (ip->i_ump->um_fstype == UFS1) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * or unlinked snapshots to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs1_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: FREE(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_din1->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; if (lbn >= NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (expungetype == BLK_NOCOPY) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -NDADDR; len = numblks - NDADDR; rlbn = NDADDR; for (i = 0; len > 0 && i < NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs2_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: FREE(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < NDADDR) { blkp = &ip->i_din2->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) [(lbn - NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; if (lbn >= NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(ip) struct inode *ip; { struct inode *xp; struct fs *fs; int snaploc; struct snapdata *sn; struct ufsmount *ump; /* * Find snapshot in incore list. */ xp = NULL; sn = ip->i_devvp->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %d\n", ip->i_number); /* * Delete snapshot inode from superblock. Keep list dense. */ fs = ip->i_fs; ump = ip->i_ump; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } UFS_UNLOCK(ump); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(vp) struct vnode *vp; { struct inode *ip; struct vnode *devvp; struct buf *ibp; struct fs *fs; struct thread *td = curthread; ufs2_daddr_t numblks, blkno, dblk; int error, loc, last; struct snapdata *sn; ip = VTOI(vp); fs = ip->i_fs; devvp = ip->i_devvp; /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) { sn = devvp->v_rdev->si_snapdata; TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; VI_UNLOCK(devvp); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td); VI_LOCK(vp); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapremove: lost lock mutation")); vp->v_vnlock = &vp->v_lock; VI_UNLOCK(vp); VI_LOCK(devvp); lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td); try_free_snapdata(devvp, td); } else VI_UNLOCK(devvp); /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (ip->i_ump->um_fstype == UFS1) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, ip->i_number))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; #ifdef QUOTA /* * Reenable disk quotas for ex-snapshot file. */ if (!getinoquota(ip)) (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); #endif } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(fs, devvp, bno, size, inum) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; { struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, error = 0, claimedblk = 0; struct snapdata *sn; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return (0); } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) goto retry; TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); /* * Lookup block being written. */ if (lbn < NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); if (ip->i_ump->um_fstype == UFS1) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else if (ip->i_ump->um_fstype == UFS1) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %d lbn %jd from inum %d\n", "Grabonremove: snapino", ip->i_number, (intmax_t)lbn, inum); #endif if (lbn < NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (ip->i_ump->um_fstype == UFS1) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); ip->i_flag |= IN_CHANGE | IN_UPDATE; lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); return (1); } if (lbn >= NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", "Copyonremove: snapino ", ip->i_number, (intmax_t)lbn, "for inum", inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if (dopersistence && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snapdata *sn; struct vnode *vp; struct vnode *lastvp; struct inode *ip; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; lastvp = NULL; sn = devvp->v_rdev->si_snapdata; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { if ((ip->i_flags & SF_SNAPSHOT) == 0) { reason = "non-snapshot"; } else { reason = "old format snapshot"; (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td); (void)ffs_syncvnode(vp, MNT_WAIT); } printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * If there already exist snapshots on this filesystem, grab a * reference to their shared lock. If this is the first snapshot * on this filesystem, we need to allocate a lock for the * snapshots to share. In either case, acquire the snapshot * lock and give up our original private lock. */ VI_LOCK(devvp); if (sn != NULL) { VI_UNLOCK(devvp); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; } else { VI_UNLOCK(devvp); sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); VI_LOCK(vp); vp->v_vnlock = &sn->sn_lock; devvp->v_rdev->si_snapdata = sn; } lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(vp), td); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %d already on list", ip->i_number); else TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp, 0, td); lastvp = vp; } vp = lastvp; /* * No usable snapshots found. */ if (vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp, 0, td); return; } MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp, 0, td); FREE(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp, 0, td); VI_LOCK(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); sn->sn_listsize = snaplistsize; sn->sn_blklist = (daddr_t *)snapblklist; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(mp) struct mount *mp; { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snapdata *sn; struct inode *xp; struct vnode *vp; struct thread *td = curthread; VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { vp = ITOV(xp); TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp), td); VI_LOCK(vp); lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(vp), td); VI_LOCK(vp); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapshot_unmount: lost lock mutation")); vp->v_vnlock = &vp->v_lock; VI_UNLOCK(vp); lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td); if (xp->i_effnlink > 0) vrele(vp); VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; } try_free_snapdata(devvp, td); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); } /* * Check the buffer block to be belong to device buffer that shall be * locked after snaplk. devvp shall be locked on entry, and will be * leaved locked upon exit. */ static int ffs_bp_snapblk(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct fs *fs; ufs2_daddr_t lbn, *snapblklist; int lower, upper, mid; ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) return (0); fs = TAILQ_FIRST(&sn->sn_head)->i_fs; lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) return (1); return (0); } void ffs_bdflush(bo, bp) struct bufobj *bo; struct buf *bp; { struct thread *td; struct vnode *vp, *devvp; struct buf *nbp; int bp_bdskip; if (bo->bo_dirty.bv_cnt <= dirtybufthresh) return; td = curthread; vp = bp->b_vp; devvp = bo->__bo_vnode; KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); VI_LOCK(devvp); bp_bdskip = ffs_bp_snapblk(devvp, bp); if (bp_bdskip) bdwriteskip++; VI_UNLOCK(devvp); if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { (void) VOP_FSYNC(vp, MNT_NOWAIT, td); altbufferflushes++; } else { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* * Don't countdeps with the bo lock * held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (bp_bdskip) { VI_LOCK(devvp); if (!ffs_bp_snapblk(vp, nbp)) { if (BO_MTX(bo) != VI_MTX(vp)) { VI_UNLOCK(devvp); BO_LOCK(bo); } BUF_UNLOCK(nbp); continue; } VI_UNLOCK(devvp); } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct buf *ibp, *cbp, *savedcbp = 0; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = 0; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, error = 0; int launched_async_io, prev_norunningbuf; long saved_runningbufspace; if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0) return (0); /* Update on a snapshot file */ if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || - TAILQ_FIRST(&sn->sn_head) == NULL) { + TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); fs = ip->i_fs; lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } launched_async_io = 0; prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; /* * Since I/O on bp isn't yet in progress and it may be blocked * for a long time waiting on snaplk, back it out of * runningbufspace, possibly waking other threads waiting for space. */ saved_runningbufspace = bp->b_runningbufspace; if (saved_runningbufspace != 0) runningbufwakeup(bp); /* * Not in the precomputed list, so check the snapshots. */ while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp), td) != 0) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || - TAILQ_FIRST(&sn->sn_head) == NULL) { + TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_int(&runningbufspace, bp->b_runningbufspace); } return (0); /* Snapshot gone */ } } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ if (lbn < NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - NDADDR) % NINDIR(fs); if (ip->i_ump->um_fstype == UFS1) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef DIAGNOSTIC if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %d lbn %jd for ", ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %d", VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. */ if (savedcbp != 0) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if (dopersistence && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if (dopersistence && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT); else launched_async_io = 1; } lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td); td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | prev_norunningbuf; if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) waitrunningbufspace(); /* * I/O on bp will now be started, so count it in runningbufspace. */ if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_int(&runningbufspace, bp->b_runningbufspace); } return (error); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(vp, bp, lbn) struct vnode *vp; struct buf *bp; ufs2_daddr_t lbn; { struct inode *ip = VTOI(vp); struct bio *bip; bip = g_alloc_bio(); bip->bio_cmd = BIO_READ; bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); bip->bio_data = bp->b_data; bip->bio_length = bp->b_bcount; bip->bio_done = NULL; g_io_request(bip, ip->i_devvp->v_bufobj.bo_private); bp->b_error = biowait(bip, "snaprdb"); g_destroy_bio(bip); return (bp->b_error); } /* * Process file deletes that were deferred by ufs_inactive() due to * the file system being suspended. Transfer IN_LAZYACCESS into * IN_MODIFIED for vnodes that were accessed during suspension. */ static void process_deferred_inactive(struct mount *mp) { struct vnode *vp, *mvp; struct inode *ip; struct thread *td; int error; td = curthread; (void) vn_start_secondary_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); loop: MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); /* * IN_LAZYACCESS is checked here without holding any * vnode lock, but this flag is set only while holding * vnode interlock. */ if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 || ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); vholdl(vp); error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td); if (error != 0) { vdrop(vp); MNT_ILOCK(mp); if (error == ENOENT) continue; /* vnode recycled */ MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } ip = VTOI(vp); if ((ip->i_flag & IN_LAZYACCESS) != 0) { ip->i_flag &= ~IN_LAZYACCESS; ip->i_flag |= IN_MODIFIED; } VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { VI_UNLOCK(vp); VOP_UNLOCK(vp, 0, td); vdrop(vp); MNT_ILOCK(mp); continue; } VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("process_deferred_inactive: " "recursed on VI_DOINGINACT")); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); (void) VOP_INACTIVE(vp, td); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("process_deferred_inactive: lost VI_DOINGINACT")); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("process_deferred_inactive: got VI_OWEINACT")); vp->v_iflag &= ~VI_DOINGINACT; VI_UNLOCK(vp); VOP_UNLOCK(vp, 0, td); vdrop(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); vn_finished_secondary_write(mp); } /* Try to free snapdata associated with devvp */ static void try_free_snapdata(struct vnode *devvp, struct thread *td) { struct snapdata *sn; ufs2_daddr_t *snapblklist; sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || (devvp->v_vflag & VV_COPYONWRITE) == 0) { VI_UNLOCK(devvp); return; } devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; snapblklist = sn->sn_blklist; sn->sn_blklist = NULL; sn->sn_listsize = 0; lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td); lockdestroy(&sn->sn_lock); free(sn, M_UFSMNT); if (snapblklist != NULL) FREE(snapblklist, M_UFSMNT); } #endif Index: head/sys/ufs/ffs/ffs_softdep.c =================================================================== --- head/sys/ufs/ffs/ffs_softdep.c (revision 168352) +++ head/sys/ufs/ffs/ffs_softdep.c (revision 168353) @@ -1,6274 +1,6274 @@ /*- * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * * Further information about soft updates can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 */ #include __FBSDID("$FreeBSD$"); /* * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. */ #ifndef DIAGNOSTIC #define DIAGNOSTIC #endif #ifndef DEBUG #define DEBUG #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_ffs.h" #include "opt_quota.h" #ifndef SOFTUPDATES int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { panic("softdep_flushfiles called"); } int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { return (0); } void softdep_initialize() { return; } void softdep_uninitialize() { return; } void softdep_setup_inomapdep(bp, ip, newinum) struct buf *bp; struct inode *ip; ino_t newinum; { panic("softdep_setup_inomapdep called"); } void softdep_setup_blkmapdep(bp, mp, newblkno) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; { panic("softdep_setup_blkmapdep called"); } void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocdirect called"); } void softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocext called"); } void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; ufs_lbn_t lbn; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; struct buf *nbp; { panic("softdep_setup_allocindir_page called"); } void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; struct inode *ip; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; { panic("softdep_setup_allocindir_meta called"); } void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; off_t length; int flags; { panic("softdep_setup_freeblocks called"); } void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { panic("softdep_freefile called"); } int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; struct inode *dp; off_t diroffset; ino_t newinum; struct buf *newdirbp; int isnewblk; { panic("softdep_setup_directory_add called"); } void softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) struct inode *dp; caddr_t base; caddr_t oldloc; caddr_t newloc; int entrysize; { panic("softdep_change_directoryentry_offset called"); } void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; int isrmdir; { panic("softdep_setup_remove called"); } void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; ino_t newinum; int isrmdir; { panic("softdep_setup_directory_change called"); } void softdep_change_linkcnt(ip) struct inode *ip; { panic("softdep_change_linkcnt called"); } void softdep_load_inodeblock(ip) struct inode *ip; { panic("softdep_load_inodeblock called"); } void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; struct buf *bp; int waitfor; { panic("softdep_update_inodeblock called"); } int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { return (0); } void softdep_fsync_mountdev(vp) struct vnode *vp; { return; } int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { *countp = 0; return (0); } int softdep_sync_metadata(struct vnode *vp) { return (0); } int softdep_slowdown(vp) struct vnode *vp; { panic("softdep_slowdown called"); } void softdep_releasefile(ip) struct inode *ip; /* inode with the zero effective link count */ { panic("softdep_releasefile called"); } int softdep_request_cleanup(fs, vp) struct fs *fs; struct vnode *vp; { return (0); } int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_deps, int softdep_accdeps, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; int error; (void) softdep_deps, (void) softdep_accdeps; ASSERT_VI_LOCKED(devvp, "softdep_check_suspend"); bo = &devvp->v_bufobj; for (;;) { if (!MNT_ITRYLOCK(mp)) { VI_UNLOCK(devvp); MNT_ILOCK(mp); MNT_IUNLOCK(mp); VI_LOCK(devvp); continue; } if (mp->mnt_secondary_writes != 0) { VI_UNLOCK(devvp); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); VI_LOCK(devvp); continue; } break; } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; VI_UNLOCK(devvp); return (error); } void softdep_get_depcounts(struct mount *mp, int *softdepactivep, int *softdepactiveaccp) { (void) mp; *softdepactivep = 0; *softdepactiveaccp = 0; } #else /* * These definitions need to be adapted to the system to which * this file is being ported. */ /* * malloc types defined for the softdep system. */ static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies"); static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies"); static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation"); static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map"); static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode"); static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies"); static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block"); static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode"); static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode"); static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated"); static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry"); static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory"); static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted"); static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block"); static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes"); #define M_SOFTDEP_FLAGS (M_WAITOK | M_USE_RESERVE) #define D_PAGEDEP 0 #define D_INODEDEP 1 #define D_NEWBLK 2 #define D_BMSAFEMAP 3 #define D_ALLOCDIRECT 4 #define D_INDIRDEP 5 #define D_ALLOCINDIR 6 #define D_FREEFRAG 7 #define D_FREEBLKS 8 #define D_FREEFILE 9 #define D_DIRADD 10 #define D_MKDIR 11 #define D_DIRREM 12 #define D_NEWDIRBLK 13 #define D_LAST D_NEWDIRBLK /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX */ static struct malloc_type *memtype[] = { M_PAGEDEP, M_INODEDEP, M_NEWBLK, M_BMSAFEMAP, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, M_FREEFRAG, M_FREEBLKS, M_FREEFILE, M_DIRADD, M_MKDIR, M_DIRREM, M_NEWDIRBLK }; #define DtoM(type) (memtype[type]) /* * Names of malloc types. */ #define TYPENAME(type) \ ((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ /* * Forward declarations. */ struct inodedep_hashhead; struct newblk_hashhead; struct pagedep_hashhead; /* * Internal function prototypes. */ static void softdep_error(char *, int); static void drain_output(struct vnode *); static struct buf *getdirtybuf(struct buf *, struct mtx *, int); static void clear_remove(struct thread *); static void clear_inodedeps(struct thread *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); static int flush_inodedep_deps(struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int handle_written_filepage(struct pagedep *, struct buf *); static void diradd_inode_written(struct diradd *, struct inodedep *); static int handle_written_inodeblock(struct inodedep *, struct buf *); static void handle_allocdirect_partdone(struct allocdirect *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); static void handle_written_mkdir(struct mkdir *, int); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static void handle_workitem_remove(struct dirrem *, struct vnode *); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); static void free_diradd(struct diradd *); static void free_allocindir(struct allocindir *, struct inodedep *); static void free_newdirblk(struct newdirblk *); static int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t, ufs2_daddr_t *); static void deallocate_dependencies(struct buf *, struct inodedep *); static void free_allocdirect(struct allocdirectlst *, struct allocdirect *, int); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); static void handle_workitem_freeblocks(struct freeblks *, int); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static void setup_allocindir_phase2(struct buf *, struct inode *, struct allocindir *); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, ufs2_daddr_t); static void handle_workitem_freefrag(struct freefrag *); static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *); static int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t, struct newblk **); static int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); static int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct mount *mp, int, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int process_worklist_item(struct mount *, int); static void add_to_worklist(struct worklist *); static void softdep_flush(void); static int softdep_speedup(void); /* * Exported softdep operations. */ static void softdep_disk_io_initiation(struct buf *); static void softdep_disk_write_complete(struct buf *); static void softdep_deallocate_dependencies(struct buf *); static int softdep_count_dependencies(struct buf *bp, int); static struct mtx lk; MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF); #define TRY_ACQUIRE_LOCK(lk) mtx_trylock(lk) #define ACQUIRE_LOCK(lk) mtx_lock(lk) #define FREE_LOCK(lk) mtx_unlock(lk) /* * Worklist queue management. * These routines require that the lock be held. */ #ifndef /* NOT */ DEBUG #define WORKLIST_INSERT(head, item) do { \ (item)->wk_state |= ONWORKLIST; \ LIST_INSERT_HEAD(head, item, wk_list); \ } while (0) #define WORKLIST_REMOVE(item) do { \ (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) #else /* DEBUG */ static void worklist_insert(struct workhead *, struct worklist *); static void worklist_remove(struct worklist *); #define WORKLIST_INSERT(head, item) worklist_insert(head, item) #define WORKLIST_REMOVE(item) worklist_remove(item) static void worklist_insert(head, item) struct workhead *head; struct worklist *item; { mtx_assert(&lk, MA_OWNED); if (item->wk_state & ONWORKLIST) panic("worklist_insert: already on list"); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void worklist_remove(item) struct worklist *item; { mtx_assert(&lk, MA_OWNED); if ((item->wk_state & ONWORKLIST) == 0) panic("worklist_remove: not on list"); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } #endif /* DEBUG */ /* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); static void workitem_alloc(struct worklist *, int, struct mount *); #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type)) static void workitem_free(item, type) struct worklist *item; int type; { struct ufsmount *ump; mtx_assert(&lk, MA_OWNED); #ifdef DEBUG if (item->wk_state & ONWORKLIST) panic("workitem_free: still on list"); if (item->wk_type != type) panic("workitem_free: type mismatch"); #endif ump = VFSTOUFS(item->wk_mp); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); FREE(item, DtoM(type)); } static void workitem_alloc(item, type, mp) struct worklist *item; int type; struct mount *mp; { item->wk_type = type; item->wk_mp = mp; item->wk_state = 0; ACQUIRE_LOCK(&lk); VFSTOUFS(mp)->softdep_deps++; VFSTOUFS(mp)->softdep_accdeps++; FREE_LOCK(&lk); } /* * Workitem queue management */ static int max_softdeps; /* maximum number of structs before slowdown */ static int maxindirdeps = 50; /* max number of indirdeps before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ static int proc_waiting; /* tracks whether we have a timeout posted */ static int *stat_countp; /* statistic to count in proc_waiting timeout */ static struct callout_handle handle; /* handle on posted proc_waiting timeout */ static int req_pending; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ #define FLUSH_INODES 1 static int req_clear_remove; /* syncer process flush some freeblks */ #define FLUSH_REMOVE 2 #define FLUSH_REMOVE_WAIT 3 /* * runtime statistics */ static int stat_worklist_push; /* number of worklist cleanups */ static int stat_blk_limit_push; /* number of times block limit neared */ static int stat_ino_limit_push; /* number of times inode limit neared */ static int stat_blk_limit_hit; /* number of times block slowdown imposed */ static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, ""); SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); /* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */ SYSCTL_DECL(_vfs_ffs); static int compute_summary_at_mount = 0; /* Whether to recompute the summary at mount time */ SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); static struct proc *softdepproc; static struct kproc_desc softdep_kp = { "softdepflush", softdep_flush, &softdepproc }; SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp) static void softdep_flush(void) { struct mount *nmp; struct mount *mp; struct ufsmount *ump; struct thread *td; int remaining; int vfslocked; td = curthread; td->td_pflags |= TDP_NORUNNINGBUF; for (;;) { kthread_suspend_check(softdepproc); vfslocked = VFS_LOCK_GIANT((struct mount *)NULL); ACQUIRE_LOCK(&lk); /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(td); req_clear_inodedeps -= 1; wakeup_one(&proc_waiting); } if (req_clear_remove) { clear_remove(td); req_clear_remove -= 1; wakeup_one(&proc_waiting); } FREE_LOCK(&lk); VFS_UNLOCK_GIANT(vfslocked); remaining = 0; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { nmp = TAILQ_NEXT(mp, mnt_list); if ((mp->mnt_flag & MNT_SOFTDEP) == 0) continue; if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) continue; vfslocked = VFS_LOCK_GIANT(mp); softdep_process_worklist(mp, 0); ump = VFSTOUFS(mp); remaining += ump->softdep_on_worklist - ump->softdep_on_worklist_inprogress; VFS_UNLOCK_GIANT(vfslocked); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp, td); } mtx_unlock(&mountlist_mtx); if (remaining) continue; ACQUIRE_LOCK(&lk); if (!req_pending) msleep(&req_pending, &lk, PVM, "sdflush", hz); req_pending = 0; FREE_LOCK(&lk); } } static int softdep_speedup(void) { mtx_assert(&lk, MA_OWNED); if (req_pending == 0) { req_pending = 1; wakeup(&req_pending); } return speedup_syncer(); } /* * Add an item to the end of the work queue. * This routine requires that the lock be held. * This is the only routine that adds items to the list. * The following routine is the only one that removes items * and does so in order from first to last. */ static void add_to_worklist(wk) struct worklist *wk; { struct ufsmount *ump; mtx_assert(&lk, MA_OWNED); ump = VFSTOUFS(wk->wk_mp); if (wk->wk_state & ONWORKLIST) panic("add_to_worklist: already on list"); wk->wk_state |= ONWORKLIST; - if (LIST_FIRST(&ump->softdep_workitem_pending) == NULL) + if (LIST_EMPTY(&ump->softdep_workitem_pending)) LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); else LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); ump->softdep_worklist_tail = wk; ump->softdep_on_worklist += 1; } /* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they * appear in the queue. The code below depends on this property to ensure * that blocks of a file are freed before the inode itself is freed. This * ordering ensures that no new triples will be generated * until all the old ones have been purged from the dependency lists. */ int softdep_process_worklist(mp, full) struct mount *mp; int full; { struct thread *td = curthread; int cnt, matchcnt, loopcount; struct ufsmount *ump; long starttime; KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); /* * Record the process identifier of our caller so that we can give * this process preferential treatment in request_cleanup below. */ matchcnt = 0; ump = VFSTOUFS(mp); ACQUIRE_LOCK(&lk); loopcount = 1; starttime = time_second; while (ump->softdep_on_worklist > 0) { if ((cnt = process_worklist_item(mp, 0)) == -1) break; else matchcnt += cnt; /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(td); req_clear_inodedeps -= 1; wakeup_one(&proc_waiting); } if (req_clear_remove) { clear_remove(td); req_clear_remove -= 1; wakeup_one(&proc_waiting); } /* * We do not generally want to stop for buffer space, but if * we are really being a buffer hog, we will stop and wait. */ if (loopcount++ % 128 == 0) { FREE_LOCK(&lk); bwillwrite(); ACQUIRE_LOCK(&lk); } /* * Never allow processing to run for more than one * second. Otherwise the other mountpoints may get * excessively backlogged. */ if (!full && starttime != time_second) { matchcnt = -1; break; } } FREE_LOCK(&lk); return (matchcnt); } /* * Process one item on the worklist. */ static int process_worklist_item(mp, flags) struct mount *mp; int flags; { struct worklist *wk, *wkend; struct ufsmount *ump; struct vnode *vp; int matchcnt = 0; mtx_assert(&lk, MA_OWNED); KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to write as we may * recurse into the copy-on-write routine. */ if (curthread->td_pflags & TDP_COWINPROGRESS) return (-1); /* * Normally we just process each item on the worklist in order. * However, if we are in a situation where we cannot lock any * inodes, we have to skip over any dirrem requests whose * vnodes are resident and locked. */ ump = VFSTOUFS(mp); vp = NULL; LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) { if (wk->wk_state & INPROGRESS) continue; if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) break; wk->wk_state |= INPROGRESS; ump->softdep_on_worklist_inprogress++; FREE_LOCK(&lk); ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum, LK_NOWAIT | LK_EXCLUSIVE, &vp); ACQUIRE_LOCK(&lk); wk->wk_state &= ~INPROGRESS; ump->softdep_on_worklist_inprogress--; if (vp != NULL) break; } if (wk == 0) return (-1); /* * Remove the item to be processed. If we are removing the last * item on the list, we need to recalculate the tail pointer. * As this happens rarely and usually when the list is short, * we just run down the list to find it rather than tracking it * in the above loop. */ WORKLIST_REMOVE(wk); if (wk == ump->softdep_worklist_tail) { LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list) if (LIST_NEXT(wkend, wk_list) == NULL) break; ump->softdep_worklist_tail = wkend; } ump->softdep_on_worklist -= 1; FREE_LOCK(&lk); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_worklist_item: suspended filesystem"); matchcnt++; switch (wk->wk_type) { case D_DIRREM: /* removal of a directory entry */ handle_workitem_remove(WK_DIRREM(wk), vp); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ handle_workitem_freefrag(WK_FREEFRAG(wk)); break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ handle_workitem_freefile(WK_FREEFILE(wk)); break; default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); /* NOTREACHED */ } vn_finished_secondary_write(mp); ACQUIRE_LOCK(&lk); return (matchcnt); } /* * Move dependencies from one buffer to another. */ void softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; - if (LIST_FIRST(&newbp->b_dep) != NULL) + if (!LIST_EMPTY(&newbp->b_dep)) panic("softdep_move_dependencies: need merge code"); wktail = 0; ACQUIRE_LOCK(&lk); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); if (wktail == 0) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else LIST_INSERT_AFTER(wktail, wk, wk_list); wktail = wk; } FREE_LOCK(&lk); } /* * Purge the work list of all items associated with a particular mount point. */ int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { struct vnode *devvp; int count, error = 0; struct ufsmount *ump; /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing * creates. We continue until no more worklist dependencies * are found. */ *countp = 0; ump = VFSTOUFS(oldmnt); devvp = ump->um_devvp; while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { *countp += count; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp, 0, td); if (error) break; } return (error); } int softdep_waitidle(struct mount *mp) { struct ufsmount *ump; int error; int i; ump = VFSTOUFS(mp); ACQUIRE_LOCK(&lk); for (i = 0; i < 10 && ump->softdep_deps; i++) { ump->softdep_req = 1; if (ump->softdep_on_worklist) panic("softdep_waitidle: work added after flush."); msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1); } ump->softdep_req = 0; FREE_LOCK(&lk); error = 0; if (i == 10) { error = EBUSY; printf("softdep_waitidle: Failed to flush worklist for %p", mp); } return (error); } /* * Flush all vnodes and worklist items associated with a specified mount point. */ int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { int error, count, loopcnt; error = 0; /* * Alternately flush the vnodes associated with the mount * point and process any dependencies that the flushing * creates. In theory, this loop can happen at most twice, * but we give it a few extra just to be sure. */ for (loopcnt = 10; loopcnt > 0; loopcnt--) { /* * Do another flush in case any vnodes were brought in * as part of the cleanup operations. */ if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0) break; if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 || count == 0) break; } /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem * activity can keep us busy forever, so we just fail with EBUSY. */ if (loopcnt == 0) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) panic("softdep_flushfiles: looping"); error = EBUSY; } if (!error) error = softdep_waitidle(oldmnt); return (error); } /* * Structure hashing. * * There are three types of structures that can be looked up: * 1) pagedep structures identified by mount point, inode number, * and logical block. * 2) inodedep structures identified by mount point and inode number. * 3) newblk structures identified by mount point and * physical block number. * * The "pagedep" and "inodedep" dependency structures are hashed * separately from the file blocks and inodes to which they correspond. * This separation helps when the in-memory copy of an inode or * file block must be replaced. It also obviates the need to access * an inode or file page when simply updating (or de-allocating) * dependency structures. Lookup of newblk structures is needed to * find newly allocated blocks when trying to associate them with * their allocdirect or allocindir structure. * * The lookup routines optionally create and hash a new instance when * an existing entry is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ #define NODELAY 0x0002 /* cannot do background work */ /* * Structures and routines associated with pagedep caching. */ LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl; u_long pagedep_hash; /* size of hash table - 1 */ #define PAGEDEP_HASH(mp, inum, lbn) \ (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ pagedep_hash]) static int pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp) struct pagedep_hashhead *pagedephd; ino_t ino; ufs_lbn_t lbn; struct mount *mp; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; LIST_FOREACH(pagedep, pagedephd, pd_hash) if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn && mp == pagedep->pd_list.wk_mp) break; if (pagedep) { *pagedeppp = pagedep; if ((flags & DEPALLOC) != 0 && (pagedep->pd_state & ONWORKLIST) == 0) return (0); return (1); } *pagedeppp = NULL; return (0); } /* * Look up a pagedep. Return 1 if found, 0 if not found or found * when asked to allocate but not associated with any buffer. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. * This routine must be called with splbio interrupts blocked. */ static int pagedep_lookup(ip, lbn, flags, pagedeppp) struct inode *ip; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; struct mount *mp; int ret; int i; mtx_assert(&lk, MA_OWNED); mp = ITOV(ip)->v_mount; pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); if (*pagedeppp || (flags & DEPALLOC) == 0) return (ret); FREE_LOCK(&lk); MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(&lk); ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp); if (*pagedeppp) { WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } pagedep->pd_ino = ip->i_number; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); for (i = 0; i < DAHASHSZ; i++) LIST_INIT(&pagedep->pd_diraddhd[i]); LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); *pagedeppp = pagedep; return (0); } /* * Structures and routines associated with inodedep caching. */ LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; static u_long inodedep_hash; /* size of hash table - 1 */ static long num_inodedep; /* number of inodedep allocated */ #define INODEDEP_HASH(fs, inum) \ (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) static int inodedep_find(inodedephd, fs, inum, inodedeppp) struct inodedep_hashhead *inodedephd; struct fs *fs; ino_t inum; struct inodedep **inodedeppp; { struct inodedep *inodedep; LIST_FOREACH(inodedep, inodedephd, id_hash) if (inum == inodedep->id_ino && fs == inodedep->id_fs) break; if (inodedep) { *inodedeppp = inodedep; return (1); } *inodedeppp = NULL; return (0); } /* * Look up an inodedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in inodedeppp. * This routine must be called with splbio interrupts blocked. */ static int inodedep_lookup(mp, inum, flags, inodedeppp) struct mount *mp; ino_t inum; int flags; struct inodedep **inodedeppp; { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; struct fs *fs; mtx_assert(&lk, MA_OWNED); fs = VFSTOUFS(mp)->um_fs; inodedephd = INODEDEP_HASH(fs, inum); if (inodedep_find(inodedephd, fs, inum, inodedeppp)) return (1); if ((flags & DEPALLOC) == 0) return (0); /* * If we are over our limit, try to improve the situation. */ if (num_inodedep > max_softdeps && (flags & NODELAY) == 0) request_cleanup(mp, FLUSH_INODES); FREE_LOCK(&lk); MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), M_INODEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); ACQUIRE_LOCK(&lk); if (inodedep_find(inodedephd, fs, inum, inodedeppp)) { WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } num_inodedep += 1; inodedep->id_fs = fs; inodedep->id_ino = inum; inodedep->id_state = ALLCOMPLETE; inodedep->id_nlinkdelta = 0; inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_buf = NULL; LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); TAILQ_INIT(&inodedep->id_newextupdt); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); *inodedeppp = inodedep; return (0); } /* * Structures and routines associated with newblk caching. */ LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; u_long newblk_hash; /* size of hash table - 1 */ #define NEWBLK_HASH(fs, inum) \ (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) static int newblk_find(newblkhd, fs, newblkno, newblkpp) struct newblk_hashhead *newblkhd; struct fs *fs; ufs2_daddr_t newblkno; struct newblk **newblkpp; { struct newblk *newblk; LIST_FOREACH(newblk, newblkhd, nb_hash) if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) break; if (newblk) { *newblkpp = newblk; return (1); } *newblkpp = NULL; return (0); } /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ static int newblk_lookup(fs, newblkno, flags, newblkpp) struct fs *fs; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; struct newblk_hashhead *newblkhd; newblkhd = NEWBLK_HASH(fs, newblkno); if (newblk_find(newblkhd, fs, newblkno, newblkpp)) return (1); if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(&lk); MALLOC(newblk, struct newblk *, sizeof(struct newblk), M_NEWBLK, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); if (newblk_find(newblkhd, fs, newblkno, newblkpp)) { FREE(newblk, M_NEWBLK); return (1); } newblk->nb_state = 0; newblk->nb_fs = fs; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); *newblkpp = newblk; return (0); } /* * Executed during filesystem system initialization before * mounting any filesystems. */ void softdep_initialize() { LIST_INIT(&mkdirlisthd); max_softdeps = desiredvnodes * 4; pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; bioops.io_complete = softdep_disk_write_complete; bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; } /* * Executed after all filesystems have been unmounted during * filesystem module unload. */ void softdep_uninitialize() { hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash); hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash); hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash); } /* * Called at mount time to notify the dependency code that a * filesystem wishes to use it. */ int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { struct csum_total cstotal; struct ufsmount *ump; struct cg *cgp; struct buf *bp; int error, cyl; MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | MNTK_SOFTDEP; mp->mnt_noasync++; } MNT_IUNLOCK(mp); ump = VFSTOUFS(mp); LIST_INIT(&ump->softdep_workitem_pending); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation * can take a long time and can be deferred for background * fsck. However, the old behavior of scanning the cylinder * groups and recalculating them at mount time is available * by setting vfs.ffs.compute_summary_at_mount to one. */ if (compute_summary_at_mount == 0 || fs->fs_clean != 0) return (0); bzero(&cstotal, sizeof cstotal); for (cyl = 0; cyl < fs->fs_ncg; cyl++) { if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), fs->fs_cgsize, cred, &bp)) != 0) { brelse(bp); return (error); } cgp = (struct cg *)bp->b_data; cstotal.cs_nffree += cgp->cg_cs.cs_nffree; cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; cstotal.cs_nifree += cgp->cg_cs.cs_nifree; cstotal.cs_ndir += cgp->cg_cs.cs_ndir; fs->fs_cs(fs, cyl) = cgp->cg_cs; brelse(bp); } #ifdef DEBUG if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); #endif bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); return (0); } /* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem * after a power failure, one must (conservatively) guarantee that the * on-disk copy of the bitmaps never indicate that a live inode or block is * free. So, when a block or inode is allocated, the bitmap should be * updated (on disk) before any new pointers. When a block or inode is * freed, the bitmap should not be updated until all pointers have been * reset. The latter dependency is handled by the delayed de-allocation * approach described below for block and inode de-allocation. The former * dependency is handled by calling the following procedure when a block or * inode is allocated. When an inode is allocated an "inodedep" is created * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. * Each "inodedep" is also inserted into the hash indexing structure so * that any additional link additions can be made dependent on the inode * allocation. * * The ufs filesystem maintains a number of free block counts (e.g., per * cylinder group, per cylinder and per pair) * in addition to the bitmaps. These counts are used to improve efficiency * during allocation and therefore must be consistent with the bitmaps. * There is no convenient way to guarantee post-crash consistency of these * counts with simple update ordering, for two main reasons: (1) The counts * and bitmaps for a single cylinder group block are not in the same disk * sector. If a disk write is interrupted (e.g., by power failure), one may * be written and the other not. (2) Some of the counts are located in the * superblock rather than the cylinder group block. So, we focus our soft * updates implementation on protecting the bitmaps. When mounting a * filesystem, we recompute the auxiliary counts from the bitmaps. */ /* * Called just after updating the cylinder group block to allocate an inode. */ void softdep_setup_inomapdep(bp, ip, newinum) struct buf *bp; /* buffer for cylgroup block with inode map */ struct inode *ip; /* inode related to allocation */ ino_t newinum; /* new inode number being allocated */ { struct inodedep *inodedep; struct bmsafemap *bmsafemap; /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY, &inodedep))) panic("softdep_setup_inomapdep: dependency for new inode " "already exists"); inodedep->id_buf = bp; inodedep->id_state &= ~DEPCOMPLETE; bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp); LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); FREE_LOCK(&lk); } /* * Called just after updating the cylinder group block to * allocate block or fragment. */ void softdep_setup_blkmapdep(bp, mp, newblkno) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ { struct newblk *newblk; struct bmsafemap *bmsafemap; struct fs *fs; fs = VFSTOUFS(mp)->um_fs; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp); LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); FREE_LOCK(&lk); } /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when * this routine is called and this routine must be called with * splbio interrupts blocked. */ static struct bmsafemap * bmsafemap_lookup(mp, bp) struct mount *mp; struct buf *bp; { struct bmsafemap *bmsafemap; struct worklist *wk; mtx_assert(&lk, MA_OWNED); LIST_FOREACH(wk, &bp->b_dep, wk_list) if (wk->wk_type == D_BMSAFEMAP) return (WK_BMSAFEMAP(wk)); FREE_LOCK(&lk); MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_allocdirecthd); LIST_INIT(&bmsafemap->sm_allocindirhd); LIST_INIT(&bmsafemap->sm_inodedephd); LIST_INIT(&bmsafemap->sm_newblkhd); ACQUIRE_LOCK(&lk); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } /* * Direct block allocation dependencies. * * When a new block is allocated, the corresponding disk locations must be * initialized (with zeros or new data) before the on-disk inode points to * them. Also, the freemap from which the block was allocated must be * updated (on disk) before the inode's pointer. These two dependencies are * independent of each other and are needed for all file blocks and indirect * blocks that are pointed to directly by the inode. Just before the * "in-core" version of the inode is updated with a newly allocated block * number, a procedure (below) is called to setup allocation dependency * structures. These structures are removed when the corresponding * dependencies are satisfied or when the block allocation becomes obsolete * (i.e., the file is deleted, the block is de-allocated, or the block is a * fragment that gets upgraded). All of these cases are handled in * procedures described later. * * When a file extension causes a fragment to be upgraded, either to a larger * fragment or to a full block, the on-disk location may change (if the * previous fragment could not simply be extended). In this case, the old * fragment must be de-allocated, but not until after the inode's pointer has * been updated. In most cases, this is handled by later procedures, which * will construct a "freefrag" structure to be added to the workitem queue * when the inode update is complete (or obsolete). The main exception to * this is when an allocation occurs while a pending allocation dependency * (for the same block pointer) remains. This case is handled in the main * allocation dependency setup procedure by immediately freeing the * unreferenced fragments. */ void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ ufs_lbn_t lbn; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ long oldsize; /* size of new block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct pagedep *pagedep; struct newblk *newblk; struct mount *mp; mp = UFSTOVFS(ip->i_ump); MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); adp->ad_lbn = lbn; adp->ad_newblkno = newblkno; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED; LIST_INIT(&adp->ad_newdirblk); if (newblkno == oldblkno) adp->ad_freefrag = NULL; else adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); ACQUIRE_LOCK(&lk); if (lbn >= NDADDR) { /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { /* * Allocating a direct block. * * If we are allocating a directory block, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); } if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); if (newblk->nb_state == DEPCOMPLETE) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; adp->ad_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_lbn <= lbn) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_lbn >= lbn) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); } /* * Replace an old allocdirect dependency with a newer one. * This routine must be called with splbio interrupts blocked. */ static void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { struct worklist *wk; struct freefrag *freefrag; struct newdirblk *newdirblk; mtx_assert(&lk, MA_OWNED); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_lbn >= NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, (intmax_t)oldadp->ad_newblkno, newadp->ad_oldsize, oldadp->ad_newsize); newadp->ad_oldblkno = oldadp->ad_oldblkno; newadp->ad_oldsize = oldadp->ad_oldsize; /* * If the old dependency had a fragment to free or had never * previously had a block allocated, then the new dependency * can immediately post its freefrag and adopt the old freefrag. * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on * the worklist when it is freed by free_allocdirect. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. * Here, the first part of the fragment allocated to the new * dependency is part of the block currently claimed on disk by * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { freefrag = newadp->ad_freefrag; newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } /* * If we are tracking a new directory-block allocation, * move it from the old allocdirect to the new allocdirect. */ if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { newdirblk = WK_NEWDIRBLK(wk); WORKLIST_REMOVE(&newdirblk->db_list); - if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL) + if (!LIST_EMPTY(&oldadp->ad_newdirblk)) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); } free_allocdirect(adphead, oldadp, 0); } /* * Allocate a new freefrag structure if needed. */ static struct freefrag * newfreefrag(ip, blkno, size) struct inode *ip; ufs2_daddr_t blkno; long size; { struct freefrag *freefrag; struct fs *fs; if (blkno == 0) return (NULL); fs = ip->i_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); freefrag->ff_inum = ip->i_number; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; return (freefrag); } /* * This workitem de-allocates fragments that were replaced during * file block allocation. */ static void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, freefrag->ff_fragsize, freefrag->ff_inum); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(&lk); } /* * Set up a dependency structure for an external attributes data block. * This routine follows much of the structure of softdep_setup_allocdirect. * See the description of softdep_setup_allocdirect above for details. */ void softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct bmsafemap *bmsafemap; struct inodedep *inodedep; struct newblk *newblk; struct mount *mp; mp = UFSTOVFS(ip->i_ump); MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp); adp->ad_lbn = lbn; adp->ad_newblkno = newblkno; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED | EXTDATA; LIST_INIT(&adp->ad_newdirblk); if (newblkno == oldblkno) adp->ad_freefrag = NULL; else adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize); ACQUIRE_LOCK(&lk); if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; if (newblk->nb_state == DEPCOMPLETE) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; adp->ad_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); if (lbn >= NXADDR) panic("softdep_setup_allocext: lbn %lld > NXADDR", (long long)lbn); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_lbn <= lbn) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_lbn >= lbn) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_lbn == lbn) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(&lk); } /* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when * a new block is allocated and pointed to by an entry in a block of * indirect pointers. The undo/redo states described above are also * used here. Because an indirect block contains many pointers that * may have dependencies, a second copy of the entire in-memory indirect * block is kept. The buffer cache copy is always completely up-to-date. * The second copy, which is used only as a source for disk writes, * contains only the safe pointers (i.e., those that have no remaining * update dependencies). The second copy is freed when all pointers * are safe. The cache is not allowed to replace indirect blocks with * pending update dependencies. If a buffer containing an indirect * block with dependencies is written, these routines will mark it * dirty again. It can only be successfully written once all the * dependencies are removed. The ffs_fsync routine in conjunction with * softdep_sync_metadata work together to get all the dependencies * removed so that a file can be successfully written to disk. Three * procedures are used when setting up indirect block pointer * dependencies. The division is necessary because of the organization * of the "balloc" routine and because of the distinction between file * pages and file metadata blocks. */ /* * Allocate a new allocindir structure. */ static struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ { struct allocindir *aip; MALLOC(aip, struct allocindir *, sizeof(struct allocindir), M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump)); aip->ai_state = ATTACHED; aip->ai_offset = ptrno; aip->ai_newblkno = newblkno; aip->ai_oldblkno = oldblkno; aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize); return (aip); } /* * Called just before setting an indirect block pointer * to a newly allocated file page. */ void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { struct allocindir *aip; struct pagedep *pagedep; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); aip = newallocindir(ip, ptrno, newblkno, oldblkno); ACQUIRE_LOCK(&lk); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); setup_allocindir_phase2(bp, ip, aip); FREE_LOCK(&lk); } /* * Called just before setting an indirect block pointer to a * newly allocated indirect block. */ void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ { struct allocindir *aip; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); aip = newallocindir(ip, ptrno, newblkno, 0); ACQUIRE_LOCK(&lk); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); setup_allocindir_phase2(bp, ip, aip); FREE_LOCK(&lk); } /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static void setup_allocindir_phase2(bp, ip, aip) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ struct allocindir *aip; /* allocindir allocated by the above routines */ { struct worklist *wk; struct indirdep *indirdep, *newindirdep; struct bmsafemap *bmsafemap; struct allocindir *oldaip; struct freefrag *freefrag; struct newblk *newblk; ufs2_daddr_t blkno; mtx_assert(&lk, MA_OWNED); if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); for (indirdep = NULL, newindirdep = NULL; ; ) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); break; } if (indirdep == NULL && newindirdep) { indirdep = newindirdep; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); newindirdep = NULL; } if (indirdep) { if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0, &newblk) == 0) panic("setup_allocindir: lost block"); if (newblk->nb_state == DEPCOMPLETE) { aip->ai_state |= DEPCOMPLETE; aip->ai_buf = NULL; } else { bmsafemap = newblk->nb_bmsafemap; aip->ai_buf = bmsafemap->sm_buf; LIST_REMOVE(newblk, nb_deps); LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd, aip, ai_deps); } LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); aip->ai_indirdep = indirdep; /* * Check to see if there is an existing dependency * for this block. If there is, merge the old * dependency into the new one. */ if (aip->ai_oldblkno == 0) oldaip = NULL; else LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) if (oldaip->ai_offset == aip->ai_offset) break; freefrag = NULL; if (oldaip != NULL) { if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("setup_allocindir_phase2: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; freefrag = aip->ai_freefrag; aip->ai_freefrag = oldaip->ai_freefrag; oldaip->ai_freefrag = NULL; free_allocindir(oldaip, NULL); } LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); if (ip->i_ump->um_fstype == UFS1) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; FREE_LOCK(&lk); if (freefrag != NULL) handle_workitem_freefrag(freefrag); } else FREE_LOCK(&lk); if (newindirdep) { newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; brelse(newindirdep->ir_savebp); ACQUIRE_LOCK(&lk); WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); if (indirdep) break; FREE_LOCK(&lk); } if (indirdep) { ACQUIRE_LOCK(&lk); break; } MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), M_INDIRDEP, M_SOFTDEP_FLAGS); workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, UFSTOVFS(ip->i_ump)); newindirdep->ir_state = ATTACHED; if (ip->i_ump->um_fstype == UFS1) newindirdep->ir_state |= UFS1FMT; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; } newindirdep->ir_savebp = getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); ACQUIRE_LOCK(&lk); } } /* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before * the blocks are made available for use by other files. (The true * requirement is that old pointers must be nullified before new on-disk * pointers are set. We chose this slightly more stringent requirement to * reduce complexity.) Our implementation handles this dependency by updating * the inode (or indirect block) appropriately but delaying the actual block * de-allocation (i.e., freemap and free space count manipulation) until * after the updated versions reach stable storage. After the disk is * updated, the blocks can be safely de-allocated whenever it is convenient. * This implementation handles only the common case of reducing a file's * length to zero. Other cases are handled by the conventional synchronous * write approach. * * The ffs implementation with which we worked double-checks * the state of the block pointers and file size as it reduces * a file's length. Some of this code is replicated here in our * soft updates implementation. The freeblks->fb_chkcnt field is * used to transfer a part of this information to the procedure * that eventually de-allocates the blocks. * * This routine should be called from the routine that shortens * a file's length, before the inode's size or block pointers * are modified. It will save the block pointer information for * later release and zero the inode so that the calling routine * can release it. */ void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; struct vnode *vp; struct buf *bp; struct fs *fs; ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error; fs = ip->i_fs; mp = UFSTOVFS(ip->i_ump); if (length != 0) panic("softdep_setup_freeblocks: non-zero length"); MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; extblocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); datablocks = DIP(ip, i_blocks) - extblocks; if ((flags & IO_NORMAL) == 0) { freeblks->fb_oldsize = 0; freeblks->fb_chkcnt = 0; } else { freeblks->fb_oldsize = ip->i_size; ip->i_size = 0; DIP_SET(ip, i_size, 0); freeblks->fb_chkcnt = datablocks; for (i = 0; i < NDADDR; i++) { freeblks->fb_dblks[i] = DIP(ip, i_db[i]); DIP_SET(ip, i_db[i], 0); } for (i = 0; i < NIADDR; i++) { freeblks->fb_iblks[i] = DIP(ip, i_ib[i]); DIP_SET(ip, i_ib[i], 0); } /* * If the file was removed, then the space being freed was * accounted for then (see softdep_releasefile()). If the * file is merely being truncated, then we account for it now. */ if ((ip->i_flag & IN_SPACECOUNTED) == 0) { UFS_LOCK(ip->i_ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ip->i_ump); } } if ((flags & IO_EXT) == 0) { freeblks->fb_oldextsize = 0; } else { freeblks->fb_oldextsize = ip->i_din2->di_extsize; ip->i_din2->di_extsize = 0; freeblks->fb_chkcnt += extblocks; for (i = 0; i < NXADDR; i++) { freeblks->fb_eblks[i] = ip->i_din2->di_extb[i]; ip->i_din2->di_extb[i] = 0; } } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt); /* * Push the zero'ed inode to to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ if ((error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); softdep_error("softdep_setup_freeblocks", error); } if (ip->i_ump->um_fstype == UFS1) *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; else *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; /* * Find and eliminate any inode dependencies. */ ACQUIRE_LOCK(&lk); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (delay == 0), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. * If we still have a bitmap dependency, then the inode has never * been written to disk, so we can free any fragments without delay. */ if (flags & IO_NORMAL) { merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) free_allocdirect(&inodedep->id_inoupdt, adp, delay); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) free_allocdirect(&inodedep->id_extupdt, adp, delay); } FREE_LOCK(&lk); bdwrite(bp); /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, walk the list and get rid of * any dependencies. */ vp = ITOV(ip); VI_LOCK(vp); drain_output(vp); restart: TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) { if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) continue; if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL) goto restart; VI_UNLOCK(vp); ACQUIRE_LOCK(&lk); (void) inodedep_lookup(mp, ip->i_number, 0, &inodedep); deallocate_dependencies(bp, inodedep); FREE_LOCK(&lk); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); VI_LOCK(vp); goto restart; } VI_UNLOCK(vp); ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); if(delay) { freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk * we can start freeing blocks. Add freeblks to the worklist * instead of calling handle_workitem_freeblocks directly as * it is more likely that additional IO is needed to complete * the request here than in the !delay case. */ if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freeblks->fb_list); } FREE_LOCK(&lk); /* * If the inode has never been written to disk (delay == 0), * then we can process the freeblks now that we have deleted * the dependencies. */ if (!delay) handle_workitem_freeblocks(freeblks, 0); } /* * Reclaim any dependency structures from a buffer that is about to * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ static void deallocate_dependencies(bp, inodedep) struct buf *bp; struct inodedep *inodedep; { struct worklist *wk; struct indirdep *indirdep; struct allocindir *aip; struct pagedep *pagedep; struct dirrem *dirrem; struct diradd *dap; int i; mtx_assert(&lk, MA_OWNED); while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { switch (wk->wk_type) { case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); /* * None of the indirect pointers will ever be visible, * so they can simply be tossed. GOINGAWAY ensures * that allocated pointers will be saved in the buffer * cache until they are freed. Note that they will * only be able to be found by their physical address * since the inode mapping the logical address will * be gone. The save buffer used for the safe copy * was allocated in setup_allocindir_phase2 using * the physical address so it could be used for this * purpose. Hence we swap the safe copy with the real * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ if (indirdep->ir_state & GOINGAWAY) panic("deallocate_dependencies: already gone"); indirdep->ir_state |= GOINGAWAY; VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1; while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) free_allocindir(aip, inodedep); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); WORKLIST_REMOVE(wk); WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); /* * None of the directory additions will ever be * visible, so they can simply be tossed. */ for (i = 0; i < DAHASHSZ; i++) while ((dap = LIST_FIRST(&pagedep->pd_diraddhd[i]))) free_diradd(dap); while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0) free_diradd(dap); /* * Copy any directory remove dependencies to the list * to be processed after the zero'ed inode is written. * If the inode has already been written, then they * can be dumped directly onto the work list. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&dirrem->dm_list); else WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } if ((pagedep->pd_state & NEWBLOCK) != 0) { LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) if (wk->wk_type == D_NEWDIRBLK && WK_NEWDIRBLK(wk)->db_pagedep == pagedep) break; if (wk != NULL) { WORKLIST_REMOVE(wk); free_newdirblk(WK_NEWDIRBLK(wk)); } else panic("deallocate_dependencies: " "lost pagedep"); } WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); continue; case D_ALLOCINDIR: free_allocindir(WK_ALLOCINDIR(wk), inodedep); continue; case D_ALLOCDIRECT: case D_INODEDEP: panic("deallocate_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ default: panic("deallocate_dependencies: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } } /* * Free an allocdirect. Generate a new freefrag work request if appropriate. * This routine must be called with splbio interrupts blocked. */ static void free_allocdirect(adphead, adp, delay) struct allocdirectlst *adphead; struct allocdirect *adp; int delay; { struct newdirblk *newdirblk; struct worklist *wk; mtx_assert(&lk, MA_OWNED); if ((adp->ad_state & DEPCOMPLETE) == 0) LIST_REMOVE(adp, ad_deps); TAILQ_REMOVE(adphead, adp, ad_next); if ((adp->ad_state & COMPLETE) == 0) WORKLIST_REMOVE(&adp->ad_list); if (adp->ad_freefrag != NULL) { if (delay) WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, &adp->ad_freefrag->ff_list); else add_to_worklist(&adp->ad_freefrag->ff_list); } if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { newdirblk = WK_NEWDIRBLK(wk); WORKLIST_REMOVE(&newdirblk->db_list); - if (LIST_FIRST(&adp->ad_newdirblk) != NULL) + if (!LIST_EMPTY(&adp->ad_newdirblk)) panic("free_allocdirect: extra newdirblk"); if (delay) WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, &newdirblk->db_list); else free_newdirblk(newdirblk); } WORKITEM_FREE(adp, D_ALLOCDIRECT); } /* * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. * This routine must be called with splbio interrupts blocked. */ static void free_newdirblk(newdirblk) struct newdirblk *newdirblk; { struct pagedep *pagedep; struct diradd *dap; int i; mtx_assert(&lk, MA_OWNED); /* * If the pagedep is still linked onto the directory buffer * dependency chain, then some of the entries on the * pd_pendinghd list may not be committed to disk yet. In * this case, we will simply clear the NEWBLOCK flag and * let the pd_pendinghd list be processed when the pagedep * is next written. If the pagedep is no longer on the buffer * dependency chain, then all the entries on the pd_pending * list are committed to disk and we can free them here. */ pagedep = newdirblk->db_pagedep; pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap); /* * If no dependencies remain, the pagedep will be freed. */ for (i = 0; i < DAHASHSZ; i++) - if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) + if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) break; if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } /* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; /* * This sets up the inode de-allocation dependency. */ MALLOC(freefile, struct freefile *, sizeof(struct freefile), M_FREEFILE, M_SOFTDEP_FLAGS); workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; if ((ip->i_flag & IN_SPACECOUNTED) == 0) { UFS_LOCK(ip->i_ump); ip->i_fs->fs_pendinginodes += 1; UFS_UNLOCK(ip->i_ump); } /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either * case we can free the file immediately. */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) { FREE_LOCK(&lk); handle_workitem_freefile(freefile); return; } WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); FREE_LOCK(&lk); } /* * Check to see if an inode has never been written to disk. If * so free the inodedep and return success, otherwise return failure. * This routine must be called with splbio interrupts blocked. * * If we still have a bitmap dependency, then the inode has never * been written to disk. Drop the dependency as it is no longer * necessary since the inode is being deallocated. We set the * ALLCOMPLETE flags since the bitmap now properly shows that the * inode is not allocated. Even if the inode is actively being * written, it has been rolled back to its zero'ed state, so we * are ensured that a zero inode is what is on the disk. For short * lived files, this change will usually result in removing all the * dependencies from the inode so that it can be freed immediately. */ static int check_inode_unwritten(inodedep) struct inodedep *inodedep; { mtx_assert(&lk, MA_OWNED); if ((inodedep->id_state & DEPCOMPLETE) != 0 || - LIST_FIRST(&inodedep->id_pendinghd) != NULL || - LIST_FIRST(&inodedep->id_bufwait) != NULL || - LIST_FIRST(&inodedep->id_inowait) != NULL || - TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || - TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || - TAILQ_FIRST(&inodedep->id_extupdt) != NULL || - TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || + !LIST_EMPTY(&inodedep->id_pendinghd) || + !LIST_EMPTY(&inodedep->id_bufwait) || + !LIST_EMPTY(&inodedep->id_inowait) || + !TAILQ_EMPTY(&inodedep->id_inoupdt) || + !TAILQ_EMPTY(&inodedep->id_newinoupdt) || + !TAILQ_EMPTY(&inodedep->id_extupdt) || + !TAILQ_EMPTY(&inodedep->id_newextupdt) || inodedep->id_nlinkdelta != 0) return (0); /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". */ if ((inodedep->id_state & IOSTARTED) != 0 && inodedep->id_savedino1 == NULL) return (0); inodedep->id_state |= ALLCOMPLETE; LIST_REMOVE(inodedep, id_deps); inodedep->id_buf = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { FREE(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; } if (free_inodedep(inodedep) == 0) panic("check_inode_unwritten: busy inode"); return (1); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ static int free_inodedep(inodedep) struct inodedep *inodedep; { mtx_assert(&lk, MA_OWNED); if ((inodedep->id_state & ONWORKLIST) != 0 || (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || - LIST_FIRST(&inodedep->id_pendinghd) != NULL || - LIST_FIRST(&inodedep->id_bufwait) != NULL || - LIST_FIRST(&inodedep->id_inowait) != NULL || - TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || - TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || - TAILQ_FIRST(&inodedep->id_extupdt) != NULL || - TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || + !LIST_EMPTY(&inodedep->id_pendinghd) || + !LIST_EMPTY(&inodedep->id_bufwait) || + !LIST_EMPTY(&inodedep->id_inowait) || + !TAILQ_EMPTY(&inodedep->id_inoupdt) || + !TAILQ_EMPTY(&inodedep->id_newinoupdt) || + !TAILQ_EMPTY(&inodedep->id_extupdt) || + !TAILQ_EMPTY(&inodedep->id_newextupdt) || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) return (0); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); num_inodedep -= 1; return (1); } /* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, * checks regarding the number of blocks de-allocated (compared * to the number of blocks allocated for the file) are also * performed in this function. */ static void handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; int i, nblocks, level, bsize; ufs2_daddr_t bn, blocksreleased = 0; int error, allerror = 0; ufs_lbn_t baselbns[NIADDR], tmpval; int fs_pendingblocks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; fs_pendingblocks = 0; tmpval = 1; baselbns[0] = NDADDR; for (i = 1; i < NIADDR; i++) { tmpval *= NINDIR(fs); baselbns[i] = baselbns[i - 1] + tmpval; } nblocks = btodb(fs->fs_bsize); blocksreleased = 0; /* * Release all extended attribute blocks or frags. */ if (freeblks->fb_oldextsize > 0) { for (i = (NXADDR - 1); i >= 0; i--) { if ((bn = freeblks->fb_eblks[i]) == 0) continue; bsize = sblksize(fs, freeblks->fb_oldextsize, i); ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, freeblks->fb_previousinum); blocksreleased += btodb(bsize); } } /* * Release all data blocks or frags. */ if (freeblks->fb_oldsize > 0) { /* * Indirect blocks first. */ for (level = (NIADDR - 1); level >= 0; level--) { if ((bn = freeblks->fb_iblks[level]) == 0) continue; if ((error = indir_trunc(freeblks, fsbtodb(fs, bn), level, baselbns[level], &blocksreleased)) != 0) allerror = error; ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, fs->fs_bsize, freeblks->fb_previousinum); fs_pendingblocks += nblocks; blocksreleased += nblocks; } /* * All direct blocks or frags. */ for (i = (NDADDR - 1); i >= 0; i--) { if ((bn = freeblks->fb_dblks[i]) == 0) continue; bsize = sblksize(fs, freeblks->fb_oldsize, i); ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize, freeblks->fb_previousinum); fs_pendingblocks += btodb(bsize); blocksreleased += btodb(bsize); } } UFS_LOCK(ump); fs->fs_pendingblocks -= fs_pendingblocks; UFS_UNLOCK(ump); /* * If we still have not finished background cleanup, then check * to see if the block count needs to be adjusted. */ if (freeblks->fb_chkcnt != blocksreleased && (fs->fs_flags & FS_UNCLEAN) != 0 && ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum, (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) { ip = VTOI(vp); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \ freeblks->fb_chkcnt - blocksreleased); ip->i_flag |= IN_CHANGE; vput(vp); } #ifdef DIAGNOSTIC if (freeblks->fb_chkcnt != blocksreleased && ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0)) printf("handle_workitem_freeblocks: block count\n"); if (allerror) softdep_error("handle_workitem_freeblks", allerror); #endif /* DIAGNOSTIC */ ACQUIRE_LOCK(&lk); WORKITEM_FREE(freeblks, D_FREEBLKS); FREE_LOCK(&lk); } /* * Release blocks associated with the inode ip and stored in the indirect * block dbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ static int indir_trunc(freeblks, dbn, level, lbn, countp) struct freeblks *freeblks; ufs2_daddr_t dbn; int level; ufs_lbn_t lbn; ufs2_daddr_t *countp; { struct buf *bp; struct fs *fs; struct worklist *wk; struct indirdep *indirdep; struct ufsmount *ump; ufs1_daddr_t *bap1 = 0; ufs2_daddr_t nb, *bap2 = 0; ufs_lbn_t lbnadd; int i, nblocks, ufs1fmt; int error, allerror = 0; int fs_pendingblocks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; fs_pendingblocks = 0; lbnadd = 1; for (i = level; i > 0; i--) lbnadd *= NINDIR(fs); /* * Get buffer of block pointers to be freed. This routine is not * called until the zero'ed inode has been written, so it is safe * to free blocks as they are encountered. Because the inode has * been zero'ed, calls to bmap on these blocks will fail. So, we * have to use the on-disk address and the block device for the * filesystem to look them up. If the file was deleted before its * indirect blocks were all written to disk, the routine that set * us up (deallocate_dependencies) will have arranged to leave * a complete copy of the indirect block in memory for our use. * Otherwise we have to read the blocks in from the disk. */ #ifdef notyet bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0, GB_NOCREAT); #else bp = incore(&freeblks->fb_devvp->v_bufobj, dbn); #endif ACQUIRE_LOCK(&lk); if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) { if (wk->wk_type != D_INDIRDEP || (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: lost indirdep"); WORKLIST_REMOVE(wk); WORKITEM_FREE(indirdep, D_INDIRDEP); - if (LIST_FIRST(&bp->b_dep) != NULL) + if (!LIST_EMPTY(&bp->b_dep)) panic("indir_trunc: dangling dep"); ump->um_numindirdeps -= 1; FREE_LOCK(&lk); } else { #ifdef notyet if (bp) brelse(bp); #endif FREE_LOCK(&lk); error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); if (error) { brelse(bp); return (error); } } /* * Recursively free indirect blocks. */ if (ump->um_fstype == UFS1) { ufs1fmt = 1; bap1 = (ufs1_daddr_t *)bp->b_data; } else { ufs1fmt = 0; bap2 = (ufs2_daddr_t *)bp->b_data; } nblocks = btodb(fs->fs_bsize); for (i = NINDIR(fs) - 1; i >= 0; i--) { if (ufs1fmt) nb = bap1[i]; else nb = bap2[i]; if (nb == 0) continue; if (level != 0) { if ((error = indir_trunc(freeblks, fsbtodb(fs, nb), level - 1, lbn + (i * lbnadd), countp)) != 0) allerror = error; } ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, freeblks->fb_previousinum); fs_pendingblocks += nblocks; *countp += nblocks; } UFS_LOCK(ump); fs->fs_pendingblocks -= fs_pendingblocks; UFS_UNLOCK(ump); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (allerror); } /* * Free an allocindir. * This routine must be called with splbio interrupts blocked. */ static void free_allocindir(aip, inodedep) struct allocindir *aip; struct inodedep *inodedep; { struct freefrag *freefrag; mtx_assert(&lk, MA_OWNED); if ((aip->ai_state & DEPCOMPLETE) == 0) LIST_REMOVE(aip, ai_deps); if (aip->ai_state & ONWORKLIST) WORKLIST_REMOVE(&aip->ai_list); LIST_REMOVE(aip, ai_next); if ((freefrag = aip->ai_freefrag) != NULL) { if (inodedep == NULL) add_to_worklist(&freefrag->ff_list); else WORKLIST_INSERT(&inodedep->id_bufwait, &freefrag->ff_list); } WORKITEM_FREE(aip, D_ALLOCINDIR); } /* * Directory entry addition dependencies. * * When adding a new directory entry, the inode (with its incremented link * count) must be written to disk before the directory entry's pointer to it. * Also, if the inode is newly allocated, the corresponding freemap must be * updated (on disk) before the directory entry's pointer. These requirements * are met via undo/redo on the directory entry's pointer, which consists * simply of the inode number. * * As directory entries are added and deleted, the free space within a * directory block can become fragmented. The ufs filesystem will compact * a fragmented directory block to make space for a new entry. When this * occurs, the offsets of previously added entries change. Any "diradd" * dependency structures corresponding to these entries must be updated with * the new offsets. */ /* * This routine is called after the in-memory inode's link * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ ino_t newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ int isnewblk; /* entry is in a newly allocated block */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; struct allocdirect *adp; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk = 0; struct mkdir *mkdir1, *mkdir2; struct mount *mp; /* * Whiteouts have no dependencies. */ if (newinum == WINO) { if (newdirbp != NULL) bdwrite(newdirbp); return (0); } mp = UFSTOVFS(dp->i_ump); fs = dp->i_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); } if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(&lk); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); mkdir1->md_state = MKDIR_BODY; mkdir1->md_diradd = dap; MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); mkdir2->md_state = MKDIR_PARENT; mkdir2->md_diradd = dap; /* * Dependency on "." and ".." being written to disk. */ mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(&lk); LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs); WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list); FREE_LOCK(&lk); bdwrite(newdirbp); /* * Dependency on link count increase for parent directory */ ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); } else { LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); } } /* * Link into parent directory pagedep to await its being written. */ if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ (void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); if (isnewblk) { /* * Directories growing into indirect blocks are rare * enough and the frequency of new block allocation * in those cases even more rare, that we choose not * to bother tracking them. Rather we simply force the * new directory entry to disk. */ if (lbn >= NDADDR) { FREE_LOCK(&lk); /* * We only have a new allocation when at the * beginning of a new block, not when we are * expanding into an existing block. */ if (blkoff(fs, diroffset) == 0) return (1); return (0); } /* * We only have a new allocation when at the beginning * of a new fragment, not when we are expanding into an * existing fragment. Also, there is nothing to do if we * are already tracking this block. */ if (fragoff(fs, diroffset) != 0) { FREE_LOCK(&lk); return (0); } if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(&lk); return (0); } /* * Find our associated allocdirect and have it track us. */ if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0) panic("softdep_setup_directory_add: lost inodedep"); adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); if (adp == NULL || adp->ad_lbn != lbn) panic("softdep_setup_directory_add: lost entry"); pagedep->pd_state |= NEWBLOCK; newdirblk->db_pagedep = pagedep; WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); } FREE_LOCK(&lk); return (0); } /* * This procedure is called to change the offset of a directory * entry when compacting a directory block which must be owned * exclusively by the caller. Note that the actual entry movement * must be done in this procedure to ensure that no I/O completions * occur while the move is in progress. */ void softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ caddr_t newloc; /* address of new directory location */ int entrysize; /* size of directory entry */ { int offset, oldoffset, newoffset; struct pagedep *pagedep; struct diradd *dap; ufs_lbn_t lbn; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0) goto done; oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { if (dap->da_offset != oldoffset) continue; dap->da_offset = newoffset; if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset)) break; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)], dap, da_pdlist); break; } if (dap == NULL) { LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { if (dap->da_offset == oldoffset) { dap->da_offset = newoffset; break; } } } done: bcopy(oldloc, newloc, entrysize); FREE_LOCK(&lk); } /* * Free a diradd dependency structure. This routine must be called * with splbio interrupts blocked. */ static void free_diradd(dap) struct diradd *dap; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; mtx_assert(&lk, MA_OWNED); WORKLIST_REMOVE(&dap->da_list); LIST_REMOVE(dap, da_pdlist); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) (void) free_inodedep(inodedep); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; dap->da_state &= ~mkdir->md_state; WORKLIST_REMOVE(&mkdir->md_list); LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } WORKITEM_FREE(dap, D_DIRADD); } /* * Directory entry removal dependencies. * * When removing a directory entry, the entry's inode pointer must be * zero'ed on disk before the corresponding inode's link count is decremented * (possibly freeing the inode for re-use). This dependency is handled by * updating the directory entry but delaying the inode count reduction until * after the directory block has been written to disk. After this point, the * inode count can be decremented whenever it is convenient. */ /* * This routine should be called immediately after removing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will do this task when it is safe. */ void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to a zeroed entry until * the new inode is committed to disk. If the COMPLETE flag is * set then we have deleted an entry that never made it to * disk. If the entry we deleted resulted from a name change, * then the old name still resides on disk. We cannot delete * its inode (returned to us in prevdirrem) until the zeroed * directory entry gets to disk. The new inode has never been * referenced on the disk, so can be deleted immediately. */ if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); FREE_LOCK(&lk); } else { if (prevdirrem != NULL) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; FREE_LOCK(&lk); handle_workitem_remove(dirrem, NULL); } } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ static long num_dirrem; /* number of dirrem allocated */ static struct dirrem * newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ struct dirrem **prevdirremp; /* previously referenced inode, if any */ { int offset; ufs_lbn_t lbn; struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); /* * If we are over our limit, try to improve the situation. * Limiting the number of dirrem structures will also limit * the number of freefile and freeblks structures. */ ACQUIRE_LOCK(&lk); if (num_dirrem > max_softdeps / 2) (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE); num_dirrem += 1; FREE_LOCK(&lk); MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dirrem->dm_pagedep = pagedep; /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can * be de-allocated. Check for an entry on both the pd_dirraddhd * list and the pd_pendinghd list. */ LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) if (dap->da_offset == offset) break; if (dap == NULL) { LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset == offset) break; if (dap == NULL) return (dirrem); } /* * Must be ATTACHED at this point. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %d should be %d", ip->i_number, dap->da_newinum); /* * If we are deleting a changed name that never made it to disk, * then return the dirrem describing the previous inode (which * represents the inode currently referenced from this entry on disk). */ if ((dap->da_state & DIRCHG) != 0) { *prevdirremp = dap->da_previous; dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } /* * We are deleting an entry that never made it to disk. * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; free_diradd(dap); return (dirrem); } /* * Directory entry change dependencies. * * Changing an existing directory entry requires that an add operation * be completed first followed by a deletion. The semantics for the addition * are identical to the description of adding a new entry above except * that the rollback is to the old inode number rather than zero. Once * the addition dependency is completed, the removal is done as described * in the removal routine above. */ /* * This routine should be called immediately after changing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will perform this task when it is safe. */ void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ ino_t newinum; /* new inode number for changed entry */ int isrmdir; /* indicates if doing RMDIR */ { int offset; struct diradd *dap = NULL; struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mount *mp; offset = blkoff(dp->i_fs, dp->i_offset); mp = UFSTOVFS(dp->i_ump); /* * Whiteouts do not need diradd dependencies. */ if (newinum != WINO) { MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; } /* * Allocate a new dirrem and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: * 0 - non-directory file rename * 1 - directory rename within same directory * inum - directory rename to new directory of given inode number * When renaming to a new directory, we are both deleting and * creating a new directory entry, so the link count on the new * directory should not change. Thus we do not need the followup * dirrem which is usually done in handle_workitem_remove. We set * the DIRCHG flag to tell handle_workitem_remove to skip the * followup dirrem. */ if (isrmdir > 1) dirrem->dm_state |= DIRCHG; /* * Whiteouts have no additional dependencies, * so just put the dirrem on the correct list. */ if (newinum == WINO) { if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } FREE_LOCK(&lk); return; } /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to the previous inode until * the new inode is committed to disk. If the COMPLETE flag is * set, then we have deleted an entry that never made it to disk. * If the entry we deleted resulted from a name change, then the old * inode reference still resides on disk. Any rollback that we do * needs to be to that old inode (returned to us in prevdirrem). If * the entry we deleted resulted from a create, then there is * no entry on the disk, so we want to roll back to zero rather * than the uncommitted inode. In either of the COMPLETE cases we * want to immediately free the unwritten and unreferenced inode. */ if ((dirrem->dm_state & COMPLETE) == 0) { dap->da_previous = dirrem; } else { if (prevdirrem != NULL) { dap->da_previous = prevdirrem; } else { dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } else { LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } FREE_LOCK(&lk); } /* * Called whenever the link count on an inode is changed. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void softdep_change_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; ACQUIRE_LOCK(&lk); (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(&lk); } /* * Called when the effective link count and the reference count * on an inode drops to zero. At this point there are no names * referencing the file in the filesystem and no active file * references. The space associated with the file will be freed * as soon as the necessary soft dependencies are cleared. */ void softdep_releasefile(ip) struct inode *ip; /* inode with the zero effective link count */ { struct inodedep *inodedep; struct fs *fs; int extblocks; if (ip->i_effnlink > 0) panic("softdep_releasefile: file still referenced"); /* * We may be called several times as the on-disk link count * drops to zero. We only want to account for the space once. */ if (ip->i_flag & IN_SPACECOUNTED) return; /* * We have to deactivate a snapshot otherwise copyonwrites may * add blocks and the cleanup may remove blocks after we have * tried to account for them. */ if ((ip->i_flags & SF_SNAPSHOT) != 0) ffs_snapremove(ITOV(ip)); /* * If we are tracking an nlinkdelta, we have to also remember * whether we accounted for the freed space yet. */ ACQUIRE_LOCK(&lk); if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep))) inodedep->id_state |= SPACECOUNTED; FREE_LOCK(&lk); fs = ip->i_fs; extblocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); UFS_LOCK(ip->i_ump); ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks; ip->i_fs->fs_pendinginodes += 1; UFS_UNLOCK(ip->i_ump); ip->i_flag |= IN_SPACECOUNTED; } /* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ static void handle_workitem_remove(dirrem, xp) struct dirrem *dirrem; struct vnode *xp; { struct thread *td = curthread; struct inodedep *inodedep; struct vnode *vp; struct inode *ip; ino_t oldinum; int error; if ((vp = xp) == NULL && (error = ffs_vget(dirrem->dm_list.wk_mp, dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) { softdep_error("handle_workitem_remove: vget", error); return; } ip = VTOI(vp); ACQUIRE_LOCK(&lk); if ((inodedep_lookup(dirrem->dm_list.wk_mp, dirrem->dm_oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); vput(vp); return; } /* * Directory deletion. Decrement reference count for both the * just deleted parent directory entry and the reference for ".". * Next truncate the directory to length zero. When the * truncation completes, arrange to have the reference count on * the parent decremented to account for the loss of "..". */ ip->i_nlink -= 2; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(&lk); if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0) softdep_error("handle_workitem_remove: truncate", error); ACQUIRE_LOCK(&lk); /* * Rename a directory to a new parent. Since, we are both deleting * and creating a new directory entry, the link count on the new * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(&lk); vput(vp); return; } /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either * case we can remove the file immediately. */ dirrem->dm_state = 0; oldinum = dirrem->dm_oldinum; dirrem->dm_oldinum = dirrem->dm_dirinum; if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum, 0, &inodedep) == 0 || check_inode_unwritten(inodedep)) { FREE_LOCK(&lk); vput(vp); handle_workitem_remove(dirrem, NULL); return; } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(&lk); ip->i_flag |= IN_CHANGE; ffs_update(vp, 0); vput(vp); } /* * Inode de-allocation dependencies. * * When an inode's link count is reduced to zero, it can be de-allocated. We * found it convenient to postpone de-allocation until after the inode is * written to disk with its new link count (zero). At this point, all of the * on-disk inode's block pointers are nullified and, with careful dependency * list ordering, all dependencies related to the inode will be satisfied and * the corresponding dependency structures de-allocated. So, if/when the * inode is reused, there will be no mixing of old dependencies with new * ones. This artificial dependency is set up by the block de-allocation * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ static void handle_workitem_freefile(freefile) struct freefile *freefile; { struct fs *fs; struct inodedep *idp; struct ufsmount *ump; int error; ump = VFSTOUFS(freefile->fx_list.wk_mp); fs = ump->um_fs; #ifdef DEBUG ACQUIRE_LOCK(&lk); error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(&lk); if (error) panic("handle_workitem_freefile: inodedep survived"); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, freefile->fx_oldinum, freefile->fx_mode)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(&lk); WORKITEM_FREE(freefile, D_FREEFILE); FREE_LOCK(&lk); } /* * Helper function which unlinks marker element from work list and returns * the next element on the list. */ static __inline struct worklist * markernext(struct worklist *marker) { struct worklist *next; next = LIST_NEXT(marker, wk_list); LIST_REMOVE(marker, wk_list); return next; } /* * Disk writes. * * The dependency structures constructed above are most actively used when file * system blocks are written to disk. No constraints are placed on when a * block can be written, but unsatisfied update dependencies are made safe by * modifying (or replacing) the source memory for the duration of the disk * write. When the disk write completes, the memory block is again brought * up-to-date. * * In-core inode structure reclamation. * * Because there are a finite number of "in-core" inode structures, they are * reused regularly. By transferring all inode-related dependencies to the * in-memory inode block and indexing them separately (via "inodedep"s), we * can allow "in-core" inode structures to be reused at any time and avoid * any increase in contention. * * Called just before entering the device driver to initiate a new disk I/O. * The buffer must be locked, thus, no I/O completion operations can occur * while we are manipulating its associated dependencies. */ static void softdep_disk_io_initiation(bp) struct buf *bp; /* structure describing disk write to occur */ { struct worklist *wk; struct worklist marker; struct indirdep *indirdep; struct inodedep *inodedep; /* * We only care about write operations. There should never * be dependencies for reads. */ if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ ACQUIRE_LOCK(&lk); /* * Do any necessary pre-I/O processing. */ for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; wk = markernext(&marker)) { LIST_INSERT_AFTER(wk, &marker, wk_list); switch (wk->wk_type) { case D_PAGEDEP: initiate_write_filepage(WK_PAGEDEP(wk), bp); continue; case D_INODEDEP: inodedep = WK_INODEDEP(wk); if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) initiate_write_inodeblock_ufs1(inodedep, bp); else initiate_write_inodeblock_ufs2(inodedep, bp); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_io_initiation: indirdep gone"); /* * If there are no remaining dependencies, this * will be writing the real pointers, so the * dependency can be freed. */ - if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { + if (LIST_EMPTY(&indirdep->ir_deplisthd)) { struct buf *bp; bp = indirdep->ir_savebp; bp->b_flags |= B_INVAL | B_NOCACHE; /* inline expand WORKLIST_REMOVE(wk); */ wk->wk_state &= ~ONWORKLIST; LIST_REMOVE(wk, wk_list); WORKITEM_FREE(indirdep, D_INDIRDEP); FREE_LOCK(&lk); brelse(bp); ACQUIRE_LOCK(&lk); continue; } /* * Replace up-to-date version with safe version. */ FREE_LOCK(&lk); MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); bcopy(indirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount); continue; case D_MKDIR: case D_BMSAFEMAP: case D_ALLOCDIRECT: case D_ALLOCINDIR: continue; default: panic("handle_disk_io_initiation: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(&lk); PRELE(curproc); /* Allow swapout of kernel stack */ } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in a directory. The buffer must be locked, * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ static void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { struct diradd *dap; struct direct *ep; int i; if (pagedep->pd_state & IOSTARTED) { /* * This can only happen if there is a driver that does not * understand chaining. Here biodone will reissue the call * to strategy for the incomplete buffers. */ printf("initiate_write_filepage: already started\n"); return; } pagedep->pd_state |= IOSTARTED; for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) panic("%s: dir inum %d != new %d", "initiate_write_filepage", ep->d_ino, dap->da_newinum); if (dap->da_state & DIRCHG) ep->d_ino = dap->da_previous->dm_oldinum; else ep->d_ino = 0; dap->da_state &= ~ATTACHED; dap->da_state |= UNDONE; } } } /* * Version of initiate_write_inodeblock that handles UFS1 dinodes. * Note that any bug fixes made to this routine must be done in the * version found below. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs1(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; struct fs *fs; ufs_lbn_t i, prevlbn = 0; int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs1: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; dp = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino1 != NULL) panic("initiate_write_inodeblock_ufs1: I/O underway"); FREE_LOCK(&lk); MALLOC(sip, struct ufs1_dinode *, sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); inodedep->id_savedino1 = sip; *inodedep->id_savedino1 = *dp; bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); dp->di_gen = inodedep->id_savedino1->di_gen; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; - if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL) + if (TAILQ_EMPTY(&inodedep->id_inoupdt)) return; /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef DIAGNOSTIC if (deplist != 0 && prevlbn >= adp->ad_lbn) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_lbn; if (adp->ad_lbn < NDADDR && dp->di_db[adp->ad_lbn] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_lbn, dp->di_db[adp->ad_lbn], (intmax_t)adp->ad_newblkno); if (adp->ad_lbn >= NDADDR && dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_lbn - NDADDR, dp->di_ib[adp->ad_lbn - NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_lbn; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* DIAGNOSTIC */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_lbn >= NDADDR) break; dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; for (i = adp->ad_lbn + 1; i < NDADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* DIAGNOSTIC */ dp->di_db[i] = 0; } for (i = 0; i < NIADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* DIAGNOSTIC */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { for (i = lastadp->ad_lbn; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_lbn - NDADDR] = 0; } /* * Version of initiate_write_inodeblock that handles UFS2 dinodes. * Note that any bug fixes made to this routine must be done in the * version found above. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs2(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; struct fs *fs; ufs_lbn_t i, prevlbn = 0; int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs2: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; dp = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino2 != NULL) panic("initiate_write_inodeblock_ufs2: I/O underway"); FREE_LOCK(&lk); MALLOC(sip, struct ufs2_dinode *, sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); inodedep->id_savedino2 = sip; *inodedep->id_savedino2 = *dp; bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); dp->di_gen = inodedep->id_savedino2->di_gen; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; - if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL && - TAILQ_FIRST(&inodedep->id_extupdt) == NULL) + if (TAILQ_EMPTY(&inodedep->id_inoupdt) && + TAILQ_EMPTY(&inodedep->id_extupdt)) return; /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef DIAGNOSTIC if (deplist != 0 && prevlbn >= adp->ad_lbn) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_lbn; if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_lbn, (intmax_t)dp->di_extb[adp->ad_lbn], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_lbn; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* DIAGNOSTIC */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the ext * data which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; for (i = adp->ad_lbn + 1; i < NXADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* DIAGNOSTIC */ dp->di_extb[i] = 0; } lastadp = NULL; break; } /* * If we have zero'ed out the last allocated block of the ext * data, roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { for (i = lastadp->ad_lbn; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; } /* * Set the file data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef DIAGNOSTIC if (deplist != 0 && prevlbn >= adp->ad_lbn) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_lbn; if (adp->ad_lbn < NDADDR && dp->di_db[adp->ad_lbn] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_lbn, (intmax_t)dp->di_db[adp->ad_lbn], (intmax_t)adp->ad_newblkno); if (adp->ad_lbn >= NDADDR && dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno) panic("%s indirect pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock:", (intmax_t)adp->ad_lbn - NDADDR, (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_lbn; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* DIAGNOSTIC */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_lbn >= NDADDR) break; dp->di_db[adp->ad_lbn] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize; for (i = adp->ad_lbn + 1; i < NDADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* DIAGNOSTIC */ dp->di_db[i] = 0; } for (i = 0; i < NIADDR; i++) { #ifdef DIAGNOSTIC if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep3"); #endif /* DIAGNOSTIC */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) { for (i = lastadp->ad_lbn; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_lbn - NDADDR] = 0; } /* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. */ static void softdep_disk_write_complete(bp) struct buf *bp; /* describes the completed disk write */ { struct worklist *wk; struct worklist *owk; struct workhead reattach; struct newblk *newblk; struct allocindir *aip; struct allocdirect *adp; struct indirdep *indirdep; struct inodedep *inodedep; struct bmsafemap *bmsafemap; /* * If an error occurred while doing the write, then the data * has not hit the disk and the dependencies cannot be unrolled. */ if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) return; LIST_INIT(&reattach); /* * This lock must not be released anywhere in this code segment. */ ACQUIRE_LOCK(&lk); owk = NULL; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); if (wk == owk) panic("duplicate worklist: %p\n", wk); owk = wk; switch (wk->wk_type) { case D_PAGEDEP: if (handle_written_filepage(WK_PAGEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: bmsafemap = WK_BMSAFEMAP(wk); while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_bmsafemap = NULL; LIST_REMOVE(newblk, nb_deps); } while ((adp = LIST_FIRST(&bmsafemap->sm_allocdirecthd))) { adp->ad_state |= DEPCOMPLETE; adp->ad_buf = NULL; LIST_REMOVE(adp, ad_deps); handle_allocdirect_partdone(adp); } while ((aip = LIST_FIRST(&bmsafemap->sm_allocindirhd))) { aip->ai_state |= DEPCOMPLETE; aip->ai_buf = NULL; LIST_REMOVE(aip, ai_deps); handle_allocindir_partdone(aip); } while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) { inodedep->id_state |= DEPCOMPLETE; LIST_REMOVE(inodedep, id_deps); inodedep->id_buf = NULL; } WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); continue; case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); adp->ad_state |= COMPLETE; handle_allocdirect_partdone(adp); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); aip->ai_state |= COMPLETE; handle_allocindir_partdone(aip); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_write_complete: indirdep gone"); bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); FREE(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = 0; indirdep->ir_state &= ~UNDONE; indirdep->ir_state |= ATTACHED; while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { handle_allocindir_partdone(aip); if (aip == LIST_FIRST(&indirdep->ir_donehd)) panic("disk_write_complete: not gone"); } WORKLIST_INSERT(&reattach, wk); if ((bp->b_flags & B_DELWRI) == 0) stat_indir_blk_ptrs++; bdirty(bp); continue; default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* * Reattach any requests that must be redone. */ while ((wk = LIST_FIRST(&reattach)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(&lk); } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocdirect_partdone(adp) struct allocdirect *adp; /* the completed allocdirect */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; long bsize, delay; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (adp->ad_buf != NULL) panic("handle_allocdirect_partdone: dangling dep"); /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. * We check the currently active set of allocdirects in id_inoupdt * or id_extupdt as appropriate. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; if (adp->ad_state & EXTDATA) listhead = &inodedep->id_extupdt; else listhead = &inodedep->id_inoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) { /* found our block */ if (listadp == adp) break; /* continue if ad_oldlbn is not a fragment */ if (listadp->ad_oldsize == 0 || listadp->ad_oldsize == bsize) continue; /* hit a fragment */ return; } /* * If we have reached the end of the current list without * finding the just finished dependency, then it must be * on the future dependency list. Future dependencies cannot * be freed until they are moved to the current list. */ if (listadp == NULL) { #ifdef DEBUG if (adp->ad_state & EXTDATA) listhead = &inodedep->id_newextupdt; else listhead = &inodedep->id_newinoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) /* found our block */ if (listadp == adp) break; if (listadp == NULL) panic("handle_allocdirect_partdone: lost dep"); #endif /* DEBUG */ return; } /* * If we have found the just finished dependency, then free * it along with anything that follows it that is complete. * If the inode still has a bitmap dependency, then it has * never been written to disk, hence the on-disk inode cannot * reference the old fragment so we can free it without delay. */ delay = (inodedep->id_state & DEPCOMPLETE); for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; free_allocdirect(listhead, adp, delay); } } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { struct indirdep *indirdep; if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (aip->ai_buf != NULL) panic("handle_allocindir_partdone: dangling dependency"); indirdep = aip->ai_indirdep; if (indirdep->ir_state & UNDONE) { LIST_REMOVE(aip, ai_next); LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; LIST_REMOVE(aip, ai_next); if (aip->ai_freefrag != NULL) add_to_worklist(&aip->ai_freefrag->ff_list); WORKITEM_FREE(aip, D_ALLOCINDIR); } /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further * splbio interrupts blocked. */ static int handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ { struct worklist *wk, *filefree; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; int hadchanges, fstype; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { fstype = UFS1; dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); } else { fstype = UFS2; dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); } /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until * all associated dependencies have been cleared and the * corresponding updates written to disk. */ if (inodedep->id_savedino1 != NULL) { if (fstype == UFS1) *dp1 = *inodedep->id_savedino1; else *dp2 = *inodedep->id_savedino2; FREE(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); return (1); } inodedep->id_state |= COMPLETE; /* * Roll forward anything that had to be rolled back before * the inode could be updated. */ hadchanges = 0; for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { if (adp->ad_lbn < NDADDR) { if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", (intmax_t)adp->ad_lbn, dp1->di_db[adp->ad_lbn], (intmax_t)adp->ad_oldblkno); dp1->di_db[adp->ad_lbn] = adp->ad_newblkno; } else { if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_lbn - NDADDR, dp1->di_ib[adp->ad_lbn - NDADDR]); dp1->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; } } else { if (adp->ad_lbn < NDADDR) { if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", (intmax_t)adp->ad_lbn, "mismatch", (intmax_t)dp2->di_db[adp->ad_lbn], (intmax_t)adp->ad_oldblkno); dp2->di_db[adp->ad_lbn] = adp->ad_newblkno; } else { if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_lbn - NDADDR, (intmax_t) dp2->di_ib[adp->ad_lbn - NDADDR]); dp2->di_ib[adp->ad_lbn - NDADDR] = adp->ad_newblkno; } } adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", (intmax_t)adp->ad_lbn, "mismatch", (intmax_t)dp2->di_extb[adp->ad_lbn], (intmax_t)adp->ad_oldblkno); dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) panic("handle_written_inodeblock: bad size"); if (fstype == UFS1) { if (dp1->di_size != inodedep->id_savedsize) { dp1->di_size = inodedep->id_savedsize; hadchanges = 1; } } else { if (dp2->di_size != inodedep->id_savedsize) { dp2->di_size = inodedep->id_savedsize; hadchanges = 1; } if (dp2->di_extsize != inodedep->id_savedextsize) { dp2->di_extsize = inodedep->id_savedextsize; hadchanges = 1; } } inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (hadchanges) bdirty(bp); /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) handle_allocdirect_partdone(adp); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples * before the old ones have been deleted. */ filefree = NULL; while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_FREEFILE: /* * We defer adding filefree to the worklist until * all other additions have been made to ensure * that it will be done after all the old blocks * have been freed. */ if (filefree != NULL) panic("handle_written_inodeblock: filefree"); filefree = wk; continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); continue; case D_DIRADD: diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEBLKS: wk->wk_state |= COMPLETE; if ((wk->wk_state & ALLCOMPLETE) != ALLCOMPLETE) continue; /* -- fall through -- */ case D_FREEFRAG: case D_DIRREM: add_to_worklist(wk); continue; case D_NEWDIRBLK: free_newdirblk(WK_NEWDIRBLK(wk)); continue; default: panic("handle_written_inodeblock: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } if (filefree != NULL) { if (free_inodedep(inodedep) == 0) panic("handle_written_inodeblock: live inodedep"); add_to_worklist(filefree); return (0); } /* * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && TAILQ_FIRST(&inodedep->id_extupdt) == 0)) return (0); return (hadchanges); } /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. */ static void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { struct pagedep *pagedep; dap->da_state |= COMPLETE; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* * Handle the completion of a mkdir dependency. */ static void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; { struct diradd *dap; struct pagedep *pagedep; if (mkdir->md_state != type) panic("handle_written_mkdir: bad type"); dap = mkdir->md_diradd; dap->da_state &= ~type; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) dap->da_state |= DEPCOMPLETE; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } LIST_REMOVE(mkdir, md_mkdirs); WORKITEM_FREE(mkdir, D_MKDIR); } /* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level * with further splbio interrupts blocked. */ static int handle_written_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ { struct dirrem *dirrem; struct diradd *dap, *nextdap; struct direct *ep; int i, chgs; if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; /* * Process any directory removals that have been committed. */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; add_to_worklist(&dirrem->dm_list); } /* * Free any directory additions that have been committed. * If it is a newly allocated block, we have to wait until * the on-disk directory inode claims the new block. */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap); /* * Uncommitted directory entries must be restored. */ for (chgs = 0, i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = nextdap) { nextdap = LIST_NEXT(dap, da_pdlist); if (dap->da_state & ATTACHED) panic("handle_written_filepage: attached"); ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); ep->d_ino = dap->da_newinum; dap->da_state &= ~UNDONE; dap->da_state |= ATTACHED; chgs = 1; /* * If the inode referenced by the directory has * been written out, then the dependency can be * moved to the pending list. */ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } } /* * If there were any rollbacks in the directory, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (chgs) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); return (1); } /* * If we are not waiting for a new directory block to be * claimed by its inode, then the pagedep will be freed. * Otherwise it will remain to track any new entries on * the page in case they are fsync'ed. */ if ((pagedep->pd_state & NEWBLOCK) == 0) { LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); } return (0); } /* * Writing back in-core inode structures. * * The filesystem only accesses an inode's contents when it occupies an * "in-core" inode structure. These "in-core" structures are separate from * the page frames used to cache inode blocks. Only the latter are * transferred to/from the disk. So, when the updated contents of the * "in-core" inode structure are copied to the corresponding in-memory inode * block, the dependencies are also transferred. The following procedure is * called when copying a dirty "in-core" inode to a cached inode block. */ /* * Called when an inode is loaded from disk. If the effective link count * differed from the actual link count when it was last flushed, then we * need to ensure that the correct effective link count is put back. */ void softdep_load_inodeblock(ip) struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(&lk); if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return; } ip->i_effnlink -= inodedep->id_nlinkdelta; if (inodedep->id_state & SPACECOUNTED) ip->i_flag |= IN_SPACECOUNTED; FREE_LOCK(&lk); } /* * This routine is called just before the "in-core" inode * information is to be copied to the in-memory inode block. * Recall that an inode block contains several inodes. If * the force flag is set, then the dependencies will be * cleared so that the update can always be made. Note that * the buffer is locked when this routine is called, so we * will never be in the middle of writing the inode block * to disk. */ void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; /* the "in_core" copy of the inode */ struct buf *bp; /* the buffer containing the inode block */ int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; struct worklist *wk; struct mount *mp; struct buf *ibp; int error; /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while * the inode is (potentially) tossed out of the cache. Otherwise, * if there is no existing inodedep, then there are no dependencies * to track. */ mp = UFSTOVFS(ip->i_ump); ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); if (ip->i_effnlink != ip->i_nlink) panic("softdep_update_inodeblock: bad link count"); return; } if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ inodedep->id_state &= ~COMPLETE; if ((inodedep->id_state & ONWORKLIST) == 0) WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); /* * Any new dependencies associated with the incore inode must * now be moved to the list associated with the buffer holding * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); - if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) + if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); - if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL) + if (!TAILQ_EMPTY(&inodedep->id_extupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt)); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk * can be moved to the id_bufwait so that they will be * processed when the buffer I/O completes. */ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&inodedep->id_bufwait, wk); } /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a * forced sync (e.g., an fsync on a file), we force the bitmap * to be written so that the update can be done. */ if (waitfor == 0) { FREE_LOCK(&lk); return; } retry: if ((inodedep->id_state & DEPCOMPLETE) != 0) { FREE_LOCK(&lk); return; } ibp = inodedep->id_buf; ibp = getdirtybuf(ibp, &lk, MNT_WAIT); if (ibp == NULL) { /* * If ibp came back as NULL, the dependency could have been * freed while we slept. Look it up again, and check to see * that it has completed. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) goto retry; FREE_LOCK(&lk); return; } FREE_LOCK(&lk); if ((error = bwrite(ibp)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); } /* * Merge the a new inode dependency list (such as id_newinoupdt) into an * old inode dependency list (such as id_inoupdt). This routine must be * called with splbio interrupts blocked. */ static void merge_inode_lists(newlisthead, oldlisthead) struct allocdirectlst *newlisthead; struct allocdirectlst *oldlisthead; { struct allocdirect *listadp, *newadp; newadp = TAILQ_FIRST(newlisthead); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { if (listadp->ad_lbn < newadp->ad_lbn) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_lbn == newadp->ad_lbn) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; } newadp = TAILQ_FIRST(newlisthead); } while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); } } /* * If we are doing an fsync, then we must ensure that any directory * entries for the inode have been written after the inode gets to disk. */ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct pagedep *pagedep; struct worklist *wk; struct diradd *dap; struct mount *mp; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct thread *td = curthread; int error, flushparent; ino_t parentino; ufs_lbn_t lbn; ip = VTOI(vp); fs = ip->i_fs; mp = vp->v_mount; ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(&lk); return (0); } - if (LIST_FIRST(&inodedep->id_inowait) != NULL || - LIST_FIRST(&inodedep->id_bufwait) != NULL || - TAILQ_FIRST(&inodedep->id_extupdt) != NULL || - TAILQ_FIRST(&inodedep->id_newextupdt) != NULL || - TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || - TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) + if (!LIST_EMPTY(&inodedep->id_inowait) || + !LIST_EMPTY(&inodedep->id_bufwait) || + !TAILQ_EMPTY(&inodedep->id_extupdt) || + !TAILQ_EMPTY(&inodedep->id_newextupdt) || + !TAILQ_EMPTY(&inodedep->id_inoupdt) || + !TAILQ_EMPTY(&inodedep->id_newinoupdt)) panic("softdep_fsync: pending ops"); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* * Flush our parent if this directory entry has a MKDIR_PARENT * dependency or is contained in a newly allocated block. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; parentino = pagedep->pd_ino; lbn = pagedep->pd_lbn; if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) panic("softdep_fsync: dirty"); if ((dap->da_state & MKDIR_PARENT) || (pagedep->pd_state & NEWBLOCK)) flushparent = 1; else flushparent = 0; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the * vnode below, so we just have to give up on writing its * directory entry out. It will eventually be written, just * not now, but then the user was not asking to have it * written, so we are not breaking any promises. */ if (vp->v_iflag & VI_DOOMED) break; /* * We prevent deadlock by always fetching inodes from the * root, moving down the directory tree. Thus, when fetching * our parent directory, we first try to get the lock. If * that fails, we must unlock ourselves before requesting * the lock on our parent. See the comment in ufs_lookup * for details on possible races. */ FREE_LOCK(&lk); if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) { VOP_UNLOCK(vp, 0, td); error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); if (error != 0) return (error); } /* * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps * that are contained in direct blocks will be resolved by * doing a ffs_update. Pagedeps contained in indirect blocks * may require a complete sync'ing of the directory. So, we * try the cheap and fast ffs_update first, and if that fails, * then we do the slower ffs_syncvnode of the directory. */ if (flushparent) { if ((error = ffs_update(pvp, 1)) != 0) { vput(pvp); return (error); } if ((pagedep->pd_state & NEWBLOCK) && (error = ffs_syncvnode(pvp, MNT_WAIT))) { vput(pvp); return (error); } } /* * Flush directory page containing the inode's name. */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, &bp); if (error == 0) error = bwrite(bp); else brelse(bp); vput(pvp); if (error != 0) return (error); ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) break; } FREE_LOCK(&lk); return (0); } /* * Flush all the dirty bitmaps associated with the block device * before flushing the rest of the dirty blocks so as to reduce * the number of dependencies that will have to be rolled back. */ void softdep_fsync_mountdev(vp) struct vnode *vp; { struct buf *bp, *nbp; struct worklist *wk; if (!vn_isdisk(vp, NULL)) panic("softdep_fsync_mountdev: vnode not a disk"); restart: ACQUIRE_LOCK(&lk); VI_LOCK(vp); TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { /* * If it is already scheduled, skip to the next buffer. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* * We are only interested in bitmaps with outstanding * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP || (bp->b_vflags & BV_BKGRDINPROG)) { BUF_UNLOCK(bp); continue; } VI_UNLOCK(vp); FREE_LOCK(&lk); bremfree(bp); (void) bawrite(bp); goto restart; } FREE_LOCK(&lk); drain_output(vp); VI_UNLOCK(vp); } /* * This routine is called when we are trying to synchronously flush a * file. This routine must eliminate any filesystem metadata dependencies * so that the syncing routine can succeed by pushing the dirty blocks * associated with the file. If any I/O errors occur, they are returned. */ int softdep_sync_metadata(struct vnode *vp) { struct pagedep *pagedep; struct allocdirect *adp; struct allocindir *aip; struct buf *bp, *nbp; struct worklist *wk; int i, error, waitfor; if (!DOINGSOFTDEP(vp)) return (0); /* * Ensure that any direct block dependencies have been cleared. */ ACQUIRE_LOCK(&lk); if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) { FREE_LOCK(&lk); return (error); } FREE_LOCK(&lk); /* * For most files, the only metadata dependencies are the * cylinder group maps that allocate their inode or blocks. * The block allocation dependencies can be found by traversing * the dependency lists for any buffers that remain on their * dirty buffer list. The inode allocation dependency will * be resolved when the inode is updated with MNT_WAIT. * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. */ waitfor = MNT_NOWAIT; top: /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. */ VI_LOCK(vp); drain_output(vp); while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) { bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT); if (bp) break; } VI_UNLOCK(vp); if (bp == NULL) return (0); loop: /* While syncing snapshots, we must allow recursive lookups */ bp->b_lock.lk_flags |= LK_CANRECURSE; ACQUIRE_LOCK(&lk); /* * As we hold the buffer locked, none of its dependencies * will disappear. */ LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); if (adp->ad_state & DEPCOMPLETE) continue; nbp = adp->ad_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = bwrite(nbp)) != 0) { break; } ACQUIRE_LOCK(&lk); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); if (aip->ai_state & DEPCOMPLETE) continue; nbp = aip->ai_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = bwrite(nbp)) != 0) { break; } ACQUIRE_LOCK(&lk); continue; case D_INDIRDEP: restart: LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { if (aip->ai_state & DEPCOMPLETE) continue; nbp = aip->ai_buf; nbp = getdirtybuf(nbp, &lk, MNT_WAIT); if (nbp == NULL) goto restart; FREE_LOCK(&lk); if ((error = bwrite(nbp)) != 0) { goto loop_end; } ACQUIRE_LOCK(&lk); goto restart; } continue; case D_INODEDEP: if ((error = flush_inodedep_deps(wk->wk_mp, WK_INODEDEP(wk)->id_ino)) != 0) { FREE_LOCK(&lk); break; } continue; case D_PAGEDEP: /* * We are trying to sync a directory that may * have dependencies on both its own metadata * and/or dependencies on the inodes of any * recently allocated files. We walk its diradd * lists pushing out the associated inode. */ pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; if ((error = flush_pagedep_deps(vp, wk->wk_mp, &pagedep->pd_diraddhd[i]))) { FREE_LOCK(&lk); goto loop_end; } } continue; case D_MKDIR: /* * This case should never happen if the vnode has * been properly sync'ed. However, if this function * is used at a place where the vnode has not yet * been sync'ed, this dependency can show up. So, * rather than panic, just flush it. */ nbp = WK_MKDIR(wk)->md_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = bwrite(nbp)) != 0) { break; } ACQUIRE_LOCK(&lk); continue; case D_BMSAFEMAP: /* * This case should never happen if the vnode has * been properly sync'ed. However, if this function * is used at a place where the vnode has not yet * been sync'ed, this dependency can show up. So, * rather than panic, just flush it. */ nbp = WK_BMSAFEMAP(wk)->sm_buf; nbp = getdirtybuf(nbp, &lk, waitfor); if (nbp == NULL) continue; FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); } else if ((error = bwrite(nbp)) != 0) { break; } ACQUIRE_LOCK(&lk); continue; default: panic("softdep_sync_metadata: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } loop_end: /* We reach here only in error and unlocked */ if (error == 0) panic("softdep_sync_metadata: zero error"); bp->b_lock.lk_flags &= ~LK_CANRECURSE; bawrite(bp); return (error); } FREE_LOCK(&lk); VI_LOCK(vp); while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) { nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT); if (nbp) break; } VI_UNLOCK(vp); bp->b_lock.lk_flags &= ~LK_CANRECURSE; bawrite(bp); if (nbp != NULL) { bp = nbp; goto loop; } /* * The brief unlock is to allow any pent up dependency * processing to be done. Then proceed with the second pass. */ if (waitfor == MNT_NOWAIT) { waitfor = MNT_WAIT; goto top; } /* * If we have managed to get rid of all the dirty buffers, * then we are done. For certain directories and block * devices, we may need to do further work. * * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. */ VI_LOCK(vp); drain_output(vp); VI_UNLOCK(vp); return (0); } /* * Flush the dependencies associated with an inodedep. * Called with splbio blocked. */ static int flush_inodedep_deps(mp, ino) struct mount *mp; ino_t ino; { struct inodedep *inodedep; int error, waitfor; /* * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ for (error = 0, waitfor = MNT_NOWAIT; ; ) { if (error) return (error); FREE_LOCK(&lk); ACQUIRE_LOCK(&lk); if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) continue; /* * If pass2, we are done, otherwise do pass 2. */ if (waitfor == MNT_WAIT) break; waitfor = MNT_WAIT; } /* * Try freeing inodedep in case all dependencies have been removed. */ if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) (void) free_inodedep(inodedep); return (0); } /* * Flush an inode dependency list. * Called with splbio blocked. */ static int flush_deplist(listhead, waitfor, errorp) struct allocdirectlst *listhead; int waitfor; int *errorp; { struct allocdirect *adp; struct buf *bp; mtx_assert(&lk, MA_OWNED); TAILQ_FOREACH(adp, listhead, ad_next) { if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; bp = getdirtybuf(bp, &lk, waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) continue; return (1); } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(bp); } else if ((*errorp = bwrite(bp)) != 0) { ACQUIRE_LOCK(&lk); return (1); } ACQUIRE_LOCK(&lk); return (1); } return (0); } /* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ static int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; { struct inodedep *inodedep; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; int error = 0; struct buf *bp; ino_t inum; struct worklist *wk; ump = VFSTOUFS(mp); while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(&lk); if ((error = ffs_update(pvp, 1)) != 0) break; ACQUIRE_LOCK(&lk); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_PARENT) panic("flush_pagedep_deps: MKDIR_PARENT"); } /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be * committed in its parent. We do not want or need * the full semantics of a synchronous ffs_syncvnode as * that may end up here again, once for each directory * level in the filesystem. Instead, we push the blocks * and wait for them to clear. We have to fsync twice * because the first call may choose to defer blocks * that still have dependencies, but deferral will * happen at most once. */ inum = dap->da_newinum; if (dap->da_state & MKDIR_BODY) { FREE_LOCK(&lk); if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp))) break; if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) || (error=ffs_syncvnode(vp, MNT_NOWAIT))) { vput(vp); break; } VI_LOCK(vp); drain_output(vp); /* * If first block is still dirty with a D_MKDIR * dependency then it needs to be written now. */ for (;;) { error = 0; bp = gbincore(&vp->v_bufobj, 0); if (bp == NULL) break; /* First block not present */ error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp)); VI_LOCK(vp); if (error == ENOLCK) continue; /* Slept, retry */ if (error != 0) break; /* Failed */ if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); break; /* Buffer not dirty */ } for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; wk = LIST_NEXT(wk, wk_list)) if (wk->wk_type == D_MKDIR) break; if (wk == NULL) BUF_UNLOCK(bp); /* Dependency gone */ else { /* * D_MKDIR dependency remains, * must write buffer to stable * storage. */ VI_UNLOCK(vp); bremfree(bp); error = bwrite(bp); VI_LOCK(vp); } break; } VI_UNLOCK(vp); vput(vp); if (error != 0) break; /* Flushing of first block failed */ ACQUIRE_LOCK(&lk); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_BODY) panic("flush_pagedep_deps: MKDIR_BODY"); } /* * Flush the inode on which the directory entry depends. * Having accounted for MKDIR_PARENT and MKDIR_BODY above, * the only remaining dependency is that the updated inode * count must get pushed to disk. The inode has already * been pushed into its inode buffer (via VOP_UPDATE) at * the time of the reference count change. So we need only * locate that buffer, ensure that there will be no rollback * caused by a bitmap dependency, then write the inode buffer. */ retry: if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode"); /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { bp = inodedep->id_buf; bp = getdirtybuf(bp, &lk, MNT_WAIT); if (bp == NULL) goto retry; FREE_LOCK(&lk); if ((error = bwrite(bp)) != 0) break; ACQUIRE_LOCK(&lk); if (dap != LIST_FIRST(diraddhdp)) continue; } /* * If the inode is still sitting in a buffer waiting * to be written, push it to disk. */ FREE_LOCK(&lk); if ((error = bread(ump->um_devvp, fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); break; } if ((error = bwrite(bp)) != 0) break; ACQUIRE_LOCK(&lk); /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) panic("flush_pagedep_deps: flush failed"); } if (error) ACQUIRE_LOCK(&lk); return (error); } /* * A large burst of file addition or deletion activity can drive the * memory load excessively high. First attempt to slow things down * using the techniques below. If that fails, this routine requests * the offending operations to fall back to running synchronously * until the memory load returns to a reasonable level. */ int softdep_slowdown(vp) struct vnode *vp; { int max_softdeps_hard; ACQUIRE_LOCK(&lk); max_softdeps_hard = max_softdeps * 11 / 10; if (num_dirrem < max_softdeps_hard / 2 && num_inodedep < max_softdeps_hard && VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps) { FREE_LOCK(&lk); return (0); } if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps) softdep_speedup(); stat_sync_limit_hit += 1; FREE_LOCK(&lk); return (1); } /* * Called by the allocation routines when they are about to fail * in the hope that we can free up some disk space. * * First check to see if the work list has anything on it. If it has, * clean up entries until we successfully free some space. Because this * process holds inodes locked, we cannot handle any remove requests * that might block on a locked inode as that could lead to deadlock. * If the worklist yields no free space, encourage the syncer daemon * to help us. In no event will we try for longer than tickdelay seconds. */ int softdep_request_cleanup(fs, vp) struct fs *fs; struct vnode *vp; { struct ufsmount *ump; long starttime; ufs2_daddr_t needed; int error; ump = VTOI(vp)->i_ump; mtx_assert(UFS_MTX(ump), MA_OWNED); needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize; starttime = time_second + tickdelay; /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to update the vnode * as we may recurse into the copy-on-write routine. */ if (!(curthread->td_pflags & TDP_COWINPROGRESS)) { UFS_UNLOCK(ump); error = ffs_update(vp, 1); UFS_LOCK(ump); if (error != 0) return (0); } while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) { if (time_second > starttime) return (0); UFS_UNLOCK(ump); ACQUIRE_LOCK(&lk); if (ump->softdep_on_worklist > 0 && process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) { stat_worklist_push += 1; FREE_LOCK(&lk); UFS_LOCK(ump); continue; } request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT); FREE_LOCK(&lk); UFS_LOCK(ump); } return (1); } /* * If memory utilization has gotten too high, deliberately slow things * down and speed up the I/O processing. */ extern struct thread *syncertd; static int request_cleanup(mp, resource) struct mount *mp; int resource; { struct thread *td = curthread; struct ufsmount *ump; mtx_assert(&lk, MA_OWNED); /* * We never hold up the filesystem syncer or buf daemon. */ if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) return (0); ump = VFSTOUFS(mp); /* * First check to see if the work list has gotten backlogged. * If it has, co-opt this process to help clean up two entries. * Because this process may hold inodes locked, we cannot * handle any remove requests that might block on a locked * inode as that could lead to deadlock. We set TDP_SOFTDEP * to avoid recursively processing the worklist. */ if (ump->softdep_on_worklist > max_softdeps / 10) { td->td_pflags |= TDP_SOFTDEP; process_worklist_item(mp, LK_NOWAIT); process_worklist_item(mp, LK_NOWAIT); td->td_pflags &= ~TDP_SOFTDEP; stat_worklist_push += 2; return(1); } /* * Next, we attempt to speed up the syncer process. If that * is successful, then we allow the process to continue. */ if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT) return(0); /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories * with removal dependencies. We would like to do the cleanup * here, but we probably hold an inode locked at this point and * that might deadlock against one that we try to clean. So, * the best that we can do is request the syncer daemon to do * the cleanup for us. */ switch (resource) { case FLUSH_INODES: stat_ino_limit_push += 1; req_clear_inodedeps += 1; stat_countp = &stat_ino_limit_hit; break; case FLUSH_REMOVE: case FLUSH_REMOVE_WAIT: stat_blk_limit_push += 1; req_clear_remove += 1; stat_countp = &stat_blk_limit_hit; break; default: panic("request_cleanup: unknown type"); } /* * Hopefully the syncer daemon will catch up and awaken us. * We wait at most tickdelay before proceeding in any case. */ proc_waiting += 1; if (handle.callout == NULL) handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); proc_waiting -= 1; return (1); } /* * Awaken processes pausing in request_cleanup and clear proc_waiting * to indicate that there is no longer a timer running. */ static void pause_timer(arg) void *arg; { ACQUIRE_LOCK(&lk); *stat_countp += 1; wakeup_one(&proc_waiting); if (proc_waiting > 0) handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2); else handle.callout = NULL; FREE_LOCK(&lk); } /* * Flush out a directory with at least one removal dependency in an effort to * reduce the number of dirrem, freefile, and freeblks dependency structures. */ static void clear_remove(td) struct thread *td; { struct pagedep_hashhead *pagedephd; struct pagedep *pagedep; static int next = 0; struct mount *mp; struct vnode *vp; int error, cnt; ino_t ino; mtx_assert(&lk, MA_OWNED); for (cnt = 0; cnt < pagedep_hash; cnt++) { pagedephd = &pagedep_hashtbl[next++]; if (next >= pagedep_hash) next = 0; LIST_FOREACH(pagedep, pagedephd, pd_hash) { - if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) + if (LIST_EMPTY(&pagedep->pd_dirremhd)) continue; mp = pagedep->pd_list.wk_mp; ino = pagedep->pd_ino; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(&lk); if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) { softdep_error("clear_remove: vget", error); vn_finished_write(mp); ACQUIRE_LOCK(&lk); return; } if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) softdep_error("clear_remove: fsync", error); VI_LOCK(vp); drain_output(vp); VI_UNLOCK(vp); vput(vp); vn_finished_write(mp); ACQUIRE_LOCK(&lk); return; } } } /* * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ static void clear_inodedeps(td) struct thread *td; { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; static int next = 0; struct mount *mp; struct vnode *vp; struct fs *fs; int error, cnt; ino_t firstino, lastino, ino; mtx_assert(&lk, MA_OWNED); /* * Pick a random inode dependency to be cleared. * We will then gather up all the inodes in its block * that have dependencies and flush them out. */ for (cnt = 0; cnt < inodedep_hash; cnt++) { inodedephd = &inodedep_hashtbl[next++]; if (next >= inodedep_hash) next = 0; if ((inodedep = LIST_FIRST(inodedephd)) != NULL) break; } if (inodedep == NULL) return; fs = inodedep->id_fs; mp = inodedep->id_list.wk_mp; /* * Find the last inode in the block with dependencies. */ firstino = inodedep->id_ino & ~(INOPB(fs) - 1); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) break; /* * Asynchronously push all but the last inode with dependencies. * Synchronously push the last inode with dependencies to ensure * that the inode block gets written to free up the inodedeps. */ for (ino = firstino; ino <= lastino; ino++) { if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) continue; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(&lk); if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) { softdep_error("clear_inodedeps: vget", error); vn_finished_write(mp); ACQUIRE_LOCK(&lk); return; } if (ino == lastino) { if ((error = ffs_syncvnode(vp, MNT_WAIT))) softdep_error("clear_inodedeps: fsync1", error); } else { if ((error = ffs_syncvnode(vp, MNT_NOWAIT))) softdep_error("clear_inodedeps: fsync2", error); VI_LOCK(vp); drain_output(vp); VI_UNLOCK(vp); } vput(vp); vn_finished_write(mp); ACQUIRE_LOCK(&lk); } } /* * Function to determine if the buffer has outstanding dependencies * that will cause a roll-back if the buffer is written. If wantcount * is set, return number of dependencies, otherwise just yes or no. */ static int softdep_count_dependencies(bp, wantcount) struct buf *bp; int wantcount; { struct worklist *wk; struct inodedep *inodedep; struct indirdep *indirdep; struct allocindir *aip; struct pagedep *pagedep; struct diradd *dap; int i, retval; retval = 0; ACQUIRE_LOCK(&lk); LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_INODEDEP: inodedep = WK_INODEDEP(wk); if ((inodedep->id_state & DEPCOMPLETE) == 0) { /* bitmap allocation dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_extupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { /* indirect block pointer dependency */ retval += 1; if (!wantcount) goto out; } continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { /* directory entry dependency */ retval += 1; if (!wantcount) goto out; } } continue; case D_BMSAFEMAP: case D_ALLOCDIRECT: case D_ALLOCINDIR: case D_MKDIR: /* never a dependency on these blocks */ continue; default: panic("softdep_check_for_rollback: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out: FREE_LOCK(&lk); return retval; } /* * Acquire exclusive access to a buffer. * Must be called with a locked mtx parameter. * Return acquired buffer or NULL on failure. */ static struct buf * getdirtybuf(bp, mtx, waitfor) struct buf *bp; struct mtx *mtx; int waitfor; { int error; mtx_assert(mtx, MA_OWNED); if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { if (waitfor != MNT_WAIT) return (NULL); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx); /* * Even if we sucessfully acquire bp here, we have dropped * mtx, which may violates our guarantee. */ if (error == 0) BUF_UNLOCK(bp); else if (error != ENOLCK) panic("getdirtybuf: inconsistent lock: %d", error); mtx_lock(mtx); return (NULL); } if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { if (mtx == &lk && waitfor == MNT_WAIT) { mtx_unlock(mtx); BO_LOCK(bp->b_bufobj); BUF_UNLOCK(bp); if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO | PDROP, "getbuf", 0); } else BO_UNLOCK(bp->b_bufobj); mtx_lock(mtx); return (NULL); } BUF_UNLOCK(bp); if (waitfor != MNT_WAIT) return (NULL); /* * The mtx argument must be bp->b_vp's mutex in * this case. */ #ifdef DEBUG_VFS_LOCKS if (bp->b_vp->v_type != VCHR) ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf"); #endif bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0); return (NULL); } if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); return (NULL); } bremfree(bp); return (bp); } /* * Check if it is safe to suspend the file system now. On entry, * the vnode interlock for devvp should be held. Return 0 with * the mount interlock held if the file system can be suspended now, * otherwise return EAGAIN with the mount interlock held. */ int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_deps, int softdep_accdeps, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; struct ufsmount *ump; int error; ASSERT_VI_LOCKED(devvp, "softdep_check_suspend"); ump = VFSTOUFS(mp); bo = &devvp->v_bufobj; for (;;) { if (!TRY_ACQUIRE_LOCK(&lk)) { VI_UNLOCK(devvp); ACQUIRE_LOCK(&lk); FREE_LOCK(&lk); VI_LOCK(devvp); continue; } if (!MNT_ITRYLOCK(mp)) { FREE_LOCK(&lk); VI_UNLOCK(devvp); MNT_ILOCK(mp); MNT_IUNLOCK(mp); VI_LOCK(devvp); continue; } if (mp->mnt_secondary_writes != 0) { FREE_LOCK(&lk); VI_UNLOCK(devvp); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); VI_LOCK(devvp); continue; } break; } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Softdep activity occurred after start of vnode sync loop * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || softdep_deps != 0 || ump->softdep_deps != 0 || softdep_accdeps != ump->softdep_accdeps || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; FREE_LOCK(&lk); VI_UNLOCK(devvp); return (error); } /* * Get the number of dependency structures for the file system, both * the current number and the total number allocated. These will * later be used to detect that softdep processing has occurred. */ void softdep_get_depcounts(struct mount *mp, int *softdep_depsp, int *softdep_accdepsp) { struct ufsmount *ump; ump = VFSTOUFS(mp); ACQUIRE_LOCK(&lk); *softdep_depsp = ump->softdep_deps; *softdep_accdepsp = ump->softdep_accdeps; FREE_LOCK(&lk); } /* * Wait for pending output on a vnode to complete. * Must be called with vnode lock and interlock locked. * * XXX: Should just be a call to bufobj_wwait(). */ static void drain_output(vp) struct vnode *vp; { ASSERT_VOP_LOCKED(vp, "drain_output"); ASSERT_VI_LOCKED(vp, "drain_output"); while (vp->v_bufobj.bo_numoutput) { vp->v_bufobj.bo_flag |= BO_WWAIT; msleep((caddr_t)&vp->v_bufobj.bo_numoutput, VI_MTX(vp), PRIBIO + 1, "drainvp", 0); } } /* * Called whenever a buffer that is being invalidated or reallocated * contains dependencies. This should only happen if an I/O error has * occurred. The routine is called with the buffer locked. */ static void softdep_deallocate_dependencies(bp) struct buf *bp; { if ((bp->b_ioflags & BIO_ERROR) == 0) panic("softdep_deallocate_dependencies: dangling deps"); softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); panic("softdep_deallocate_dependencies: unrecovered I/O error"); } /* * Function to handle asynchronous write errors in the filesystem. */ static void softdep_error(func, error) char *func; int error; { /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } #endif /* SOFTUPDATES */ Index: head/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c (revision 168352) +++ head/sys/ufs/ffs/ffs_vfsops.c (revision 168353) @@ -1,1864 +1,1864 @@ /*- * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; static int ffs_reload(struct mount *, struct thread *); static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_oldfscompat_write(struct fs *, struct ufsmount *); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr", "clusterw", "exec", "export", "force", "from", "multilabel", "snapshot", "suid", "suiddir", "symfollow", "sync", "union", NULL }; static int ffs_mount(struct mount *mp, struct thread *td) { struct vnode *devvp; struct ufsmount *ump = 0; struct fs *fs; int error, flags; u_int mntorflags, mntandnotflags; mode_t accessmode; struct nameidata ndp; char *fspec; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; mntandnotflags = 0; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0) mntorflags |= MNT_ASYNC; if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0) mntorflags |= MNT_FORCE; if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0) mntorflags |= MNT_MULTILABEL; if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0) mntandnotflags |= MNT_ASYNC; if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0) mntorflags |= MNT_NOATIME; if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0) mntorflags |= MNT_NOCLUSTERR; if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0) mntorflags |= MNT_NOCLUSTERW; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) mntorflags |= MNT_SNAPSHOT; MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag | mntorflags) & ~mntandnotflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); /* * Flush any dirty data. */ if ((error = ffs_sync(mp, MNT_WAIT, td)) != 0) { vn_finished_write(mp); return (error); } /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (mp->mnt_flag & MNT_SOFTDEP) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vn_finished_write(mp); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: %s: blocks %jd files %d\n", fs->fs_fsmnt, "update error", (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vn_finished_write(mp); return (error); } vn_finished_write(mp); DROP_GIANT(); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); PICKUP_GIANT(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp, 0, td); return (error); } VOP_UNLOCK(devvp, 0, td); fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not %s\n", fs->fs_fsmnt, "properly dismounted"); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); return (EPERM); } } DROP_GIANT(); g_topology_lock(); /* * If we're the root device, we may not have an E count * yet, get it now. */ if (ump->um_cp->ace == 0) error = g_access(ump->um_cp, 0, 1, 1); else error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); PICKUP_GIANT(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { vn_finished_write(mp); return (error); } /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ vn_finished_write(mp); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vn_finished_write(mp); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (mp->mnt_flag & MNT_SOFTDEP) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(&ndp)) != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accessmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= VWRITE; error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } } vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof args.export); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. */ static int ffs_reload(struct mount *mp, struct thread *td) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, size, error; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); ump = VFSTOUFS(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td); if (vinvalbuf(devvp, 0, td, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp, 0, td); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Copy pointer fields back into superblock before copying in XXX * new superblock. These should really be in the ufsmount. XXX * Note that important parameters (eg fs_ncg) are unchanged. */ newfs->fs_csp = fs->fs_csp; newfs->fs_maxcluster = fs->fs_maxcluster; newfs->fs_contigdirs = fs->fs_contigdirs; newfs->fs_active = fs->fs_active; /* The file system is still read-only. */ newfs->fs_ronly = 1; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: reload pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { lp = fs->fs_maxcluster; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } loop: MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); /* * Step 4: invalidate all cached file data. */ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { MNT_VNODE_FOREACH_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, td, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_VNODE_FOREACH_ABORT(mp, mvp); return (error); } ffs_load_inode(bp, ip, fs, ip->i_number); ip->i_effnlink = ip->i_nlink; brelse(bp); VOP_UNLOCK(vp, 0, td); vrele(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); return (0); } /* * Possible superblock locations ordered from most to least likely. */ static int sblock_try[] = SBLOCKSEARCH; /* * Common code for mount and mountroot */ static int ffs_mountfs(devvp, mp, td) struct vnode *devvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct buf *bp; struct fs *fs; struct cdev *dev; void *space; ufs2_daddr_t sblockloc; int error, i, blks, size, ronly; int32_t *lp; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; dev = devvp->v_rdev; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); /* * If we are a root mount, drop the E flag so fsck can do its magic. * We will pick it up again when we remount R/W. */ if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS)) error = g_access(cp, 0, 0, -1); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0, td); if (error) return (error); if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; devvp->v_bufobj.bo_private = cp; devvp->v_bufobj.bo_ops = &ffs_ops; bp = NULL; ump = NULL; fs = NULL; sblockloc = 0; /* * Try reading the superblock in each of its possible locations. */ for (i = 0; sblock_try[i] != -1; i++) { if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE, cred, &bp)) != 0) goto out; fs = (struct fs *)bp->b_data; sblockloc = sblock_try[i]; if ((fs->fs_magic == FS_UFS1_MAGIC || (fs->fs_magic == FS_UFS2_MAGIC && (fs->fs_sblockloc == sblockloc || (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && fs->fs_bsize <= MAXBSIZE && fs->fs_bsize >= sizeof(struct fs)) break; brelse(bp); bp = NULL; } if (sblock_try[i] == -1) { error = EINVAL; /* XXX needs translation */ goto out; } fs->fs_fmod = 0; fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indicies */ fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & FS_NEEDSFSCK) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf( "WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->fs_fsmnt); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: mount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ size = 1024; mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &size, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf( "WARNING: %s: GJOURNAL flag on fs but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf( "WARNING: %s: GJOURNAL flag on fs but no UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); if (fs->fs_sbsize < SBLOCKSIZE) bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); bp = NULL; fs = ump->um_fs; ffs_oldfscompat_read(fs, ump, sblockloc); fs->fs_ronly = ronly; size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, cred, &bp)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); bp = NULL; } if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); fs->fs_active = NULL; mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf( "WARNING: %s: multilabel flag on fs but no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf( "WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); if( mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { free(fs->fs_csp, M_UFSMNT); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem stat information in mount struct. */ #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_MPSAFE; MNT_IUNLOCK(mp); return (0); out: if (bp) brelse(bp); if (cp != NULL) { DROP_GIANT(); g_topology_lock(); g_vfs_close(cp, td); g_topology_unlock(); PICKUP_GIANT(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump->um_fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; } return (error); } #include static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags, td) struct mount *mp; int mntflags; struct thread *td; { struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags; flags = 0; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("ffs_unmount: ufs_extattr_stop returned %d\n", error); } else { ufs_extattr_uepm_destroy(&ump->um_extattr); } #endif if (mp->mnt_flag & MNT_SOFTDEP) { if ((error = softdep_flushfiles(mp, flags, td)) != 0) return (error); } else { if ((error = ffs_flushfiles(mp, flags, td)) != 0) return (error); } fs = ump->um_fs; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("%s: unmount pending error: blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (fs->fs_ronly == 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (error) { fs->fs_clean = 0; return (error); } } DROP_GIANT(); g_topology_lock(); g_vfs_close(ump->um_cp, td); g_topology_unlock(); PICKUP_GIANT(); vrele(ump->um_devvp); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = (qaddr_t)0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int error; ump = VFSTOUFS(mp); #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { quotaoff(td, mp, i); } /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Flush all the files. */ if ((error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp, 0, td); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp, td) struct mount *mp; struct statfs *sbp; struct thread *td; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = NAME_MAX; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ffs_sync(mp, waitfor, td) struct mount *mp; int waitfor; struct thread *td; { struct vnode *mvp, *vp, *devvp; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, wait, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ printf("fs = %s\n", fs->fs_fsmnt); panic("ffs_sync: rofs mod"); } /* * Write back each (modified) inode. */ wait = 0; suspend = 0; suspended = 0; lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) { wait = 1; lockreq = LK_EXCLUSIVE; } lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; MNT_ILOCK(mp); loop: /* Grab snapshot of secondary write counts */ secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; /* Grab snapshot of softdep dependency counts */ MNT_IUNLOCK(mp); softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_ILOCK(mp); MNT_VNODE_FOREACH(vp, mp, mvp) { /* * Depend on the mntvnode_slock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ VI_LOCK(vp); if (vp->v_iflag & VI_DOOMED) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if (vp->v_type == VNON || ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0)) { VI_UNLOCK(vp); continue; } MNT_IUNLOCK(mp); if ((error = vget(vp, lockreq, td)) != 0) { MNT_ILOCK(mp); if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); goto loop; } continue; } if ((error = ffs_syncvnode(vp, waitfor)) != 0) allerror = error; vput(vp); MNT_ILOCK(mp); } MNT_IUNLOCK(mp); /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) { MNT_ILOCK(mp); goto loop; } } #ifdef QUOTA qsync(mp); #endif devvp = ump->um_devvp; VI_LOCK(devvp); bo = &devvp->v_bufobj; if (waitfor != MNT_LAZY && (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td); if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(devvp, 0, td); if (allerror == 0 && waitfor == MNT_WAIT) { MNT_ILOCK(mp); goto loop; } } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) goto loop; /* More work needed */ mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else VI_UNLOCK(devvp); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; struct cdev *dev; int error; struct thread *td; error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); dev = ump->um_dev; fs = ump->um_fs; /* * If this MALLOC() is performed after the getnewvnode() * it might block, leaving a vnode with a NULL v_data to be * found by ffs_sync() if a sync happens to fire right then, * which will cause a panic because ffs_sync() blindly * dereferences vp->v_data (as well it should). */ ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if (fs->fs_magic == FS_UFS1_MAGIC) error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp); else error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree(uma_inode, ip); return (error); } /* * FFS supports recursive and shared locking. */ vp->v_vnlock->lk_flags |= LK_CANRECURSE; vp->v_vnlock->lk_flags &= ~LK_NOSHARE; vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_fs = fs; ip->i_dev = dev; ip->i_number = ino; #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif td = curthread; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL, td); error = insmntque(vp, mp); if (error != 0) { uma_zfree(uma_inode, ip); *vpp = NULL; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } if (ip->i_ump->um_fstype == UFS1) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); ffs_load_inode(bp, ip, fs, ino); if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if (ip->i_ump->um_fstype == UFS1) error = ufs_vinit(mp, &ffs_fifoops1, &vp); else error = ufs_vinit(mp, &ffs_fifoops2, &vp); if (error) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { ip->i_gen = arc4random() / 2 + 1; if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { ip->i_flag |= IN_MODIFIED; DIP_SET(ip, i_gen, ip->i_gen); } } /* * Ensure that uid and gid are correct. This is a temporary * fix until fsck has been changed to do the update. */ if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */ fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */ ip->i_uid = ip->i_din1->di_ouid; /* XXX */ ip->i_gid = ip->i_din1->di_ogid; /* XXX */ } /* XXX */ #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_associate_vnode_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, vpp) struct mount *mp; struct fid *fhp; struct vnode **vpp; { struct ufid *ufhp; struct fs *fs; ufhp = (struct ufid *)fhp; fs = VFSTOUFS(mp)->um_fs; if (ufhp->ufid_ino < ROOTINO || ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); return (ufs_fhtovp(mp, ufhp, vpp)); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); return (ret); } /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(mp, waitfor, suspended) struct ufsmount *mp; int waitfor; int suspended; { struct fs *fs = mp->um_fs; struct buf *sbbp; struct buf *bp; int blks; void *space; int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE)) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * First write back the summary information. */ blks = howmany(fs->fs_cssize, fs->fs_fsize); space = fs->fs_csp; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; } /* * Now write back the superblock itself. If any errors occurred * up to this point, then fail so that the superblock avoids * being written out as clean. */ if (allerror) { brelse(sbbp); return (allerror); } bp = sbbp; if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_flags & FS_FLAGS_UPDATED) == 0) { printf("%s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } fs->fs_fmod = 0; fs->fs_time = time_second; bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, mp); if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) allerror = error; return (allerror); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname, struct thread *td) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname, td)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* Grab an extra reference to be dropped by the bufdone() below. */ bufobj_wrefl(bufobj); BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ - if (LIST_FIRST(&bp->b_dep) != NULL) + if (!LIST_EMPTY(&bp->b_dep)) buf_complete(bp); #ifdef SOFTUPDATES - if (LIST_FIRST(&bp->b_dep) != NULL) + if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { int oldflags, s; struct buf *newbp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } oldflags = bp->b_flags; if (BUF_REFCNT(bp) == 0) panic("bufwrite: buffer is not busy???"); s = splbio(); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); splx(s); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } BO_UNLOCK(bp->b_bufobj); /* Mark the buffer clean */ bundirty(bp); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize); /* * set it to be identical to the old block. We have to * set b_lblkno and BKGRDMARKER before calling bgetvp() * to avoid confusing the splay tree and gbincore(). */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); newbp->b_lblkno = bp->b_lblkno; newbp->b_xflags |= BX_BKGRDMARKER; BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; bgetvp(bp->b_vp, newbp); BO_UNLOCK(bp->b_bufobj); newbp->b_bufobj = &bp->b_vp->v_bufobj; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; #ifdef SOFTUPDATES /* move over the dependencies */ - if (LIST_FIRST(&bp->b_dep) != NULL) + if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, newbp); #endif /* * Initiate write on the copy, release the original to * the B_LOCKED queue so that it cannot go away until * the background write completes. If not locked it could go * away and then be reconstituted while it was being written. * If the reconstituted buffer were written, we could end up * with two background copies being written at the same time. */ bqrelse(bp); bp = newbp; } /* Let the normal bufwrite do the rest for us */ return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; int error; struct buf *tbp; vp = bo->__bo_vnode; if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); bp->b_flags &= ~B_VALIDSUSPWRT; if ((vp->v_vflag & VV_COPYONWRITE) && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_int(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { - if (LIST_FIRST(&tbp->b_dep) != NULL) + if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { - if (LIST_FIRST(&bp->b_dep) != NULL) + if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif } g_vfs_strategy(bo, bp); } Index: head/sys/ufs/ffs/ffs_vnops.c =================================================================== --- head/sys/ufs/ffs/ffs_vnops.c (revision 168352) +++ head/sys/ufs/ffs/ffs_vnops.c (revision 168353) @@ -1,1736 +1,1736 @@ /*- * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_directio.h" #include "opt_ffs.h" #ifdef DIRECTIO extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); #endif static vop_fsync_t ffs_fsync; static _vop_lock_t ffs_lock; static vop_getpages_t ffs_getpages; static vop_read_t ffs_read; static vop_write_t ffs_write; static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred); static vop_strategy_t ffsext_strategy; static vop_closeextattr_t ffs_closeextattr; static vop_deleteextattr_t ffs_deleteextattr; static vop_getextattr_t ffs_getextattr; static vop_listextattr_t ffs_listextattr; static vop_openextattr_t ffs_openextattr; static vop_setextattr_t ffs_setextattr; static vop_vptofh_t ffs_vptofh; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops1 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_getpages = ffs_getpages, ._vop_lock = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops1 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ .vop_vptofh = ffs_vptofh, }; /* Global vfs data structures for ufs. */ struct vop_vector ffs_vnodeops2 = { .vop_default = &ufs_vnodeops, .vop_fsync = ffs_fsync, .vop_getpages = ffs_getpages, ._vop_lock = ffs_lock, .vop_read = ffs_read, .vop_reallocblks = ffs_reallocblks, .vop_write = ffs_write, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; struct vop_vector ffs_fifoops2 = { .vop_default = &ufs_fifoops, .vop_fsync = ffs_fsync, ._vop_lock = ffs_lock, .vop_reallocblks = ffs_reallocblks, .vop_strategy = ffsext_strategy, .vop_closeextattr = ffs_closeextattr, .vop_deleteextattr = ffs_deleteextattr, .vop_getextattr = ffs_getextattr, .vop_listextattr = ffs_listextattr, .vop_openextattr = ffs_openextattr, .vop_setextattr = ffs_setextattr, .vop_vptofh = ffs_vptofh, }; /* * Synch an open file. */ /* ARGSUSED */ static int ffs_fsync(struct vop_fsync_args *ap) { int error; error = ffs_syncvnode(ap->a_vp, ap->a_waitfor); if (error) return (error); if (ap->a_waitfor == MNT_WAIT && (ap->a_vp->v_mount->mnt_flag & MNT_SOFTDEP)) error = softdep_fsync(ap->a_vp); return (error); } int ffs_syncvnode(struct vnode *vp, int waitfor) { struct inode *ip = VTOI(vp); struct buf *bp; struct buf *nbp; int s, error, wait, passes, skipmeta; ufs_lbn_t lbn; wait = (waitfor == MNT_WAIT); lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1)); /* * Flush all dirty buffers associated with a vnode. */ passes = NIADDR + 1; skipmeta = 0; if (wait) skipmeta = 1; s = splbio(); VI_LOCK(vp); loop: TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) { /* * Reasons to skip this buffer: it has already been considered * on this pass, this pass is the first time through on a * synchronous flush request and the buffer being considered * is metadata, the buffer has dependencies that will cause * it to be redirtied and it has not already been deferred, * or it is already being written. */ if ((bp->b_vflags & BV_SCANNED) != 0) continue; bp->b_vflags |= BV_SCANNED; if ((skipmeta == 1 && bp->b_lblkno < 0)) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; VI_UNLOCK(vp); - if (!wait && LIST_FIRST(&bp->b_dep) != NULL && + if (!wait && !LIST_EMPTY(&bp->b_dep) && (bp->b_flags & B_DEFERRED) == 0 && buf_countdeps(bp, 0)) { bp->b_flags |= B_DEFERRED; BUF_UNLOCK(bp); VI_LOCK(vp); continue; } if ((bp->b_flags & B_DELWRI) == 0) panic("ffs_fsync: not dirty"); /* * If this is a synchronous flush request, or it is not a * file or device, start the write on this buffer immediatly. */ if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) { /* * On our final pass through, do all I/O synchronously * so that we can find out if our flush is failing * because of write errors. */ if (passes > 0 || !wait) { if ((bp->b_flags & B_CLUSTEROK) && !wait) { (void) vfs_bio_awrite(bp); } else { bremfree(bp); splx(s); (void) bawrite(bp); s = splbio(); } } else { bremfree(bp); splx(s); if ((error = bwrite(bp)) != 0) return (error); s = splbio(); } } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) { /* * If the buffer is for data that has been truncated * off the file, then throw it away. */ bremfree(bp); bp->b_flags |= B_INVAL | B_NOCACHE; splx(s); brelse(bp); s = splbio(); } else vfs_bio_awrite(bp); /* * Since we may have slept during the I/O, we need * to start from a known point. */ VI_LOCK(vp); nbp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd); } /* * If we were asked to do this synchronously, then go back for * another pass, this time doing the metadata. */ if (skipmeta) { skipmeta = 0; goto loop; } if (wait) { bufobj_wwait(&vp->v_bufobj, 3, 0); VI_UNLOCK(vp); /* * Ensure that any filesystem metatdata associated * with the vnode has been written. */ splx(s); if ((error = softdep_sync_metadata(vp)) != 0) return (error); s = splbio(); VI_LOCK(vp); if (vp->v_bufobj.bo_dirty.bv_cnt > 0) { /* * Block devices associated with filesystems may * have new I/O requests posted for them even if * the vnode is locked, so no amount of trying will * get them clean. Thus we give block devices a * good effort, then just give up. For all other file * types, go around and try again until it is clean. */ if (passes > 0) { passes -= 1; goto loop; } #ifdef DIAGNOSTIC if (!vn_isdisk(vp, NULL)) vprint("ffs_fsync: dirty", vp); #endif } } VI_UNLOCK(vp); splx(s); return (ffs_update(vp, wait)); } static int ffs_lock(ap) struct _vop_lock_args /* { struct vnode *a_vp; int a_flags; struct thread *a_td; char *file; int line; } */ *ap; { #ifndef NO_FFS_SNAPSHOT struct vnode *vp; int flags; struct lock *lkp; int result; switch (ap->a_flags & LK_TYPE_MASK) { case LK_SHARED: case LK_UPGRADE: case LK_EXCLUSIVE: vp = ap->a_vp; flags = ap->a_flags; for (;;) { /* * vnode interlock must be held to ensure that * the possibly external lock isn't freed, * e.g. when mutating from snapshot file vnode * to regular file vnode. */ if ((flags & LK_INTERLOCK) == 0) { VI_LOCK(vp); flags |= LK_INTERLOCK; } lkp = vp->v_vnlock; result = _lockmgr(lkp, flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line); if (lkp == vp->v_vnlock || result != 0) break; /* * Apparent success, except that the vnode * mutated between snapshot file vnode and * regular file vnode while this process * slept. The lock currently held is not the * right lock. Release it, and try to get the * new lock. */ (void) _lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line); if ((flags & LK_TYPE_MASK) == LK_UPGRADE) flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; flags &= ~LK_INTERLOCK; } break; default: result = _VOP_LOCK_APV(&ufs_vnodeops, ap); } return (result); #else return (_VOP_LOCK_APV(&ufs_vnodeops, ap)); #endif } /* * Vnode op for reading. */ /* ARGSUSED */ static int ffs_read(ap) struct vop_read_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct inode *ip; struct uio *uio; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid; int seqcount; int ioflag; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extread(vp, uio, ioflag)); #else panic("ffs_read+IO_EXT"); #endif #ifdef DIRECTIO if ((ioflag & IO_DIRECT) != 0) { int workdone; error = ffs_rawread(vp, uio, &workdone); if (error != 0 || workdone != 0) return error; } #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) panic("ffs_read: mode"); if (vp->v_type == VLNK) { if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) panic("ffs_read: short symlink"); } else if (vp->v_type != VREG && vp->v_type != VDIR) panic("ffs_read: type %d", vp->v_type); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); fs = ip->i_fs; if (uio->uio_offset < ip->i_size && uio->uio_offset >= fs->fs_maxfilesize) return (EOVERFLOW); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = blksize(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= ip->i_size) { /* * Don't do readahead if this is the end of the file. */ error = bread(vp, lbn, size, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* * Otherwise if we are allowed to cluster, * grab as much as we can. * * XXX This may not be a win if we are not * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); } else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then * if we appear to be acting sequentially, * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ int nextsize = blksize(fs, ip, nextlbn); error = breadn(vp, lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } else { /* * Failing all of the above, just read what the * user asked for. Interestingly, the same as * the first option above. */ error = bread(vp, lbn, size, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * If IO_DIRECT then set B_DIRECT for the buffer. This * will cause us to attempt to release the buffer later on * and will cause the buffer cache to attempt to free the * underlying pages. */ if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { + (LIST_EMPTY(&bp->b_dep))) { /* * If there are no dependencies, and it's VMIO, * then we don't need the buf, mark it available * for freeing. The VM has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); } else { /* * Otherwise let whoever * made the request take care of * freeing it. We just queue * it onto another list. */ bqrelse(bp); } } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) { if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { + (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bqrelse(bp); } } if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); } return (error); } /* * Vnode op for writing. */ static int ffs_write(ap) struct vop_write_args /* { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; } */ *ap; { struct vnode *vp; struct uio *uio; struct inode *ip; struct fs *fs; struct buf *bp; struct thread *td; ufs_lbn_t lbn; off_t osize; int seqcount; int blkoffset, error, flags, ioflag, resid, size, xfersize; vp = ap->a_vp; uio = ap->a_uio; ioflag = ap->a_ioflag; if (ap->a_ioflag & IO_EXT) #ifdef notyet return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); #else panic("ffs_write+IO_EXT"); #endif seqcount = ap->a_ioflag >> IO_SEQSHIFT; ip = VTOI(vp); #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) panic("ffs_write: mode"); #endif switch (vp->v_type) { case VREG: if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) return (EPERM); /* FALLTHROUGH */ case VLNK: break; case VDIR: panic("ffs_write: dir write"); break; default: panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, (int)uio->uio_offset, (int)uio->uio_resid ); } KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); fs = ip->i_fs; if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) return (EFBIG); /* * Maybe this should be above the vnode op call, but so long as * file servers have no limits, I don't think it matters. */ td = uio->uio_td; if (vp->v_type == VREG && td != NULL) { PROC_LOCK(td->td_proc); if (uio->uio_offset + uio->uio_resid > lim_cur(td->td_proc, RLIMIT_FSIZE)) { psignal(td->td_proc, SIGXFSZ); PROC_UNLOCK(td->td_proc); return (EFBIG); } PROC_UNLOCK(td->td_proc); } resid = uio->uio_resid; osize = ip->i_size; if (seqcount > BA_SEQMAX) flags = BA_SEQMAX << BA_SEQSHIFT; else flags = seqcount << BA_SEQSHIFT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (uio->uio_offset + xfersize > ip->i_size) vnode_pager_setsize(vp, uio->uio_offset + xfersize); /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; /* XXX is uio->uio_offset the right thing here? */ error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ap->a_cred, flags, &bp); if (error != 0) break; /* * If the buffer is not valid we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) bp->b_flags |= B_NOCACHE; if (uio->uio_offset + xfersize > ip->i_size) { ip->i_size = uio->uio_offset + xfersize; DIP_SET(ip, i_size, ip->i_size); } size = blksize(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { + (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; } /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else if (xfersize + blkoffset == fs->fs_bsize) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; cluster_write(vp, bp, ip->i_size, seqcount); } else { bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; bawrite(bp); } else { bp->b_flags |= B_CLUSTEROK; bdwrite(bp); } if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ap->a_cred) { if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, SUSER_ALLOWJAIL)) { ip->i_mode &= ~(ISUID | ISGID); DIP_SET(ip, i_mode, ip->i_mode); } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * get page routine */ static int ffs_getpages(ap) struct vop_getpages_args *ap; { int i; vm_page_t mreq; int pcount; pcount = round_page(ap->a_count) / PAGE_SIZE; mreq = ap->a_m[ap->a_reqpage]; /* * if ANY DEV_BSIZE blocks are valid on a large filesystem block, * then the entire page is valid. Since the page may be mapped, * user programs might reference data beyond the actual end of file * occuring within the page. We have to zero that data. */ VM_OBJECT_LOCK(mreq->object); if (mreq->valid) { if (mreq->valid != VM_PAGE_BITS_ALL) vm_page_zero_invalid(mreq, TRUE); vm_page_lock_queues(); for (i = 0; i < pcount; i++) { if (i != ap->a_reqpage) { vm_page_free(ap->a_m[i]); } } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(mreq->object); return VM_PAGER_OK; } VM_OBJECT_UNLOCK(mreq->object); return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage); } /* * Extended attribute area reading. */ static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error, orig_resid; ip = VTOI(vp); fs = ip->i_fs; dp = ip->i_din2; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extread: mode"); #endif orig_resid = uio->uio_resid; KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); if (orig_resid == 0) return (0); KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; /* * size of buffer. The buffer representing the * end of the file is rounded up to the size of * the block type ( fragment or full block, * depending ). */ size = sblksize(fs, dp->di_extsize, lbn); blkoffset = blkoff(fs, uio->uio_offset); /* * The amount we want to transfer in this iteration is * one FS block less the amount of the data before * our startpoint (duh!) */ xfersize = fs->fs_bsize - blkoffset; /* * But if we actually want less than the block, * or the file doesn't have a whole block more of data, * then use the lesser number. */ if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; if (bytesinfile < xfersize) xfersize = bytesinfile; if (lblktosize(fs, nextlbn) >= dp->di_extsize) { /* * Don't do readahead if this is the end of the info. */ error = bread(vp, -1 - lbn, size, NOCRED, &bp); } else { /* * If we have a second block, then * fire off a request for a readahead * as well as a read. Note that the 4th and 5th * arguments point to arrays of the size specified in * the 6th argument. */ int nextsize = sblksize(fs, dp->di_extsize, nextlbn); nextlbn = -1 - nextlbn; error = breadn(vp, -1 - lbn, size, &nextlbn, &nextsize, 1, NOCRED, &bp); } if (error) { brelse(bp); bp = NULL; break; } /* * If IO_DIRECT then set B_DIRECT for the buffer. This * will cause us to attempt to release the buffer later on * and will cause the buffer cache to attempt to free the * underlying pages. */ if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; /* * We should only get non-zero b_resid when an I/O error * has occurred, which should cause us to break above. * However, if the short read did not cause an error, * then we want to ensure that we do not uiomove bad * or uninitialized data. */ size -= bp->b_resid; if (size < xfersize) { if (size == 0) break; xfersize = size; } error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if (error) break; if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { + (LIST_EMPTY(&bp->b_dep))) { /* * If there are no dependencies, and it's VMIO, * then we don't need the buf, mark it available * for freeing. The VM has the data. */ bp->b_flags |= B_RELBUF; brelse(bp); } else { /* * Otherwise let whoever * made the request take care of * freeing it. We just queue * it onto another list. */ bqrelse(bp); } } /* * This can only happen in the case of an error * because the loop above resets bp to NULL on each iteration * and on normal completion has not set a new value into it. * so it must have come from a 'break' statement */ if (bp != NULL) { if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { + (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; brelse(bp); } else { bqrelse(bp); } } if ((error == 0 || uio->uio_resid != orig_resid) && (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) { VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); } return (error); } /* * Extended attribute area writing. */ static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) { struct inode *ip; struct ufs2_dinode *dp; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; off_t osize; int blkoffset, error, flags, resid, size, xfersize; ip = VTOI(vp); fs = ip->i_fs; dp = ip->i_din2; KASSERT(!(ip->i_flag & IN_SPACECOUNTED), ("inode %u: inode is dead", ip->i_number)); #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_extwrite: mode"); #endif if (ioflag & IO_APPEND) uio->uio_offset = dp->di_extsize; KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) return (EFBIG); resid = uio->uio_resid; osize = dp->di_extsize; flags = IO_EXT; if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) flags |= IO_SYNC; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; if (uio->uio_resid < xfersize) xfersize = uio->uio_resid; /* * We must perform a read-before-write if the transfer size * does not cover the entire buffer. */ if (fs->fs_bsize > xfersize) flags |= BA_CLRBUF; else flags &= ~BA_CLRBUF; error = UFS_BALLOC(vp, uio->uio_offset, xfersize, ucred, flags, &bp); if (error != 0) break; /* * If the buffer is not valid we have to clear out any * garbage data from the pages instantiated for the buffer. * If we do not, a failed uiomove() during a write can leave * the prior contents of the pages exposed to a userland * mmap(). XXX deal with uiomove() errors a better way. */ if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) vfs_bio_clrbuf(bp); if (ioflag & IO_DIRECT) bp->b_flags |= B_DIRECT; if (uio->uio_offset + xfersize > dp->di_extsize) dp->di_extsize = uio->uio_offset + xfersize; size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); if ((ioflag & (IO_VMIO|IO_DIRECT)) && - (LIST_FIRST(&bp->b_dep) == NULL)) { + (LIST_EMPTY(&bp->b_dep))) { bp->b_flags |= B_RELBUF; } /* * If IO_SYNC each buffer is written synchronously. Otherwise * if we have a severe page deficiency write the buffer * asynchronously. Otherwise try to cluster, and if that * doesn't do it then either do an async write (if O_DIRECT), * or a delayed write (if not). */ if (ioflag & IO_SYNC) { (void)bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || xfersize + blkoffset == fs->fs_bsize || (ioflag & (IO_ASYNC | IO_DIRECT))) bawrite(bp); else bdwrite(bp); if (error || xfersize == 0) break; ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, SUSER_ALLOWJAIL)) { ip->i_mode &= ~(ISUID | ISGID); dp->di_mode = ip->i_mode; } } if (error) { if (ioflag & IO_UNIT) { (void)ffs_truncate(vp, osize, IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td); uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) error = ffs_update(vp, 1); return (error); } /* * Vnode operating to retrieve a named extended attribute. * * Locate a particular EA (nspace:name) in the area (ptr:length), and return * the length of the EA, and possibly the pointer to the entry and to the data. */ static int ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) { u_char *p, *pe, *pn, *p0; int eapad1, eapad2, ealength, ealen, nlen; uint32_t ul; pe = ptr + length; nlen = strlen(name); for (p = ptr; p < pe; p = pn) { p0 = p; bcopy(p, &ul, sizeof(ul)); pn = p + ul; /* make sure this entry is complete */ if (pn > pe) break; p += sizeof(uint32_t); if (*p != nspace) continue; p++; eapad2 = *p++; if (*p != nlen) continue; p++; if (bcmp(p, name, nlen)) continue; ealength = sizeof(uint32_t) + 3 + nlen; eapad1 = 8 - (ealength % 8); if (eapad1 == 8) eapad1 = 0; ealength += eapad1; ealen = ul - ealength - eapad2; p += nlen + eapad1; if (eap != NULL) *eap = p0; if (eac != NULL) *eac = p; return (ealen); } return(-1); } static int ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) { struct inode *ip; struct ufs2_dinode *dp; struct uio luio; struct iovec liovec; int easize, error; u_char *eae; ip = VTOI(vp); dp = ip->i_din2; easize = dp->di_extsize; eae = malloc(easize + extra, M_TEMP, M_WAITOK); liovec.iov_base = eae; liovec.iov_len = easize; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = easize; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_READ; luio.uio_td = td; error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); if (error) { free(eae, M_TEMP); return(error); } *p = eae; return (0); } static int ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) { struct inode *ip; struct ufs2_dinode *dp; int error; ip = VTOI(vp); if (ip->i_ea_area != NULL) return (EBUSY); dp = ip->i_din2; error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); if (error) return (error); ip->i_ea_len = dp->di_extsize; ip->i_ea_error = 0; return (0); } /* * Vnode extattr transaction commit/abort */ static int ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) { struct inode *ip; struct uio luio; struct iovec liovec; int error; struct ufs2_dinode *dp; ip = VTOI(vp); if (ip->i_ea_area == NULL) return (EINVAL); dp = ip->i_din2; error = ip->i_ea_error; if (commit && error == 0) { if (cred == NOCRED) cred = vp->v_mount->mnt_cred; liovec.iov_base = ip->i_ea_area; liovec.iov_len = ip->i_ea_len; luio.uio_iov = &liovec; luio.uio_iovcnt = 1; luio.uio_offset = 0; luio.uio_resid = ip->i_ea_len; luio.uio_segflg = UIO_SYSSPACE; luio.uio_rw = UIO_WRITE; luio.uio_td = td; /* XXX: I'm not happy about truncating to zero size */ if (ip->i_ea_len < dp->di_extsize) error = ffs_truncate(vp, 0, IO_EXT, cred, td); error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); } free(ip->i_ea_area, M_TEMP); ip->i_ea_area = NULL; ip->i_ea_len = 0; ip->i_ea_error = 0; return (error); } /* * Vnode extattr strategy routine for fifos. * * We need to check for a read or write of the external attributes. * Otherwise we just fall through and do the usual thing. */ static int ffsext_strategy(struct vop_strategy_args *ap) /* struct vop_strategy_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; struct buf *a_bp; }; */ { struct vnode *vp; daddr_t lbn; vp = ap->a_vp; lbn = ap->a_bp->b_lblkno; if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC && lbn < 0 && lbn >= -NXADDR) return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); if (vp->v_type == VFIFO) return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); panic("spec nodes went here"); } /* * Vnode extattr transaction commit/abort */ static int ffs_openextattr(struct vop_openextattr_args *ap) /* struct vop_openextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); } /* * Vnode extattr transaction commit/abort */ static int ffs_closeextattr(struct vop_closeextattr_args *ap) /* struct vop_closeextattr_args { struct vnodeop_desc *a_desc; struct vnode *a_vp; int a_commit; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); } /* * Vnode operation to remove a named attribute. */ static int ffs_deleteextattr(struct vop_deleteextattr_args *ap) /* vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; uint32_t ealength, ul; int ealen, olen, eapad1, eapad2, error, i, easize; u_char *eae, *p; int stand_alone; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, IWRITE); if (error) { if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } if (ip->i_ea_area == NULL) { error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); stand_alone = 1; } else { stand_alone = 0; } ealength = eapad1 = ealen = eapad2 = 0; eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &p, NULL); if (olen == -1) { /* delete but nonexistent */ free(eae, M_TEMP); if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(ENOATTR); } bcopy(p, &ul, sizeof ul); i = p - eae + ul; if (ul != ealength) { bcopy(p + ul, p + ealength, easize - i); easize += (ealength - ul); } if (easize > NXADDR * fs->fs_bsize) { free(eae, M_TEMP); if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); else if (ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return(ENOSPC); } p = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(p, M_TEMP); if (stand_alone) error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to retrieve a named extended attribute. */ static int ffs_getextattr(struct vop_getextattr_args *ap) /* vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; u_char *eae, *p; unsigned easize; int error, ealen, stand_alone; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, IREAD); if (error) return (error); if (ip->i_ea_area == NULL) { error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); stand_alone = 1; } else { stand_alone = 0; } eae = ip->i_ea_area; easize = ip->i_ea_len; ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, NULL, &p); if (ealen >= 0) { error = 0; if (ap->a_size != NULL) *ap->a_size = ealen; else if (ap->a_uio != NULL) error = uiomove(p, ealen, ap->a_uio); } else error = ENOATTR; if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to retrieve extended attributes on a vnode. */ static int ffs_listextattr(struct vop_listextattr_args *ap) /* vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; u_char *eae, *p, *pe, *pn; unsigned easize; uint32_t ul; int error, ealen, stand_alone; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, IREAD); if (error) return (error); if (ip->i_ea_area == NULL) { error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); stand_alone = 1; } else { stand_alone = 0; } eae = ip->i_ea_area; easize = ip->i_ea_len; error = 0; if (ap->a_size != NULL) *ap->a_size = 0; pe = eae + easize; for(p = eae; error == 0 && p < pe; p = pn) { bcopy(p, &ul, sizeof(ul)); pn = p + ul; if (pn > pe) break; p += sizeof(ul); if (*p++ != ap->a_attrnamespace) continue; p++; /* pad2 */ ealen = *p; if (ap->a_size != NULL) { *ap->a_size += ealen + 1; } else if (ap->a_uio != NULL) { error = uiomove(p, ealen + 1, ap->a_uio); } } if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); return(error); } /* * Vnode operation to set a named attribute. */ static int ffs_setextattr(struct vop_setextattr_args *ap) /* vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; */ { struct inode *ip; struct fs *fs; uint32_t ealength, ul; int ealen, olen, eapad1, eapad2, error, i, easize; u_char *eae, *p; int stand_alone; ip = VTOI(ap->a_vp); fs = ip->i_fs; if (ap->a_vp->v_type == VCHR) return (EOPNOTSUPP); if (strlen(ap->a_name) == 0) return (EINVAL); /* XXX Now unsupported API to delete EAs using NULL uio. */ if (ap->a_uio == NULL) return (EOPNOTSUPP); if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, IWRITE); if (error) { if (ip->i_ea_area != NULL && ip->i_ea_error == 0) ip->i_ea_error = error; return (error); } if (ip->i_ea_area == NULL) { error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); if (error) return (error); stand_alone = 1; } else { stand_alone = 0; } ealen = ap->a_uio->uio_resid; ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); eapad1 = 8 - (ealength % 8); if (eapad1 == 8) eapad1 = 0; eapad2 = 8 - (ealen % 8); if (eapad2 == 8) eapad2 = 0; ealength += eapad1 + ealen + eapad2; eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); bcopy(ip->i_ea_area, eae, ip->i_ea_len); easize = ip->i_ea_len; olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, &p, NULL); if (olen == -1) { /* new, append at end */ p = eae + easize; easize += ealength; } else { bcopy(p, &ul, sizeof ul); i = p - eae + ul; if (ul != ealength) { bcopy(p + ul, p + ealength, easize - i); easize += (ealength - ul); } } if (easize > NXADDR * fs->fs_bsize) { free(eae, M_TEMP); if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); else if (ip->i_ea_error == 0) ip->i_ea_error = ENOSPC; return(ENOSPC); } bcopy(&ealength, p, sizeof(ealength)); p += sizeof(ealength); *p++ = ap->a_attrnamespace; *p++ = eapad2; *p++ = strlen(ap->a_name); strcpy(p, ap->a_name); p += strlen(ap->a_name); bzero(p, eapad1); p += eapad1; error = uiomove(p, ealen, ap->a_uio); if (error) { free(eae, M_TEMP); if (stand_alone) ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); else if (ip->i_ea_error == 0) ip->i_ea_error = error; return(error); } p += ealen; bzero(p, eapad2); p = ip->i_ea_area; ip->i_ea_area = eae; ip->i_ea_len = easize; free(p, M_TEMP); if (stand_alone) error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); return(error); } /* * Vnode pointer to File handle */ static int ffs_vptofh(struct vop_vptofh_args *ap) /* vop_vptofh { IN struct vnode *a_vp; IN struct fid *a_fhp; }; */ { struct inode *ip; struct ufid *ufhp; ip = VTOI(ap->a_vp); ufhp = (struct ufid *)ap->a_fhp; ufhp->ufid_len = sizeof(struct ufid); ufhp->ufid_ino = ip->i_number; ufhp->ufid_gen = ip->i_gen; return (0); }