Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c	(revision 156202)
+++ head/sys/kern/vfs_subr.c	(revision 156203)
@@ -1,3875 +1,3865 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/reboot.h>
 #include <sys/sleepqueue.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
 
 static void	delmntque(struct vnode *vp);
 static void	insmntque(struct vnode *vp, struct mount *mp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	vbusy(struct vnode *vp);
 static void	vdropl(struct vnode *vp);
 static void	vinactive(struct vnode *, struct thread *);
 static void	v_incr_usecount(struct vnode *);
 static void	v_decr_usecount(struct vnode *);
 static void	v_decr_useonly(struct vnode *);
 static void	v_upgrade_usecount(struct vnode *);
 static void	vfree(struct vnode *);
 static void	vnlru_free(int);
 static void	vdestroy(struct vnode *);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static int	vfs_knllocked(void *arg);
 
 
 /*
  * Enable Giant pushdown based on whether or not the vm is mpsafe in this
  * build.  Without mpsafevm the buffer cache can not run Giant free.
  */
 #if defined(__alpha__) || defined(__amd64__) || defined(__i386__) || \
 	defined(__ia64__) || defined(__sparc64__)
 int mpsafe_vfs = 1;
 #else
 int mpsafe_vfs;
 #endif
 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
     "MPSAFE VFS");
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, never decreased.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * Free vnode target.  Free vnodes may simply be files which have been stat'd
  * but not read.  This is somewhat common, and a small cache of such files
  * should be kept to avoid recreation costs.
  */
 static u_long wantfreevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 /* Number of vnodes in the free list. */
 static u_long freevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  */
 int	nfs_mount_type = -1;
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /* Set to 1 to print out reclaim of active vnodes */
 int	prtactive;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  *
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
     &desiredvnodes, 0, "Maximum number of vnodes");
 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
-/* Hook for calling soft updates. */
-int (*softdep_process_worklist_hook)(struct mount *);
-
 /*
  * Macros to control when a vnode is freed and recycled.  All require
  * the vnode interlock.
  */
 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
 
 
 /*
  * Initialize the vnode management data structures.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	100000
 #endif
 static void
 vntblinit(void *dummy __unused)
 {
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and
 	 * the kernel's heap size.  Specifically, desiredvnodes scales
 	 * in proportion to the physical memory size until two fifths
 	 * of the kernel's heap size is consumed by vnodes and vm
 	 * objects.
 	 */
 	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
 	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4; 
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(struct mount *mp, int flags, struct mtx *interlkp,
     struct thread *td)
 {
 	int lkflags;
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			return (ENOENT);
 		}
 		if (interlkp)
 			mtx_unlock(interlkp);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		if (interlkp)
 			mtx_lock(interlkp);
 		return (ENOENT);
 	}
 	if (interlkp)
 		mtx_unlock(interlkp);
 	lkflags = LK_SHARED | LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
 		panic("vfs_busy: unexpected lock failure");
 	vfs_rel(mp);
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp, struct thread *td)
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access priveledged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	if ((mp->mnt_flag & MNT_USER) == 0 ||
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = suser(td)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static u_int16_t mntid_base;
 	fsid_t tfsid;
 	int mtype;
 
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if (vfs_getvfs(&tfsid) == NULL)
 			break;
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_SEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp)
 {
 	struct thread *td;
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 	int count;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
 	trigger = cnt.v_page_count * 2 / usevnodes;
 	done = 0;
 	td = curthread;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize / 10 + 1;
 	while (count != 0) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		MNT_IUNLOCK(mp);
 		vholdl(vp);
 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) {
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		VI_LOCK(vp);
 		/*
 		 * v_usecount may have been bumped after VOP_LOCK() dropped
 		 * the vnode interlock and before it was locked again.
 		 *
 		 * It is not necessary to recheck VI_DOOMED because it can
 		 * only be set by another thread that holds both the vnode
 		 * lock and vnode interlock.  If another thread has the
 		 * vnode lock before we get to VOP_LOCK() and obtains the
 		 * vnode interlock after VOP_LOCK() drops the vnode
 		 * interlock, the other thread will be unable to drop the
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 		    (vp->v_object != NULL && 
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK, td);
 			goto next_iter_mntunlocked;
 		}
 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 		vgonel(vp);
 		VOP_UNLOCK(vp, 0, td);
 		vdropl(vp);
 		done++;
 next_iter_mntunlocked:
 		if ((count % 256) != 0)
 			goto relock_mnt;
 		goto yield;
 next_iter:
 		if ((count % 256) != 0)
 			continue;
 		MNT_IUNLOCK(mp);
 yield:
 		uio_yield();
 relock_mnt:
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	return done;
 }
 
 /*
  * Attempt to keep the free list at wantfreevnodes length.
  */
 static void
 vnlru_free(int count)
 {
 	struct vnode *vp;
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	for (; count > 0; count--) {
 		vp = TAILQ_FIRST(&vnode_free_list);
 		/*
 		 * The list can be modified while the free_list_mtx
 		 * has been dropped and vp could be NULL here.
 		 */
 		if (!vp)
 			break;
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vnlru_free: vnode already reclaimed."));
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		/*
 		 * Don't recycle if we can't get the interlock.
 		 */
 		if (!VI_TRYLOCK(vp)) {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 			continue;
 		}
 		VNASSERT(VCANRECYCLE(vp), vp,
 		    ("vp inconsistent on freelist"));
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
 		vholdl(vp);
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		vtryrecycle(vp);
 		/*
 		 * If the recycled succeeded this vdrop will actually free
 		 * the vnode.  If not it will simply place it back on
 		 * the free list.
 		 */
 		vdrop(vp);
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int done;
 	struct proc *p = vnlruproc;
 	struct thread *td = FIRST_THREAD_IN_PROC(p);
 
 	mtx_lock(&Giant);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
 	for (;;) {
 		kthread_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
 		if (freevnodes > wantfreevnodes)
 			vnlru_free(freevnodes - wantfreevnodes);
 		if (numvnodes <= desiredvnodes * 9 / 10) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			int vfsunlocked;
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			if (!VFS_NEEDSGIANT(mp)) {
 				mtx_unlock(&Giant);
 				vfsunlocked = 1;
 			} else
 				vfsunlocked = 0;
 			done += vlrureclaim(mp);
 			if (vfsunlocked)
 				mtx_lock(&Giant);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, td);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
 				printf("vnlru process getting nowhere..\n");
 			else if (vnlru_nowhere == 5)
 				printf("vnlru process messages stopped.\n");
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else 
 			uio_yield();
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 static void
 vdestroy(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	CTR1(KTR_VFS, "vdestroy vp %p", vp);
 	mtx_lock(&vnode_free_list_mtx);
 	numvnodes--;
 	mtx_unlock(&vnode_free_list_mtx);
 	bo = &vp->v_bufobj;
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("cleaned vnode still on the free list."));
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_destroy_vnode(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
 		mtx_destroy(&vp->v_pollinfo->vpi_lock);
 		uma_zfree(vnodepoll_zone, vp->v_pollinfo);
 	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	uma_zfree(vnode_zone, vp);
 }
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct thread *td = curthread;
 	struct mount *vnmp;
 
 	CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
 		return (EWOULDBLOCK);
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0, td);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp, LK_INTERLOCK, td);
 		vn_finished_write(vnmp);
 		return (EBUSY);
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0)
 		vgonel(vp);
 	VOP_UNLOCK(vp, LK_INTERLOCK, td);
 	vn_finished_write(vnmp);
 	CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
 	return (0);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp = NULL;
 	struct bufobj *bo;
 
 	mtx_lock(&vnode_free_list_mtx);
 	/*
 	 * Lend our context to reclaim vnodes if they've exceeded the max.
 	 */
 	if (freevnodes > wantfreevnodes)
 		vnlru_free(1);
 	/*
 	 * Wait for available vnodes.
 	 */
 	if (numvnodes > desiredvnodes) {
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 		if (numvnodes > desiredvnodes) {
 			mtx_unlock(&vnode_free_list_mtx);
 			return (ENFILE);
 		}
 #endif
 	}
 	numvnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems
 	 * opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bo = &vp->v_bufobj;
 	bo->__bo_vnode = vp;
 	bo->bo_mtx = &vp->v_interlock;
 	bo->bo_ops = &buf_ops_bio;
 	bo->bo_private = vp;
 	TAILQ_INIT(&bo->bo_clean.bv_hd);
 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
 	/*
 	 * Initialize namecache.
 	 */
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp);
 	vp->v_data = 0;
 #ifdef MAC
 	mac_init_vnode(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_associate_vnode_singlelabel(mp, vp);
 	else if (mp == NULL)
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	if (mp != NULL) {
 		insmntque(vp, mp);
 		bo->bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
 
 	CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	vp->v_mount = NULL;
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 static void
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	vp->v_mount = mp;
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag,
     int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0)
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		BO_UNLOCK(bo);
 		if (bo->bo_object != NULL) {
 			VM_OBJECT_LOCK(bo->bo_object);
 			vm_object_pip_wait(bo->bo_object, "bovlbx");
 			VM_OBJECT_UNLOCK(bo->bo_object);
 		}
 		BO_LOCK(bo);
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL) {
 		VM_OBJECT_LOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 		VM_OBJECT_UNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag,
     int slptimeo)
 {
 
 	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_LOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			continue;
 		}
 		lblkno = 0;
 		xflags = 0;
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags &
 				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 	            ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
 			BUF_UNLOCK(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);
 		}
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp != NULL &&
 		    (nbp->b_bufobj != bo || 
 		     nbp->b_lblkno != lblkno ||
 		     (nbp->b_xflags &
 		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
     off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
 	int trunclbn;
 	struct bufobj *bo;
 
 	CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI))) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0)) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK) {
 				goto restart;
 			}
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			VI_LOCK(vp);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	VI_UNLOCK(vp);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * buf_splay() - splay tree core for the clean/dirty list of buffers in
  * 		 a vnode.
  *
  *	NOTE: We have to deal with the special case of a background bitmap
  *	buffer, a situation where two buffers will have the same logical
  *	block offset.  We want (1) only the foreground buffer to be accessed
  *	in a lookup and (2) must differentiate between the foreground and
  *	background buffer in the splay tree algorithm because the splay
  *	tree cannot normally handle multiple entities with the same 'index'.
  *	We accomplish this by adding differentiating flags to the splay tree's
  *	numerical domain.
  */
 static
 struct buf *
 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
 {
 	struct buf dummy;
 	struct buf *lefttreemax, *righttreemin, *y;
 
 	if (root == NULL)
 		return (NULL);
 	lefttreemax = righttreemin = &dummy;
 	for (;;) {
 		if (lblkno < root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_left) == NULL)
 				break;
 			if (lblkno < y->b_lblkno) {
 				/* Rotate right. */
 				root->b_left = y->b_right;
 				y->b_right = root;
 				root = y;
 				if ((y = root->b_left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->b_left = root;
 			righttreemin = root;
 		} else if (lblkno > root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_right) == NULL)
 				break;
 			if (lblkno > y->b_lblkno) {
 				/* Rotate left. */
 				root->b_right = y->b_left;
 				y->b_left = root;
 				root = y;
 				if ((y = root->b_right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->b_right = root;
 			lefttreemax = root;
 		} else {
 			break;
 		}
 		root = y;
 	}
 	/* Assemble the new root. */
 	lefttreemax->b_right = root->b_left;
 	righttreemin->b_left = root->b_right;
 	root->b_left = dummy.b_right;
 	root->b_right = dummy.b_left;
 	return (root);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_LOCKED(bp->b_bufobj);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
 	if (bp->b_xflags & BX_VNDIRTY) 
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	if (bp != bv->bv_root) {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 		KASSERT(root == bp, ("splay lookup failed in remove"));
 	}
 	if (bp->b_left == NULL) {
 		root = bp->b_right;
 	} else {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
 		root->b_right = bp->b_right;
 	}
 	bv->bv_root = root;
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list using a
  * splay tree algorithm.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	ASSERT_BO_LOCKED(bo);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 	if (root == NULL) {
 		bp->b_left = NULL;
 		bp->b_right = NULL;
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	} else if (bp->b_lblkno < root->b_lblkno ||
 	    (bp->b_lblkno == root->b_lblkno &&
 	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 		bp->b_left = root->b_left;
 		bp->b_right = root;
 		root->b_left = NULL;
 		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
 	} else {
 		bp->b_right = root->b_right;
 		bp->b_left = root;
 		root->b_right = NULL;
 		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
 	}
 	bv->bv_cnt++;
 	bv->bv_root = bp;
 }
 
 /*
  * Lookup a buffer using the splay tree.  Note that we specifically avoid
  * shadow buffers used in background bitmap writes.
  *
  * This code isn't quite efficient as it could be because we are maintaining
  * two sorted lists and do not know which list the block resides in.
  *
  * During a "make buildworld" the desired buffer is found at one of
  * the roots more than 60% of the time.  Thus, checking both roots
  * before performing either splay eliminates unnecessary splays on the
  * first tree splayed.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	if ((bp = bo->bo_clean.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_dirty.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_clean.bv_root) != NULL) {
 		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	if ((bp = bo->bo_dirty.bv_root) != NULL) {
 		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	return (NULL);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	ASSERT_VI_LOCKED(vp, "bgetvp");
 	vholdl(vp);
 	bp->b_vp = vp;
 	bp->b_bufobj = &vp->v_bufobj;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("brelvp: Buffer %p not on queue.", bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
  		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	vdropl(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_LOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
  		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 static int
 sync_vnode(struct bufobj *bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	vp = bo->__bo_vnode; 	/* XXX */
 	if (VOP_ISLOCKED(vp, NULL) != 0)
 		return (1);
 	if (VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (1);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	VI_LOCK(vp);
 	if ((bo->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(bo, syncdelay);
 	}
 	vdropl(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next;
 	struct synclist *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
 	static int dummychan;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int first_printf;
 	int error;
 
 	mtx_lock(&Giant);
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	for (;;) {
 		mtx_lock(&sync_mtx);
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kthread_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes, 
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while ((bo = LIST_FIRST(slp)) != NULL) {
 			error = sync_vnode(bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		mtx_unlock(&sync_mtx);
-
-		/*
-		 * Do soft update processing.
-		 */
-		if (softdep_process_worklist_hook != NULL)
-			(*softdep_process_worklist_hook)(NULL);
-
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		mtx_lock(&sync_mtx);
 		if (rushjob > 0) {
 			rushjob -= 1;
 			mtx_unlock(&sync_mtx);
 			continue;
 		}
 		mtx_unlock(&sync_mtx);
 		/*
 		 * Just sleep for a short period if time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING)
 			tsleep(&dummychan, PPAUSE, "syncfnl",
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer()
 {
 	struct thread *td;
 	int ret = 0;
 
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 
 	if (howto & RB_NOSYNC)
 		return;
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	kproc_shutdown(arg, howto);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	++reassignbufcalls;
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	VI_LOCK(vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("reassignbuf: Buffer %p not on queue.", bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
  			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	VI_UNLOCK(vp);
 }
 
 /*
  * Increment the use and hold counts on the vnode, taking care to reference
  * the driver's usecount if this is a chardev.  The vholdl() will remove
  * the vnode from the free list if it is presently free.  Requires the
  * vnode interlock and returns with it held.
  */
 static void
 v_incr_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 	vholdl(vp);
 }
 
 /*
  * Turn a holdcnt into a use+holdcnt such that only one call to
  * v_decr_usecount is needed.
  */
 static void
 v_upgrade_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 }
 
 /*
  * Decrement the vnode use and hold count along with the driver's usecount
  * if this is a chardev.  The vdropl() below releases the vnode interlock
  * as it may free the vnode.
  */
 static void
 v_decr_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_usecount: negative usecount"));
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 	vdropl(vp);
 }
 
 /*
  * Decrement only the use count and driver use count.  This is intended to
  * be paired with a follow on vdropl() to release the remaining hold count.
  * In this way we may vgone() a vnode with a 0 usecount without risk of
  * having it end up on a free list because the hold count is kept above 0.
  */
 static void
 v_decr_useonly(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_useonly: negative usecount"));
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set if the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new filesystem type).
  */
 int
 vget(struct vnode *vp, int flags, struct thread *td)
 {
 	int oweinact;
 	int oldflags;
 	int error;
 
 	error = 0;
 	oldflags = flags;
 	oweinact = 0;
 	VFS_ASSERT_GIANT(vp->v_mount);
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	/*
 	 * If the inactive call was deferred because vput() was called
 	 * with a shared lock, we have to do it here before another thread
 	 * gets a reference to data that should be dead.
 	 */
 	if (vp->v_iflag & VI_OWEINACT) {
 		if (flags & LK_NOWAIT) {
 			VI_UNLOCK(vp);
 			return (EBUSY);
 		}
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 		oweinact = 1;
 	}
 	vholdl(vp);
 	if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
 		vdrop(vp);
 		return (error);
 	}
 	VI_LOCK(vp);
 	/* Upgrade our holdcnt to a usecount. */
 	v_upgrade_usecount(vp);
 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 		panic("vget: vn_lock failed to return ENOENT\n");
 	if (oweinact) {
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 		if ((oldflags & LK_TYPE_MASK) == 0)
 			VOP_UNLOCK(vp, 0, td);
 	} else
 		VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Increase the reference count of a vnode.
  */
 void
 vref(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	v_incr_usecount(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism other
  * than the VI lock is used to stop other processes from gaining references
  * to the vnode.  This may be the case if the caller holds the only reference.
  * This is also useful when stale data is acceptable as race conditions may
  * be accounted for by some other means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 	int usecnt;
 
 	VI_LOCK(vp);
 	usecnt = vp->v_usecount;
 	VI_UNLOCK(vp);
 
 	return (usecnt);
 }
 
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 	VFS_ASSERT_GIANT(vp->v_mount);
 
 	VI_LOCK(vp);
 
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vrele: missed vn_close"));
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		v_decr_usecount(vp);
 		return;
 	}
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		VI_UNLOCK(vp);
 		panic("vrele: negative ref cnt");
 	}
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	/*
 	 * We must call VOP_INACTIVE with the node locked. Mark
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
 		VI_LOCK(vp);
 		vinactive(vp, td);
 		VOP_UNLOCK(vp, 0, td);
 	} else
 		VI_LOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() aquires the lock internally.)
  */
 void
 vput(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 	int error;
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 	ASSERT_VOP_LOCKED(vp, "vput");
 	VFS_ASSERT_GIANT(vp->v_mount);
 	VI_LOCK(vp);
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vput: missed vn_close"));
 	error = 0;
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		VOP_UNLOCK(vp, 0, td);
 		v_decr_usecount(vp);
 		return;
 	}
 
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	vp->v_iflag |= VI_OWEINACT;
 	if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) {
 		error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td);
 		VI_LOCK(vp);
 		if (error)
 			goto done;
 	}
 	if (vp->v_iflag & VI_OWEINACT)
 		vinactive(vp, td);
 	VOP_UNLOCK(vp, 0, td);
 done:
 	vdropl(vp);
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vholdl(struct vnode *vp)
 {
 
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 }
 
 /*
  * Note that there is one less who cares about this vnode.  vdrop() is the
  * opposite of vhold().
  */
 void
 vdrop(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we will free it if it has been vgone'd otherwise it is
  * placed on the free list.
  */
 static void
 vdropl(struct vnode *vp)
 {
 
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
 	vp->v_holdcnt--;
 	if (vp->v_holdcnt == 0) {
 		if (vp->v_iflag & VI_DOOMED) {
 			vdestroy(vp);
 			return;
 		} else
 			vfree(vp);
 	}
 	VI_UNLOCK(vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  * OWEINACT tracks whether a vnode missed a call to inactive due to a
  * failed lock upgrade.
  */
 static void
 vinactive(struct vnode *vp, struct thread *td)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush( struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR1(KTR_VFS, "vflush: mp %p", mp);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
 			return (error);
 		vput(rootvp);
 
 	}
 	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 
 		VI_LOCK(vp);
 		vholdl(vp);
 		MNT_IUNLOCK(mp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
 		if (error) {
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0, td);
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, 0, td);
 				vdropl(vp);
 				MNT_ILOCK(mp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			VNASSERT(vp->v_usecount == 0 ||
 			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
 			    ("device VNODE %p is FORCECLOSED", vp));
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vprint("vflush: busy vnode", vp);
 #endif
 		}
 		VOP_UNLOCK(vp, 0, td);
 		vdropl(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp, 0, td);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy)
 		return (EBUSY);
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp, struct thread *td)
 {
 	int recycled;
 
 	ASSERT_VOP_LOCKED(vp, "vrecycle");
 	recycled = 0;
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	int oweinact;
 	int active;
 
 	CTR1(KTR_VFS, "vgonel: vp %p", vp);
 	ASSERT_VOP_LOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 #if 0
 	/* XXX Need to fix ttyvp before I enable this. */
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 #endif
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (vp->v_iflag & VI_DOOMED)
 		return;
 	vp->v_iflag |= VI_DOOMED;
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to call
 	 * VOP_CLOSE() and VOP_INACTIVE().
 	 */
 	active = vp->v_usecount;
 	oweinact = (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
 		vinvalbuf(vp, 0, td, 0, 0);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	VI_LOCK(vp);
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_tag = "none";
 	vp->v_type = VBAD;
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(struct vnode *vp)
 {
 	int count;
 
 	dev_lock();
 	count = vp->v_rdev->si_usecount;
 	dev_unlock();
 	return (count);
 }
 
 /*
  * Same as above, but using the struct cdev *as argument
  */
 int
 count_dev(struct cdev *dev)
 {
 	int count;
 
 	dev_lock();
 	count = dev->si_usecount;
 	dev_unlock();
 	return(count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
  "VMARKER"};
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[96];
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strcat(buf, "|VV_ROOT");
 	if (vp->v_vflag & VV_TEXT)
 		strcat(buf, "|VV_TEXT");
 	if (vp->v_vflag & VV_SYSTEM)
 		strcat(buf, "|VV_SYSTEM");
 	if (vp->v_iflag & VI_DOOMED)
 		strcat(buf, "|VI_DOOMED");
 	if (vp->v_iflag & VI_FREE)
 		strcat(buf, "|VI_FREE");
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	printf("Locked vnodes\n");
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (VOP_ISLOCKED(vp, NULL))
 				vprint("", vp);
 		}
 		nmp = TAILQ_NEXT(mp, mnt_list);
 	}
 }
 #endif
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static void
 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
 {
 
 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
 	xvfsp->vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp->vfc_vfsops = NULL;
 	xvfsp->vfc_next = NULL;
 }
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 	int error;
 
 	error = 0;
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 
 	printf("WARNING: userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
 	vfs_sysctl, "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct thread *td = req->td;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp, td);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,xvnode", "");
 #endif
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp;
 	struct thread *td;
 	int error;
 
 	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
 	td = curthread;
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, td);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			/*
 			 * XXX: Due to the way in which we mount the root
 			 * file system off of devfs, devfs will generate a
 			 * "busy" warning when we try to unmount it before
 			 * the root.  Don't print a warning as a result in
 			 * order to avoid false positive errors that may
 			 * cause needless upset.
 			 */
 			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
 				printf("unmount of %s failed (",
 				    mp->mnt_stat.f_mntonname);
 				if (error == EBUSY)
 					printf("BUSY)\n");
 				else
 					printf("%d)\n", error);
 			}
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_OBJDIRTY) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 			MNT_IUNLOCK(mp);
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					MNT_ILOCK(mp);
 					continue;
 				}
 
 				obj = vp->v_object;
 				if (obj != NULL) {
 					VM_OBJECT_LOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_UNLOCK(obj);
 				}
 				vput(vp);
 			}
 			MNT_ILOCK(mp);
 		} else
 			VI_UNLOCK(vp);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Mark a vnode as free, putting it up for recycling.
  */
 static void
 vfree(struct vnode *vp)
 {
 
 	CTR1(KTR_VFS, "vfree vp %p", vp);
 	ASSERT_VI_LOCKED(vp, "vfree");
 	mtx_lock(&vnode_free_list_mtx);
 	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
 	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
 	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("vfree: Freeing doomed vnode"));
 	if (vp->v_iflag & VI_AGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	vp->v_iflag &= ~VI_AGE;
 	vp->v_iflag |= VI_FREE;
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Opposite of vfree() - mark a vnode as in use.
  */
 static void
 vbusy(struct vnode *vp)
 {
 	CTR1(KTR_VFS, "vbusy vp %p", vp);
 	ASSERT_VI_LOCKED(vp, "vbusy");
 	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
 	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
 
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	freevnodes--;
 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Initalize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
 	if (vp->v_pollinfo != NULL) {
 		uma_zfree(vnodepoll_zone, vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knllocked);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return 0;
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	.vop_lock =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	VI_LOCK(vp);
 	vn_syncer_add_to_worklist(&vp->v_bufobj,
 	    syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct thread *td = ap->a_td;
 	int error, asyncflag;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	mtx_lock(&mountlist_mtx);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
 		mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp, td);
 		return (0);
 	}
 	asyncflag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY, td);
 	if (asyncflag)
 		mp->mnt_flag |= MNT_ASYNC;
 	vn_finished_write(mp);
 	vfs_unbusy(mp, td);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
  		syncer_worklist_len--;
 		sync_vnode_count--;
 		mtx_unlock(&sync_mtx);
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	VI_UNLOCK(vp);
 
 	return (0);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(struct vnode *vp, int *errp)
 {
 	int error;
 
 	error = 0;
 	dev_lock();
 	if (vp->v_type != VCHR)
 		error = ENOTBLK;
 	else if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     mode_t acc_mode, struct ucred *cred, int *privused)
 {
 	mode_t dac_granted;
 #ifdef CAPABILITIES
 	mode_t cap_granted;
 #endif
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 
 privcheck:
 	if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 #ifdef CAPABILITIES
 	/*
 	 * Build a capability mask to determine if the set of capabilities
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.
 	 * For each capability, if the capability is required, bitwise
 	 * or the request type onto the cap_granted mask.
 	 */
 	cap_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
 		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
 		 */
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 			cap_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
 			cap_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
 		cap_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
 		cap_granted |= (VWRITE | VAPPEND);
 
 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
 		cap_granted |= VADMIN;
 
 	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 #endif
 
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, int access)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly
 	 * manipulate system attributes.
 	 *
 	 * XXX What capability should apply here?
 	 * Probably CAP_SYS_SETFFLAG.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (suser_cred(cred, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, access, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to supress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter("lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 
 #if 0
 void
 assert_vop_elocked_other(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
 		vfs_badlock("is not exclusive locked by another thread",
 		    str, vp);
 }
 
 void
 assert_vop_slocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
 		vfs_badlock("is not locked shared but should be", str, vp);
 }
 #endif /* 0 */
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp != a->a_fdvp)
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp != a->a_fvp)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 void
 vop_strategy_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (BUF_REFCNT(bp) < 1) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter("lock violation");
 	}
 #endif
 }
 
 void
 vop_lookup_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 #endif
 }
 
 void
 vop_lookup_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 	struct vnode *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = *(a->a_vpp);
 
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 
 	if (!rc)
 		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
 #endif
 }
 
 void
 vop_lock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lock_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lock_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a = ap;
 	
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 
 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a = ap;
 	
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops =
 	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	return (VFS_SYSCTL(mp, vc.vc_op, req));
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
         NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfsread };
 static struct filterops vfswrite_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfswrite };
 static struct filterops vfsvnode_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfsvnode };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp, 0, curthread);
 }
 
 static int
 vfs_knllocked(void *arg)
 {
 	struct vnode *vp = arg;
 
 	return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE);
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl; 
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct vattr va;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		return (1);
 	}
 
 	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread)) 
 		return (0);
 
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 	return (kn->kn_data != 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE)
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	return (kn->kn_fflags != 0);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	return (0);
 }
 
 /*
  * Mark for update the access time of the file if the filesystem
  * supports VA_MARK_ATIME.  This functionality is used by execve
  * and mmap, so we want to avoid the synchronous I/O implied by
  * directly setting va_atime for the sake of efficiency.
  */
 void
 vfs_mark_atime(struct vnode *vp, struct thread *td)
 {
 	struct vattr atimeattr;
 
 	if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
 		VATTR_NULL(&atimeattr);
 		atimeattr.va_vaflags |= VA_MARK_ATIME;
 		(void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td);
 	}
 }
Index: head/sys/sys/mount.h
===================================================================
--- head/sys/sys/mount.h	(revision 156202)
+++ head/sys/sys/mount.h	(revision 156203)
@@ -1,716 +1,712 @@
 /*-
  * Copyright (c) 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mount.h	8.21 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MOUNT_H_
 #define _SYS_MOUNT_H_
 
 #include <sys/ucred.h>
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/lockmgr.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #endif
 
 typedef struct fsid { int32_t val[2]; } fsid_t;	/* filesystem id type */
 
 /*
  * File identifier.
  * These are unique per filesystem on a single machine.
  */
 #define	MAXFIDSZ	16
 
 struct fid {
 	u_short		fid_len;		/* length of data in bytes */
 	u_short		fid_reserved;		/* force longword alignment */
 	char		fid_data[MAXFIDSZ];	/* data (variable length) */
 };
 
 /*
  * filesystem statistics
  */
 #define	MFSNAMELEN	16		/* length of type name including null */
 #define	MNAMELEN	88		/* size of on/from name bufs */
 #define	STATFS_VERSION	0x20030518	/* current version number */
 struct statfs {
 	uint32_t f_version;		/* structure version number */
 	uint32_t f_type;		/* type of filesystem */
 	uint64_t f_flags;		/* copy of mount exported flags */
 	uint64_t f_bsize;		/* filesystem fragment size */
 	uint64_t f_iosize;		/* optimal transfer block size */
 	uint64_t f_blocks;		/* total data blocks in filesystem */
 	uint64_t f_bfree;		/* free blocks in filesystem */
 	int64_t	 f_bavail;		/* free blocks avail to non-superuser */
 	uint64_t f_files;		/* total file nodes in filesystem */
 	int64_t	 f_ffree;		/* free nodes avail to non-superuser */
 	uint64_t f_syncwrites;		/* count of sync writes since mount */
 	uint64_t f_asyncwrites;		/* count of async writes since mount */
 	uint64_t f_syncreads;		/* count of sync reads since mount */
 	uint64_t f_asyncreads;		/* count of async reads since mount */
 	uint64_t f_spare[10];		/* unused spare */
 	uint32_t f_namemax;		/* maximum filename length */
 	uid_t	  f_owner;		/* user that mounted the filesystem */
 	fsid_t	  f_fsid;		/* filesystem id */
 	char	  f_charspare[80];	    /* spare string space */
 	char	  f_fstypename[MFSNAMELEN]; /* filesystem type name */
 	char	  f_mntfromname[MNAMELEN];  /* mounted filesystem */
 	char	  f_mntonname[MNAMELEN];    /* directory on which mounted */
 };
 
 #ifdef _KERNEL
 #define	OMFSNAMELEN	16	/* length of fs type name, including null */
 #define	OMNAMELEN	(88 - 2 * sizeof(long))	/* size of on/from name bufs */
 
 /* XXX getfsstat.2 is out of date with write and read counter changes here. */
 /* XXX statfs.2 is out of date with read counter changes here. */
 struct ostatfs {
 	long	f_spare2;		/* placeholder */
 	long	f_bsize;		/* fundamental filesystem block size */
 	long	f_iosize;		/* optimal transfer block size */
 	long	f_blocks;		/* total data blocks in filesystem */
 	long	f_bfree;		/* free blocks in fs */
 	long	f_bavail;		/* free blocks avail to non-superuser */
 	long	f_files;		/* total file nodes in filesystem */
 	long	f_ffree;		/* free file nodes in fs */
 	fsid_t	f_fsid;			/* filesystem id */
 	uid_t	f_owner;		/* user that mounted the filesystem */
 	int	f_type;			/* type of filesystem */
 	int	f_flags;		/* copy of mount exported flags */
 	long	f_syncwrites;		/* count of sync writes since mount */
 	long	f_asyncwrites;		/* count of async writes since mount */
 	char	f_fstypename[OMFSNAMELEN]; /* fs type name */
 	char	f_mntonname[OMNAMELEN];	/* directory on which mounted */
 	long	f_syncreads;		/* count of sync reads since mount */
 	long	f_asyncreads;		/* count of async reads since mount */
 	short	f_spares1;		/* unused spare */
 	char	f_mntfromname[OMNAMELEN];/* mounted filesystem */
 	short	f_spares2;		/* unused spare */
 	/*
 	 * XXX on machines where longs are aligned to 8-byte boundaries, there
 	 * is an unnamed int32_t here.  This spare was after the apparent end
 	 * of the struct until we bit off the read counters from f_mntonname.
 	 */
 	long	f_spare[2];		/* unused spare */
 };
 
 #define	MMAXOPTIONLEN	65536		/* maximum length of a mount option */
 
 TAILQ_HEAD(vnodelst, vnode);
 
 struct vfsoptlist;
 struct vfsopt;
 
 /*
  * Structure per mounted filesystem.  Each mounted filesystem has an
  * array of operations and an instance record.  The filesystems are
  * put on a doubly linked list.
  *
  * Lock reference:
  * 	m - mountlist_mtx
  *	i - interlock
  *	l - mnt_lock
  *
  * Unmarked fields are considered stable as long as a ref is held.
  *
  */
 struct mount {
 	TAILQ_ENTRY(mount) mnt_list;		/* (m) mount list */
 	struct vfsops	*mnt_op;		/* operations on fs */
 	struct vfsconf	*mnt_vfc;		/* configuration info */
 	struct vnode	*mnt_vnodecovered;	/* vnode we mounted on */
 	struct vnode	*mnt_syncer;		/* syncer vnode */
 	struct lock	mnt_lock;		/* mount structure lock */
 	struct mtx	mnt_mtx;		/* mount structure interlock */
 	int		mnt_ref;		/* (i) Reference count */
 	struct vnodelst	mnt_nvnodelist;		/* (i) list of vnodes */
 	int		mnt_nvnodelistsize;	/* (i) # of vnodes */
 	int		mnt_writeopcount;	/* (i) write syscalls pending */
 	int		mnt_kern_flag;		/* (i) kernel only flags */
 	u_int		mnt_flag;		/* flags shared with user */
 	struct vfsoptlist *mnt_opt;		/* current mount options */
 	struct vfsoptlist *mnt_optnew;		/* new options passed to fs */
 	int		mnt_maxsymlinklen;	/* max size of short symlink */
 	struct statfs	mnt_stat;		/* cache of filesystem stats */
 	struct ucred	*mnt_cred;		/* credentials of mounter */
 	void *		mnt_data;		/* private data */
 	time_t		mnt_time;		/* last time written*/
 	int		mnt_iosize_max;		/* max size for clusters, etc */
 	struct netexport *mnt_export;		/* export list */
 	struct label	*mnt_mntlabel;		/* MAC label for the mount */
 	struct label	*mnt_fslabel;		/* MAC label for the fs */
 	u_int		mnt_hashseed;		/* Random seed for vfs_hash */
 	int		mnt_markercnt;		/* marker vnodes in use */
 	int		mnt_holdcnt;		/* hold count */
 	int		mnt_holdcntwaiters;	/* waits on hold count */
 };
 
 struct vnode *__mnt_vnode_next(struct vnode **mvp, struct mount *mp);
 struct vnode *__mnt_vnode_first(struct vnode **mvp, struct mount *mp);
 void          __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
 
 #define MNT_VNODE_FOREACH(vp, mp, mvp) \
 	for (vp = __mnt_vnode_first(&(mvp), (mp)); \
 		(vp) != NULL; vp = __mnt_vnode_next(&(mvp), (mp)))
 
 #define MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp)			\
 	__mnt_vnode_markerfree(&(mvp), (mp)) 
 
 #define MNT_VNODE_FOREACH_ABORT(mp, mvp)				\
         do {								\
 	  MNT_ILOCK(mp);						\
           MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);			\
 	  MNT_IUNLOCK(mp);						\
 	} while (0)
 
 #define	MNT_ILOCK(mp)	mtx_lock(&(mp)->mnt_mtx)
 #define	MNT_IUNLOCK(mp)	mtx_unlock(&(mp)->mnt_mtx)
 #define	MNT_MTX(mp)	(&(mp)->mnt_mtx)
 #define	MNT_REF(mp)	(mp)->mnt_ref++
 #define	MNT_REL(mp)	do {						\
 	(mp)->mnt_ref--;						\
 	if ((mp)->mnt_ref == 0)						\
 		wakeup((mp));						\
 } while (0)
 
 #endif /* _KERNEL */
 
 /*
  * User specifiable flags.
  */
 #define	MNT_RDONLY	0x00000001	/* read only filesystem */
 #define	MNT_SYNCHRONOUS	0x00000002	/* filesystem written synchronously */
 #define	MNT_NOEXEC	0x00000004	/* can't exec from filesystem */
 #define	MNT_NOSUID	0x00000008	/* don't honor setuid bits on fs */
 #define	MNT_UNION	0x00000020	/* union with underlying filesystem */
 #define	MNT_ASYNC	0x00000040	/* filesystem written asynchronously */
 #define	MNT_SUIDDIR	0x00100000	/* special handling of SUID on dirs */
 #define	MNT_SOFTDEP	0x00200000	/* soft updates being done */
 #define	MNT_NOSYMFOLLOW	0x00400000	/* do not follow symlinks */
 #define	MNT_MULTILABEL	0x04000000	/* MAC support for individual objects */
 #define	MNT_ACLS	0x08000000	/* ACL support enabled */
 #define	MNT_NOATIME	0x10000000	/* disable update of file access time */
 #define	MNT_NOCLUSTERR	0x40000000	/* disable cluster read */
 #define	MNT_NOCLUSTERW	0x80000000	/* disable cluster write */
 
 /*
  * NFS export related mount flags.
  */
 #define	MNT_EXRDONLY	0x00000080	/* exported read only */
 #define	MNT_EXPORTED	0x00000100	/* filesystem is exported */
 #define	MNT_DEFEXPORTED	0x00000200	/* exported to the world */
 #define	MNT_EXPORTANON	0x00000400	/* use anon uid mapping for everyone */
 #define	MNT_EXKERB	0x00000800	/* exported with Kerberos uid mapping */
 #define	MNT_EXPUBLIC	0x20000000	/* public export (WebNFS) */
 
 /*
  * Flags set by internal operations,
  * but visible to the user.
  * XXX some of these are not quite right.. (I've never seen the root flag set)
  */
 #define	MNT_LOCAL	0x00001000	/* filesystem is stored locally */
 #define	MNT_QUOTA	0x00002000	/* quotas are enabled on filesystem */
 #define	MNT_ROOTFS	0x00004000	/* identifies the root filesystem */
 #define	MNT_USER	0x00008000	/* mounted by a user */
 #define	MNT_IGNORE	0x00800000	/* do not show entry in df */
 
 /*
  * Mask of flags that are visible to statfs().
  * XXX I think that this could now become (~(MNT_CMDFLAGS))
  * but the 'mount' program may need changing to handle this.
  */
 #define	MNT_VISFLAGMASK	(MNT_RDONLY	| MNT_SYNCHRONOUS | MNT_NOEXEC	| \
 			MNT_NOSUID	| MNT_UNION	| \
 			MNT_ASYNC	| MNT_EXRDONLY	| MNT_EXPORTED	| \
 			MNT_DEFEXPORTED	| MNT_EXPORTANON| MNT_EXKERB	| \
 			MNT_LOCAL	| MNT_USER	| MNT_QUOTA	| \
 			MNT_ROOTFS	| MNT_NOATIME	| MNT_NOCLUSTERR| \
 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
 			MNT_MULTILABEL	| MNT_ACLS)
 
 /* Mask of flags that can be updated. */
 #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
 			MNT_SYNCHRONOUS	| MNT_UNION	| MNT_ASYNC	| \
 			MNT_NOATIME | \
 			MNT_NOSYMFOLLOW	| MNT_IGNORE	| \
 			MNT_NOCLUSTERR	| MNT_NOCLUSTERW | MNT_SUIDDIR	| \
 			MNT_ACLS	| MNT_USER)
 
 /*
  * External filesystem command modifier flags.
  * Unmount can use the MNT_FORCE flag.
  * XXX: These are not STATES and really should be somewhere else.
  * XXX: MNT_BYFSID collides with MNT_ACLS, but because MNT_ACLS is only used for
  *      mount(2) and MNT_BYFSID is only used for unmount(2) it's harmless.
  */
 #define	MNT_UPDATE	0x00010000	/* not a real mount, just an update */
 #define	MNT_DELEXPORT	0x00020000	/* delete export host lists */
 #define	MNT_RELOAD	0x00040000	/* reload filesystem data */
 #define	MNT_FORCE	0x00080000	/* force unmount or readonly change */
 #define	MNT_SNAPSHOT	0x01000000	/* snapshot the filesystem */
 #define	MNT_BYFSID	0x08000000	/* specify filesystem by ID. */
 #define MNT_CMDFLAGS   (MNT_UPDATE	| MNT_DELEXPORT	| MNT_RELOAD	| \
 			MNT_FORCE	| MNT_SNAPSHOT	| MNT_BYFSID)
 /*
  * Still available.
  */
 #define	MNT_SPARE_0x00000010	0x00000010
 #define	MNT_SPARE_0x02000000	0x02000000
 /*
  * Internal filesystem control flags stored in mnt_kern_flag.
  *
  * MNTK_UNMOUNT locks the mount entry so that name lookup cannot proceed
  * past the mount point.  This keeps the subtree stable during mounts
  * and unmounts.
  *
  * MNTK_UNMOUNTF permits filesystems to detect a forced unmount while
  * dounmount() is still waiting to lock the mountpoint. This allows
  * the filesystem to cancel operations that might otherwise deadlock
  * with the unmount attempt (used by NFS).
  */
 #define MNTK_UNMOUNTF	0x00000001	/* forced unmount in progress */
 #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
 #define	MNTK_SUSPEND	0x08000000	/* request write suspension */
 #define	MNTK_SUSPENDED	0x10000000	/* write operations are suspended */
 #define	MNTK_MPSAFE	0x20000000	/* Filesystem is MPSAFE. */
 #define	MNTK_NOKNOTE	0x80000000	/* Don't send KNOTEs from VOP hooks */
 
 /*
  * Sysctl CTL_VFS definitions.
  *
  * Second level identifier specifies which filesystem. Second level
  * identifier VFS_VFSCONF returns information about all filesystems.
  * Second level identifier VFS_GENERIC is non-terminal.
  */
 #define	VFS_VFSCONF		0	/* get configured filesystems */
 #define	VFS_GENERIC		0	/* generic filesystem information */
 /*
  * Third level identifiers for VFS_GENERIC are given below; third
  * level identifiers for specific filesystems are given in their
  * mount specific header files.
  */
 #define VFS_MAXTYPENUM	1	/* int: highest defined filesystem type */
 #define VFS_CONF	2	/* struct: vfsconf for filesystem given
 				   as next argument */
 
 /*
  * Flags for various system call interfaces.
  *
  * waitfor flags to vfs_sync() and getfsstat()
  */
 #define MNT_WAIT	1	/* synchronously wait for I/O to complete */
 #define MNT_NOWAIT	2	/* start all I/O, but do not wait for it */
 #define MNT_LAZY	3	/* push data not written by filesystem syncer */
 
 /*
  * Generic file handle
  */
 struct fhandle {
 	fsid_t	fh_fsid;	/* Filesystem id of mount point */
 	struct	fid fh_fid;	/* Filesys specific id */
 };
 typedef struct fhandle	fhandle_t;
 
 /*
  * Export arguments for local filesystem mount calls.
  */
 struct export_args {
 	int	ex_flags;		/* export related flags */
 	uid_t	ex_root;		/* mapping for root uid */
 	struct	xucred ex_anon;		/* mapping for anonymous user */
 	struct	sockaddr *ex_addr;	/* net address to which exported */
 	u_char	ex_addrlen;		/* and the net address length */
 	struct	sockaddr *ex_mask;	/* mask of valid bits in saddr */
 	u_char	ex_masklen;		/* and the smask length */
 	char	*ex_indexfile;		/* index file for WebNFS URLs */
 };
 
 /*
  * Structure holding information for a publicly exported filesystem
  * (WebNFS). Currently the specs allow just for one such filesystem.
  */
 struct nfs_public {
 	int		np_valid;	/* Do we hold valid information */
 	fhandle_t	np_handle;	/* Filehandle for pub fs (internal) */
 	struct mount	*np_mount;	/* Mountpoint of exported fs */
 	char		*np_index;	/* Index file */
 };
 
 /*
  * Filesystem configuration information. One of these exists for each
  * type of filesystem supported by the kernel. These are searched at
  * mount time to identify the requested filesystem.
  *
  * XXX: Never change the first two arguments!
  */
 struct vfsconf {
 	u_int	vfc_version;		/* ABI version number */
 	char	vfc_name[MFSNAMELEN];	/* filesystem type name */
 	struct	vfsops *vfc_vfsops;	/* filesystem operations vector */
 	int	vfc_typenum;		/* historic filesystem type number */
 	int	vfc_refcount;		/* number mounted of this type */
 	int	vfc_flags;		/* permanent flags */
 	struct	vfsoptdecl *vfc_opts;	/* mount options */
 	TAILQ_ENTRY(vfsconf) vfc_list;	/* list of vfscons */
 };
 
 /* Userland version of the struct vfsconf. */
 struct xvfsconf {
 	struct	vfsops *vfc_vfsops;	/* filesystem operations vector */
 	char	vfc_name[MFSNAMELEN];	/* filesystem type name */
 	int	vfc_typenum;		/* historic filesystem type number */
 	int	vfc_refcount;		/* number mounted of this type */
 	int	vfc_flags;		/* permanent flags */
 	struct	vfsconf *vfc_next;	/* next in list */
 };
 
 #ifndef BURN_BRIDGES
 struct ovfsconf {
 	void	*vfc_vfsops;
 	char	vfc_name[32];
 	int	vfc_index;
 	int	vfc_refcount;
 	int	vfc_flags;
 };
 #endif
 
 /*
  * NB: these flags refer to IMPLEMENTATION properties, not properties of
  * any actual mounts; i.e., it does not make sense to change the flags.
  */
 #define	VFCF_STATIC	0x00010000	/* statically compiled into kernel */
 #define	VFCF_NETWORK	0x00020000	/* may get data over the network */
 #define	VFCF_READONLY	0x00040000	/* writes are not implemented */
 #define VFCF_SYNTHETIC	0x00080000	/* data does not represent real files */
 #define	VFCF_LOOPBACK	0x00100000	/* aliases some other mounted FS */
 #define	VFCF_UNICODE	0x00200000	/* stores file names as Unicode*/
 
 typedef uint32_t fsctlop_t;
 
 struct vfsidctl {
 	int		vc_vers;	/* should be VFSIDCTL_VERS1 (below) */
 	fsid_t		vc_fsid;	/* fsid to operate on. */
 	char		vc_fstypename[MFSNAMELEN];
 					/* type of fs 'nfs' or '*' */
 	fsctlop_t	vc_op;		/* operation VFS_CTL_* (below) */
 	void		*vc_ptr;	/* pointer to data structure. */
 	size_t		vc_len;		/* sizeof said structure. */
 	u_int32_t	vc_spare[12];	/* spare (must be zero). */
 };
 
 /* vfsidctl API version. */
 #define VFS_CTL_VERS1	0x01
 
 /*
  * New style VFS sysctls, do not reuse/conflict with the namespace for
  * private sysctls.
  * All "global" sysctl ops have the 33rd bit set:
  * 0x...1....
  * Private sysctl ops should have the 33rd bit unset.
  */
 #define VFS_CTL_QUERY	0x00010001	/* anything wrong? (vfsquery) */
 #define VFS_CTL_TIMEO	0x00010002	/* set timeout for vfs notification */
 #define VFS_CTL_NOLOCKS	0x00010003	/* disable file locking */
 
 struct vfsquery {
 	u_int32_t	vq_flags;
 	u_int32_t	vq_spare[31];
 };
 
 /* vfsquery flags */
 #define VQ_NOTRESP	0x0001	/* server down */
 #define VQ_NEEDAUTH	0x0002	/* server bad auth */
 #define VQ_LOWDISK	0x0004	/* we're low on space */
 #define VQ_MOUNT	0x0008	/* new filesystem arrived */
 #define VQ_UNMOUNT	0x0010	/* filesystem has left */
 #define VQ_DEAD		0x0020	/* filesystem is dead, needs force unmount */
 #define VQ_ASSIST	0x0040	/* filesystem needs assistance from external
 				   program */
 #define VQ_NOTRESPLOCK	0x0080	/* server lockd down */
 #define VQ_FLAG0100	0x0100	/* placeholder */
 #define VQ_FLAG0200	0x0200	/* placeholder */
 #define VQ_FLAG0400	0x0400	/* placeholder */
 #define VQ_FLAG0800	0x0800	/* placeholder */
 #define VQ_FLAG1000	0x1000	/* placeholder */
 #define VQ_FLAG2000	0x2000	/* placeholder */
 #define VQ_FLAG4000	0x4000	/* placeholder */
 #define VQ_FLAG8000	0x8000	/* placeholder */
 
 #ifdef _KERNEL
 /* Point a sysctl request at a vfsidctl's data. */
 #define VCTLTOREQ(vc, req)						\
 	do {								\
 		(req)->newptr = (vc)->vc_ptr;				\
 		(req)->newlen = (vc)->vc_len;				\
 		(req)->newidx = 0;					\
 	} while (0)
 #endif
 
 struct iovec;
 struct uio;
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_MOUNT);
 #endif
 extern int maxvfsconf;		/* highest defined filesystem type */
 extern int nfs_mount_type;	/* vfc_typenum for nfs, or -1 */
 
 TAILQ_HEAD(vfsconfhead, vfsconf);
 extern struct vfsconfhead vfsconf;
 
 /*
  * Operations supported on mounted filesystem.
  */
 struct mount_args;
 struct nameidata;
 struct sysctl_req;
 struct mntarg;
 
 typedef int vfs_cmount_t(struct mntarg *ma, void *data, int flags, struct thread *td);
 typedef int vfs_unmount_t(struct mount *mp, int mntflags, struct thread *td);
 typedef int vfs_root_t(struct mount *mp, int flags, struct vnode **vpp,
 		    struct thread *td);
 typedef	int vfs_quotactl_t(struct mount *mp, int cmds, uid_t uid,
 		    void *arg, struct thread *td);
 typedef	int vfs_statfs_t(struct mount *mp, struct statfs *sbp,
 		    struct thread *td);
 typedef	int vfs_sync_t(struct mount *mp, int waitfor, struct thread *td);
 typedef	int vfs_vget_t(struct mount *mp, ino_t ino, int flags,
 		    struct vnode **vpp);
 typedef	int vfs_fhtovp_t(struct mount *mp, struct fid *fhp, struct vnode **vpp);
 typedef	int vfs_checkexp_t(struct mount *mp, struct sockaddr *nam,
 		    int *extflagsp, struct ucred **credanonp);
 typedef	int vfs_vptofh_t(struct vnode *vp, struct fid *fhp);
 typedef	int vfs_init_t(struct vfsconf *);
 typedef	int vfs_uninit_t(struct vfsconf *);
 typedef	int vfs_extattrctl_t(struct mount *mp, int cmd,
 		    struct vnode *filename_vp, int attrnamespace,
 		    const char *attrname, struct thread *td);
 typedef	int vfs_mount_t(struct mount *mp, struct thread *td);
 typedef int vfs_sysctl_t(struct mount *mp, fsctlop_t op,
 		    struct sysctl_req *req);
 
 struct vfsops {
 	vfs_mount_t		*vfs_mount;
 	vfs_cmount_t		*vfs_cmount;
 	vfs_unmount_t		*vfs_unmount;
 	vfs_root_t		*vfs_root;
 	vfs_quotactl_t		*vfs_quotactl;
 	vfs_statfs_t		*vfs_statfs;
 	vfs_sync_t		*vfs_sync;
 	vfs_vget_t		*vfs_vget;
 	vfs_fhtovp_t		*vfs_fhtovp;
 	vfs_checkexp_t		*vfs_checkexp;
 	vfs_vptofh_t		*vfs_vptofh;
 	vfs_init_t		*vfs_init;
 	vfs_uninit_t		*vfs_uninit;
 	vfs_extattrctl_t	*vfs_extattrctl;
 	vfs_sysctl_t		*vfs_sysctl;
 };
 
 vfs_statfs_t	__vfs_statfs;
 
 #define VFS_MOUNT(MP, P)    (*(MP)->mnt_op->vfs_mount)(MP, P)
 #define VFS_UNMOUNT(MP, FORCE, P) (*(MP)->mnt_op->vfs_unmount)(MP, FORCE, P)
 #define VFS_ROOT(MP, FLAGS, VPP, P) \
 	(*(MP)->mnt_op->vfs_root)(MP, FLAGS, VPP, P)
 #define VFS_QUOTACTL(MP,C,U,A,P)  (*(MP)->mnt_op->vfs_quotactl)(MP, C, U, A, P)
 #define VFS_STATFS(MP, SBP, P)	  __vfs_statfs((MP), (SBP), (P))
 #define VFS_SYNC(MP, WAIT, P)  (*(MP)->mnt_op->vfs_sync)(MP, WAIT, P)
 #define VFS_VGET(MP, INO, FLAGS, VPP) \
 	(*(MP)->mnt_op->vfs_vget)(MP, INO, FLAGS, VPP)
 #define VFS_FHTOVP(MP, FIDP, VPP) \
 	(*(MP)->mnt_op->vfs_fhtovp)(MP, FIDP, VPP)
 #define	VFS_VPTOFH(VP, FIDP)	  (*(VP)->v_mount->mnt_op->vfs_vptofh)(VP, FIDP)
 #define VFS_CHECKEXP(MP, NAM, EXFLG, CRED) \
 	(*(MP)->mnt_op->vfs_checkexp)(MP, NAM, EXFLG, CRED)
 #define VFS_EXTATTRCTL(MP, C, FN, NS, N, P) \
 	(*(MP)->mnt_op->vfs_extattrctl)(MP, C, FN, NS, N, P)
 #define VFS_SYSCTL(MP, OP, REQ) \
 	(*(MP)->mnt_op->vfs_sysctl)(MP, OP, REQ)
 
 extern int mpsafe_vfs;
 
 #define	VFS_NEEDSGIANT(MP)						\
     (!mpsafe_vfs || ((MP) != NULL && ((MP)->mnt_kern_flag & MNTK_MPSAFE) == 0))
 
 #define	VFS_LOCK_GIANT(MP) __extension__				\
 ({									\
 	int _locked;							\
 	if (VFS_NEEDSGIANT((MP))) {					\
 		mtx_lock(&Giant);					\
 		_locked = 1;						\
 	} else								\
 		_locked = 0;						\
 	_locked;							\
 })
 #define	VFS_UNLOCK_GIANT(locked)	if ((locked)) mtx_unlock(&Giant);
 #define	VFS_ASSERT_GIANT(MP) do 					\
 {									\
 	if (VFS_NEEDSGIANT((MP)))					\
 		mtx_assert(&Giant, MA_OWNED);				\
 } while (0)
 
 #define VFS_KNOTE_LOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) == 0)				\
 		VN_KNOTE((vp), (hint), 1);				\
 } while (0)
 
 #define VFS_KNOTE_UNLOCKED(vp, hint) do					\
 {									\
 	if (((vp)->v_vflag & VV_NOKNOTE) == 0)				\
 		VN_KNOTE((vp), (hint), 0);				\
 } while (0)
 
 #include <sys/module.h>
 
 /*
  * Version numbers.
  */
 #define VFS_VERSION_00	0x19660120
 #define VFS_VERSION	VFS_VERSION_00
 
 #define VFS_SET(vfsops, fsname, flags) \
 	static struct vfsconf fsname ## _vfsconf = {		\
 		.vfc_version = VFS_VERSION,			\
 		.vfc_name = #fsname,				\
 		.vfc_vfsops = &vfsops,				\
 		.vfc_typenum = -1,				\
 		.vfc_flags = flags,				\
 	};							\
 	static moduledata_t fsname ## _mod = {			\
 		#fsname,					\
 		vfs_modevent,					\
 		& fsname ## _vfsconf				\
 	};							\
 	DECLARE_MODULE(fsname, fsname ## _mod, SI_SUB_VFS, SI_ORDER_MIDDLE)
 
 extern	char *mountrootfsname;
 
 /*
  * exported vnode operations
  */
 
 int	dounmount(struct mount *, int, struct thread *);
 
 int	kernel_mount(struct mntarg *ma, int flags);
 int	kernel_vmount(int flags, ...);
 struct mntarg *mount_arg(struct mntarg *ma, const char *name, const void *val, int len);
 struct mntarg *mount_argb(struct mntarg *ma, int flag, const char *name);
 struct mntarg *mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...);
 struct mntarg *mount_argsu(struct mntarg *ma, const char *name, const void *val, int len);
 struct vfsconf *vfs_byname(const char *);
 struct vfsconf *vfs_byname_kld(const char *, struct thread *td, int *);
 void	vfs_event_signal(fsid_t *, u_int32_t, intptr_t);
 int	vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val);
 int	vfs_getopt(struct vfsoptlist *, const char *, void **, int *);
 char	*vfs_getopts(struct vfsoptlist *, const char *, int *error);
 int	vfs_copyopt(struct vfsoptlist *, const char *, void *, int);
 int	vfs_filteropt(struct vfsoptlist *, const char **legal);
 int	vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...);
 int	vfs_setpublicfs			    /* set publicly exported fs */
 	    (struct mount *, struct netexport *, struct export_args *);
 void	vfs_msync(struct mount *, int);
 int	vfs_busy(struct mount *, int, struct mtx *, struct thread *);
 int	vfs_export			 /* process mount export info */
 	    (struct mount *, struct export_args *);
 int	vfs_allocate_syncvnode(struct mount *);
 void	vfs_getnewfsid(struct mount *);
 struct cdev *vfs_getrootfsid(struct mount *);
 struct	mount *vfs_getvfs(fsid_t *);      /* return vfs given fsid */
 int	vfs_modevent(module_t, int, void *);
 void	vfs_mount_error(struct mount *, const char *, ...);
 void	vfs_mountroot(void);			/* mount our root filesystem */
 void	vfs_mountedfrom(struct mount *, const char *from);
 void	vfs_ref(struct mount *);
 void	vfs_rel(struct mount *);
 int	vfs_suser(struct mount *, struct thread *);
 void	vfs_unbusy(struct mount *, struct thread *);
 void	vfs_unmountall(void);
 extern	TAILQ_HEAD(mntlist, mount) mountlist;	/* mounted filesystem list */
 extern	struct mtx mountlist_mtx;
 extern	struct nfs_public nfs_pub;
 
 /*
  * Declarations for these vfs default operations are located in
  * kern/vfs_default.c, they should be used instead of making "dummy"
  * functions or casting entries in the VFS op table to "enopnotsupp()".
  */
 vfs_root_t		vfs_stdroot;
 vfs_quotactl_t		vfs_stdquotactl;
 vfs_statfs_t		vfs_stdstatfs;
 vfs_sync_t		vfs_stdsync;
 vfs_sync_t		vfs_stdnosync;
 vfs_vget_t		vfs_stdvget;
 vfs_fhtovp_t		vfs_stdfhtovp;
 vfs_checkexp_t		vfs_stdcheckexp;
 vfs_vptofh_t		vfs_stdvptofh;
 vfs_init_t		vfs_stdinit;
 vfs_uninit_t		vfs_stduninit;
 vfs_extattrctl_t	vfs_stdextattrctl;
 vfs_sysctl_t		vfs_stdsysctl;
 
-/* XXX - these should be indirect functions!!! */
-int	softdep_fsync(struct vnode *);
-int	softdep_process_worklist(struct mount *);
-
 #else /* !_KERNEL */
 
 #include <sys/cdefs.h>
 
 struct stat;
 
 __BEGIN_DECLS
 int	fhopen(const struct fhandle *, int);
 int	fhstat(const struct fhandle *, struct stat *);
 int	fhstatfs(const struct fhandle *, struct statfs *);
 int	fstatfs(int, struct statfs *);
 int	getfh(const char *, fhandle_t *);
 int	getfsstat(struct statfs *, long, int);
 int	getmntinfo(struct statfs **, int);
 int	lgetfh(const char *, fhandle_t *);
 int	mount(const char *, const char *, int, void *);
 int	nmount(struct iovec *, unsigned int, int);
 int	statfs(const char *, struct statfs *);
 int	unmount(const char *, int);
 
 /* C library stuff */
 int	getvfsbyname(const char *, struct xvfsconf *);
 __END_DECLS
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_MOUNT_H_ */
Index: head/sys/sys/vnode.h
===================================================================
--- head/sys/sys/vnode.h	(revision 156202)
+++ head/sys/sys/vnode.h	(revision 156203)
@@ -1,732 +1,731 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 /*
  * XXX - compatability until lockmgr() goes away or all the #includes are
  * updated.
  */
 #include <sys/lockmgr.h>
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/lock.h>
 #include <sys/_mutex.h>
 #include <sys/mutex.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	f - freelist mutex
  *	G - Giant
  *	i - interlock
  *	m - mntvnodes mutex
  *	p - pollinfo lock
  *	s - spechash mutex
  *	S - syncer mutex
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  *
  * XXX Not all fields are locked yet and some fields that are marked are not
  * locked consistently.  This is a work in progress.  Requires Giant!
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	enum	vtype v_type;			/* u vnode type */
 	const char *v_tag;			/* u type of underlying data */
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 * See #defines below for renaming to v_* namespace.
 	 */
 	union {
 		struct mount	*vu_mount;	/* v ptr to mountpoint (VDIR) */
 		struct socket	*vu_socket;	/* v unix domain net (VSOCK) */
 		struct cdev	*vu_cdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*vu_fifoinfo;	/* v fifo (VFIFO) */
 	} v_un;
 
 	/*
 	 * vfs_hash:  (mount + inode) -> vnode hash.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 	u_int			v_hash;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct	vnode *v_dd;			/* c .. vnode */
 
 	/*
 	 * clustering stuff
 	 */
 	daddr_t	v_cstart;			/* v start block of cluster */
 	daddr_t	v_lasta;			/* v last allocation  */
 	daddr_t	v_lastw;			/* v last write  */
 	int	v_clen;				/* v length of cur. cluster */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 	int	v_holdcnt;			/* i prevents recycling. */
 	int	v_usecount;			/* i ref count of users */
 	u_long	v_iflag;			/* i vnode flags (see below) */
 	u_long	v_vflag;			/* v vnode flags */
 	int	v_writecount;			/* v ref count of writers */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_freelist;		/* f vnode freelist */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* G Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 };
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	v_mountedhere	v_un.vu_mount
 #define	v_socket	v_un.vu_socket
 #define	v_rdev		v_un.vu_cdev
 #define	v_fifoinfo	v_un.vu_fifoinfo
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* socket, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), (a)); \
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, 1)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  */
 #define	VI_MOUNT	0x0020	/* Mount in progress */
 #define	VI_AGE		0x0040	/* Insert vnode at head of free list */
 #define	VI_DOOMED	0x0080	/* This vnode is being recycled */
 #define	VI_FREE		0x0100	/* This vnode is on the freelist */
 #define	VI_OBJDIRTY	0x0400	/* object might be dirty */
 #define	VI_DOINGINACT	0x0800	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x1000	/* Need to call inactive */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_TEXT		0x0020	/* vnode is a pure text prototype */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	short		va_nlink;	/* number of references to file */
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	dev_t		va_fsid;	/* filesystem id */
 	long		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_MARK_ATIME	0x04		/* setting atime for execve/mmap */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
  */
 #define	VEXEC	000100		/* execute/search permission */
 #define	VWRITE	000200		/* write permission */
 #define	VREAD	000400		/* read permission */
 #define	VSVTX	001000		/* save swapped text even after use */
 #define	VSGID	002000		/* set group id on execution */
 #define	VSUID	004000		/* set user id on execution */
 #define	VADMIN	010000		/* permission to administer */
 #define	VSTAT	020000		/* permission to retrieve attrs */
 #define	VAPPEND	040000		/* permission to write/append */
 #define	VALLPERM	(VEXEC | VWRITE | VREAD | VADMIN | VSTAT | VAPPEND)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	DOCLOSE		0x0008	/* vclean: close active files */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	int async_io_version;		/* 0 or POSIX version of AIO i'face */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	int prtactive;			/* nonzero to call vprint() */
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 /*
  * Macro/function to check for client cache inconsistency w.r.t. leasing.
  */
 #define	LEASE_READ	0x1		/* Check lease for readers */
 #define	LEASE_WRITE	0x2		/* Check lease for modifiers */
 
 extern void	(*lease_updatetime)(int deltat);
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 #define	VDESC_NOMAP_VPP		0x0100
 #define	VDESC_VPP_WILLRELE	0x0200
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 #if 0
 void	assert_vop_elocked_other(struct vnode *vp, const char *str);
 #endif
 void	assert_vop_locked(struct vnode *vp, const char *str);
 #if 0
 voi0	assert_vop_slocked(struct vnode *vp, const char *str);
 #endif
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #if 0
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str) assert_vop_locked_other((vp), (str))
 #endif
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #if 0
 #define	ASSERT_VOP_SLOCKED(vp, str)	assert_vop_slocked((vp), (str))
 #endif
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)
 #define	ASSERT_VI_UNLOCKED(vp, str)
 #define	ASSERT_VOP_ELOCKED(vp, str)
 #if 0
 #define	ASSERT_VOP_ELOCKED_OTHER(vp, str)
 #endif
 #define	ASSERT_VOP_LOCKED(vp, str)
 #if 0
 #define	ASSERT_VOP_SLOCKED(vp, str)
 #endif
 #define	ASSERT_VOP_UNLOCKED(vp, str)
 #endif /* DEBUG_VFS_LOCKS */
 
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 #include "vnode_if.h"
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vnode;
 
 extern int	(*lease_check_hook)(struct vop_lease_args *);
-extern int	(*softdep_process_worklist_hook)(struct mount *);
 
 /* cache_* may belong in namei.h. */
 void	cache_enter(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp);
 void	cache_purge(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp);
 int	cache_leaf_test(struct vnode *vp);
 int	change_dir(struct vnode *vp, struct thread *td);
 int	change_root(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	cvtnstat(struct stat *sb, struct nstat *nsb);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 u_quad_t init_va_filerev(void);
 int	lease_check(struct vop_lease_args *ap);
 int	speedup_syncer(void);
 #define textvp_fullpath(p, rb, rfb) \
 	vn_fullpath(FIRST_THREAD_IN_PROC(p), (p)->p_textvp, rb, rfb)
 int	vn_fullpath(struct thread *td, struct vnode *vn,
 	    char **retbuf, char **freebuf);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, mode_t acc_mode, struct ucred *cred,
 	    int *privused);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, mode_t acc_mode,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
 void	vdrop(struct vnode *);
 void	vfs_add_vnodeops(const void *);
 void	vfs_rm_vnodeops(const void *);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
 void	vhold(struct vnode *);
 void	vholdl(struct vnode *);
 int	vinvalbuf(struct vnode *vp, int save,
 	    struct thread *td, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
 	    off_t length, int blksize);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 #define vprint(label, vp) vn_printf((vp), "%s\n", (label))
 int	vrecycle(struct vnode *vp, struct thread *td);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 void	vn_finished_write(struct mount *mp);
 int	vn_isdisk(struct vnode *vp, int *errp);
 int	vn_lock(struct vnode *vp, int flags, struct thread *td);
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, int fdidx);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    struct ucred *cred, int fdidx);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, int *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 	    struct ucred *file_cred, struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_write_suspend_wait(struct vnode *vp, struct mount *mp,
 	    int flags);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp);
 int	vfs_write_suspend(struct mount *mp);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_create_post(void *a, int rc);
 void	vop_link_post(void *a, int rc);
 void	vop_lock_pre(void *a);
 void	vop_lock_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_post(void *a, int rc);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_post(void *a, int rc);
 void	vop_strategy_pre(void *a);
 void	vop_symlink_post(void *a, int rc);
 void	vop_unlock_post(void *a, int rc);
 void	vop_unlock_pre(void *a);
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error, osize, ooffset, noffset;				\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred,	\
 		    curthread);						\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = va.va_size;					\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 int	vrefcnt(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 int vfs_hash_get(struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 void vfs_mark_atime(struct vnode *vp, struct thread *td);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: head/sys/ufs/ffs/ffs_alloc.c
===================================================================
--- head/sys/ufs/ffs/ffs_alloc.c	(revision 156202)
+++ head/sys/ufs/ffs/ffs_alloc.c	(revision 156203)
@@ -1,2559 +1,2559 @@
 /*-
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Marshall
  * Kirk McKusick and Network Associates Laboratories, the Security
  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
  * research program
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_alloc.c	8.18 (Berkeley) 5/26/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufsmount.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 typedef ufs2_daddr_t allocfcn_t(struct inode *ip, int cg, ufs2_daddr_t bpref,
 				  int size);
 
 static ufs2_daddr_t ffs_alloccg(struct inode *, int, ufs2_daddr_t, int);
 static ufs2_daddr_t
 	      ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t);
 #ifdef DIAGNOSTIC
 static int	ffs_checkblk(struct inode *, ufs2_daddr_t, long);
 #endif
 static ufs2_daddr_t ffs_clusteralloc(struct inode *, int, ufs2_daddr_t, int);
 static void	ffs_clusteracct(struct ufsmount *, struct fs *, struct cg *,
 		    ufs1_daddr_t, int);
 static ino_t	ffs_dirpref(struct inode *);
 static ufs2_daddr_t ffs_fragextend(struct inode *, int, ufs2_daddr_t, int, int);
 static void	ffs_fserr(struct fs *, ino_t, char *);
 static ufs2_daddr_t	ffs_hashalloc
 		(struct inode *, int, ufs2_daddr_t, int, allocfcn_t *);
 static ufs2_daddr_t ffs_nodealloccg(struct inode *, int, ufs2_daddr_t, int);
 static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
 static int	ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
 static int	ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
 
 /*
  * Allocate a block in the filesystem.
  *
  * The size of the requested block is given, which must be some
  * multiple of fs_fsize and <= fs_bsize.
  * A preference may be optionally specified. If a preference is given
  * the following hierarchy is used to allocate a block:
  *   1) allocate the requested block.
  *   2) allocate a rotationally optimal block in the same cylinder.
  *   3) allocate a block in the same cylinder group.
  *   4) quadradically rehash into other cylinder groups, until an
  *      available block is located.
  * If no block preference is given the following heirarchy is used
  * to allocate a block:
  *   1) allocate a block in the cylinder group that contains the
  *      inode for the file.
  *   2) quadradically rehash into other cylinder groups, until an
  *      available block is located.
  */
 int
 ffs_alloc(ip, lbn, bpref, size, cred, bnp)
 	struct inode *ip;
 	ufs2_daddr_t lbn, bpref;
 	int size;
 	struct ucred *cred;
 	ufs2_daddr_t *bnp;
 {
 	struct fs *fs;
 	struct ufsmount *ump;
 	ufs2_daddr_t bno;
 	int cg, reclaimed;
 	static struct timeval lastfail;
 	static int curfail;
 #ifdef QUOTA
 	int error;
 #endif
 
 	*bnp = 0;
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 #ifdef DIAGNOSTIC
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
 		    devtoname(ip->i_dev), (long)fs->fs_bsize, size,
 		    fs->fs_fsmnt);
 		panic("ffs_alloc: bad size");
 	}
 	if (cred == NOCRED)
 		panic("ffs_alloc: missing credential");
 #endif /* DIAGNOSTIC */
 	reclaimed = 0;
 retry:
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	error = chkdq(ip, btodb(size), cred, 0);
 	if (error)
 		return (error);
 	UFS_LOCK(ump);
 #endif
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 		goto nospace;
 	if (suser_cred(cred, SUSER_ALLOWJAIL) &&
 	    freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
 		goto nospace;
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	if (bpref == 0)
 		cg = ino_to_cg(fs, ip->i_number);
 	else
 		cg = dtog(fs, bpref);
 	bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
 	if (bno > 0) {
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		*bnp = bno;
 		return (0);
 	}
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	/*
 	 * Restore user's disk quota because allocation failed.
 	 */
 	(void) chkdq(ip, -btodb(size), cred, FORCE);
 	UFS_LOCK(ump);
 #endif
 nospace:
 	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
 		reclaimed = 1;
 		softdep_request_cleanup(fs, ITOV(ip));
 		goto retry;
 	}
 	UFS_UNLOCK(ump);
 	if (ppsratecheck(&lastfail, &curfail, 1)) {
 		ffs_fserr(fs, ip->i_number, "filesystem full");
 		uprintf("\n%s: write failed, filesystem is full\n",
 		    fs->fs_fsmnt);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Reallocate a fragment to a bigger size
  *
  * The number and size of the old block is given, and a preference
  * and new size is also specified. The allocator attempts to extend
  * the original block. Failing that, the regular block allocator is
  * invoked to get an appropriate block.
  */
 int
 ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, cred, bpp)
 	struct inode *ip;
 	ufs2_daddr_t lbprev;
 	ufs2_daddr_t bprev;
 	ufs2_daddr_t bpref;
 	int osize, nsize;
 	struct ucred *cred;
 	struct buf **bpp;
 {
 	struct vnode *vp;
 	struct fs *fs;
 	struct buf *bp;
 	struct ufsmount *ump;
 	int cg, request, error, reclaimed;
 	ufs2_daddr_t bno;
 	static struct timeval lastfail;
 	static int curfail;
 
 	*bpp = 0;
 	vp = ITOV(ip);
 	fs = ip->i_fs;
 	bp = NULL;
 	ump = ip->i_ump;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 #ifdef DIAGNOSTIC
 	if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_realloccg: allocation on suspended filesystem");
 	if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
 	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
 		printf(
 		"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
 		    devtoname(ip->i_dev), (long)fs->fs_bsize, osize,
 		    nsize, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad size");
 	}
 	if (cred == NOCRED)
 		panic("ffs_realloccg: missing credential");
 #endif /* DIAGNOSTIC */
 	reclaimed = 0;
 retry:
 	if (suser_cred(cred, SUSER_ALLOWJAIL) &&
 	    freespace(fs, fs->fs_minfree) -  numfrags(fs, nsize - osize) < 0) {
 		goto nospace;
 	}
 	if (bprev == 0) {
 		printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
 		    devtoname(ip->i_dev), (long)fs->fs_bsize, (intmax_t)bprev,
 		    fs->fs_fsmnt);
 		panic("ffs_realloccg: bad bprev");
 	}
 	UFS_UNLOCK(ump);
 	/*
 	 * Allocate the extra space in the buffer.
 	 */
 	error = bread(vp, lbprev, osize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		if (lbprev >= NDADDR)
 			panic("ffs_realloccg: lbprev out of range");
 		bp->b_blkno = fsbtodb(fs, bprev);
 	}
 
 #ifdef QUOTA
 	error = chkdq(ip, btodb(nsize - osize), cred, 0);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 #endif
 	/*
 	 * Check for extension in the existing location.
 	 */
 	cg = dtog(fs, bprev);
 	UFS_LOCK(ump);
 	bno = ffs_fragextend(ip, cg, bprev, osize, nsize);
 	if (bno) {
 		if (bp->b_blkno != fsbtodb(fs, bno))
 			panic("ffs_realloccg: bad blockno");
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(nsize - osize));
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
 		if ((bp->b_flags & (B_MALLOC | B_VMIO)) != B_VMIO)
 			bzero((char *)bp->b_data + osize, nsize - osize);
 		else
 			vfs_bio_clrbuf(bp);
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Allocate a new disk location.
 	 */
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	switch ((int)fs->fs_optim) {
 	case FS_OPTSPACE:
 		/*
 		 * Allocate an exact sized fragment. Although this makes
 		 * best use of space, we will waste time relocating it if
 		 * the file continues to grow. If the fragmentation is
 		 * less than half of the minimum free reserve, we choose
 		 * to begin optimizing for time.
 		 */
 		request = nsize;
 		if (fs->fs_minfree <= 5 ||
 		    fs->fs_cstotal.cs_nffree >
 		    (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
 			break;
 		log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
 			fs->fs_fsmnt);
 		fs->fs_optim = FS_OPTTIME;
 		break;
 	case FS_OPTTIME:
 		/*
 		 * At this point we have discovered a file that is trying to
 		 * grow a small fragment to a larger fragment. To save time,
 		 * we allocate a full sized block, then free the unused portion.
 		 * If the file continues to grow, the `ffs_fragextend' call
 		 * above will be able to grow it in place without further
 		 * copying. If aberrant programs cause disk fragmentation to
 		 * grow within 2% of the free reserve, we choose to begin
 		 * optimizing for space.
 		 */
 		request = fs->fs_bsize;
 		if (fs->fs_cstotal.cs_nffree <
 		    (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
 			break;
 		log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
 			fs->fs_fsmnt);
 		fs->fs_optim = FS_OPTSPACE;
 		break;
 	default:
 		printf("dev = %s, optim = %ld, fs = %s\n",
 		    devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad optim");
 		/* NOTREACHED */
 	}
 	bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp, bprev, (long)osize,
 			    ip->i_number);
 		if (nsize < request)
 			ffs_blkfree(ump, fs, ip->i_devvp,
 			    bno + numfrags(fs, nsize),
 			    (long)(request - nsize), ip->i_number);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(nsize - osize));
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
 		if ((bp->b_flags & (B_MALLOC | B_VMIO)) != B_VMIO)
 			bzero((char *)bp->b_data + osize, nsize - osize);
 		else
 			vfs_bio_clrbuf(bp);
 		*bpp = bp;
 		return (0);
 	}
 #ifdef QUOTA
 	UFS_UNLOCK(ump);
 	/*
 	 * Restore user's disk quota because allocation failed.
 	 */
 	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
 	UFS_LOCK(ump);
 #endif
 nospace:
 	/*
 	 * no space available
 	 */
 	if (fs->fs_pendingblocks > 0 && reclaimed == 0) {
 		reclaimed = 1;
 		softdep_request_cleanup(fs, vp);
 		UFS_UNLOCK(ump);
 		if (bp)
 			brelse(bp);
 		UFS_LOCK(ump);
 		goto retry;
 	}
 	UFS_UNLOCK(ump);
 	if (bp)
 		brelse(bp);
 	if (ppsratecheck(&lastfail, &curfail, 1)) {
 		ffs_fserr(fs, ip->i_number, "filesystem full");
 		uprintf("\n%s: write failed, filesystem is full\n",
 		    fs->fs_fsmnt);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Reallocate a sequence of blocks into a contiguous sequence of blocks.
  *
  * The vnode and an array of buffer pointers for a range of sequential
  * logical blocks to be made contiguous is given. The allocator attempts
  * to find a range of sequential blocks starting as close as possible
  * from the end of the allocation for the logical block immediately
  * preceding the current range. If successful, the physical block numbers
  * in the buffer pointers and in the inode are changed to reflect the new
  * allocation. If unsuccessful, the allocation is left unchanged. The
  * success in doing the reallocation is returned. Note that the error
  * return is not reflected back to the user. Rather the previous block
  * allocation will be used.
  */
 
 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem");
 
 static int doasyncfree = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "");
 
 static int doreallocblks = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
 
 #ifdef DEBUG
 static volatile int prtrealloc = 0;
 #endif
 
 int
 ffs_reallocblks(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 
 	if (doreallocblks == 0)
 		return (ENOSPC);
 	if (VTOI(ap->a_vp)->i_ump->um_fstype == UFS1)
 		return (ffs_reallocblks_ufs1(ap));
 	return (ffs_reallocblks_ufs2(ap));
 }
 	
 static int
 ffs_reallocblks_ufs1(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
 	struct buf *sbp, *ebp;
 	ufs1_daddr_t *bap, *sbap, *ebap = 0;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
 	ufs_lbn_t start_lbn, end_lbn;
 	ufs1_daddr_t soff, newblk, blkno;
 	ufs2_daddr_t pref;
 	struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
 	int i, len, start_lvl, end_lvl, ssize;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	if (fs->fs_contigsumsize <= 0)
 		return (ENOSPC);
 	buflist = ap->a_buflist;
 	len = buflist->bs_nchildren;
 	start_lbn = buflist->bs_children[0]->b_lblkno;
 	end_lbn = start_lbn + len - 1;
 #ifdef DIAGNOSTIC
 	for (i = 0; i < len; i++)
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 1");
 	for (i = 1; i < len; i++)
 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
 			panic("ffs_reallocblks: non-logical cluster");
 	blkno = buflist->bs_children[0]->b_blkno;
 	ssize = fsbtodb(fs, fs->fs_frag);
 	for (i = 1; i < len - 1; i++)
 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
 			panic("ffs_reallocblks: non-physical cluster %d", i);
 #endif
 	/*
 	 * If the latest allocation is in a new cylinder group, assume that
 	 * the filesystem has decided to move and do not force it back to
 	 * the previous cylinder group.
 	 */
 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
 		return (ENOSPC);
 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
 		return (ENOSPC);
 	/*
 	 * Get the starting offset and block map for the first block.
 	 */
 	if (start_lvl == 0) {
 		sbap = &ip->i_din1->di_db[0];
 		soff = start_lbn;
 	} else {
 		idp = &start_ap[start_lvl - 1];
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
 			brelse(sbp);
 			return (ENOSPC);
 		}
 		sbap = (ufs1_daddr_t *)sbp->b_data;
 		soff = idp->in_off;
 	}
 	/*
 	 * If the block range spans two block maps, get the second map.
 	 */
 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
 		ssize = len;
 	} else {
 #ifdef DIAGNOSTIC
 		if (start_ap[start_lvl-1].in_lbn == idp->in_lbn)
 			panic("ffs_reallocblk: start == end");
 #endif
 		ssize = len - (idp->in_off + 1);
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
 			goto fail;
 		ebap = (ufs1_daddr_t *)ebp->b_data;
 	}
 	/*
 	 * Find the preferred location for the cluster.
 	 */
 	UFS_LOCK(ump);
 	pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap);
 	/*
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
 	    len, ffs_clusteralloc)) == 0) {
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
 	/*
 	 * We have found a new contiguous block.
 	 *
 	 * First we have to replace the old block pointers with the new
 	 * block pointers in the inode and indirect blocks associated
 	 * with the file.
 	 */
 #ifdef DEBUG
 	if (prtrealloc)
 		printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number,
 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
 #endif
 	blkno = newblk;
 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (i == ssize) {
 			bap = ebap;
 			soff = -i;
 		}
 #ifdef DIAGNOSTIC
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 2");
 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
 			panic("ffs_reallocblks: alloc mismatch");
 #endif
 #ifdef DEBUG
 		if (prtrealloc)
 			printf(" %d,", *bap);
 #endif
 		if (DOINGSOFTDEP(vp)) {
 			if (sbap == &ip->i_din1->di_db[0] && i < ssize)
 				softdep_setup_allocdirect(ip, start_lbn + i,
 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
 				    buflist->bs_children[i]);
 			else
 				softdep_setup_allocindir_page(ip, start_lbn + i,
 				    i < ssize ? sbp : ebp, soff + i, blkno,
 				    *bap, buflist->bs_children[i]);
 		}
 		*bap++ = blkno;
 	}
 	/*
 	 * Next we must write out the modified inode and indirect blocks.
 	 * For strict correctness, the writes should be synchronous since
 	 * the old block values may have been written to disk. In practise
 	 * they are almost never written, but if we are concerned about
 	 * strict correctness, the `doasyncfree' flag should be set to zero.
 	 *
 	 * The test on `doasyncfree' should be changed to test a flag
 	 * that shows whether the associated buffers and inodes have
 	 * been written. The flag should be set when the cluster is
 	 * started and cleared whenever the buffer or inode is flushed.
 	 * We can then check below to see if it is set, and do the
 	 * synchronous write only when it has been cleared.
 	 */
 	if (sbap != &ip->i_din1->di_db[0]) {
 		if (doasyncfree)
 			bdwrite(sbp);
 		else
 			bwrite(sbp);
 	} else {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (!doasyncfree)
 			ffs_update(vp, 1);
 	}
 	if (ssize < len) {
 		if (doasyncfree)
 			bdwrite(ebp);
 		else
 			bwrite(ebp);
 	}
 	/*
 	 * Last, free the old blocks and assign the new blocks to the buffers.
 	 */
 #ifdef DEBUG
 	if (prtrealloc)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
 			    fs->fs_bsize, ip->i_number);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
 #ifdef DIAGNOSTIC
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
 		if (prtrealloc)
 			printf(" %d,", blkno);
 #endif
 	}
 #ifdef DEBUG
 	if (prtrealloc) {
 		prtrealloc--;
 		printf("\n");
 	}
 #endif
 	return (0);
 
 fail:
 	if (ssize < len)
 		brelse(ebp);
 	if (sbap != &ip->i_din1->di_db[0])
 		brelse(sbp);
 	return (ENOSPC);
 }
 
 static int
 ffs_reallocblks_ufs2(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
 	struct buf *sbp, *ebp;
 	ufs2_daddr_t *bap, *sbap, *ebap = 0;
 	struct cluster_save *buflist;
 	struct ufsmount *ump;
 	ufs_lbn_t start_lbn, end_lbn;
 	ufs2_daddr_t soff, newblk, blkno, pref;
 	struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
 	int i, len, start_lvl, end_lvl, ssize;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	if (fs->fs_contigsumsize <= 0)
 		return (ENOSPC);
 	buflist = ap->a_buflist;
 	len = buflist->bs_nchildren;
 	start_lbn = buflist->bs_children[0]->b_lblkno;
 	end_lbn = start_lbn + len - 1;
 #ifdef DIAGNOSTIC
 	for (i = 0; i < len; i++)
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 1");
 	for (i = 1; i < len; i++)
 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
 			panic("ffs_reallocblks: non-logical cluster");
 	blkno = buflist->bs_children[0]->b_blkno;
 	ssize = fsbtodb(fs, fs->fs_frag);
 	for (i = 1; i < len - 1; i++)
 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
 			panic("ffs_reallocblks: non-physical cluster %d", i);
 #endif
 	/*
 	 * If the latest allocation is in a new cylinder group, assume that
 	 * the filesystem has decided to move and do not force it back to
 	 * the previous cylinder group.
 	 */
 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
 		return (ENOSPC);
 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
 		return (ENOSPC);
 	/*
 	 * Get the starting offset and block map for the first block.
 	 */
 	if (start_lvl == 0) {
 		sbap = &ip->i_din2->di_db[0];
 		soff = start_lbn;
 	} else {
 		idp = &start_ap[start_lvl - 1];
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
 			brelse(sbp);
 			return (ENOSPC);
 		}
 		sbap = (ufs2_daddr_t *)sbp->b_data;
 		soff = idp->in_off;
 	}
 	/*
 	 * If the block range spans two block maps, get the second map.
 	 */
 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
 		ssize = len;
 	} else {
 #ifdef DIAGNOSTIC
 		if (start_ap[start_lvl-1].in_lbn == idp->in_lbn)
 			panic("ffs_reallocblk: start == end");
 #endif
 		ssize = len - (idp->in_off + 1);
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
 			goto fail;
 		ebap = (ufs2_daddr_t *)ebp->b_data;
 	}
 	/*
 	 * Find the preferred location for the cluster.
 	 */
 	UFS_LOCK(ump);
 	pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
 	/*
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
 	    len, ffs_clusteralloc)) == 0) {
 		UFS_UNLOCK(ump);
 		goto fail;
 	}
 	/*
 	 * We have found a new contiguous block.
 	 *
 	 * First we have to replace the old block pointers with the new
 	 * block pointers in the inode and indirect blocks associated
 	 * with the file.
 	 */
 #ifdef DEBUG
 	if (prtrealloc)
 		printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number,
 		    (intmax_t)start_lbn, (intmax_t)end_lbn);
 #endif
 	blkno = newblk;
 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (i == ssize) {
 			bap = ebap;
 			soff = -i;
 		}
 #ifdef DIAGNOSTIC
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 2");
 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
 			panic("ffs_reallocblks: alloc mismatch");
 #endif
 #ifdef DEBUG
 		if (prtrealloc)
 			printf(" %jd,", (intmax_t)*bap);
 #endif
 		if (DOINGSOFTDEP(vp)) {
 			if (sbap == &ip->i_din2->di_db[0] && i < ssize)
 				softdep_setup_allocdirect(ip, start_lbn + i,
 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
 				    buflist->bs_children[i]);
 			else
 				softdep_setup_allocindir_page(ip, start_lbn + i,
 				    i < ssize ? sbp : ebp, soff + i, blkno,
 				    *bap, buflist->bs_children[i]);
 		}
 		*bap++ = blkno;
 	}
 	/*
 	 * Next we must write out the modified inode and indirect blocks.
 	 * For strict correctness, the writes should be synchronous since
 	 * the old block values may have been written to disk. In practise
 	 * they are almost never written, but if we are concerned about
 	 * strict correctness, the `doasyncfree' flag should be set to zero.
 	 *
 	 * The test on `doasyncfree' should be changed to test a flag
 	 * that shows whether the associated buffers and inodes have
 	 * been written. The flag should be set when the cluster is
 	 * started and cleared whenever the buffer or inode is flushed.
 	 * We can then check below to see if it is set, and do the
 	 * synchronous write only when it has been cleared.
 	 */
 	if (sbap != &ip->i_din2->di_db[0]) {
 		if (doasyncfree)
 			bdwrite(sbp);
 		else
 			bwrite(sbp);
 	} else {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (!doasyncfree)
 			ffs_update(vp, 1);
 	}
 	if (ssize < len) {
 		if (doasyncfree)
 			bdwrite(ebp);
 		else
 			bwrite(ebp);
 	}
 	/*
 	 * Last, free the old blocks and assign the new blocks to the buffers.
 	 */
 #ifdef DEBUG
 	if (prtrealloc)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ump, fs, ip->i_devvp,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
 			    fs->fs_bsize, ip->i_number);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
 #ifdef DIAGNOSTIC
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
 		if (prtrealloc)
 			printf(" %jd,", (intmax_t)blkno);
 #endif
 	}
 #ifdef DEBUG
 	if (prtrealloc) {
 		prtrealloc--;
 		printf("\n");
 	}
 #endif
 	return (0);
 
 fail:
 	if (ssize < len)
 		brelse(ebp);
 	if (sbap != &ip->i_din2->di_db[0])
 		brelse(sbp);
 	return (ENOSPC);
 }
 
 /*
  * Allocate an inode in the filesystem.
  *
  * If allocating a directory, use ffs_dirpref to select the inode.
  * If allocating in a directory, the following hierarchy is followed:
  *   1) allocate the preferred inode.
  *   2) allocate an inode in the same cylinder group.
  *   3) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
  * If no inode preference is given the following heirarchy is used
  * to allocate an inode:
  *   1) allocate an inode in cylinder group 0.
  *   2) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
  */
 int
 ffs_valloc(pvp, mode, cred, vpp)
 	struct vnode *pvp;
 	int mode;
 	struct ucred *cred;
 	struct vnode **vpp;
 {
 	struct inode *pip;
 	struct fs *fs;
 	struct inode *ip;
 	struct timespec ts;
 	struct ufsmount *ump;
 	ino_t ino, ipref;
 	int cg, error;
 	static struct timeval lastfail;
 	static int curfail;
 
 	*vpp = NULL;
 	pip = VTOI(pvp);
 	fs = pip->i_fs;
 	ump = pip->i_ump;
 
 	UFS_LOCK(ump);
 	if (fs->fs_cstotal.cs_nifree == 0)
 		goto noinodes;
 
 	if ((mode & IFMT) == IFDIR)
 		ipref = ffs_dirpref(pip);
 	else
 		ipref = pip->i_number;
 	if (ipref >= fs->fs_ncg * fs->fs_ipg)
 		ipref = 0;
 	cg = ino_to_cg(fs, ipref);
 	/*
 	 * Track number of dirs created one after another
 	 * in a same cg without intervening by files.
 	 */
 	if ((mode & IFMT) == IFDIR) {
 		if (fs->fs_contigdirs[cg] < 255)
 			fs->fs_contigdirs[cg]++;
 	} else {
 		if (fs->fs_contigdirs[cg] > 0)
 			fs->fs_contigdirs[cg]--;
 	}
 	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode,
 					(allocfcn_t *)ffs_nodealloccg);
 	if (ino == 0)
 		goto noinodes;
 	error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp);
 	if (error) {
 		ffs_vfree(pvp, ino, mode);
 		return (error);
 	}
 	ip = VTOI(*vpp);
 	if (ip->i_mode) {
 		printf("mode = 0%o, inum = %lu, fs = %s\n",
 		    ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt);
 		panic("ffs_valloc: dup alloc");
 	}
 	if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) {  /* XXX */
 		printf("free inode %s/%lu had %ld blocks\n",
 		    fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks));
 		DIP_SET(ip, i_blocks, 0);
 	}
 	ip->i_flags = 0;
 	DIP_SET(ip, i_flags, 0);
 	/*
 	 * Set up a new generation number for this inode.
 	 */
 	if (ip->i_gen == 0 || ++ip->i_gen == 0)
 		ip->i_gen = arc4random() / 2 + 1;
 	DIP_SET(ip, i_gen, ip->i_gen);
 	if (fs->fs_magic == FS_UFS2_MAGIC) {
 		vfs_timestamp(&ts);
 		ip->i_din2->di_birthtime = ts.tv_sec;
 		ip->i_din2->di_birthnsec = ts.tv_nsec;
 	}
 	ip->i_flag = 0;
 	vnode_destroy_vobject(*vpp);
 	(*vpp)->v_type = VNON;
 	if (fs->fs_magic == FS_UFS2_MAGIC)
 		(*vpp)->v_op = &ffs_vnodeops2;
 	else
 		(*vpp)->v_op = &ffs_vnodeops1;
 	return (0);
 noinodes:
 	UFS_UNLOCK(ump);
 	if (ppsratecheck(&lastfail, &curfail, 1)) {
 		ffs_fserr(fs, pip->i_number, "out of inodes");
 		uprintf("\n%s: create/symlink failed, no inodes free\n",
 		    fs->fs_fsmnt);
 	}
 	return (ENOSPC);
 }
 
 /*
  * Find a cylinder group to place a directory.
  *
  * The policy implemented by this algorithm is to allocate a
  * directory inode in the same cylinder group as its parent
  * directory, but also to reserve space for its files inodes
  * and data. Restrict the number of directories which may be
  * allocated one after another in the same cylinder group
  * without intervening allocation of files.
  *
  * If we allocate a first level directory then force allocation
  * in another cylinder group.
  */
 static ino_t
 ffs_dirpref(pip)
 	struct inode *pip;
 {
 	struct fs *fs;
 	int cg, prefcg, dirsize, cgsize;
 	int avgifree, avgbfree, avgndir, curdirsize;
 	int minifree, minbfree, maxndir;
 	int mincg, minndir;
 	int maxcontigdirs;
 
 	mtx_assert(UFS_MTX(pip->i_ump), MA_OWNED);
 	fs = pip->i_fs;
 
 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
 	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
 
 	/*
 	 * Force allocation in another cg if creating a first level dir.
 	 */
 	ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
 	if (ITOV(pip)->v_vflag & VV_ROOT) {
 		prefcg = arc4random() % fs->fs_ncg;
 		mincg = prefcg;
 		minndir = fs->fs_ipg;
 		for (cg = prefcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				mincg = cg;
 				minndir = fs->fs_cs(fs, cg).cs_ndir;
 			}
 		for (cg = 0; cg < prefcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				mincg = cg;
 				minndir = fs->fs_cs(fs, cg).cs_ndir;
 			}
 		return ((ino_t)(fs->fs_ipg * mincg));
 	}
 
 	/*
 	 * Count various limits which used for
 	 * optimal allocation of a directory inode.
 	 */
 	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
 	minifree = avgifree - avgifree / 4;
 	if (minifree < 1)
 		minifree = 1;
 	minbfree = avgbfree - avgbfree / 4;
 	if (minbfree < 1)
 		minbfree = 1;
 	cgsize = fs->fs_fsize * fs->fs_fpg;
 	dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
 	curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
 	if (dirsize < curdirsize)
 		dirsize = curdirsize;
 	maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
 	if (fs->fs_avgfpdir > 0)
 		maxcontigdirs = min(maxcontigdirs,
 				    fs->fs_ipg / fs->fs_avgfpdir);
 	if (maxcontigdirs == 0)
 		maxcontigdirs = 1;
 
 	/*
 	 * Limit number of dirs in one cg and reserve space for 
 	 * regular files, but only if we have no deficit in
 	 * inodes or space.
 	 */
 	prefcg = ino_to_cg(fs, pip->i_number);
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
 	for (cg = 0; cg < prefcg; cg++)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
 	/*
 	 * This is a backstop when we have deficit in space.
 	 */
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			return ((ino_t)(fs->fs_ipg * cg));
 	for (cg = 0; cg < prefcg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			break;
 	return ((ino_t)(fs->fs_ipg * cg));
 }
 
 /*
  * Select the desired position for the next block in a file.  The file is
  * logically divided into sections. The first section is composed of the
  * direct blocks. Each additional section contains fs_maxbpg blocks.
  *
  * If no blocks have been allocated in the first section, the policy is to
  * request a block in the same cylinder group as the inode that describes
  * the file. If no blocks have been allocated in any other section, the
  * policy is to place the section in a cylinder group with a greater than
  * average number of free blocks.  An appropriate cylinder group is found
  * by using a rotor that sweeps the cylinder groups. When a new group of
  * blocks is needed, the sweep begins in the cylinder group following the
  * cylinder group from which the previous allocation was made. The sweep
  * continues until a cylinder group with greater than the average number
  * of free blocks is found. If the allocation is for the first block in an
  * indirect block, the information on the previous allocation is unavailable;
  * here a best guess is made based upon the logical block number being
  * allocated.
  *
  * If a section is already partially allocated, the policy is to
  * contiguously allocate fs_maxcontig blocks. The end of one of these
  * contiguous blocks and the beginning of the next is laid out
  * contiguously if possible.
  */
 ufs2_daddr_t
 ffs_blkpref_ufs1(ip, lbn, indx, bap)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	int indx;
 	ufs1_daddr_t *bap;
 {
 	struct fs *fs;
 	int cg;
 	int avgbfree, startcg;
 
 	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
 	fs = ip->i_fs;
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 		if (lbn < NDADDR + NINDIR(fs)) {
 			cg = ino_to_cg(fs, ip->i_number);
 			return (cgbase(fs, cg) + fs->fs_frag);
 		}
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || bap[indx - 1] == 0)
 			startcg =
 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, bap[indx - 1]) + 1;
 		startcg %= fs->fs_ncg;
 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgbase(fs, cg) + fs->fs_frag);
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgbase(fs, cg) + fs->fs_frag);
 			}
 		return (0);
 	}
 	/*
 	 * We just always try to lay things out contiguously.
 	 */
 	return (bap[indx - 1] + fs->fs_frag);
 }
 
 /*
  * Same as above, but for UFS2
  */
 ufs2_daddr_t
 ffs_blkpref_ufs2(ip, lbn, indx, bap)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	int indx;
 	ufs2_daddr_t *bap;
 {
 	struct fs *fs;
 	int cg;
 	int avgbfree, startcg;
 
 	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
 	fs = ip->i_fs;
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 		if (lbn < NDADDR + NINDIR(fs)) {
 			cg = ino_to_cg(fs, ip->i_number);
 			return (cgbase(fs, cg) + fs->fs_frag);
 		}
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || bap[indx - 1] == 0)
 			startcg =
 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, bap[indx - 1]) + 1;
 		startcg %= fs->fs_ncg;
 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgbase(fs, cg) + fs->fs_frag);
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (cgbase(fs, cg) + fs->fs_frag);
 			}
 		return (0);
 	}
 	/*
 	 * We just always try to lay things out contiguously.
 	 */
 	return (bap[indx - 1] + fs->fs_frag);
 }
 
 /*
  * Implement the cylinder overflow algorithm.
  *
  * The policy implemented by this algorithm is:
  *   1) allocate the block in its requested cylinder group.
  *   2) quadradically rehash on the cylinder group number.
  *   3) brute force search for a free block.
  *
  * Must be called with the UFS lock held.  Will release the lock on success
  * and return with it held on failure.
  */
 /*VARARGS5*/
 static ufs2_daddr_t
 ffs_hashalloc(ip, cg, pref, size, allocator)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t pref;
 	int size;	/* size for data blocks, mode for inodes */
 	allocfcn_t *allocator;
 {
 	struct fs *fs;
 	ufs2_daddr_t result;
 	int i, icg = cg;
 
 	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
 #ifdef DIAGNOSTIC
 	if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_hashalloc: allocation on suspended filesystem");
 #endif
 	fs = ip->i_fs;
 	/*
 	 * 1: preferred cylinder group
 	 */
 	result = (*allocator)(ip, cg, pref, size);
 	if (result)
 		return (result);
 	/*
 	 * 2: quadratic rehash
 	 */
 	for (i = 1; i < fs->fs_ncg; i *= 2) {
 		cg += i;
 		if (cg >= fs->fs_ncg)
 			cg -= fs->fs_ncg;
 		result = (*allocator)(ip, cg, 0, size);
 		if (result)
 			return (result);
 	}
 	/*
 	 * 3: brute force search
 	 * Note that we start at i == 2, since 0 was checked initially,
 	 * and 1 is always checked in the quadratic rehash.
 	 */
 	cg = (icg + 2) % fs->fs_ncg;
 	for (i = 2; i < fs->fs_ncg; i++) {
 		result = (*allocator)(ip, cg, 0, size);
 		if (result)
 			return (result);
 		cg++;
 		if (cg == fs->fs_ncg)
 			cg = 0;
 	}
 	return (0);
 }
 
 /*
  * Determine whether a fragment can be extended.
  *
  * Check to see if the necessary fragments are available, and
  * if they are, allocate them.
  */
 static ufs2_daddr_t
 ffs_fragextend(ip, cg, bprev, osize, nsize)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t bprev;
 	int osize, nsize;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	int nffree;
 	long bno;
 	int frags, bbase;
 	int i, error;
 	u_int8_t *blksfree;
 
 	ump = ip->i_ump;
 	fs = ip->i_fs;
 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
 		return (0);
 	frags = numfrags(fs, nsize);
 	bbase = fragnum(fs, bprev);
 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
 		/* cannot extend across a block boundary */
 		return (0);
 	}
 	UFS_UNLOCK(ump);
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error)
 		goto fail;
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp))
 		goto fail;
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_old_time = cgp->cg_time = time_second;
 	bno = dtogd(fs, bprev);
 	blksfree = cg_blksfree(cgp);
 	for (i = numfrags(fs, osize); i < frags; i++)
 		if (isclr(blksfree, bno + i))
 			goto fail;
 	/*
 	 * the current fragment can be extended
 	 * deduct the count on fragment being extended into
 	 * increase the count on the remaining fragment (if any)
 	 * allocate the extended piece
 	 */
 	for (i = frags; i < fs->fs_frag - bbase; i++)
 		if (isclr(blksfree, bno + i))
 			break;
 	cgp->cg_frsum[i - numfrags(fs, osize)]--;
 	if (i != frags)
 		cgp->cg_frsum[i - frags]++;
 	for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) {
 		clrbit(blksfree, bno + i);
 		cgp->cg_cs.cs_nffree--;
 		nffree++;
 	}
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nffree -= nffree;
 	fs->fs_cs(fs, cg).cs_nffree -= nffree;
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, fs, bprev);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev);
 	bdwrite(bp);
 	return (bprev);
 
 fail:
 	brelse(bp);
 	UFS_LOCK(ump);
 	return (0);
 
 }
 
 /*
  * Determine whether a block can be allocated.
  *
  * Check to see if a block of the appropriate size is available,
  * and if it is, allocate it.
  */
 static ufs2_daddr_t
 ffs_alloccg(ip, cg, bpref, size)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t bpref;
 	int size;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	int i, allocsiz, error, frags;
 	u_int8_t *blksfree;
 
 	ump = ip->i_ump;
 	fs = ip->i_fs;
 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
 		return (0);
 	UFS_UNLOCK(ump);
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error)
 		goto fail;
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp) ||
 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
 		goto fail;
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_old_time = cgp->cg_time = time_second;
 	if (size == fs->fs_bsize) {
 		UFS_LOCK(ump);
 		blkno = ffs_alloccgblk(ip, bp, bpref);
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
 		return (blkno);
 	}
 	/*
 	 * check to see if any fragments are already available
 	 * allocsiz is the size which will be allocated, hacking
 	 * it down to a smaller size if necessary
 	 */
 	blksfree = cg_blksfree(cgp);
 	frags = numfrags(fs, size);
 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
 		if (cgp->cg_frsum[allocsiz] != 0)
 			break;
 	if (allocsiz == fs->fs_frag) {
 		/*
 		 * no fragments were available, so a block will be
 		 * allocated, and hacked up
 		 */
 		if (cgp->cg_cs.cs_nbfree == 0)
 			goto fail;
 		UFS_LOCK(ump);
 		blkno = ffs_alloccgblk(ip, bp, bpref);
 		bno = dtogd(fs, blkno);
 		for (i = frags; i < fs->fs_frag; i++)
 			setbit(blksfree, bno + i);
 		i = fs->fs_frag - frags;
 		cgp->cg_cs.cs_nffree += i;
 		fs->fs_cstotal.cs_nffree += i;
 		fs->fs_cs(fs, cg).cs_nffree += i;
 		fs->fs_fmod = 1;
 		cgp->cg_frsum[i]++;
 		ACTIVECLEAR(fs, cg);
 		UFS_UNLOCK(ump);
 		bdwrite(bp);
 		return (blkno);
 	}
 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
 	if (bno < 0)
 		goto fail;
 	for (i = 0; i < frags; i++)
 		clrbit(blksfree, bno + i);
 	cgp->cg_cs.cs_nffree -= frags;
 	cgp->cg_frsum[allocsiz]--;
 	if (frags != allocsiz)
 		cgp->cg_frsum[allocsiz - frags]++;
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nffree -= frags;
 	fs->fs_cs(fs, cg).cs_nffree -= frags;
 	fs->fs_fmod = 1;
 	blkno = cgbase(fs, cg) + bno;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, fs, blkno);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
 	bdwrite(bp);
 	return (blkno);
 
 fail:
 	brelse(bp);
 	UFS_LOCK(ump);
 	return (0);
 }
 
 /*
  * Allocate a block in a cylinder group.
  *
  * This algorithm implements the following policy:
  *   1) allocate the requested block.
  *   2) allocate a rotationally optimal block in the same cylinder.
  *   3) allocate the next available block on the block rotor for the
  *      specified cylinder group.
  * Note that this routine only allocates fs_bsize blocks; these
  * blocks may be fragmented by the routine that allocates them.
  */
 static ufs2_daddr_t
 ffs_alloccgblk(ip, bp, bpref)
 	struct inode *ip;
 	struct buf *bp;
 	ufs2_daddr_t bpref;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct ufsmount *ump;
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	u_int8_t *blksfree;
 
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	cgp = (struct cg *)bp->b_data;
 	blksfree = cg_blksfree(cgp);
 	if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) {
 		bpref = cgp->cg_rotor;
 	} else {
 		bpref = blknum(fs, bpref);
 		bno = dtogd(fs, bpref);
 		/*
 		 * if the requested block is available, use it
 		 */
 		if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
 			goto gotit;
 	}
 	/*
 	 * Take the next available block in this cylinder group.
 	 */
 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
 	if (bno < 0)
 		return (0);
 	cgp->cg_rotor = bno;
 gotit:
 	blkno = fragstoblks(fs, bno);
 	ffs_clrblock(fs, blksfree, (long)blkno);
 	ffs_clusteracct(ump, fs, cgp, blkno, -1);
 	cgp->cg_cs.cs_nbfree--;
 	fs->fs_cstotal.cs_nbfree--;
 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
 	fs->fs_fmod = 1;
 	blkno = cgbase(fs, cgp->cg_cgx) + bno;
 	/* XXX Fixme. */
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
-		softdep_setup_blkmapdep(bp, fs, blkno);
+		softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno);
 	UFS_LOCK(ump);
 	return (blkno);
 }
 
 /*
  * Determine whether a cluster can be allocated.
  *
  * We do not currently check for optimal rotational layout if there
  * are multiple choices in the same cylinder group. Instead we just
  * take the first one that we find following bpref.
  */
 static ufs2_daddr_t
 ffs_clusteralloc(ip, cg, bpref, len)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t bpref;
 	int len;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	struct ufsmount *ump;
 	int i, run, bit, map, got;
 	ufs2_daddr_t bno;
 	u_char *mapp;
 	int32_t *lp;
 	u_int8_t *blksfree;
 
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	if (fs->fs_maxcluster[cg] < len)
 		return (0);
 	UFS_UNLOCK(ump);
 	if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize,
 	    NOCRED, &bp))
 		goto fail_lock;
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp))
 		goto fail_lock;
 	bp->b_xflags |= BX_BKGRDWRITE;
 	/*
 	 * Check to see if a cluster of the needed size (or bigger) is
 	 * available in this cylinder group.
 	 */
 	lp = &cg_clustersum(cgp)[len];
 	for (i = len; i <= fs->fs_contigsumsize; i++)
 		if (*lp++ > 0)
 			break;
 	if (i > fs->fs_contigsumsize) {
 		/*
 		 * This is the first time looking for a cluster in this
 		 * cylinder group. Update the cluster summary information
 		 * to reflect the true maximum sized cluster so that
 		 * future cluster allocation requests can avoid reading
 		 * the cylinder group map only to find no clusters.
 		 */
 		lp = &cg_clustersum(cgp)[len - 1];
 		for (i = len - 1; i > 0; i--)
 			if (*lp-- > 0)
 				break;
 		UFS_LOCK(ump);
 		fs->fs_maxcluster[cg] = i;
 		goto fail;
 	}
 	/*
 	 * Search the cluster map to find a big enough cluster.
 	 * We take the first one that we find, even if it is larger
 	 * than we need as we prefer to get one close to the previous
 	 * block allocation. We do not search before the current
 	 * preference point as we do not want to allocate a block
 	 * that is allocated before the previous one (as we will
 	 * then have to wait for another pass of the elevator
 	 * algorithm before it will be read). We prefer to fail and
 	 * be recalled to try an allocation in the next cylinder group.
 	 */
 	if (dtog(fs, bpref) != cg)
 		bpref = 0;
 	else
 		bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref)));
 	mapp = &cg_clustersfree(cgp)[bpref / NBBY];
 	map = *mapp++;
 	bit = 1 << (bpref % NBBY);
 	for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
 		if ((map & bit) == 0) {
 			run = 0;
 		} else {
 			run++;
 			if (run == len)
 				break;
 		}
 		if ((got & (NBBY - 1)) != (NBBY - 1)) {
 			bit <<= 1;
 		} else {
 			map = *mapp++;
 			bit = 1;
 		}
 	}
 	if (got >= cgp->cg_nclusterblks)
 		goto fail_lock;
 	/*
 	 * Allocate the cluster that we have found.
 	 */
 	blksfree = cg_blksfree(cgp);
 	for (i = 1; i <= len; i++)
 		if (!ffs_isblock(fs, blksfree, got - run + i))
 			panic("ffs_clusteralloc: map mismatch");
 	bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1);
 	if (dtog(fs, bno) != cg)
 		panic("ffs_clusteralloc: allocated out of group");
 	len = blkstofrags(fs, len);
 	UFS_LOCK(ump);
 	for (i = 0; i < len; i += fs->fs_frag)
 		if (ffs_alloccgblk(ip, bp, bno + i) != bno + i)
 			panic("ffs_clusteralloc: lost block");
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	bdwrite(bp);
 	return (bno);
 
 fail_lock:
 	UFS_LOCK(ump);
 fail:
 	brelse(bp);
 	return (0);
 }
 
 /*
  * Determine whether an inode can be allocated.
  *
  * Check to see if an inode is available, and if it is,
  * allocate it using the following policy:
  *   1) allocate the requested inode.
  *   2) allocate the next available inode after the requested
  *      inode in the specified cylinder group.
  */
 static ufs2_daddr_t
 ffs_nodealloccg(ip, cg, ipref, mode)
 	struct inode *ip;
 	int cg;
 	ufs2_daddr_t ipref;
 	int mode;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp, *ibp;
 	struct ufsmount *ump;
 	u_int8_t *inosused;
 	struct ufs2_dinode *dp2;
 	int error, start, len, loc, map, i;
 
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
 		return (0);
 	UFS_UNLOCK(ump);
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		UFS_LOCK(ump);
 		return (0);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
 		brelse(bp);
 		UFS_LOCK(ump);
 		return (0);
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_old_time = cgp->cg_time = time_second;
 	inosused = cg_inosused(cgp);
 	if (ipref) {
 		ipref %= fs->fs_ipg;
 		if (isclr(inosused, ipref))
 			goto gotit;
 	}
 	start = cgp->cg_irotor / NBBY;
 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
 	loc = skpc(0xff, len, &inosused[start]);
 	if (loc == 0) {
 		len = start + 1;
 		start = 0;
 		loc = skpc(0xff, len, &inosused[0]);
 		if (loc == 0) {
 			printf("cg = %d, irotor = %ld, fs = %s\n",
 			    cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
 			panic("ffs_nodealloccg: map corrupted");
 			/* NOTREACHED */
 		}
 	}
 	i = start + len - loc;
 	map = inosused[i];
 	ipref = i * NBBY;
 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
 		if ((map & i) == 0) {
 			cgp->cg_irotor = ipref;
 			goto gotit;
 		}
 	}
 	printf("fs = %s\n", fs->fs_fsmnt);
 	panic("ffs_nodealloccg: block not in map");
 	/* NOTREACHED */
 gotit:
 	/*
 	 * Check to see if we need to initialize more inodes.
 	 */
 	ibp = NULL;
 	if (fs->fs_magic == FS_UFS2_MAGIC &&
 	    ipref + INOPB(fs) > cgp->cg_initediblk &&
 	    cgp->cg_initediblk < cgp->cg_niblk) {
 		ibp = getblk(ip->i_devvp, fsbtodb(fs,
 		    ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)),
 		    (int)fs->fs_bsize, 0, 0, 0);
 		bzero(ibp->b_data, (int)fs->fs_bsize);
 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
 		for (i = 0; i < INOPB(fs); i++) {
 			dp2->di_gen = arc4random() / 2 + 1;
 			dp2++;
 		}
 		cgp->cg_initediblk += INOPB(fs);
 	}
 	UFS_LOCK(ump);
 	ACTIVECLEAR(fs, cg);
 	setbit(inosused, ipref);
 	cgp->cg_cs.cs_nifree--;
 	fs->fs_cstotal.cs_nifree--;
 	fs->fs_cs(fs, cg).cs_nifree--;
 	fs->fs_fmod = 1;
 	if ((mode & IFMT) == IFDIR) {
 		cgp->cg_cs.cs_ndir++;
 		fs->fs_cstotal.cs_ndir++;
 		fs->fs_cs(fs, cg).cs_ndir++;
 	}
 	UFS_UNLOCK(ump);
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
 	bdwrite(bp);
 	if (ibp != NULL)
 		bawrite(ibp);
 	return (cg * fs->fs_ipg + ipref);
 }
 
 /*
  * check if a block is free
  */
 static int
 ffs_isfreeblock(struct fs *fs, u_char *cp, ufs1_daddr_t h)
 {
 
 	switch ((int)fs->fs_frag) {
 	case 8:
 		return (cp[h] == 0);
 	case 4:
 		return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
 	case 2:
 		return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
 	case 1:
 		return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
 	default:
 		panic("ffs_isfreeblock");
 	}
 	return (0);
 }
 
 /*
  * Free a block or fragment.
  *
  * The specified block or fragment is placed back in the
  * free map. If a fragment is deallocated, a possible
  * block reassembly is checked.
  */
 void
 ffs_blkfree(ump, fs, devvp, bno, size, inum)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	struct cg *cgp;
 	struct buf *bp;
 	ufs1_daddr_t fragno, cgbno;
 	ufs2_daddr_t cgblkno;
 	int i, cg, blk, frags, bbase;
 	u_int8_t *blksfree;
 	struct cdev *dev;
 
 	cg = dtog(fs, bno);
 	if (devvp->v_type != VCHR) {
 		/* devvp is a snapshot */
 		dev = VTOI(devvp)->i_devvp->v_rdev;
 		cgblkno = fragstoblks(fs, cgtod(fs, cg));
 	} else {
 		/* devvp is a normal disk device */
 		dev = devvp->v_rdev;
 		cgblkno = fsbtodb(fs, cgtod(fs, cg));
 		ASSERT_VOP_LOCKED(devvp, "ffs_blkfree");
 		if ((devvp->v_vflag & VV_COPYONWRITE) &&
 		    ffs_snapblkfree(fs, devvp, bno, size, inum))
 			return;
 	}
 #ifdef DIAGNOSTIC
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
 	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
 		printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",
 		    devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize,
 		    size, fs->fs_fsmnt);
 		panic("ffs_blkfree: bad size");
 	}
 #endif
 	if ((u_int)bno >= fs->fs_size) {
 		printf("bad block %jd, ino %lu\n", (intmax_t)bno,
 		    (u_long)inum);
 		ffs_fserr(fs, inum, "bad block");
 		return;
 	}
 	if (bread(devvp, cgblkno, (int)fs->fs_cgsize, NOCRED, &bp)) {
 		brelse(bp);
 		return;
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return;
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_old_time = cgp->cg_time = time_second;
 	cgbno = dtogd(fs, bno);
 	blksfree = cg_blksfree(cgp);
 	UFS_LOCK(ump);
 	if (size == fs->fs_bsize) {
 		fragno = fragstoblks(fs, cgbno);
 		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
 			if (devvp->v_type != VCHR) {
 				UFS_UNLOCK(ump);
 				/* devvp is a snapshot */
 				brelse(bp);
 				return;
 			}
 			printf("dev = %s, block = %jd, fs = %s\n",
 			    devtoname(dev), (intmax_t)bno, fs->fs_fsmnt);
 			panic("ffs_blkfree: freeing free block");
 		}
 		ffs_setblock(fs, blksfree, fragno);
 		ffs_clusteracct(ump, fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 		fs->fs_cstotal.cs_nbfree++;
 		fs->fs_cs(fs, cg).cs_nbfree++;
 	} else {
 		bbase = cgbno - fragnum(fs, cgbno);
 		/*
 		 * decrement the counts associated with the old frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
 		/*
 		 * deallocate the fragment
 		 */
 		frags = numfrags(fs, size);
 		for (i = 0; i < frags; i++) {
 			if (isset(blksfree, cgbno + i)) {
 				printf("dev = %s, block = %jd, fs = %s\n",
 				    devtoname(dev), (intmax_t)(bno + i),
 				    fs->fs_fsmnt);
 				panic("ffs_blkfree: freeing free frag");
 			}
 			setbit(blksfree, cgbno + i);
 		}
 		cgp->cg_cs.cs_nffree += i;
 		fs->fs_cstotal.cs_nffree += i;
 		fs->fs_cs(fs, cg).cs_nffree += i;
 		/*
 		 * add back in counts associated with the new frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
 		/*
 		 * if a complete block has been reassembled, account for it
 		 */
 		fragno = fragstoblks(fs, bbase);
 		if (ffs_isblock(fs, blksfree, fragno)) {
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
 			ffs_clusteracct(ump, fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 			fs->fs_cstotal.cs_nbfree++;
 			fs->fs_cs(fs, cg).cs_nbfree++;
 		}
 	}
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	bdwrite(bp);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Verify allocation of a block or fragment. Returns true if block or
  * fragment is allocated, false if it is free.
  */
 static int
 ffs_checkblk(ip, bno, size)
 	struct inode *ip;
 	ufs2_daddr_t bno;
 	long size;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	ufs1_daddr_t cgbno;
 	int i, error, frags, free;
 	u_int8_t *blksfree;
 
 	fs = ip->i_fs;
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("bsize = %ld, size = %ld, fs = %s\n",
 		    (long)fs->fs_bsize, size, fs->fs_fsmnt);
 		panic("ffs_checkblk: bad size");
 	}
 	if ((u_int)bno >= fs->fs_size)
 		panic("ffs_checkblk: bad block %jd", (intmax_t)bno);
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error)
 		panic("ffs_checkblk: cg bread failed");
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp))
 		panic("ffs_checkblk: cg magic mismatch");
 	bp->b_xflags |= BX_BKGRDWRITE;
 	blksfree = cg_blksfree(cgp);
 	cgbno = dtogd(fs, bno);
 	if (size == fs->fs_bsize) {
 		free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno));
 	} else {
 		frags = numfrags(fs, size);
 		for (free = 0, i = 0; i < frags; i++)
 			if (isset(blksfree, cgbno + i))
 				free++;
 		if (free != 0 && free != frags)
 			panic("ffs_checkblk: partially free fragment");
 	}
 	brelse(bp);
 	return (!free);
 }
 #endif /* DIAGNOSTIC */
 
 /*
  * Free an inode.
  */
 int
 ffs_vfree(pvp, ino, mode)
 	struct vnode *pvp;
 	ino_t ino;
 	int mode;
 {
 	struct inode *ip;
 
 	if (DOINGSOFTDEP(pvp)) {
 		softdep_freefile(pvp, ino, mode);
 		return (0);
 	}
 	ip = VTOI(pvp);
 	return (ffs_freefile(ip->i_ump, ip->i_fs, ip->i_devvp, ino, mode));
 }
 
 /*
  * Do the actual free operation.
  * The specified inode is placed back in the free map.
  */
 int
 ffs_freefile(ump, fs, devvp, ino, mode)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct vnode *devvp;
 	ino_t ino;
 	int mode;
 {
 	struct cg *cgp;
 	struct buf *bp;
 	ufs2_daddr_t cgbno;
 	int error, cg;
 	u_int8_t *inosused;
 	struct cdev *dev;
 
 	cg = ino_to_cg(fs, ino);
 	if (devvp->v_type != VCHR) {
 		/* devvp is a snapshot */
 		dev = VTOI(devvp)->i_devvp->v_rdev;
 		cgbno = fragstoblks(fs, cgtod(fs, cg));
 	} else {
 		/* devvp is a normal disk device */
 		dev = devvp->v_rdev;
 		cgbno = fsbtodb(fs, cgtod(fs, cg));
 	}
 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
 		panic("ffs_freefile: range: dev = %s, ino = %lu, fs = %s",
 		    devtoname(dev), (u_long)ino, fs->fs_fsmnt);
 	if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) {
 		brelse(bp);
 		return (error);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (0);
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_old_time = cgp->cg_time = time_second;
 	inosused = cg_inosused(cgp);
 	ino %= fs->fs_ipg;
 	if (isclr(inosused, ino)) {
 		printf("dev = %s, ino = %lu, fs = %s\n", devtoname(dev),
 		    (u_long)ino + cg * fs->fs_ipg, fs->fs_fsmnt);
 		if (fs->fs_ronly == 0)
 			panic("ffs_freefile: freeing free inode");
 	}
 	clrbit(inosused, ino);
 	if (ino < cgp->cg_irotor)
 		cgp->cg_irotor = ino;
 	cgp->cg_cs.cs_nifree++;
 	UFS_LOCK(ump);
 	fs->fs_cstotal.cs_nifree++;
 	fs->fs_cs(fs, cg).cs_nifree++;
 	if ((mode & IFMT) == IFDIR) {
 		cgp->cg_cs.cs_ndir--;
 		fs->fs_cstotal.cs_ndir--;
 		fs->fs_cs(fs, cg).cs_ndir--;
 	}
 	fs->fs_fmod = 1;
 	ACTIVECLEAR(fs, cg);
 	UFS_UNLOCK(ump);
 	bdwrite(bp);
 	return (0);
 }
 
 /*
  * Check to see if a file is free.
  */
 int
 ffs_checkfreefile(fs, devvp, ino)
 	struct fs *fs;
 	struct vnode *devvp;
 	ino_t ino;
 {
 	struct cg *cgp;
 	struct buf *bp;
 	ufs2_daddr_t cgbno;
 	int ret, cg;
 	u_int8_t *inosused;
 
 	cg = ino_to_cg(fs, ino);
 	if (devvp->v_type != VCHR) {
 		/* devvp is a snapshot */
 		cgbno = fragstoblks(fs, cgtod(fs, cg));
 	} else {
 		/* devvp is a normal disk device */
 		cgbno = fsbtodb(fs, cgtod(fs, cg));
 	}
 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
 		return (1);
 	if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp)) {
 		brelse(bp);
 		return (1);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (1);
 	}
 	inosused = cg_inosused(cgp);
 	ino %= fs->fs_ipg;
 	ret = isclr(inosused, ino);
 	brelse(bp);
 	return (ret);
 }
 
 /*
  * Find a block of the specified size in the specified cylinder group.
  *
  * It is a panic if a request is made to find a block if none are
  * available.
  */
 static ufs1_daddr_t
 ffs_mapsearch(fs, cgp, bpref, allocsiz)
 	struct fs *fs;
 	struct cg *cgp;
 	ufs2_daddr_t bpref;
 	int allocsiz;
 {
 	ufs1_daddr_t bno;
 	int start, len, loc, i;
 	int blk, field, subfield, pos;
 	u_int8_t *blksfree;
 
 	/*
 	 * find the fragment by searching through the free block
 	 * map for an appropriate bit pattern
 	 */
 	if (bpref)
 		start = dtogd(fs, bpref) / NBBY;
 	else
 		start = cgp->cg_frotor / NBBY;
 	blksfree = cg_blksfree(cgp);
 	len = howmany(fs->fs_fpg, NBBY) - start;
 	loc = scanc((u_int)len, (u_char *)&blksfree[start],
 		(u_char *)fragtbl[fs->fs_frag],
 		(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 	if (loc == 0) {
 		len = start + 1;
 		start = 0;
 		loc = scanc((u_int)len, (u_char *)&blksfree[0],
 			(u_char *)fragtbl[fs->fs_frag],
 			(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 		if (loc == 0) {
 			printf("start = %d, len = %d, fs = %s\n",
 			    start, len, fs->fs_fsmnt);
 			panic("ffs_alloccg: map corrupted");
 			/* NOTREACHED */
 		}
 	}
 	bno = (start + len - loc) * NBBY;
 	cgp->cg_frotor = bno;
 	/*
 	 * found the byte in the map
 	 * sift through the bits to find the selected frag
 	 */
 	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
 		blk = blkmap(fs, blksfree, bno);
 		blk <<= 1;
 		field = around[allocsiz];
 		subfield = inside[allocsiz];
 		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
 			if ((blk & field) == subfield)
 				return (bno + pos);
 			field <<= 1;
 			subfield <<= 1;
 		}
 	}
 	printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt);
 	panic("ffs_alloccg: block not in map");
 	return (-1);
 }
 
 /*
  * Update the cluster map because of an allocation or free.
  *
  * Cnt == 1 means free; cnt == -1 means allocating.
  */
 void
 ffs_clusteracct(ump, fs, cgp, blkno, cnt)
 	struct ufsmount *ump;
 	struct fs *fs;
 	struct cg *cgp;
 	ufs1_daddr_t blkno;
 	int cnt;
 {
 	int32_t *sump;
 	int32_t *lp;
 	u_char *freemapp, *mapp;
 	int i, start, end, forw, back, map, bit;
 
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 
 	if (fs->fs_contigsumsize <= 0)
 		return;
 	freemapp = cg_clustersfree(cgp);
 	sump = cg_clustersum(cgp);
 	/*
 	 * Allocate or clear the actual block.
 	 */
 	if (cnt > 0)
 		setbit(freemapp, blkno);
 	else
 		clrbit(freemapp, blkno);
 	/*
 	 * Find the size of the cluster going forward.
 	 */
 	start = blkno + 1;
 	end = start + fs->fs_contigsumsize;
 	if (end >= cgp->cg_nclusterblks)
 		end = cgp->cg_nclusterblks;
 	mapp = &freemapp[start / NBBY];
 	map = *mapp++;
 	bit = 1 << (start % NBBY);
 	for (i = start; i < end; i++) {
 		if ((map & bit) == 0)
 			break;
 		if ((i & (NBBY - 1)) != (NBBY - 1)) {
 			bit <<= 1;
 		} else {
 			map = *mapp++;
 			bit = 1;
 		}
 	}
 	forw = i - start;
 	/*
 	 * Find the size of the cluster going backward.
 	 */
 	start = blkno - 1;
 	end = start - fs->fs_contigsumsize;
 	if (end < 0)
 		end = -1;
 	mapp = &freemapp[start / NBBY];
 	map = *mapp--;
 	bit = 1 << (start % NBBY);
 	for (i = start; i > end; i--) {
 		if ((map & bit) == 0)
 			break;
 		if ((i & (NBBY - 1)) != 0) {
 			bit >>= 1;
 		} else {
 			map = *mapp--;
 			bit = 1 << (NBBY - 1);
 		}
 	}
 	back = start - i;
 	/*
 	 * Account for old cluster and the possibly new forward and
 	 * back clusters.
 	 */
 	i = back + forw + 1;
 	if (i > fs->fs_contigsumsize)
 		i = fs->fs_contigsumsize;
 	sump[i] += cnt;
 	if (back > 0)
 		sump[back] -= cnt;
 	if (forw > 0)
 		sump[forw] -= cnt;
 	/*
 	 * Update cluster summary information.
 	 */
 	lp = &sump[fs->fs_contigsumsize];
 	for (i = fs->fs_contigsumsize; i > 0; i--)
 		if (*lp-- > 0)
 			break;
 	fs->fs_maxcluster[cgp->cg_cgx] = i;
 }
 
 /*
  * Fserr prints the name of a filesystem with an error diagnostic.
  *
  * The form of the error message is:
  *	fs: error message
  */
 static void
 ffs_fserr(fs, inum, cp)
 	struct fs *fs;
 	ino_t inum;
 	char *cp;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct proc *p = td->td_proc;
 
 	log(LOG_ERR, "pid %d (%s), uid %d inumber %d on %s: %s\n",
 	    p->p_pid, p->p_comm, td->td_ucred->cr_uid, inum, fs->fs_fsmnt, cp);
 }
 
 /*
  * This function provides the capability for the fsck program to
  * update an active filesystem. Eleven operations are provided:
  *
  * adjrefcnt(inode, amt) - adjusts the reference count on the
  *	specified inode by the specified amount. Under normal
  *	operation the count should always go down. Decrementing
  *	the count to zero will cause the inode to be freed.
  * adjblkcnt(inode, amt) - adjust the number of blocks used to
  *	by the specifed amount.
  * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
  *	adjust the superblock summary.
  * freedirs(inode, count) - directory inodes [inode..inode + count - 1]
  *	are marked as free. Inodes should never have to be marked
  *	as in use.
  * freefiles(inode, count) - file inodes [inode..inode + count - 1]
  *	are marked as free. Inodes should never have to be marked
  *	as in use.
  * freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
  *	are marked as free. Blocks should never have to be marked
  *	as in use.
  * setflags(flags, set/clear) - the fs_flags field has the specified
  *	flags set (second parameter +1) or cleared (second parameter -1).
  */
 
 static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT,
 	0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust Inode Used Blocks Count");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust number of directories");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust number of free blocks");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust number of free inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust number of free frags");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust number of free clusters");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Free Range of Directory Inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Free Range of File Inodes");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Free Range of Blocks");
 
 static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Change Filesystem Flags");
 
 #ifdef DEBUG
 static int fsckcmds = 0;
 SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "");
 #endif /* DEBUG */
 
 static int
 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 {
 	struct fsck_cmd cmd;
 	struct ufsmount *ump;
 	struct vnode *vp;
 	struct inode *ip;
 	struct mount *mp;
 	struct fs *fs;
 	ufs2_daddr_t blkno;
 	long blkcnt, blksize;
 	struct file *fp;
 	int filetype, error;
 
 	if (req->newlen > sizeof cmd)
 		return (EBADRPC);
 	if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0)
 		return (error);
 	if (cmd.version != FFS_CMD_VERSION)
 		return (ERPCMISMATCH);
 	if ((error = getvnode(curproc->p_fd, cmd.handle, &fp)) != 0)
 		return (error);
 	vn_start_write(fp->f_data, &mp, V_WAIT);
 	if (mp == 0 || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
 		vn_finished_write(mp);
 		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	if (mp->mnt_flag & MNT_RDONLY) {
 		vn_finished_write(mp);
 		fdrop(fp, curthread);
 		return (EROFS);
 	}
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	filetype = IFREG;
 
 	switch (oidp->oid_number) {
 
 	case FFS_SET_FLAGS:
 #ifdef DEBUG
 		if (fsckcmds)
 			printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
 			    cmd.size > 0 ? "set" : "clear");
 #endif /* DEBUG */
 		if (cmd.size > 0)
 			fs->fs_flags |= (long)cmd.value;
 		else
 			fs->fs_flags &= ~(long)cmd.value;
 		break;
 
 	case FFS_ADJ_REFCNT:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust inode %jd count by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DEBUG */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
 		ip->i_nlink += cmd.size;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_effnlink += cmd.size;
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(vp))
 			softdep_change_linkcnt(ip);
 		vput(vp);
 		break;
 
 	case FFS_ADJ_BLKCNT:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust inode %jd block count by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
 			    (intmax_t)cmd.size);
 		}
 #endif /* DEBUG */
 		if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
 			break;
 		ip = VTOI(vp);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
 		ip->i_flag |= IN_CHANGE;
 		vput(vp);
 		break;
 
 	case FFS_DIR_FREE:
 		filetype = IFDIR;
 		/* fall through */
 
 	case FFS_FILE_FREE:
 #ifdef DEBUG
 		if (fsckcmds) {
 			if (cmd.size == 1)
 				printf("%s: free %s inode %d\n",
 				    mp->mnt_stat.f_mntonname,
 				    filetype == IFDIR ? "directory" : "file",
 				    (ino_t)cmd.value);
 			else
 				printf("%s: free %s inodes %d-%d\n",
 				    mp->mnt_stat.f_mntonname,
 				    filetype == IFDIR ? "directory" : "file",
 				    (ino_t)cmd.value,
 				    (ino_t)(cmd.value + cmd.size - 1));
 		}
 #endif /* DEBUG */
 		while (cmd.size > 0) {
 			if ((error = ffs_freefile(ump, fs, ump->um_devvp,
 			    cmd.value, filetype)))
 				break;
 			cmd.size -= 1;
 			cmd.value += 1;
 		}
 		break;
 
 	case FFS_BLK_FREE:
 #ifdef DEBUG
 		if (fsckcmds) {
 			if (cmd.size == 1)
 				printf("%s: free block %jd\n",
 				    mp->mnt_stat.f_mntonname,
 				    (intmax_t)cmd.value);
 			else
 				printf("%s: free blocks %jd-%jd\n",
 				    mp->mnt_stat.f_mntonname, 
 				    (intmax_t)cmd.value,
 				    (intmax_t)cmd.value + cmd.size - 1);
 		}
 #endif /* DEBUG */
 		blkno = cmd.value;
 		blkcnt = cmd.size;
 		blksize = fs->fs_frag - (blkno % fs->fs_frag);
 		while (blkcnt > 0) {
 			if (blksize > blkcnt)
 				blksize = blkcnt;
 			ffs_blkfree(ump, fs, ump->um_devvp, blkno,
 			    blksize * fs->fs_fsize, ROOTINO);
 			blkno += blksize;
 			blkcnt -= blksize;
 			blksize = fs->fs_frag;
 		}
 		break;
 
 	/*
 	 * Adjust superblock summaries.  fsck(8) is expected to
 	 * submit deltas when necessary.
 	 */
 	case FFS_ADJ_NDIR:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust number of directories by %jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DEBUG */
 		fs->fs_cstotal.cs_ndir += cmd.value;
 		break;
 	case FFS_ADJ_NBFREE:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust number of free blocks by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DEBUG */
 		fs->fs_cstotal.cs_nbfree += cmd.value;
 		break;
 	case FFS_ADJ_NIFREE:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust number of free inodes by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DEBUG */
 		fs->fs_cstotal.cs_nifree += cmd.value;
 		break;
 	case FFS_ADJ_NFFREE:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust number of free frags by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DEBUG */
 		fs->fs_cstotal.cs_nffree += cmd.value;
 		break;
 	case FFS_ADJ_NUMCLUSTERS:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust number of free clusters by %+jd\n",
 			    mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
 		}
 #endif /* DEBUG */
 		fs->fs_cstotal.cs_numclusters += cmd.value;
 		break;
 
 	default:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("Invalid request %d from fsck\n",
 			    oidp->oid_number);
 		}
 #endif /* DEBUG */
 		error = EINVAL;
 		break;
 
 	}
 	fdrop(fp, curthread);
 	vn_finished_write(mp);
 	return (error);
 }
Index: head/sys/ufs/ffs/ffs_extern.h
===================================================================
--- head/sys/ufs/ffs/ffs_extern.h	(revision 156202)
+++ head/sys/ufs/ffs/ffs_extern.h	(revision 156203)
@@ -1,126 +1,125 @@
 /*-
  * Copyright (c) 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_extern.h	8.6 (Berkeley) 3/30/95
  * $FreeBSD$
  */
 
 #ifndef _UFS_FFS_EXTERN_H
 #define	_UFS_FFS_EXTERN_H
 
 struct buf;
 struct cg;
 struct fid;
 struct fs;
 struct inode;
 struct malloc_type;
 struct mount;
 struct thread;
 struct sockaddr;
 struct statfs;
 struct ucred;
 struct vnode;
 struct vop_fsync_args;
 struct vop_reallocblks_args;
 
 int	ffs_alloc(struct inode *,
 	    ufs2_daddr_t, ufs2_daddr_t, int, struct ucred *, ufs2_daddr_t *);
 int	ffs_balloc_ufs1(struct vnode *a_vp, off_t a_startoffset, int a_size,
             struct ucred *a_cred, int a_flags, struct buf **a_bpp);
 int	ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size,
             struct ucred *a_cred, int a_flags, struct buf **a_bpp);
 int	ffs_blkatoff(struct vnode *, off_t, char **, struct buf **);
 void	ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *,
 	    ufs2_daddr_t, long, ino_t);
 ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *);
 ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *);
 int	ffs_checkfreefile(struct fs *, struct vnode *, ino_t);
 void	ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t);
 int	ffs_copyonwrite(struct vnode *, struct buf *);
 int	ffs_flushfiles(struct mount *, int, struct thread *);
 void	ffs_fragacct(struct fs *, int, int32_t [], int);
 int	ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t,
 	    int);
 int	ffs_isblock(struct fs *, u_char *, ufs1_daddr_t);
 void	ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
 int	ffs_mountroot(void);
 int	ffs_reallocblks(struct vop_reallocblks_args *);
 int	ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t,
 	    ufs2_daddr_t, int, int, struct ucred *, struct buf **);
 void	ffs_setblock(struct fs *, u_char *, ufs1_daddr_t);
 int	ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t);
 void	ffs_snapremove(struct vnode *vp);
 int	ffs_snapshot(struct mount *mp, char *snapfile);
 void	ffs_snapshot_mount(struct mount *mp);
 void	ffs_snapshot_unmount(struct mount *mp);
 int	ffs_syncvnode(struct vnode *vp, int waitfor);
 int	ffs_truncate(struct vnode *, off_t, int, struct ucred *, struct thread *);
 int	ffs_update(struct vnode *, int);
 int	ffs_valloc(struct vnode *, int, struct ucred *, struct vnode **);
 
 int	ffs_vfree(struct vnode *, ino_t, int);
 vfs_vget_t ffs_vget;
 
 extern struct vop_vector ffs_vnodeops1;
 extern struct vop_vector ffs_fifoops1;
 extern struct vop_vector ffs_vnodeops2;
 extern struct vop_vector ffs_fifoops2;
 
 /*
  * Soft update function prototypes.
  */
 void	softdep_initialize(void);
 void	softdep_uninitialize(void);
 int	softdep_mount(struct vnode *, struct mount *, struct fs *,
 	    struct ucred *);
 void	softdep_move_dependencies(struct buf *, struct buf *);
 int	softdep_flushworklist(struct mount *, int *, struct thread *);
 int	softdep_flushfiles(struct mount *, int, struct thread *);
 void	softdep_update_inodeblock(struct inode *, struct buf *, int);
 void	softdep_load_inodeblock(struct inode *);
 void	softdep_freefile(struct vnode *, ino_t, int);
 int	softdep_request_cleanup(struct fs *, struct vnode *);
 void	softdep_setup_freeblocks(struct inode *, off_t, int);
 void	softdep_setup_inomapdep(struct buf *, struct inode *, ino_t);
-void	softdep_setup_blkmapdep(struct buf *, struct fs *, ufs2_daddr_t);
+void	softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t);
 void	softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t,
 	    ufs2_daddr_t, long, long, struct buf *);
 void	softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t,
 	    ufs2_daddr_t, long, long, struct buf *);
 void	softdep_setup_allocindir_meta(struct buf *, struct inode *,
 	    struct buf *, int, ufs2_daddr_t);
 void	softdep_setup_allocindir_page(struct inode *, ufs_lbn_t,
 	    struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *);
 void	softdep_fsync_mountdev(struct vnode *);
 int	softdep_sync_metadata(struct vnode *);
-/* XXX incorrectly moved to mount.h - should be indirect function */
-#if 0
-int	softdep_fsync(struct vnode *vp);
-#endif
+int     softdep_process_worklist(struct mount *, int);
+int     softdep_fsync(struct vnode *);
+int	softdep_waitidle(struct mount *);
 
 #endif /* !_UFS_FFS_EXTERN_H */
Index: head/sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	(revision 156202)
+++ head/sys/ufs/ffs/ffs_snapshot.c	(revision 156203)
@@ -1,2233 +1,2240 @@
 /*-
  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * Further information about snapshots can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/sched.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #define KERNCRED thread0.td_ucred
 #define DEBUG 1
 
 #include "opt_ffs.h"
 
 #ifdef NO_FFS_SNAPSHOT
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	return (EINVAL);
 }
 
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	return (EINVAL);
 }
 
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 }
 
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 }
 
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 }
 
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 }
 
 int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	return (EINVAL);
 }
 
 #else
 
 TAILQ_HEAD(snaphead, inode);
 
 struct snapdata {
 	struct snaphead sn_head;
 	daddr_t sn_listsize;
 	daddr_t *sn_blklist;
 	struct lock sn_lock;
 };
 
 static int cgaccount(int, struct vnode *, struct buf *, int);
 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
     ufs_lbn_t, int), int);
 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
     struct fs *, ufs_lbn_t, int);
 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
 
 /*
  * To ensure the consistency of snapshots across crashes, we must
  * synchronously write out copied blocks before allowing the
  * originals to be modified. Because of the rather severe speed
  * penalty that this imposes, the following flag allows this
  * crash persistence to be disabled.
  */
 int dopersistence = 0;
 
 #ifdef DEBUG
 #include <sys/sysctl.h>
 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
 static int snapdebug = 0;
 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
 int collectsnapstats = 0;
 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
 	0, "");
 #endif /* DEBUG */
 
 /*
  * Create a snapshot file and initialize it for the filesystem.
  */
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
 	int error, cg, snaploc;
 	int i, size, len, loc;
 	int flag = mp->mnt_flag;
 	struct timespec starttime = {0, 0}, endtime;
 	char saved_nice = 0;
 	long redo = 0, snaplistsize = 0;
 	int32_t *lp;
 	void *space;
 	struct fs *copy_fs = NULL, *fs;
 	struct thread *td = curthread;
 	struct inode *ip, *xp;
 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
 	struct nameidata nd;
 	struct mount *wrtmp;
 	struct vattr vat;
 	struct vnode *vp, *xvp, *mvp, *devvp;
 	struct uio auio;
 	struct iovec aiov;
 	struct snapdata *sn;
 	struct ufsmount *ump;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	/*
 	 * XXX: make sure we don't go to out1 before we setup sn
 	 */
 	sn = (void *)0xdeadbeef;
 
 	/*
 	 * Need to serialize access to snapshot code per filesystem.
 	 */
 	/*
 	 * Assign a snapshot slot in the superblock.
 	 */
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 	UFS_UNLOCK(ump);
 	if (snaploc == FSMAXSNAP)
 		return (ENOSPC);
 	/*
 	 * Create the snapshot file.
 	 */
 restart:
 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		vput(nd.ni_vp);
 		error = EEXIST;
 	}
 	if (nd.ni_dvp->v_mount != mp)
 		error = EXDEV;
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		return (error);
 	}
 	VATTR_NULL(&vat);
 	vat.va_type = VREG;
 	vat.va_mode = S_IRUSR;
 	vat.va_vaflags |= VA_EXCLUSIVE;
 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
 		wrtmp = NULL;
 	if (wrtmp != mp)
 		panic("ffs_snapshot: mount mismatch");
 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &wrtmp,
 		    V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
 	vput(nd.ni_dvp);
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	ip = VTOI(vp);
 	devvp = ip->i_devvp;
 	/*
 	 * Allocate and copy the last block contents so as to be able
 	 * to set size to that of the filesystem.
 	 */
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
 	if (error)
 		goto out;
 	ip->i_size = lblktosize(fs, (off_t)numblks);
 	DIP_SET(ip, i_size, ip->i_size);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if ((error = readblock(vp, bp, numblks - 1)) != 0)
 		goto out;
 	bawrite(bp);
 	/*
 	 * Preallocate critical data structures so that we can copy
 	 * them in without further allocation after we suspend all
 	 * operations on the filesystem. We would like to just release
 	 * the allocated buffers without writing them since they will
 	 * be filled in below once we are ready to go, but this upsets
 	 * the soft update code, so we go ahead and write the new buffers.
 	 *
 	 * Allocate all indirect blocks and mark all of them as not
 	 * needing to be copied.
 	 */
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
 		if (error)
 			goto out;
 		bawrite(ibp);
 	}
 	/*
 	 * Allocate copies for the superblock and its summary information.
 	 */
 	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
 	    0, &nbp);
 	if (error)
 		goto out;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	for (loc = 0; loc < len; loc++) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Allocate all cylinder group blocks.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
 	 * filesystem is still active, we hope that only a few
 	 * cylinder groups will change between now and when we
 	 * suspend operations. Thus, we will be able to quickly
 	 * touch up the few cylinder groups that changed during
 	 * the suspension period.
 	 */
 	len = howmany(fs->fs_ncg, NBBY);
 	MALLOC(space, void *, len, M_DEVBUF, M_WAITOK|M_ZERO);
 	UFS_LOCK(ump);
 	fs->fs_active = space;
 	UFS_UNLOCK(ump);
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
 		if (error)
 			goto out;
 	}
 	/*
 	 * Change inode to snapshot type file.
 	 */
 	ip->i_flags |= SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * Ensure that the snapshot is completely on disk.
 	 * Since we have marked it as a snapshot it is safe to
 	 * unlock it as no process will be allowed to write to it.
 	 */
 	if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
 		goto out;
 	VOP_UNLOCK(vp, 0, td);
 	/*
 	 * All allocations are done, so we can now snapshot the system.
 	 *
 	 * Recind nice scheduling while running with the filesystem suspended.
 	 */
 	if (td->td_proc->p_nice > 0) {
 		PROC_LOCK(td->td_proc);
 		mtx_lock_spin(&sched_lock);
 		saved_nice = td->td_proc->p_nice;
 		sched_nice(td->td_proc, 0);
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(td->td_proc);
 	}
 	/*
 	 * Suspend operation on filesystem.
 	 */
 	for (;;) {
 		vn_finished_write(wrtmp);
 		if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
 			vn_start_write(NULL, &wrtmp, V_WAIT);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			goto out;
 		}
 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
 			break;
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
+	/*
+	 * Wait for all dependency processing to finish after the sync
+	 * triggered by vfs_write_suspend().
+	 */
+	error = softdep_waitidle(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (error)
+		goto out1;
 	if (collectsnapstats)
 		nanotime(&starttime);
 	/*
 	 * First, copy all the cylinder group maps that have changed.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
 			continue;
 		redo++;
 		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out1;
 		error = cgaccount(cg, vp, nbp, 2);
 		bawrite(nbp);
 		if (error)
 			goto out1;
 	}
 	/*
 	 * Grab a copy of the superblock and its summary information.
 	 * We delay writing it until the suspension is released below.
 	 */
 	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
 	    KERNCRED, &sbp);
 	if (error) {
 		brelse(sbp);
 		sbp = NULL;
 		goto out1;
 	}
 	loc = blkoff(fs, fs->fs_sblockloc);
 	copy_fs = (struct fs *)(sbp->b_data + loc);
 	bcopy(fs, copy_fs, fs->fs_sbsize);
 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
 		copy_fs->fs_clean = 1;
 	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
 	if (fs->fs_sbsize < size)
 		bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize);
 	size = blkroundup(fs, fs->fs_cssize);
 	if (fs->fs_contigsumsize > 0)
 		size += fs->fs_ncg * sizeof(int32_t);
 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
 	copy_fs->fs_csp = space;
 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
 	space = (char *)space + fs->fs_cssize;
 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
 	i = fs->fs_frag - loc % fs->fs_frag;
 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
 	if (len > 0) {
 		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
 		    len, KERNCRED, &bp)) != 0) {
 			brelse(bp);
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			goto out1;
 		}
 		bcopy(bp->b_data, space, (u_int)len);
 		space = (char *)space + len;
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 	}
 	if (fs->fs_contigsumsize > 0) {
 		copy_fs->fs_maxcluster = lp = space;
 		for (i = 0; i < fs->fs_ncg; i++)
 			*lp++ = fs->fs_contigsumsize;
 	}
 	/*
 	 * We must check for active files that have been unlinked
 	 * (e.g., with a zero link count). We have to expunge all
 	 * trace of these files from the snapshot so that they are
 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
 	 * spec_strategy about writing on a suspended filesystem.
 	 * Note that we skip unlinked snapshot files as they will
 	 * be handled separately below.
 	 *
 	 * We also calculate the needed size for the snapshot list.
 	 */
 	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
 	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
 loop:
 	MNT_VNODE_FOREACH(xvp, mp, mvp) {
 		VI_LOCK(xvp);
 		MNT_IUNLOCK(mp);
 		if ((xvp->v_iflag & VI_DOOMED) ||
 		    xvp->v_usecount == 0 || xvp->v_type == VNON ||
 		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
 			VI_UNLOCK(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * We can skip parent directory vnode because it must have
 		 * this snapshot file in it.
 		 */
 		if (xvp == nd.ni_dvp) {
 			VI_UNLOCK(xvp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) {
 			MNT_ILOCK(mp);
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		if (snapdebug)
 			vprint("ffs_snapshot: busy vnode", xvp);
 		if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 &&
 		    vat.va_nlink > 0) {
 			VOP_UNLOCK(xvp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		xp = VTOI(xvp);
 		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
 			VOP_UNLOCK(xvp, 0, td);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If there is a fragment, clear it here.
 		 */
 		blkno = 0;
 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
 		if (loc < NDADDR) {
 			len = fragroundup(fs, blkoff(fs, xp->i_size));
 			if (len != 0 && len < fs->fs_bsize) {
 				ffs_blkfree(ump, copy_fs, vp,
 				    DIP(xp, i_db[loc]), len, xp->i_number);
 				blkno = DIP(xp, i_db[loc]);
 				DIP_SET(xp, i_db[loc], 0);
 			}
 		}
 		snaplistsize += 1;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
 			    BLK_NOCOPY);
 		else
 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
 			    BLK_NOCOPY);
 		if (blkno)
 			DIP_SET(xp, i_db[loc], blkno);
 		if (!error)
 			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
 			    xp->i_mode);
 		VOP_UNLOCK(xvp, 0, td);
 		if (error) {
 			free(copy_fs->fs_csp, M_UFSMNT);
 			bawrite(sbp);
 			sbp = NULL;
 			MNT_VNODE_FOREACH_ABORT(mp, mvp);
 			goto out1;
 		}
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	/*
 	 * If there already exist snapshots on this filesystem, grab a
 	 * reference to their shared lock. If this is the first snapshot
 	 * on this filesystem, we need to allocate a lock for the snapshots
 	 * to share. In either case, acquire the snapshot lock and give
 	 * up our original private lock.
 	 */
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn != NULL) {
 		xp = TAILQ_FIRST(&sn->sn_head);
 		VI_UNLOCK(devvp);
 		VI_LOCK(vp);
 		vp->v_vnlock = &sn->sn_lock;
 	} else {
 		VI_UNLOCK(devvp);
 		sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 		TAILQ_INIT(&sn->sn_head);
 		lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 		    LK_CANRECURSE | LK_NOSHARE);
 		VI_LOCK(vp);
 		vp->v_vnlock = &sn->sn_lock;
 		mp_fixme("si_snapdata setting is racey.");
 		devvp->v_rdev->si_snapdata = sn;
 		xp = NULL;
 	}
 	lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 	    VI_MTX(vp), td);
 	transferlockers(&vp->v_lock, vp->v_vnlock);
 	lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 	/*
 	 * If this is the first snapshot on this filesystem, then we need
 	 * to allocate the space for the list of preallocated snapshot blocks.
 	 * This list will be refined below, but this preliminary one will
 	 * keep us out of deadlock until the full one is ready.
 	 */
 	if (xp == NULL) {
 		MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
 		    M_UFSMNT, M_WAITOK);
 		blkp = &snapblklist[1];
 		*blkp++ = lblkno(fs, fs->fs_sblockloc);
 		blkno = fragstoblks(fs, fs->fs_csaddr);
 		for (cg = 0; cg < fs->fs_ncg; cg++) {
 			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
 				break;
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		}
 		len = howmany(fs->fs_cssize, fs->fs_bsize);
 		for (loc = 0; loc < len; loc++)
 			*blkp++ = blkno + loc;
 		for (; cg < fs->fs_ncg; cg++)
 			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
 		snapblklist[0] = blkp - snapblklist;
 		VI_LOCK(devvp);
 		if (sn->sn_blklist != NULL)
 			panic("ffs_snapshot: non-empty list");
 		sn->sn_blklist = snapblklist;
 		sn->sn_listsize = blkp - snapblklist;
 		VI_UNLOCK(devvp);
 	}
 	/*
 	 * Record snapshot inode. Since this is the newest snapshot,
 	 * it must be placed at the end of the list.
 	 */
 	VI_LOCK(devvp);
 	fs->fs_snapinum[snaploc] = ip->i_number;
 	if (ip->i_nextsnap.tqe_prev != 0)
 		panic("ffs_snapshot: %d already on list", ip->i_number);
 	TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
 	vp->v_vflag |= VV_SYSTEM;
 out1:
 	KASSERT(sn != (void *)0xdeadbeef, ("email phk@ and mckusick@"));
 	/*
 	 * Resume operation on filesystem.
 	 */
 	vfs_write_resume(vp->v_mount);
 	vn_start_write(NULL, &wrtmp, V_WAIT);
 	if (collectsnapstats && starttime.tv_sec > 0) {
 		nanotime(&endtime);
 		timespecsub(&endtime, &starttime);
 		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
 		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
 	}
 	if (sbp == NULL)
 		goto out;
 	/*
 	 * Copy allocation information from all the snapshots in
 	 * this snapshot and then expunge them from its view.
 	 */
 	TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
 		if (xp == ip)
 			break;
 		if (xp->i_ump->um_fstype == UFS1)
 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
 			    BLK_SNAP);
 		else
 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
 			    BLK_SNAP);
 		if (error) {
 			fs->fs_snapinum[snaploc] = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Allocate space for the full list of preallocated snapshot blocks.
 	 */
 	MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	ip->i_snapblklist = &snapblklist[1];
 	/*
 	 * Expunge the blocks used by the snapshots from the set of
 	 * blocks marked as used in the snapshot bitmaps. Also, collect
 	 * the list of allocated blocks in i_snapblklist.
 	 */
 	if (ip->i_ump->um_fstype == UFS1)
 		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
 	else
 		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
 	if (error) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	if (snaplistsize < ip->i_snapblklist - snapblklist)
 		panic("ffs_snapshot: list too small");
 	snaplistsize = ip->i_snapblklist - snapblklist;
 	snapblklist[0] = snaplistsize;
 	ip->i_snapblklist = 0;
 	/*
 	 * Write out the list of allocated blocks to the end of the snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)snapblklist;
 	aiov.iov_len = snaplistsize * sizeof(daddr_t);
 	auio.uio_resid = aiov.iov_len;;
 	auio.uio_offset = ip->i_size;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		fs->fs_snapinum[snaploc] = 0;
 		FREE(snapblklist, M_UFSMNT);
 		goto done;
 	}
 	/*
 	 * Write the superblock and its summary information
 	 * to the snapshot.
 	 */
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	space = copy_fs->fs_csp;
 	for (loc = 0; loc < len; loc++) {
 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			fs->fs_snapinum[snaploc] = 0;
 			FREE(snapblklist, M_UFSMNT);
 			goto done;
 		}
 		bcopy(space, nbp->b_data, fs->fs_bsize);
 		space = (char *)space + fs->fs_bsize;
 		bawrite(nbp);
 	}
 	/*
 	 * As this is the newest list, it is the most inclusive, so
 	 * should replace the previous list.
 	 */
 	VI_LOCK(devvp);
 	space = sn->sn_blklist;
 	sn->sn_blklist = snapblklist;
 	sn->sn_listsize = snaplistsize;
 	VI_UNLOCK(devvp);
 	if (space != NULL)
 		FREE(space, M_UFSMNT);
 	/*
 	 * If another process is currently writing the buffer containing
 	 * the inode for this snapshot then a deadlock can occur. Drop
 	 * the snapshot lock until the buffer has been written.
 	 */
 	VOP_UNLOCK(vp, 0, td);
 	(void) bread(ip->i_devvp,
 		     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		     (int) fs->fs_bsize, NOCRED, &nbp);
 	brelse(nbp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 done:
 	FREE(copy_fs->fs_csp, M_UFSMNT);
 	bawrite(sbp);
 out:
 	if (saved_nice > 0) {
 		PROC_LOCK(td->td_proc);
 		mtx_lock_spin(&sched_lock);
 		sched_nice(td->td_proc, saved_nice);
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(td->td_proc);
 	}
 	UFS_LOCK(ump);
 	if (fs->fs_active != 0) {
 		FREE(fs->fs_active, M_DEVBUF);
 		fs->fs_active = 0;
 	}
 	UFS_UNLOCK(ump);
 	mp->mnt_flag = flag;
 	if (error)
 		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 	(void) ffs_syncvnode(vp, MNT_WAIT);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(wrtmp);
 	return (error);
 }
 
 /*
  * Copy a cylinder group map. All the unallocated blocks are marked
  * BLK_NOCOPY so that the snapshot knows that it need not copy them
  * if they are later written. If passno is one, then this is a first
  * pass, so only setting needs to be done. If passno is 2, then this
  * is a revision to a previous pass which must be undone as the
  * replacement pass is done.
  */
 static int
 cgaccount(cg, vp, nbp, passno)
 	int cg;
 	struct vnode *vp;
 	struct buf *nbp;
 	int passno;
 {
 	struct buf *bp, *ibp;
 	struct inode *ip;
 	struct cg *cgp;
 	struct fs *fs;
 	ufs2_daddr_t base, numblks;
 	int error, len, loc, indiroff;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, KERNCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (EIO);
 	}
 	UFS_LOCK(ip->i_ump);
 	ACTIVESET(fs, cg);
 	UFS_UNLOCK(ip->i_ump);
 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 	if (fs->fs_cgsize < fs->fs_bsize)
 		bzero(&nbp->b_data[fs->fs_cgsize],
 		    fs->fs_bsize - fs->fs_cgsize);
 	cgp = (struct cg *)nbp->b_data;
 	bqrelse(bp);
 	if (passno == 2)
 		nbp->b_flags |= B_VALIDSUSPWRT;
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	len = howmany(fs->fs_fpg, fs->fs_frag);
 	base = cgbase(fs, cg) / fs->fs_frag;
 	if (base + len >= numblks)
 		len = numblks - base - 1;
 	loc = 0;
 	if (base < NDADDR) {
 		for ( ; loc < NDADDR; loc++) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				DIP_SET(ip, i_db[loc], BLK_NOCOPY);
 			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				DIP_SET(ip, i_db[loc], 0);
 			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
 				panic("ffs_snapshot: lost direct block");
 		}
 	}
 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
 	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 	if (error) {
 		return (error);
 	}
 	indiroff = (base + loc - NDADDR) % NINDIR(fs);
 	for ( ; loc < len; loc++, indiroff++) {
 		if (indiroff >= NINDIR(fs)) {
 			if (passno == 2)
 				ibp->b_flags |= B_VALIDSUSPWRT;
 			bawrite(ibp);
 			error = UFS_BALLOC(vp,
 			    lblktosize(fs, (off_t)(base + loc)),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error) {
 				return (error);
 			}
 			indiroff = 0;
 		}
 		if (ip->i_ump->um_fstype == UFS1) {
 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
 			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
 			    [indiroff] == BLK_NOCOPY)
 				panic("ffs_snapshot: lost indirect block");
 			continue;
 		}
 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 		else if (passno == 2 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
 		else if (passno == 1 &&
 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
 			panic("ffs_snapshot: lost indirect block");
 	}
 	if (passno == 2)
 		ibp->b_flags |= B_VALIDSUSPWRT;
 	bdwrite(ibp);
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs1_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
 	} else {
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(snapvp, bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
 	    &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
 	    &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs1_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs1_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs1: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs1: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs1_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din1->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs1_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs1: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs1_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs1_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Before expunging a snapshot inode, note all the
  * blocks that it claims with BLK_SNAP so that fsck will
  * be able to account for those blocks properly and so
  * that this snapshot knows that it need not copy them
  * if the other snapshot holding them is freed. This code
  * is reproduced once each for UFS1 and UFS2.
  */
 static int
 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct inode *cancelip;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int i, error, indiroff;
 	ufs_lbn_t lbn, rlbn;
 	ufs2_daddr_t len, blkno, numblks, blksperindir;
 	struct ufs2_dinode *dip;
 	struct thread *td = curthread;
 	struct buf *bp;
 
 	/*
 	 * Prepare to expunge the inode. If its inode block has not
 	 * yet been copied, then allocate and fill the copy.
 	 */
 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
 	blkno = 0;
 	if (lbn < NDADDR) {
 		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
 	} else {
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			return (error);
 		indiroff = (lbn - NDADDR) % NINDIR(fs);
 		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
 		bqrelse(bp);
 	}
 	if (blkno != 0) {
 		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
 			return (error);
 	} else {
 		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &bp);
 		if (error)
 			return (error);
 		if ((error = readblock(snapvp, bp, lbn)) != 0)
 			return (error);
 	}
 	/*
 	 * Set a snapshot inode to be a zero length file, regular files
 	 * to be completely unallocated.
 	 */
 	dip = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, cancelip->i_number);
 	if (expungetype == BLK_NOCOPY)
 		dip->di_mode = 0;
 	dip->di_size = 0;
 	dip->di_blocks = 0;
 	dip->di_flags &= ~SF_SNAPSHOT;
 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
 	bdwrite(bp);
 	/*
 	 * Now go through and expunge all the blocks in the file
 	 * using the function requested.
 	 */
 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
 	    &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
 		return (error);
 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
 	    &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
 		return (error);
 	blksperindir = 1;
 	lbn = -NDADDR;
 	len = numblks - NDADDR;
 	rlbn = NDADDR;
 	for (i = 0; len > 0 && i < NIADDR; i++) {
 		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
 		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
 		    blksperindir, fs, acctfunc, expungetype);
 		if (error)
 			return (error);
 		blksperindir *= NINDIR(fs);
 		lbn -= blksperindir + 1;
 		len -= blksperindir;
 		rlbn += blksperindir;
 	}
 	return (0);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
 	    blksperindir, fs, acctfunc, expungetype)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs2_daddr_t blkno;
 	ufs_lbn_t lbn;
 	ufs_lbn_t rlbn;
 	ufs_lbn_t remblks;
 	ufs_lbn_t blksperindir;
 	struct fs *fs;
 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
 	    struct fs *, ufs_lbn_t, int);
 	int expungetype;
 {
 	int error, num, i;
 	ufs_lbn_t subblksperindir;
 	struct indir indirs[NIADDR + 2];
 	ufs2_daddr_t last, *bap;
 	struct buf *bp;
 
 	if (blkno == 0) {
 		if (expungetype == BLK_NOCOPY)
 			return (0);
 		panic("indiracct_ufs2: missing indir");
 	}
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
 		panic("indiracct_ufs2: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 	bqrelse(bp);
 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
 	    level == 0 ? rlbn : -1, expungetype);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Do both snap accounting and map accounting.
  */
 static int
 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	int error;
 
 	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
 		return (error);
 	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
 }
 
 /*
  * Identify a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
 {
 	struct inode *ip = VTOI(vp);
 	ufs2_daddr_t blkno, *blkp;
 	ufs_lbn_t lbn;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_din2->di_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs2_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		/*
 		 * If we are expunging a snapshot vnode and we
 		 * find a block marked BLK_NOCOPY, then it is
 		 * one that has been allocated to this snapshot after
 		 * we took our current snapshot and can be ignored.
 		 */
 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
 			if (lbn >= NDADDR)
 				brelse(ibp);
 		} else {
 			if (*blkp != 0)
 				panic("snapacct_ufs2: bad block");
 			*blkp = expungetype;
 			if (lbn >= NDADDR)
 				bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
 	struct vnode *vp;
 	ufs2_daddr_t *oldblkp, *lastblkp;
 	struct fs *fs;
 	ufs_lbn_t lblkno;
 	int expungetype;
 {
 	ufs2_daddr_t blkno;
 	struct inode *ip;
 	ino_t inum;
 	int acctit;
 
 	ip = VTOI(vp);
 	inum = ip->i_number;
 	if (lblkno == -1)
 		acctit = 0;
 	else
 		acctit = 1;
 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY)
 			continue;
 		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
 			*ip->i_snapblklist++ = lblkno;
 		if (blkno == BLK_SNAP)
 			blkno = blkstofrags(fs, lblkno);
 		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum);
 	}
 	return (0);
 }
 
 /*
  * Decrement extra reference on snapshot when last name is removed.
  * It will not be freed until the last open reference goes away.
  */
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 	struct inode *xp;
 	struct fs *fs;
 	int snaploc;
 	struct snapdata *sn;
 	struct ufsmount *ump;
 
 	/*
 	 * Find snapshot in incore list.
 	 */
 	xp = NULL;
 	sn = ip->i_devvp->v_rdev->si_snapdata;
 	if (sn != NULL)
 		TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
 			if (xp == ip)
 				break;
 	if (xp != NULL)
 		vrele(ITOV(ip));
 	else if (snapdebug)
 		printf("ffs_snapgone: lost snapshot vnode %d\n",
 		    ip->i_number);
 	/*
 	 * Delete snapshot inode from superblock. Keep list dense.
 	 */
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	UFS_LOCK(ump);
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == ip->i_number)
 			break;
 	if (snaploc < FSMAXSNAP) {
 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 			if (fs->fs_snapinum[snaploc] == 0)
 				break;
 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 		}
 		fs->fs_snapinum[snaploc - 1] = 0;
 	}
 	UFS_UNLOCK(ump);
 }
 
 /*
  * Prepare a snapshot file for being removed.
  */
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 	struct inode *ip;
 	struct vnode *devvp;
 	struct lock *lkp;
 	struct buf *ibp;
 	struct fs *fs;
 	struct thread *td = curthread;
 	ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
 	int error, loc, last;
 	struct snapdata *sn;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	devvp = ip->i_devvp;
 	sn = devvp->v_rdev->si_snapdata;
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
 	 * would not have been active).
 	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
 	if (ip->i_nextsnap.tqe_prev != 0) {
 		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
 		VI_LOCK(devvp);
 		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
 		lkp = vp->v_vnlock;
 		vp->v_vnlock = &vp->v_lock;
 		lockmgr(lkp, LK_RELEASE, NULL, td);
 		if (TAILQ_FIRST(&sn->sn_head) != 0) {
 			VI_UNLOCK(devvp);
 		} else {
 			snapblklist = sn->sn_blklist;
 			sn->sn_blklist = 0;
 			sn->sn_listsize = 0;
 			devvp->v_rdev->si_snapdata = NULL;
 			devvp->v_vflag &= ~VV_COPYONWRITE;
 			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
 			lockmgr(lkp, LK_RELEASE, NULL, td);
 			lockdestroy(lkp);
 			free(sn, M_UFSMNT);
 			FREE(snapblklist, M_UFSMNT);
 		}
 	}
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
 	 */
 	for (blkno = 1; blkno < NDADDR; blkno++) {
 		dblk = DIP(ip, i_db[blkno]);
 		if (dblk == 0)
 			continue;
 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 			DIP_SET(ip, i_db[blkno], 0);
 		else if ((dblk == blkstofrags(fs, blkno) &&
 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
 		     ip->i_number))) {
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
 			    btodb(fs->fs_bsize));
 			DIP_SET(ip, i_db[blkno], 0);
 		}
 	}
 	numblks = howmany(ip->i_size, fs->fs_bsize);
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 		if (error)
 			continue;
 		if (fs->fs_size - blkno > NINDIR(fs))
 			last = NINDIR(fs);
 		else
 			last = fs->fs_size - blkno;
 		for (loc = 0; loc < last; loc++) {
 			if (ip->i_ump->um_fstype == UFS1) {
 				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
 				if (dblk == 0)
 					continue;
 				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				else if ((dblk == blkstofrags(fs, blkno) &&
 				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 				     fs->fs_bsize, ip->i_number))) {
 					ip->i_din1->di_blocks -=
 					    btodb(fs->fs_bsize);
 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
 				}
 				continue;
 			}
 			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
 			if (dblk == 0)
 				continue;
 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			else if ((dblk == blkstofrags(fs, blkno) &&
 			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
 			     fs->fs_bsize, ip->i_number))) {
 				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
 			}
 		}
 		bawrite(ibp);
 	}
 	/*
 	 * Clear snapshot flag and drop reference.
 	 */
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 }
 
 /*
  * Notification that a block is being freed. Return zero if the free
  * should be allowed to proceed. Return non-zero if the snapshot file
  * wants to claim the block. The block will be claimed if it is an
  * uncopied part of one of the snapshots. It will be freed if it is
  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
  * If a fragment is being freed, then all snapshots that care about
  * it must make a copy since a snapshot file can only claim full sized
  * blocks. Note that if more than one snapshot file maps the block,
  * we can pick one at random to claim it. Since none of the snapshots
  * can change, we are assurred that they will all see the same unmodified
  * image. When deleting a snapshot file (see ffs_snapremove above), we
  * must push any of these claimed blocks to one of the other snapshots
  * that maps it. These claimed blocks are easily identified as they will
  * have a block number equal to their logical block number within the
  * snapshot. A copied block can never have this property because they
  * must always have been allocated from a BLK_NOCOPY location.
  */
 int
 ffs_snapblkfree(fs, devvp, bno, size, inum)
 	struct fs *fs;
 	struct vnode *devvp;
 	ufs2_daddr_t bno;
 	long size;
 	ino_t inum;
 {
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct inode *ip;
 	struct vnode *vp = NULL;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t blkno;
 	int indiroff = 0, error = 0, claimedblk = 0;
 	struct snapdata *sn;
 
 	lbn = fragstoblks(fs, bno);
 retry:
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	if (lockmgr(&sn->sn_lock,
 		    LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 		    VI_MTX(devvp), td) != 0)
 		goto retry;
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * Lookup block being written.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_pflags |= TDP_COWINPROGRESS;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 		}
 		/*
 		 * Check to see if block needs to be copied.
 		 */
 		if (blkno == 0) {
 			/*
 			 * A block that we map is being freed. If it has not
 			 * been claimed yet, we will claim or copy it (below).
 			 */
 			claimedblk = 1;
 		} else if (blkno == BLK_SNAP) {
 			/*
 			 * No previous snapshot claimed the block,
 			 * so it will be freed and become a BLK_NOCOPY
 			 * (don't care) for us.
 			 */
 			if (claimedblk)
 				panic("snapblkfree: inconsistent block type");
 			if (lbn < NDADDR) {
 				DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			}
 			continue;
 		} else /* BLK_NOCOPY or default */ {
 			/*
 			 * If the snapshot has already copied the block
 			 * (default), or does not care about the block,
 			 * it is not needed.
 			 */
 			if (lbn >= NDADDR)
 				bqrelse(ibp);
 			continue;
 		}
 		/*
 		 * If this is a full size block, we will just grab it
 		 * and assign it to the snapshot inode. Otherwise we
 		 * will proceed to copy it. See explanation for this
 		 * routine as to why only a single snapshot needs to
 		 * claim this block.
 		 */
 		if (size == fs->fs_bsize) {
 #ifdef DEBUG
 			if (snapdebug)
 				printf("%s %d lbn %jd from inum %d\n",
 				    "Grabonremove: snapino", ip->i_number,
 				    (intmax_t)lbn, inum);
 #endif
 			if (lbn < NDADDR) {
 				DIP_SET(ip, i_db[lbn], bno);
 			} else if (ip->i_ump->um_fstype == UFS1) {
 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			} else {
 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			}
 			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 			return (1);
 		}
 		if (lbn >= NDADDR)
 			bqrelse(ibp);
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug)
 			printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
 			    "Copyonremove: snapino ", ip->i_number,
 			    (intmax_t)lbn, "for inum", inum, size,
 			    (intmax_t)cbp->b_blkno);
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(vp, cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT);
 	}
 	/*
 	 * If we have been unable to allocate a block in which to do
 	 * the copy, then return non-zero so that the fragment will
 	 * not be freed. Although space will be lost, the snapshot
 	 * will stay consistent.
 	 */
 	lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 	return (error);
 }
 
 /*
  * Associate snapshot files when mounting.
  */
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct vnode *devvp = ump->um_devvp;
 	struct fs *fs = ump->um_fs;
 	struct thread *td = curthread;
 	struct snapdata *sn;
 	struct vnode *vp;
 	struct inode *ip;
 	struct uio auio;
 	struct iovec aiov;
 	void *snapblklist;
 	char *reason;
 	daddr_t snaplistsize;
 	int error, snaploc, loc;
 
 	/*
 	 * XXX The following needs to be set before ffs_truncate or
 	 * VOP_READ can be called.
 	 */
 	mp->mnt_stat.f_iosize = fs->fs_bsize;
 	/*
 	 * Process each snapshot listed in the superblock.
 	 */
 	vp = NULL;
 	sn = devvp->v_rdev->si_snapdata;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 		if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
 		    LK_EXCLUSIVE, &vp)) != 0){
 			printf("ffs_snapshot_mount: vget failed %d\n", error);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
 			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 				reason = "non-snapshot";
 			} else {
 				reason = "old format snapshot";
 				(void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
 				(void)ffs_syncvnode(vp, MNT_WAIT);
 			}
 			printf("ffs_snapshot_mount: %s inode %d\n",
 			    reason, fs->fs_snapinum[snaploc]);
 			vput(vp);
 			vp = NULL;
 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 				if (fs->fs_snapinum[loc] == 0)
 					break;
 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 			}
 			fs->fs_snapinum[loc - 1] = 0;
 			snaploc--;
 			continue;
 		}
 		/*
 		 * If there already exist snapshots on this filesystem, grab a
 		 * reference to their shared lock. If this is the first snapshot
 		 * on this filesystem, we need to allocate a lock for the
 		 * snapshots to share. In either case, acquire the snapshot
 		 * lock and give up our original private lock.
 		 */
 		VI_LOCK(devvp);
 		if (sn != NULL) {
 
 			VI_UNLOCK(devvp);
 			VI_LOCK(vp);
 			vp->v_vnlock = &sn->sn_lock;
 		} else {
 			VI_UNLOCK(devvp);
 			sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
 			TAILQ_INIT(&sn->sn_head);
 			lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
 			    LK_CANRECURSE | LK_NOSHARE);
 			VI_LOCK(vp);
 			vp->v_vnlock = &sn->sn_lock;
 			devvp->v_rdev->si_snapdata = sn;
 		}
 		lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 		    VI_MTX(vp), td);
 		transferlockers(&vp->v_lock, vp->v_vnlock);
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 		/*
 		 * Link it onto the active snapshot list.
 		 */
 		VI_LOCK(devvp);
 		if (ip->i_nextsnap.tqe_prev != 0)
 			panic("ffs_snapshot_mount: %d already on list",
 			    ip->i_number);
 		else
 			TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
 		vp->v_vflag |= VV_SYSTEM;
 		VI_UNLOCK(devvp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 	/*
 	 * No usable snapshots found.
 	 */
 	if (vp == NULL)
 		return;
 	/*
 	 * Allocate the space for the block hints list. We always want to
 	 * use the list from the newest snapshot.
 	 */
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = (void *)&snaplistsize;
 	aiov.iov_len = sizeof(snaplistsize);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset =
 	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		return;
 	}
 	MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t),
 	    M_UFSMNT, M_WAITOK);
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = snapblklist;
 	aiov.iov_len = snaplistsize * sizeof (daddr_t);
 	auio.uio_resid = aiov.iov_len;
 	auio.uio_offset -= sizeof(snaplistsize);
 	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
 		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
 		VOP_UNLOCK(vp, 0, td);
 		FREE(snapblklist, M_UFSMNT);
 		return;
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VI_LOCK(devvp);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
 	sn->sn_listsize = snaplistsize;
 	sn->sn_blklist = (daddr_t *)snapblklist;
 	devvp->v_vflag |= VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 }
 
 /*
  * Disassociate snapshot files when unmounting.
  */
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
 	struct snapdata *sn;
 	struct inode *xp;
 	struct vnode *vp;
 
 	sn = devvp->v_rdev->si_snapdata;
 	VI_LOCK(devvp);
 	while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) {
 		vp = ITOV(xp);
 		vp->v_vnlock = &vp->v_lock;
 		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 		xp->i_nextsnap.tqe_prev = 0;
 		if (xp->i_effnlink > 0) {
 			VI_UNLOCK(devvp);
 			vrele(vp);
 			VI_LOCK(devvp);
 		}
 	}
 	devvp->v_rdev->si_snapdata = NULL;
 	devvp->v_vflag &= ~VV_COPYONWRITE;
 	VI_UNLOCK(devvp);
 	if (sn->sn_blklist != NULL) {
 		FREE(sn->sn_blklist, M_UFSMNT);
 		sn->sn_blklist = NULL;
 		sn->sn_listsize = 0;
 	}
 	lockdestroy(&sn->sn_lock);
 	free(sn, M_UFSMNT);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 }
 
 /*
  * Check for need to copy block that is about to be written,
  * copying the block if necessary.
  */
 int
 ffs_copyonwrite(devvp, bp)
 	struct vnode *devvp;
 	struct buf *bp;
 {
 	struct snapdata *sn;
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct thread *td = curthread;
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp = 0;
 	ufs2_daddr_t lbn, blkno, *snapblklist;
 	int lower, upper, mid, indiroff, error = 0;
 	int launched_async_io, prev_norunningbuf;
 
 	if ((VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
 		return (0);		/* Update on a snapshot file */
 	if (td->td_pflags & TDP_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
 	/*
 	 * First check to see if it is in the preallocated list.
 	 * By doing this check we avoid several potential deadlocks.
 	 */
 	VI_LOCK(devvp);
 	sn = devvp->v_rdev->si_snapdata;
 	if (sn == NULL ||
 	    TAILQ_FIRST(&sn->sn_head) == NULL) {
 		VI_UNLOCK(devvp);
 		return (0);		/* No snapshot */
 	}
 	ip = TAILQ_FIRST(&sn->sn_head);
 	fs = ip->i_fs;
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	snapblklist = sn->sn_blklist;
 	upper = sn->sn_listsize - 1;
 	lower = 1;
 	while (lower <= upper) {
 		mid = (lower + upper) / 2;
 		if (snapblklist[mid] == lbn)
 			break;
 		if (snapblklist[mid] < lbn)
 			lower = mid + 1;
 		else
 			upper = mid - 1;
 	}
 	if (lower <= upper) {
 		VI_UNLOCK(devvp);
 		return (0);
 	}
 	launched_async_io = 0;
 	prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
 	/*
 	 * Since I/O on bp isn't yet in progress and it may be blocked
 	 * for a long time waiting on snaplk, back it out of
 	 * runningbufspace, possibly waking other threads waiting for space.
 	 */
 	runningbufwakeup(bp);
 	/*
 	 * Not in the precomputed list, so check the snapshots.
 	 */
 	while (lockmgr(&sn->sn_lock,
 		       LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
 		       VI_MTX(devvp), td) != 0) {
 		VI_LOCK(devvp);
 		sn = devvp->v_rdev->si_snapdata;
 		if (sn == NULL ||
 		    TAILQ_FIRST(&sn->sn_head) == NULL) {
 			VI_UNLOCK(devvp);
 			if (bp->b_runningbufspace)
 				atomic_add_int(&runningbufspace,
 					       bp->b_runningbufspace);
 			return (0);		/* Snapshot gone */
 		}
 	}
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
 		/*
 		 * We ensure that everything of our own that needs to be
 		 * copied will be done at the time that ffs_snapshot is
 		 * called. Thus we can skip the check here which can
 		 * deadlock in doing the lookup in UFS_BALLOC.
 		 */
 		if (bp->b_vp == vp)
 			continue;
 		/*
 		 * Check to see if block needs to be copied. We do not have
 		 * to hold the snapshot lock while doing this lookup as it
 		 * will never require any additional allocations for the
 		 * snapshot inode.
 		 */
 		if (lbn < NDADDR) {
 			blkno = DIP(ip, i_db[lbn]);
 		} else {
 			td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
 			td->td_pflags &= ~TDP_COWINPROGRESS;
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			if (ip->i_ump->um_fstype == UFS1)
 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
 			else
 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
 			bqrelse(ibp);
 		}
 #ifdef DIAGNOSTIC
 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 			panic("ffs_copyonwrite: bad copy block");
 #endif
 		if (blkno != 0)
 			continue;
 		/*
 		 * Allocate the block into which to do the copy. Since
 		 * multiple processes may all try to copy the same block,
 		 * we have to recheck our need to do a copy if we sleep
 		 * waiting for the lock.
 		 *
 		 * Because all snapshots on a filesystem share a single
 		 * lock, we ensure that we will never be in competition
 		 * with another process to allocate a block.
 		 */
 		td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		td->td_pflags &= ~TDP_COWINPROGRESS;
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug) {
 			printf("Copyonwrite: snapino %d lbn %jd for ",
 			    ip->i_number, (intmax_t)lbn);
 			if (bp->b_vp == devvp)
 				printf("fs metadata");
 			else
 				printf("inum %d", VTOI(bp->b_vp)->i_number);
 			printf(" lblkno %jd to blkno %jd\n",
 			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
 		}
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block. Note that we need
 		 * to synchronously write snapshots that have not been
 		 * unlinked, and hence will be visible after a crash,
 		 * to ensure their integrity.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			else
 				launched_async_io = 1;
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(vp, cbp, lbn)) != 0) {
 			bzero(cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			if (dopersistence && ip->i_effnlink > 0)
 				(void) ffs_syncvnode(vp, MNT_WAIT);
 			else
 				launched_async_io = 1;
 			break;
 		}
 		savedcbp = cbp;
 	}
 	/*
 	 * Note that we need to synchronously write snapshots that
 	 * have not been unlinked, and hence will be visible after
 	 * a crash, to ensure their integrity.
 	 */
 	if (savedcbp) {
 		vp = savedcbp->b_vp;
 		bawrite(savedcbp);
 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
 			(void) ffs_syncvnode(vp, MNT_WAIT);
 		else
 			launched_async_io = 1;
 	}
 	lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
 	td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
 		prev_norunningbuf;
 	if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
 		waitrunningbufspace();
 	/*
 	 * I/O on bp will now be started, so count it in runningbufspace.
 	 */
 	if (bp->b_runningbufspace)
 		atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 	return (error);
 }
 
 /*
  * Read the specified block into the given buffer.
  * Much of this boiler-plate comes from bwrite().
  */
 static int
 readblock(vp, bp, lbn)
 	struct vnode *vp;
 	struct buf *bp;
 	ufs2_daddr_t lbn;
 {
 	struct inode *ip = VTOI(vp);
 	struct bio *bip;
 
 	bip = g_alloc_bio();
 	bip->bio_cmd = BIO_READ;
 	bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 	bip->bio_data = bp->b_data;
 	bip->bio_length = bp->b_bcount;
 
 	g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
 
 	do 
 		msleep(bip, NULL, PRIBIO, "snaprdb", hz/10);
 	while (!(bip->bio_flags & BIO_DONE));
 	bp->b_error = bip->bio_error;
 	g_destroy_bio(bip);
 	return (bp->b_error);
 }
 
 #endif
Index: head/sys/ufs/ffs/ffs_softdep.c
===================================================================
--- head/sys/ufs/ffs/ffs_softdep.c	(revision 156202)
+++ head/sys/ufs/ffs/ffs_softdep.c	(revision 156203)
@@ -1,5989 +1,6048 @@
 /*-
  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * The soft updates code is derived from the appendix of a University
  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
  * "Soft Updates: A Solution to the Metadata Update Problem in File
  * Systems", CSE-TR-254-95, August 1995).
  *
  * Further information about soft updates can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
  */
 #ifndef DIAGNOSTIC
 #define DIAGNOSTIC
 #endif
 #ifndef DEBUG
 #define DEBUG
 #endif
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kdb.h>
+#include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/softdep.h>
 #include <ufs/ffs/ffs_extern.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <vm/vm.h>
 
 #include "opt_ffs.h"
 
 #ifndef SOFTUPDATES
 
 int
 softdep_flushfiles(oldmnt, flags, td)
 	struct mount *oldmnt;
 	int flags;
 	struct thread *td;
 {
 
 	panic("softdep_flushfiles called");
 }
 
 int
 softdep_mount(devvp, mp, fs, cred)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct fs *fs;
 	struct ucred *cred;
 {
 
 	return (0);
 }
 
 void 
 softdep_initialize()
 {
 
 	return;
 }
 
 void
 softdep_uninitialize()
 {
 
 	return;
 }
 
 void
 softdep_setup_inomapdep(bp, ip, newinum)
 	struct buf *bp;
 	struct inode *ip;
 	ino_t newinum;
 {
 
 	panic("softdep_setup_inomapdep called");
 }
 
 void
-softdep_setup_blkmapdep(bp, fs, newblkno)
+softdep_setup_blkmapdep(bp, mp, newblkno)
 	struct buf *bp;
-	struct fs *fs;
+	struct mount *mp;
 	ufs2_daddr_t newblkno;
 {
 
 	panic("softdep_setup_blkmapdep called");
 }
 
 void 
 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	
 	panic("softdep_setup_allocdirect called");
 }
 
 void 
 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	
 	panic("softdep_setup_allocdirect called");
 }
 
 void
 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	struct buf *bp;
 	int ptrno;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	struct buf *nbp;
 {
 
 	panic("softdep_setup_allocindir_page called");
 }
 
 void
 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 	struct buf *nbp;
 	struct inode *ip;
 	struct buf *bp;
 	int ptrno;
 	ufs2_daddr_t newblkno;
 {
 
 	panic("softdep_setup_allocindir_meta called");
 }
 
 void
 softdep_setup_freeblocks(ip, length, flags)
 	struct inode *ip;
 	off_t length;
 	int flags;
 {
 	
 	panic("softdep_setup_freeblocks called");
 }
 
 void
 softdep_freefile(pvp, ino, mode)
 		struct vnode *pvp;
 		ino_t ino;
 		int mode;
 {
 
 	panic("softdep_freefile called");
 }
 
 int 
 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 	struct buf *bp;
 	struct inode *dp;
 	off_t diroffset;
 	ino_t newinum;
 	struct buf *newdirbp;
 	int isnewblk;
 {
 
 	panic("softdep_setup_directory_add called");
 }
 
 void 
 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 	struct inode *dp;
 	caddr_t base;
 	caddr_t oldloc;
 	caddr_t newloc;
 	int entrysize;
 {
 
 	panic("softdep_change_directoryentry_offset called");
 }
 
 void 
 softdep_setup_remove(bp, dp, ip, isrmdir)
 	struct buf *bp;
 	struct inode *dp;
 	struct inode *ip;
 	int isrmdir;
 {
 	
 	panic("softdep_setup_remove called");
 }
 
 void 
 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	struct buf *bp;
 	struct inode *dp;
 	struct inode *ip;
 	ino_t newinum;
 	int isrmdir;
 {
 
 	panic("softdep_setup_directory_change called");
 }
 
 void
 softdep_change_linkcnt(ip)
 	struct inode *ip;
 {
 
 	panic("softdep_change_linkcnt called");
 }
 
 void 
 softdep_load_inodeblock(ip)
 	struct inode *ip;
 {
 
 	panic("softdep_load_inodeblock called");
 }
 
 void 
 softdep_update_inodeblock(ip, bp, waitfor)
 	struct inode *ip;
 	struct buf *bp;
 	int waitfor;
 {
 
 	panic("softdep_update_inodeblock called");
 }
 
 int
 softdep_fsync(vp)
 	struct vnode *vp;	/* the "in_core" copy of the inode */
 {
 
 	return (0);
 }
 
 void
 softdep_fsync_mountdev(vp)
 	struct vnode *vp;
 {
 
 	return;
 }
 
 int
 softdep_flushworklist(oldmnt, countp, td)
 	struct mount *oldmnt;
 	int *countp;
 	struct thread *td;
 {
 
 	*countp = 0;
 	return (0);
 }
 
 int
 softdep_sync_metadata(struct vnode *vp)
 {
 
 	return (0);
 }
 
 int
 softdep_slowdown(vp)
 	struct vnode *vp;
 {
 
 	panic("softdep_slowdown called");
 }
 
 void
 softdep_releasefile(ip)
 	struct inode *ip;	/* inode with the zero effective link count */
 {
 
 	panic("softdep_releasefile called");
 }
 
 int
 softdep_request_cleanup(fs, vp)
 	struct fs *fs;
 	struct vnode *vp;
 {
 
 	return (0);
 }
 
 #else
 /*
  * These definitions need to be adapted to the system to which
  * this file is being ported.
  */
 /*
  * malloc types defined for the softdep system.
  */
 static MALLOC_DEFINE(M_PAGEDEP, "pagedep","File page dependencies");
 static MALLOC_DEFINE(M_INODEDEP, "inodedep","Inode dependencies");
 static MALLOC_DEFINE(M_NEWBLK, "newblk","New block allocation");
 static MALLOC_DEFINE(M_BMSAFEMAP, "bmsafemap","Block or frag allocated from cyl group map");
 static MALLOC_DEFINE(M_ALLOCDIRECT, "allocdirect","Block or frag dependency for an inode");
 static MALLOC_DEFINE(M_INDIRDEP, "indirdep","Indirect block dependencies");
 static MALLOC_DEFINE(M_ALLOCINDIR, "allocindir","Block dependency for an indirect block");
 static MALLOC_DEFINE(M_FREEFRAG, "freefrag","Previously used frag for an inode");
 static MALLOC_DEFINE(M_FREEBLKS, "freeblks","Blocks freed from an inode");
 static MALLOC_DEFINE(M_FREEFILE, "freefile","Inode deallocated");
 static MALLOC_DEFINE(M_DIRADD, "diradd","New directory entry");
 static MALLOC_DEFINE(M_MKDIR, "mkdir","New directory");
 static MALLOC_DEFINE(M_DIRREM, "dirrem","Directory entry deleted");
 static MALLOC_DEFINE(M_NEWDIRBLK, "newdirblk","Unclaimed new directory block");
 static MALLOC_DEFINE(M_SAVEDINO, "savedino","Saved inodes");
 
 #define M_SOFTDEP_FLAGS	(M_WAITOK | M_USE_RESERVE)
 
 #define	D_PAGEDEP	0
 #define	D_INODEDEP	1
 #define	D_NEWBLK	2
 #define	D_BMSAFEMAP	3
 #define	D_ALLOCDIRECT	4
 #define	D_INDIRDEP	5
 #define	D_ALLOCINDIR	6
 #define	D_FREEFRAG	7
 #define	D_FREEBLKS	8
 #define	D_FREEFILE	9
 #define	D_DIRADD	10
 #define	D_MKDIR		11
 #define	D_DIRREM	12
 #define	D_NEWDIRBLK	13
 #define	D_LAST		D_NEWDIRBLK
 
 /* 
  * translate from workitem type to memory type
  * MUST match the defines above, such that memtype[D_XXX] == M_XXX
  */
 static struct malloc_type *memtype[] = {
 	M_PAGEDEP,
 	M_INODEDEP,
 	M_NEWBLK,
 	M_BMSAFEMAP,
 	M_ALLOCDIRECT,
 	M_INDIRDEP,
 	M_ALLOCINDIR,
 	M_FREEFRAG,
 	M_FREEBLKS,
 	M_FREEFILE,
 	M_DIRADD,
 	M_MKDIR,
 	M_DIRREM,
 	M_NEWDIRBLK
 };
 
 #define DtoM(type) (memtype[type])
 
 /*
  * Names of malloc types.
  */
 #define TYPENAME(type)  \
 	((unsigned)(type) < D_LAST ? memtype[type]->ks_shortdesc : "???")
 /*
  * End system adaptation definitions.
  */
 
 /*
  * Forward declarations.
  */
 struct inodedep_hashhead;
 struct newblk_hashhead;
 struct pagedep_hashhead;
 
 /*
  * Internal function prototypes.
  */
 static	void softdep_error(char *, int);
 static	void drain_output(struct vnode *);
 static	struct buf *getdirtybuf(struct buf *, struct mtx *, int);
 static	void clear_remove(struct thread *);
 static	void clear_inodedeps(struct thread *);
 static	int flush_pagedep_deps(struct vnode *, struct mount *,
 	    struct diraddhd *);
-static	int flush_inodedep_deps(struct fs *, ino_t);
+static	int flush_inodedep_deps(struct mount *, ino_t);
 static	int flush_deplist(struct allocdirectlst *, int, int *);
 static	int handle_written_filepage(struct pagedep *, struct buf *);
 static  void diradd_inode_written(struct diradd *, struct inodedep *);
 static	int handle_written_inodeblock(struct inodedep *, struct buf *);
 static	void handle_allocdirect_partdone(struct allocdirect *);
 static	void handle_allocindir_partdone(struct allocindir *);
 static	void initiate_write_filepage(struct pagedep *, struct buf *);
 static	void handle_written_mkdir(struct mkdir *, int);
 static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
 static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
 static	void handle_workitem_freefile(struct freefile *);
 static	void handle_workitem_remove(struct dirrem *, struct vnode *);
 static	struct dirrem *newdirrem(struct buf *, struct inode *,
 	    struct inode *, int, struct dirrem **);
 static	void free_diradd(struct diradd *);
 static	void free_allocindir(struct allocindir *, struct inodedep *);
 static	void free_newdirblk(struct newdirblk *);
 static	int indir_trunc(struct freeblks *, ufs2_daddr_t, int, ufs_lbn_t,
 	    ufs2_daddr_t *);
 static	void deallocate_dependencies(struct buf *, struct inodedep *);
 static	void free_allocdirect(struct allocdirectlst *,
 	    struct allocdirect *, int);
 static	int check_inode_unwritten(struct inodedep *);
 static	int free_inodedep(struct inodedep *);
 static	void handle_workitem_freeblocks(struct freeblks *, int);
 static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
 static	void setup_allocindir_phase2(struct buf *, struct inode *,
 	    struct allocindir *);
 static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
 	    ufs2_daddr_t);
 static	void handle_workitem_freefrag(struct freefrag *);
 static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long);
 static	void allocdirect_merge(struct allocdirectlst *,
 	    struct allocdirect *, struct allocdirect *);
-static	struct bmsafemap *bmsafemap_lookup(struct buf *);
+static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *);
 static	int newblk_find(struct newblk_hashhead *, struct fs *, ufs2_daddr_t,
 	    struct newblk **);
 static	int newblk_lookup(struct fs *, ufs2_daddr_t, int, struct newblk **);
 static	int inodedep_find(struct inodedep_hashhead *, struct fs *, ino_t,
 	    struct inodedep **);
-static	int inodedep_lookup(struct fs *, ino_t, int, struct inodedep **);
+static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
 static	int pagedep_lookup(struct inode *, ufs_lbn_t, int, struct pagedep **);
 static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
 	    struct mount *mp, int, struct pagedep **);
 static	void pause_timer(void *);
-static	int request_cleanup(int);
+static	int request_cleanup(struct mount *, int);
 static	int process_worklist_item(struct mount *, int);
 static	void add_to_worklist(struct worklist *);
+static	void softdep_flush(void);
+static	int softdep_speedup(void);
 
 /*
  * Exported softdep operations.
  */
 static	void softdep_disk_io_initiation(struct buf *);
 static	void softdep_disk_write_complete(struct buf *);
 static	void softdep_deallocate_dependencies(struct buf *);
 static	int softdep_count_dependencies(struct buf *bp, int);
 
 static struct mtx lk;
 MTX_SYSINIT(softdep_lock, &lk, "Softdep Lock", MTX_DEF);
 
 #define ACQUIRE_LOCK(lk)		mtx_lock(lk)
 #define FREE_LOCK(lk)			mtx_unlock(lk)
 
 /*
  * Worklist queue management.
  * These routines require that the lock be held.
  */
 #ifndef /* NOT */ DEBUG
 #define WORKLIST_INSERT(head, item) do {	\
 	(item)->wk_state |= ONWORKLIST;		\
 	LIST_INSERT_HEAD(head, item, wk_list);	\
 } while (0)
 #define WORKLIST_REMOVE(item) do {		\
 	(item)->wk_state &= ~ONWORKLIST;	\
 	LIST_REMOVE(item, wk_list);		\
 } while (0)
-#define WORKITEM_FREE(item, type) FREE(item, DtoM(type))
-
 #else /* DEBUG */
 static	void worklist_insert(struct workhead *, struct worklist *);
 static	void worklist_remove(struct worklist *);
-static	void workitem_free(struct worklist *, int);
 
 #define WORKLIST_INSERT(head, item) worklist_insert(head, item)
 #define WORKLIST_REMOVE(item) worklist_remove(item)
-#define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type)
 
 static void
 worklist_insert(head, item)
 	struct workhead *head;
 	struct worklist *item;
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if (item->wk_state & ONWORKLIST)
 		panic("worklist_insert: already on list");
 	item->wk_state |= ONWORKLIST;
 	LIST_INSERT_HEAD(head, item, wk_list);
 }
 
 static void
 worklist_remove(item)
 	struct worklist *item;
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((item->wk_state & ONWORKLIST) == 0)
 		panic("worklist_remove: not on list");
 	item->wk_state &= ~ONWORKLIST;
 	LIST_REMOVE(item, wk_list);
 }
+#endif /* DEBUG */
 
+/*
+ * Routines for tracking and managing workitems.
+ */
+static	void workitem_free(struct worklist *, int);
+static	void workitem_alloc(struct worklist *, int, struct mount *);
+
+#define	WORKITEM_FREE(item, type) workitem_free((struct worklist *)(item), (type))
+
 static void
 workitem_free(item, type)
 	struct worklist *item;
 	int type;
 {
+	struct ufsmount *ump;
+	mtx_assert(&lk, MA_OWNED);
 
+#ifdef DEBUG
 	if (item->wk_state & ONWORKLIST)
 		panic("workitem_free: still on list");
 	if (item->wk_type != type)
 		panic("workitem_free: type mismatch");
+#endif
+	ump = VFSTOUFS(item->wk_mp);
+	if (--ump->softdep_deps == 0 && ump->softdep_req)
+		wakeup(&ump->softdep_deps);
 	FREE(item, DtoM(type));
 }
-#endif /* DEBUG */
 
+static void
+workitem_alloc(item, type, mp)
+	struct worklist *item;
+	int type;
+	struct mount *mp;
+{
+	item->wk_type = type;
+	item->wk_mp = mp;
+	item->wk_state = 0;
+	ACQUIRE_LOCK(&lk);
+	VFSTOUFS(mp)->softdep_deps++;
+	FREE_LOCK(&lk);
+}
+
 /*
  * Workitem queue management
  */
-static struct workhead softdep_workitem_pending;
-static struct worklist *worklist_tail;
-static int num_on_worklist;	/* number of worklist items to be processed */
-static int softdep_worklist_busy; /* 1 => trying to do unmount */
-static int softdep_worklist_req; /* serialized waiters */
 static int max_softdeps;	/* maximum number of structs before slowdown */
 static int maxindirdeps = 50;	/* max number of indirdeps before slowdown */
 static int tickdelay = 2;	/* number of ticks to pause during slowdown */
 static int proc_waiting;	/* tracks whether we have a timeout posted */
 static int *stat_countp;	/* statistic to count in proc_waiting timeout */
 static struct callout_handle handle; /* handle on posted proc_waiting timeout */
-static struct thread *filesys_syncer; /* proc of filesystem syncer process */
+static int req_pending;
 static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
 #define FLUSH_INODES		1
 static int req_clear_remove;	/* syncer process flush some freeblks */
 #define FLUSH_REMOVE		2
 #define FLUSH_REMOVE_WAIT	3
 /*
  * runtime statistics
  */
 static int stat_worklist_push;	/* number of worklist cleanups */
 static int stat_blk_limit_push;	/* number of times block limit neared */
 static int stat_ino_limit_push;	/* number of times inode limit neared */
 static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
 static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
 static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
 static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
 static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
 static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
 static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
 
 SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, maxindirdeps, CTLFLAG_RW, &maxindirdeps, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,"");
 SYSCTL_INT(_debug, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,"");
 SYSCTL_INT(_debug, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,"");
 SYSCTL_INT(_debug, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, "");
-SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &num_on_worklist, 0, "");
+/* SYSCTL_INT(_debug, OID_AUTO, worklist_num, CTLFLAG_RD, &softdep_on_worklist, 0, ""); */
 
 SYSCTL_DECL(_vfs_ffs);
 
 static int compute_summary_at_mount = 0;	/* Whether to recompute the summary at mount time */
 SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
 	   &compute_summary_at_mount, 0, "Recompute summary at mount");
 
+static struct proc *softdepproc;
+static struct kproc_desc softdep_kp = {
+	"softdepflush",
+	softdep_flush,
+	&softdepproc
+};
+SYSINIT(sdproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, &softdep_kp)
+
+static void
+softdep_flush(void)
+{
+	struct mount *nmp;
+	struct mount *mp;
+	struct thread *td;
+	int remaining;
+
+	td = curthread;
+	td->td_pflags |= TDP_NORUNNINGBUF;
+
+	for (;;) {	
+		kthread_suspend_check(softdepproc);
+		ACQUIRE_LOCK(&lk);
+		/*
+		 * If requested, try removing inode or removal dependencies.
+		 */
+		if (req_clear_inodedeps) {
+			clear_inodedeps(td);
+			req_clear_inodedeps -= 1;
+			wakeup_one(&proc_waiting);
+		}
+		if (req_clear_remove) {
+			clear_remove(td);
+			req_clear_remove -= 1;
+			wakeup_one(&proc_waiting);
+		}
+		FREE_LOCK(&lk);
+		remaining = 0;
+		mtx_lock(&mountlist_mtx);
+		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp)  {
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			if ((mp->mnt_flag & MNT_SOFTDEP) == 0)
+				continue;
+			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
+				continue;
+			softdep_process_worklist(mp, 0);
+			remaining += VFSTOUFS(mp)->softdep_on_worklist;
+			mtx_lock(&mountlist_mtx);
+			nmp = TAILQ_NEXT(mp, mnt_list);
+			vfs_unbusy(mp, td);
+		}
+		mtx_unlock(&mountlist_mtx);
+		if (remaining)
+			continue;
+		ACQUIRE_LOCK(&lk);
+		if (!req_pending)
+			msleep(&req_pending, &lk, PVM, "sdflush", hz);
+		req_pending = 0;
+		FREE_LOCK(&lk);
+	}
+}
+
+static int
+softdep_speedup(void)
+{
+	ACQUIRE_LOCK(&lk);
+	if (req_pending == 0) {
+		req_pending = 1;
+		wakeup(&req_pending);
+	}
+	FREE_LOCK(&lk);
+
+	return speedup_syncer();
+}
+
 /*
  * Add an item to the end of the work queue.
  * This routine requires that the lock be held.
  * This is the only routine that adds items to the list.
  * The following routine is the only one that removes items
  * and does so in order from first to last.
  */
 static void
 add_to_worklist(wk)
 	struct worklist *wk;
 {
+	struct ufsmount *ump;
 
 	mtx_assert(&lk, MA_OWNED);
+	ump = VFSTOUFS(wk->wk_mp);
 	if (wk->wk_state & ONWORKLIST)
 		panic("add_to_worklist: already on list");
 	wk->wk_state |= ONWORKLIST;
-	if (LIST_FIRST(&softdep_workitem_pending) == NULL)
-		LIST_INSERT_HEAD(&softdep_workitem_pending, wk, wk_list);
+	if (LIST_FIRST(&ump->softdep_workitem_pending) == NULL)
+		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
 	else
-		LIST_INSERT_AFTER(worklist_tail, wk, wk_list);
-	worklist_tail = wk;
-	num_on_worklist += 1;
+		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
+	ump->softdep_worklist_tail = wk;
+	ump->softdep_on_worklist += 1;
 }
 
 /*
  * Process that runs once per second to handle items in the background queue.
  *
  * Note that we ensure that everything is done in the order in which they
  * appear in the queue. The code below depends on this property to ensure
  * that blocks of a file are freed before the inode itself is freed. This
  * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
  * until all the old ones have been purged from the dependency lists.
  */
 int 
-softdep_process_worklist(matchmnt)
-	struct mount *matchmnt;
+softdep_process_worklist(mp, full)
+	struct mount *mp;
+	int full;
 {
 	struct thread *td = curthread;
 	int cnt, matchcnt, loopcount;
+	struct ufsmount *ump;
 	long starttime;
 
+	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
 	/*
 	 * Record the process identifier of our caller so that we can give
 	 * this process preferential treatment in request_cleanup below.
 	 */
-	filesys_syncer = td;
 	matchcnt = 0;
-
-	/*
-	 * There is no danger of having multiple processes run this
-	 * code, but we have to single-thread it when softdep_flushfiles()
-	 * is in operation to get an accurate count of the number of items
-	 * related to its mount point that are in the list.
-	 */
+	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(&lk);
-	if (matchmnt == NULL) {
-		if (softdep_worklist_busy < 0) {
-			FREE_LOCK(&lk);
-			return(-1);
-		}
-		softdep_worklist_busy += 1;
-	}
-
-	/*
-	 * If requested, try removing inode or removal dependencies.
-	 */
-	if (req_clear_inodedeps) {
-		clear_inodedeps(td);
-		req_clear_inodedeps -= 1;
-		wakeup_one(&proc_waiting);
-	}
-	if (req_clear_remove) {
-		clear_remove(td);
-		req_clear_remove -= 1;
-		wakeup_one(&proc_waiting);
-	}
 	loopcount = 1;
 	starttime = time_second;
-	while (num_on_worklist > 0) {
-		if ((cnt = process_worklist_item(matchmnt, 0)) == -1)
+	while (ump->softdep_on_worklist > 0) {
+		if ((cnt = process_worklist_item(mp, 0)) == -1)
 			break;
 		else
 			matchcnt += cnt;
-
 		/*
-		 * If a umount operation wants to run the worklist
-		 * accurately, abort.
-		 */
-		if (softdep_worklist_req && matchmnt == NULL) {
-			matchcnt = -1;
-			break;
-		}
-
-		/*
 		 * If requested, try removing inode or removal dependencies.
 		 */
 		if (req_clear_inodedeps) {
 			clear_inodedeps(td);
 			req_clear_inodedeps -= 1;
 			wakeup_one(&proc_waiting);
 		}
 		if (req_clear_remove) {
 			clear_remove(td);
 			req_clear_remove -= 1;
 			wakeup_one(&proc_waiting);
 		}
 		/*
 		 * We do not generally want to stop for buffer space, but if
 		 * we are really being a buffer hog, we will stop and wait.
 		 */
 		if (loopcount++ % 128 == 0) {
 			FREE_LOCK(&lk);
 			bwillwrite();
 			ACQUIRE_LOCK(&lk);
 		}
 		/*
 		 * Never allow processing to run for more than one
-		 * second. Otherwise the other syncer tasks may get
+		 * second. Otherwise the other mountpoints may get
 		 * excessively backlogged.
 		 */
-		if (starttime != time_second && matchmnt == NULL) {
+		if (!full && starttime != time_second) {
 			matchcnt = -1;
 			break;
 		}
 	}
-	if (matchmnt == NULL) {
-		softdep_worklist_busy -= 1;
-		if (softdep_worklist_req && softdep_worklist_busy == 0)
-			wakeup(&softdep_worklist_req);
-	}
 	FREE_LOCK(&lk);
 	return (matchcnt);
 }
 
 /*
  * Process one item on the worklist.
  */
 static int
-process_worklist_item(matchmnt, flags)
-	struct mount *matchmnt;
+process_worklist_item(mp, flags)
+	struct mount *mp;
 	int flags;
 {
 	struct worklist *wk, *wkend;
-	struct mount *mp;
+	struct ufsmount *ump;
 	struct vnode *vp;
 	int matchcnt = 0;
 
 	mtx_assert(&lk, MA_OWNED);
+	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
 	/*
 	 * If we are being called because of a process doing a
 	 * copy-on-write, then it is not safe to write as we may
 	 * recurse into the copy-on-write routine.
 	 */
 	if (curthread->td_pflags & TDP_COWINPROGRESS)
 		return (-1);
 	/*
 	 * Normally we just process each item on the worklist in order.
 	 * However, if we are in a situation where we cannot lock any
 	 * inodes, we have to skip over any dirrem requests whose
 	 * vnodes are resident and locked.
 	 */
+	ump = VFSTOUFS(mp);
 	vp = NULL;
-	LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) {
+	LIST_FOREACH(wk, &ump->softdep_workitem_pending, wk_list) {
 		if (wk->wk_state & INPROGRESS)
 			continue;
 		if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM)
 			break;
 		wk->wk_state |= INPROGRESS;
 		FREE_LOCK(&lk);
-		ffs_vget(WK_DIRREM(wk)->dm_mnt, WK_DIRREM(wk)->dm_oldinum,
+		ffs_vget(mp, WK_DIRREM(wk)->dm_oldinum,
 		    LK_NOWAIT | LK_EXCLUSIVE, &vp);
 		ACQUIRE_LOCK(&lk);
 		wk->wk_state &= ~INPROGRESS;
 		if (vp != NULL)
 			break;
 	}
 	if (wk == 0)
 		return (-1);
 	/*
 	 * Remove the item to be processed. If we are removing the last
 	 * item on the list, we need to recalculate the tail pointer.
 	 * As this happens rarely and usually when the list is short,
 	 * we just run down the list to find it rather than tracking it
 	 * in the above loop.
 	 */
 	WORKLIST_REMOVE(wk);
-	if (wk == worklist_tail) {
-		LIST_FOREACH(wkend, &softdep_workitem_pending, wk_list)
+	if (wk == ump->softdep_worklist_tail) {
+		LIST_FOREACH(wkend, &ump->softdep_workitem_pending, wk_list)
 			if (LIST_NEXT(wkend, wk_list) == NULL)
 				break;
-		worklist_tail = wkend;
+		ump->softdep_worklist_tail = wkend;
 	}
-	num_on_worklist -= 1;
+	ump->softdep_on_worklist -= 1;
 	FREE_LOCK(&lk);
+	if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
+		panic("process_worklist_item: suspended filesystem");
+	matchcnt++;
 	switch (wk->wk_type) {
 
 	case D_DIRREM:
 		/* removal of a directory entry */
-		mp = WK_DIRREM(wk)->dm_mnt;
-		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
-			panic("%s: dirrem on suspended filesystem",
-				"process_worklist_item");
-		if (mp == matchmnt)
-			matchcnt += 1;
 		handle_workitem_remove(WK_DIRREM(wk), vp);
 		break;
 
 	case D_FREEBLKS:
 		/* releasing blocks and/or fragments from a file */
-		mp = WK_FREEBLKS(wk)->fb_mnt;
-		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
-			panic("%s: freeblks on suspended filesystem",
-				"process_worklist_item");
-		if (mp == matchmnt)
-			matchcnt += 1;
 		handle_workitem_freeblocks(WK_FREEBLKS(wk), flags & LK_NOWAIT);
 		break;
 
 	case D_FREEFRAG:
 		/* releasing a fragment when replaced as a file grows */
-		mp = WK_FREEFRAG(wk)->ff_mnt;
-		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
-			panic("%s: freefrag on suspended filesystem",
-				"process_worklist_item");
-		if (mp == matchmnt)
-			matchcnt += 1;
 		handle_workitem_freefrag(WK_FREEFRAG(wk));
 		break;
 
 	case D_FREEFILE:
 		/* releasing an inode when its link count drops to 0 */
-		mp = WK_FREEFILE(wk)->fx_mnt;
-		if (vn_write_suspend_wait(NULL, mp, V_NOWAIT))
-			panic("%s: freefile on suspended filesystem",
-				"process_worklist_item");
-		if (mp == matchmnt)
-			matchcnt += 1;
 		handle_workitem_freefile(WK_FREEFILE(wk));
 		break;
 
 	default:
 		panic("%s_process_worklist: Unknown type %s",
 		    "softdep", TYPENAME(wk->wk_type));
 		/* NOTREACHED */
 	}
 	ACQUIRE_LOCK(&lk);
 	return (matchcnt);
 }
 
 /*
  * Move dependencies from one buffer to another.
  */
 void
 softdep_move_dependencies(oldbp, newbp)
 	struct buf *oldbp;
 	struct buf *newbp;
 {
 	struct worklist *wk, *wktail;
 
 	if (LIST_FIRST(&newbp->b_dep) != NULL)
 		panic("softdep_move_dependencies: need merge code");
 	wktail = 0;
 	ACQUIRE_LOCK(&lk);
 	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
 		LIST_REMOVE(wk, wk_list);
 		if (wktail == 0)
 			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
 		else
 			LIST_INSERT_AFTER(wktail, wk, wk_list);
 		wktail = wk;
 	}
 	FREE_LOCK(&lk);
 }
 
 /*
  * Purge the work list of all items associated with a particular mount point.
  */
 int
 softdep_flushworklist(oldmnt, countp, td)
 	struct mount *oldmnt;
 	int *countp;
 	struct thread *td;
 {
 	struct vnode *devvp;
 	int count, error = 0;
+	struct ufsmount *ump;
 
 	/*
-	 * Await our turn to clear out the queue, then serialize access.
-	 */
-	ACQUIRE_LOCK(&lk);
-	while (softdep_worklist_busy) {
-		softdep_worklist_req += 1;
-		msleep(&softdep_worklist_req, &lk, PRIBIO, "softflush", 0);
-		softdep_worklist_req -= 1;
-	}
-	softdep_worklist_busy = -1;
-	FREE_LOCK(&lk);
-	/*
 	 * Alternately flush the block device associated with the mount
 	 * point and process any dependencies that the flushing
 	 * creates. We continue until no more worklist dependencies
 	 * are found.
 	 */
 	*countp = 0;
-	devvp = VFSTOUFS(oldmnt)->um_devvp;
-	while ((count = softdep_process_worklist(oldmnt)) > 0) {
+	ump = VFSTOUFS(oldmnt);
+	devvp = ump->um_devvp;
+	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
 		*countp += count;
 		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = VOP_FSYNC(devvp, MNT_WAIT, td);
 		VOP_UNLOCK(devvp, 0, td);
 		if (error)
 			break;
 	}
+	return (error);
+}
+
+int
+softdep_waitidle(struct mount *mp)
+{
+	struct ufsmount *ump;
+	int error;
+	int i;
+
+	ump = VFSTOUFS(mp);
 	ACQUIRE_LOCK(&lk);
-	softdep_worklist_busy = 0;
-	if (softdep_worklist_req)
-		wakeup(&softdep_worklist_req);
+	for (i = 0; i < 10 && ump->softdep_deps; i++) {
+		ump->softdep_req = 1;
+		if (ump->softdep_on_worklist)
+			panic("softdep_waitidle: work added after flush.");
+		msleep(&ump->softdep_deps, &lk, PVM, "softdeps", 1);
+	}
+	ump->softdep_req = 0;
 	FREE_LOCK(&lk);
+	error = 0;
+	if (i == 10) {
+		error = EBUSY;
+		printf("softdep_waitidle: Failed to flush worklist for %p",
+		    mp);
+	}
+
 	return (error);
 }
 
 /*
  * Flush all vnodes and worklist items associated with a specified mount point.
  */
 int
 softdep_flushfiles(oldmnt, flags, td)
 	struct mount *oldmnt;
 	int flags;
 	struct thread *td;
 {
 	int error, count, loopcnt;
 
 	error = 0;
 
 	/*
 	 * Alternately flush the vnodes associated with the mount
 	 * point and process any dependencies that the flushing
 	 * creates. In theory, this loop can happen at most twice,
 	 * but we give it a few extra just to be sure.
 	 */
 	for (loopcnt = 10; loopcnt > 0; loopcnt--) {
 		/*
 		 * Do another flush in case any vnodes were brought in
 		 * as part of the cleanup operations.
 		 */
 		if ((error = ffs_flushfiles(oldmnt, flags, td)) != 0)
 			break;
 		if ((error = softdep_flushworklist(oldmnt, &count, td)) != 0 ||
 		    count == 0)
 			break;
 	}
 	/*
 	 * If we are unmounting then it is an error to fail. If we
 	 * are simply trying to downgrade to read-only, then filesystem
 	 * activity can keep us busy forever, so we just fail with EBUSY.
 	 */
 	if (loopcnt == 0) {
 		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
 			panic("softdep_flushfiles: looping");
 		error = EBUSY;
 	}
+	if (!error)
+		error = softdep_waitidle(oldmnt);
 	return (error);
 }
 
 /*
  * Structure hashing.
  * 
  * There are three types of structures that can be looked up:
  *	1) pagedep structures identified by mount point, inode number,
  *	   and logical block.
  *	2) inodedep structures identified by mount point and inode number.
  *	3) newblk structures identified by mount point and
  *	   physical block number.
  *
  * The "pagedep" and "inodedep" dependency structures are hashed
  * separately from the file blocks and inodes to which they correspond.
  * This separation helps when the in-memory copy of an inode or
  * file block must be replaced. It also obviates the need to access
  * an inode or file page when simply updating (or de-allocating)
  * dependency structures. Lookup of newblk structures is needed to
  * find newly allocated blocks when trying to associate them with
  * their allocdirect or allocindir structure.
  *
  * The lookup routines optionally create and hash a new instance when
  * an existing entry is not found.
  */
 #define DEPALLOC	0x0001	/* allocate structure if lookup fails */
 #define NODELAY		0x0002	/* cannot do background work */
 
 /*
  * Structures and routines associated with pagedep caching.
  */
 LIST_HEAD(pagedep_hashhead, pagedep) *pagedep_hashtbl;
 u_long	pagedep_hash;		/* size of hash table - 1 */
 #define	PAGEDEP_HASH(mp, inum, lbn) \
 	(&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \
 	    pagedep_hash])
 
 static int
 pagedep_find(pagedephd, ino, lbn, mp, flags, pagedeppp)
 	struct pagedep_hashhead *pagedephd;
 	ino_t ino;
 	ufs_lbn_t lbn;
 	struct mount *mp;
 	int flags;
 	struct pagedep **pagedeppp;
 {
 	struct pagedep *pagedep;
 
 	LIST_FOREACH(pagedep, pagedephd, pd_hash)
 		if (ino == pagedep->pd_ino &&
 		    lbn == pagedep->pd_lbn &&
-		    mp == pagedep->pd_mnt)
+		    mp == pagedep->pd_list.wk_mp)
 			break;
 	if (pagedep) {
 		*pagedeppp = pagedep;
 		if ((flags & DEPALLOC) != 0 &&
 		    (pagedep->pd_state & ONWORKLIST) == 0)
 			return (0);
 		return (1);
 	}
 	*pagedeppp = NULL;
 	return (0);
 }
 /*
  * Look up a pagedep. Return 1 if found, 0 if not found or found
  * when asked to allocate but not associated with any buffer.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in pagedeppp.
  * This routine must be called with splbio interrupts blocked.
  */
 static int
 pagedep_lookup(ip, lbn, flags, pagedeppp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	int flags;
 	struct pagedep **pagedeppp;
 {
 	struct pagedep *pagedep;
 	struct pagedep_hashhead *pagedephd;
 	struct mount *mp;
 	int ret;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
 	mp = ITOV(ip)->v_mount;
 	pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn);
 
 	ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
 	if (*pagedeppp || (flags & DEPALLOC) == 0)
 		return (ret);
 	FREE_LOCK(&lk);
 	MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep),
 	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
+	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
 	ACQUIRE_LOCK(&lk);
 	ret = pagedep_find(pagedephd, ip->i_number, lbn, mp, flags, pagedeppp);
 	if (*pagedeppp) {
-		FREE(pagedep, M_PAGEDEP);
+		WORKITEM_FREE(pagedep, D_PAGEDEP);
 		return (ret);
 	}
-	pagedep->pd_list.wk_type = D_PAGEDEP;
-	pagedep->pd_mnt = mp;
 	pagedep->pd_ino = ip->i_number;
 	pagedep->pd_lbn = lbn;
 	LIST_INIT(&pagedep->pd_dirremhd);
 	LIST_INIT(&pagedep->pd_pendinghd);
 	for (i = 0; i < DAHASHSZ; i++)
 		LIST_INIT(&pagedep->pd_diraddhd[i]);
 	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
 	*pagedeppp = pagedep;
 	return (0);
 }
 
 /*
  * Structures and routines associated with inodedep caching.
  */
 LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
 static u_long	inodedep_hash;	/* size of hash table - 1 */
 static long	num_inodedep;	/* number of inodedep allocated */
 #define	INODEDEP_HASH(fs, inum) \
       (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
 
 static int
 inodedep_find(inodedephd, fs, inum, inodedeppp)
 	struct inodedep_hashhead *inodedephd;
 	struct fs *fs;
 	ino_t inum;
 	struct inodedep **inodedeppp;
 {
 	struct inodedep *inodedep;
 
 	LIST_FOREACH(inodedep, inodedephd, id_hash)
 		if (inum == inodedep->id_ino && fs == inodedep->id_fs)
 			break;
 	if (inodedep) {
 		*inodedeppp = inodedep;
 		return (1);
 	}
 	*inodedeppp = NULL;
 
 	return (0);
 }
 /*
  * Look up an inodedep. Return 1 if found, 0 if not found.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in inodedeppp.
  * This routine must be called with splbio interrupts blocked.
  */
 static int
-inodedep_lookup(fs, inum, flags, inodedeppp)
-	struct fs *fs;
+inodedep_lookup(mp, inum, flags, inodedeppp)
+	struct mount *mp;
 	ino_t inum;
 	int flags;
 	struct inodedep **inodedeppp;
 {
 	struct inodedep *inodedep;
 	struct inodedep_hashhead *inodedephd;
+	struct fs *fs;
 
 	mtx_assert(&lk, MA_OWNED);
+	fs = VFSTOUFS(mp)->um_fs;
 	inodedephd = INODEDEP_HASH(fs, inum);
 
 	if (inodedep_find(inodedephd, fs, inum, inodedeppp))
 		return (1);
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	/*
 	 * If we are over our limit, try to improve the situation.
 	 */
-	if (num_inodedep > max_softdeps  && (flags & NODELAY) == 0)
-		request_cleanup(FLUSH_INODES);
+	if (num_inodedep > max_softdeps && (flags & NODELAY) == 0)
+		request_cleanup(mp, FLUSH_INODES);
 	FREE_LOCK(&lk);
 	MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
 		M_INODEDEP, M_SOFTDEP_FLAGS);
+	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
 	ACQUIRE_LOCK(&lk);
 	if (inodedep_find(inodedephd, fs, inum, inodedeppp)) {
-		FREE(inodedep, M_INODEDEP);
+		WORKITEM_FREE(inodedep, D_INODEDEP);
 		return (1);
 	}
 	num_inodedep += 1;
-	inodedep->id_list.wk_type = D_INODEDEP;
 	inodedep->id_fs = fs;
 	inodedep->id_ino = inum;
 	inodedep->id_state = ALLCOMPLETE;
 	inodedep->id_nlinkdelta = 0;
 	inodedep->id_savedino1 = NULL;
 	inodedep->id_savedsize = -1;
 	inodedep->id_savedextsize = -1;
 	inodedep->id_buf = NULL;
 	LIST_INIT(&inodedep->id_pendinghd);
 	LIST_INIT(&inodedep->id_inowait);
 	LIST_INIT(&inodedep->id_bufwait);
 	TAILQ_INIT(&inodedep->id_inoupdt);
 	TAILQ_INIT(&inodedep->id_newinoupdt);
 	TAILQ_INIT(&inodedep->id_extupdt);
 	TAILQ_INIT(&inodedep->id_newextupdt);
 	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
 	*inodedeppp = inodedep;
 	return (0);
 }
 
 /*
  * Structures and routines associated with newblk caching.
  */
 LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl;
 u_long	newblk_hash;		/* size of hash table - 1 */
 #define	NEWBLK_HASH(fs, inum) \
 	(&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash])
 
 static int
 newblk_find(newblkhd, fs, newblkno, newblkpp)
 	struct newblk_hashhead *newblkhd;
 	struct fs *fs;
 	ufs2_daddr_t newblkno;
 	struct newblk **newblkpp;
 {
 	struct newblk *newblk;
 
 	LIST_FOREACH(newblk, newblkhd, nb_hash)
 		if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs)
 			break;
 	if (newblk) {
 		*newblkpp = newblk;
 		return (1);
 	}
 	*newblkpp = NULL;
 	return (0);
 }
 
 /*
  * Look up a newblk. Return 1 if found, 0 if not found.
  * If not found, allocate if DEPALLOC flag is passed.
  * Found or allocated entry is returned in newblkpp.
  */
 static int
 newblk_lookup(fs, newblkno, flags, newblkpp)
 	struct fs *fs;
 	ufs2_daddr_t newblkno;
 	int flags;
 	struct newblk **newblkpp;
 {
 	struct newblk *newblk;
 	struct newblk_hashhead *newblkhd;
 
 	newblkhd = NEWBLK_HASH(fs, newblkno);
 	if (newblk_find(newblkhd, fs, newblkno, newblkpp))
 		return (1);
 	if ((flags & DEPALLOC) == 0)
 		return (0);
 	FREE_LOCK(&lk);
 	MALLOC(newblk, struct newblk *, sizeof(struct newblk),
 		M_NEWBLK, M_SOFTDEP_FLAGS);
 	ACQUIRE_LOCK(&lk);
 	if (newblk_find(newblkhd, fs, newblkno, newblkpp)) {
 		FREE(newblk, M_NEWBLK);
 		return (1);
 	}
 	newblk->nb_state = 0;
 	newblk->nb_fs = fs;
 	newblk->nb_newblkno = newblkno;
 	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
 	*newblkpp = newblk;
 	return (0);
 }
 
 /*
  * Executed during filesystem system initialization before
  * mounting any filesystems.
  */
 void 
 softdep_initialize()
 {
 
 	LIST_INIT(&mkdirlisthd);
-	LIST_INIT(&softdep_workitem_pending);
 	max_softdeps = desiredvnodes * 4;
 	pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
 	    &pagedep_hash);
 	inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
 	newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
 
-	/* hooks through which the main kernel code calls us */
-	softdep_process_worklist_hook = softdep_process_worklist;
-
 	/* initialise bioops hack */
 	bioops.io_start = softdep_disk_io_initiation;
 	bioops.io_complete = softdep_disk_write_complete;
 	bioops.io_deallocate = softdep_deallocate_dependencies;
 	bioops.io_countdeps = softdep_count_dependencies;
 }
 
 /*
  * Executed after all filesystems have been unmounted during
  * filesystem module unload.
  */
 void
 softdep_uninitialize()
 {
 
-	softdep_process_worklist_hook = NULL;
 	hashdestroy(pagedep_hashtbl, M_PAGEDEP, pagedep_hash);
 	hashdestroy(inodedep_hashtbl, M_INODEDEP, inodedep_hash);
 	hashdestroy(newblk_hashtbl, M_NEWBLK, newblk_hash);
 }
 
 /*
  * Called at mount time to notify the dependency code that a
  * filesystem wishes to use it.
  */
 int
 softdep_mount(devvp, mp, fs, cred)
 	struct vnode *devvp;
 	struct mount *mp;
 	struct fs *fs;
 	struct ucred *cred;
 {
 	struct csum_total cstotal;
+	struct ufsmount *ump;
 	struct cg *cgp;
 	struct buf *bp;
 	int error, cyl;
 
 	mp->mnt_flag &= ~MNT_ASYNC;
 	mp->mnt_flag |= MNT_SOFTDEP;
+	ump = VFSTOUFS(mp);
+	LIST_INIT(&ump->softdep_workitem_pending);
+	ump->softdep_worklist_tail = NULL;
+	ump->softdep_on_worklist = 0;
+	ump->softdep_deps = 0;
 	/*
 	 * When doing soft updates, the counters in the
 	 * superblock may have gotten out of sync. Recomputation
 	 * can take a long time and can be deferred for background
 	 * fsck.  However, the old behavior of scanning the cylinder
 	 * groups and recalculating them at mount time is available
 	 * by setting vfs.ffs.compute_summary_at_mount to one.
 	 */
 	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
 		return (0);
 	bzero(&cstotal, sizeof cstotal);
 	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
 		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
 		    fs->fs_cgsize, cred, &bp)) != 0) {
 			brelse(bp);
 			return (error);
 		}
 		cgp = (struct cg *)bp->b_data;
 		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
 		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
 		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
 		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
 		fs->fs_cs(fs, cyl) = cgp->cg_cs;
 		brelse(bp);
 	}
 #ifdef DEBUG
 	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
 		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
 #endif
 	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
 	return (0);
 }
 
 /*
  * Protecting the freemaps (or bitmaps).
  * 
  * To eliminate the need to execute fsck before mounting a filesystem
  * after a power failure, one must (conservatively) guarantee that the
  * on-disk copy of the bitmaps never indicate that a live inode or block is
  * free.  So, when a block or inode is allocated, the bitmap should be
  * updated (on disk) before any new pointers.  When a block or inode is
  * freed, the bitmap should not be updated until all pointers have been
  * reset.  The latter dependency is handled by the delayed de-allocation
  * approach described below for block and inode de-allocation.  The former
  * dependency is handled by calling the following procedure when a block or
  * inode is allocated. When an inode is allocated an "inodedep" is created
  * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
  * Each "inodedep" is also inserted into the hash indexing structure so
  * that any additional link additions can be made dependent on the inode
  * allocation.
  * 
  * The ufs filesystem maintains a number of free block counts (e.g., per
  * cylinder group, per cylinder and per <cylinder, rotational position> pair)
  * in addition to the bitmaps.  These counts are used to improve efficiency
  * during allocation and therefore must be consistent with the bitmaps.
  * There is no convenient way to guarantee post-crash consistency of these
  * counts with simple update ordering, for two main reasons: (1) The counts
  * and bitmaps for a single cylinder group block are not in the same disk
  * sector.  If a disk write is interrupted (e.g., by power failure), one may
  * be written and the other not.  (2) Some of the counts are located in the
  * superblock rather than the cylinder group block. So, we focus our soft
  * updates implementation on protecting the bitmaps. When mounting a
  * filesystem, we recompute the auxiliary counts from the bitmaps.
  */
 
 /*
  * Called just after updating the cylinder group block to allocate an inode.
  */
 void
 softdep_setup_inomapdep(bp, ip, newinum)
 	struct buf *bp;		/* buffer for cylgroup block with inode map */
 	struct inode *ip;	/* inode related to allocation */
 	ino_t newinum;		/* new inode number being allocated */
 {
 	struct inodedep *inodedep;
 	struct bmsafemap *bmsafemap;
 
 	/*
 	 * Create a dependency for the newly allocated inode.
 	 * Panic if it already exists as something is seriously wrong.
 	 * Otherwise add it to the dependency list for the buffer holding
 	 * the cylinder group map from which it was allocated.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if ((inodedep_lookup(ip->i_fs, newinum, DEPALLOC|NODELAY, &inodedep)))
+	if ((inodedep_lookup(UFSTOVFS(ip->i_ump), newinum, DEPALLOC|NODELAY,
+	    &inodedep)))
 		panic("softdep_setup_inomapdep: found inode");
 	inodedep->id_buf = bp;
 	inodedep->id_state &= ~DEPCOMPLETE;
-	bmsafemap = bmsafemap_lookup(bp);
+	bmsafemap = bmsafemap_lookup(inodedep->id_list.wk_mp, bp);
 	LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called just after updating the cylinder group block to
  * allocate block or fragment.
  */
 void
-softdep_setup_blkmapdep(bp, fs, newblkno)
+softdep_setup_blkmapdep(bp, mp, newblkno)
 	struct buf *bp;		/* buffer for cylgroup block with block map */
-	struct fs *fs;		/* filesystem doing allocation */
+	struct mount *mp;	/* filesystem doing allocation */
 	ufs2_daddr_t newblkno;	/* number of newly allocated block */
 {
 	struct newblk *newblk;
 	struct bmsafemap *bmsafemap;
+	struct fs *fs;
 
+	fs = VFSTOUFS(mp)->um_fs;
 	/*
 	 * Create a dependency for the newly allocated block.
 	 * Add it to the dependency list for the buffer holding
 	 * the cylinder group map from which it was allocated.
 	 */
 	ACQUIRE_LOCK(&lk);
 	if (newblk_lookup(fs, newblkno, DEPALLOC, &newblk) != 0)
 		panic("softdep_setup_blkmapdep: found block");
-	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(bp);
+	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp);
 	LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Find the bmsafemap associated with a cylinder group buffer.
  * If none exists, create one. The buffer must be locked when
  * this routine is called and this routine must be called with
  * splbio interrupts blocked.
  */
 static struct bmsafemap *
-bmsafemap_lookup(bp)
+bmsafemap_lookup(mp, bp)
+	struct mount *mp;
 	struct buf *bp;
 {
 	struct bmsafemap *bmsafemap;
 	struct worklist *wk;
 
 	mtx_assert(&lk, MA_OWNED);
 	LIST_FOREACH(wk, &bp->b_dep, wk_list)
 		if (wk->wk_type == D_BMSAFEMAP)
 			return (WK_BMSAFEMAP(wk));
 	FREE_LOCK(&lk);
 	MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap),
 		M_BMSAFEMAP, M_SOFTDEP_FLAGS);
-	bmsafemap->sm_list.wk_type = D_BMSAFEMAP;
-	bmsafemap->sm_list.wk_state = 0;
+	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
 	bmsafemap->sm_buf = bp;
 	LIST_INIT(&bmsafemap->sm_allocdirecthd);
 	LIST_INIT(&bmsafemap->sm_allocindirhd);
 	LIST_INIT(&bmsafemap->sm_inodedephd);
 	LIST_INIT(&bmsafemap->sm_newblkhd);
 	ACQUIRE_LOCK(&lk);
 	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
 	return (bmsafemap);
 }
 
 /*
  * Direct block allocation dependencies.
  * 
  * When a new block is allocated, the corresponding disk locations must be
  * initialized (with zeros or new data) before the on-disk inode points to
  * them.  Also, the freemap from which the block was allocated must be
  * updated (on disk) before the inode's pointer. These two dependencies are
  * independent of each other and are needed for all file blocks and indirect
  * blocks that are pointed to directly by the inode.  Just before the
  * "in-core" version of the inode is updated with a newly allocated block
  * number, a procedure (below) is called to setup allocation dependency
  * structures.  These structures are removed when the corresponding
  * dependencies are satisfied or when the block allocation becomes obsolete
  * (i.e., the file is deleted, the block is de-allocated, or the block is a
  * fragment that gets upgraded).  All of these cases are handled in
  * procedures described later.
  * 
  * When a file extension causes a fragment to be upgraded, either to a larger
  * fragment or to a full block, the on-disk location may change (if the
  * previous fragment could not simply be extended). In this case, the old
  * fragment must be de-allocated, but not until after the inode's pointer has
  * been updated. In most cases, this is handled by later procedures, which
  * will construct a "freefrag" structure to be added to the workitem queue
  * when the inode update is complete (or obsolete).  The main exception to
  * this is when an allocation occurs while a pending allocation dependency
  * (for the same block pointer) remains.  This case is handled in the main
  * allocation dependency setup procedure by immediately freeing the
  * unreferenced fragments.
  */ 
 void 
 softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;	/* inode to which block is being added */
 	ufs_lbn_t lbn;		/* block pointer within inode */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
 	long newsize;		/* size of new block */
 	long oldsize;		/* size of new block */
 	struct buf *bp;		/* bp for allocated block */
 {
 	struct allocdirect *adp, *oldadp;
 	struct allocdirectlst *adphead;
 	struct bmsafemap *bmsafemap;
 	struct inodedep *inodedep;
 	struct pagedep *pagedep;
 	struct newblk *newblk;
+	struct mount *mp;
 
+	mp = UFSTOVFS(ip->i_ump);
 	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
 		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
-	adp->ad_list.wk_type = D_ALLOCDIRECT;
+	workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
 	adp->ad_lbn = lbn;
 	adp->ad_newblkno = newblkno;
 	adp->ad_oldblkno = oldblkno;
 	adp->ad_newsize = newsize;
 	adp->ad_oldsize = oldsize;
 	adp->ad_state = ATTACHED;
 	LIST_INIT(&adp->ad_newdirblk);
 	if (newblkno == oldblkno)
 		adp->ad_freefrag = NULL;
 	else
 		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 
 	ACQUIRE_LOCK(&lk);
 	if (lbn >= NDADDR) {
 		/* allocating an indirect block */
 		if (oldblkno != 0)
 			panic("softdep_setup_allocdirect: non-zero indir");
 	} else {
 		/*
 		 * Allocating a direct block.
 		 *
 		 * If we are allocating a directory block, then we must
 		 * allocate an associated pagedep to track additions and
 		 * deletions.
 		 */
 		if ((ip->i_mode & IFMT) == IFDIR &&
 		    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 			WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	}
 	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocdirect: lost block");
 	if (newblk->nb_state == DEPCOMPLETE) {
 		adp->ad_state |= DEPCOMPLETE;
 		adp->ad_buf = NULL;
 	} else {
 		bmsafemap = newblk->nb_bmsafemap;
 		adp->ad_buf = bmsafemap->sm_buf;
 		LIST_REMOVE(newblk, nb_deps);
 		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 	}
 	LIST_REMOVE(newblk, nb_hash);
 	FREE(newblk, M_NEWBLK);
 
-	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
+	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 	adp->ad_inodedep = inodedep;
 	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 	/*
 	 * The list of allocdirects must be kept in sorted and ascending
 	 * order so that the rollback routines can quickly determine the
 	 * first uncommitted block (the size of the file stored on disk
 	 * ends at the end of the lowest committed fragment, or if there
 	 * are no fragments, at the end of the highest committed block).
 	 * Since files generally grow, the typical case is that the new
 	 * block is to be added at the end of the list. We speed this
 	 * special case by checking against the last allocdirect in the
 	 * list before laboriously traversing the list looking for the
 	 * insertion point.
 	 */
 	adphead = &inodedep->id_newinoupdt;
 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
 	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 		/* insert at end of list */
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 		if (oldadp != NULL && oldadp->ad_lbn == lbn)
 			allocdirect_merge(adphead, adp, oldadp);
 		FREE_LOCK(&lk);
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
 		if (oldadp->ad_lbn >= lbn)
 			break;
 	}
 	if (oldadp == NULL)
 		panic("softdep_setup_allocdirect: lost entry");
 	/* insert in middle of list */
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 	if (oldadp->ad_lbn == lbn)
 		allocdirect_merge(adphead, adp, oldadp);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Replace an old allocdirect dependency with a newer one.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 allocdirect_merge(adphead, newadp, oldadp)
 	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
 	struct allocdirect *newadp;	/* allocdirect being added */
 	struct allocdirect *oldadp;	/* existing allocdirect being checked */
 {
 	struct worklist *wk;
 	struct freefrag *freefrag;
 	struct newdirblk *newdirblk;
 
 	mtx_assert(&lk, MA_OWNED);
 	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
 	    newadp->ad_oldsize != oldadp->ad_newsize ||
 	    newadp->ad_lbn >= NDADDR)
 		panic("%s %jd != new %jd || old size %ld != new %ld",
 		    "allocdirect_merge: old blkno",
 		    (intmax_t)newadp->ad_oldblkno,
 		    (intmax_t)oldadp->ad_newblkno,
 		    newadp->ad_oldsize, oldadp->ad_newsize);
 	newadp->ad_oldblkno = oldadp->ad_oldblkno;
 	newadp->ad_oldsize = oldadp->ad_oldsize;
 	/*
 	 * If the old dependency had a fragment to free or had never
 	 * previously had a block allocated, then the new dependency
 	 * can immediately post its freefrag and adopt the old freefrag.
 	 * This action is done by swapping the freefrag dependencies.
 	 * The new dependency gains the old one's freefrag, and the
 	 * old one gets the new one and then immediately puts it on
 	 * the worklist when it is freed by free_allocdirect. It is
 	 * not possible to do this swap when the old dependency had a
 	 * non-zero size but no previous fragment to free. This condition
 	 * arises when the new block is an extension of the old block.
 	 * Here, the first part of the fragment allocated to the new
 	 * dependency is part of the block currently claimed on disk by
 	 * the old dependency, so cannot legitimately be freed until the
 	 * conditions for the new dependency are fulfilled.
 	 */
 	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
 		freefrag = newadp->ad_freefrag;
 		newadp->ad_freefrag = oldadp->ad_freefrag;
 		oldadp->ad_freefrag = freefrag;
 	}
 	/*
 	 * If we are tracking a new directory-block allocation,
 	 * move it from the old allocdirect to the new allocdirect.
 	 */
 	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
 		newdirblk = WK_NEWDIRBLK(wk);
 		WORKLIST_REMOVE(&newdirblk->db_list);
 		if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL)
 			panic("allocdirect_merge: extra newdirblk");
 		WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list);
 	}
 	free_allocdirect(adphead, oldadp, 0);
 }
 		
 /*
  * Allocate a new freefrag structure if needed.
  */
 static struct freefrag *
 newfreefrag(ip, blkno, size)
 	struct inode *ip;
 	ufs2_daddr_t blkno;
 	long size;
 {
 	struct freefrag *freefrag;
 	struct fs *fs;
 
 	if (blkno == 0)
 		return (NULL);
 	fs = ip->i_fs;
 	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
 		panic("newfreefrag: frag size");
 	MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag),
 		M_FREEFRAG, M_SOFTDEP_FLAGS);
-	freefrag->ff_list.wk_type = D_FREEFRAG;
-	freefrag->ff_state = 0;
+	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
 	freefrag->ff_inum = ip->i_number;
-	freefrag->ff_mnt = ITOV(ip)->v_mount;
 	freefrag->ff_blkno = blkno;
 	freefrag->ff_fragsize = size;
 	return (freefrag);
 }
 
 /*
  * This workitem de-allocates fragments that were replaced during
  * file block allocation.
  */
 static void 
 handle_workitem_freefrag(freefrag)
 	struct freefrag *freefrag;
 {
-	struct ufsmount *ump = VFSTOUFS(freefrag->ff_mnt);
+	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
 
 	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
 	    freefrag->ff_fragsize, freefrag->ff_inum);
-	FREE(freefrag, M_FREEFRAG);
+	ACQUIRE_LOCK(&lk);
+	WORKITEM_FREE(freefrag, D_FREEFRAG);
+	FREE_LOCK(&lk);
 }
 
 /*
  * Set up a dependency structure for an external attributes data block.
  * This routine follows much of the structure of softdep_setup_allocdirect.
  * See the description of softdep_setup_allocdirect above for details.
  */
 void 
 softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	struct inode *ip;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t newblkno;
 	ufs2_daddr_t oldblkno;
 	long newsize;
 	long oldsize;
 	struct buf *bp;
 {
 	struct allocdirect *adp, *oldadp;
 	struct allocdirectlst *adphead;
 	struct bmsafemap *bmsafemap;
 	struct inodedep *inodedep;
 	struct newblk *newblk;
+	struct mount *mp;
 
+	mp = UFSTOVFS(ip->i_ump);
 	MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect),
 		M_ALLOCDIRECT, M_SOFTDEP_FLAGS|M_ZERO);
-	adp->ad_list.wk_type = D_ALLOCDIRECT;
+	workitem_alloc(&adp->ad_list, D_ALLOCDIRECT, mp);
 	adp->ad_lbn = lbn;
 	adp->ad_newblkno = newblkno;
 	adp->ad_oldblkno = oldblkno;
 	adp->ad_newsize = newsize;
 	adp->ad_oldsize = oldsize;
 	adp->ad_state = ATTACHED | EXTDATA;
 	LIST_INIT(&adp->ad_newdirblk);
 	if (newblkno == oldblkno)
 		adp->ad_freefrag = NULL;
 	else
 		adp->ad_freefrag = newfreefrag(ip, oldblkno, oldsize);
 
 	ACQUIRE_LOCK(&lk);
 	if (newblk_lookup(ip->i_fs, newblkno, 0, &newblk) == 0)
 		panic("softdep_setup_allocext: lost block");
 
-	inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep);
+	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
 	adp->ad_inodedep = inodedep;
 
 	if (newblk->nb_state == DEPCOMPLETE) {
 		adp->ad_state |= DEPCOMPLETE;
 		adp->ad_buf = NULL;
 	} else {
 		bmsafemap = newblk->nb_bmsafemap;
 		adp->ad_buf = bmsafemap->sm_buf;
 		LIST_REMOVE(newblk, nb_deps);
 		LIST_INSERT_HEAD(&bmsafemap->sm_allocdirecthd, adp, ad_deps);
 	}
 	LIST_REMOVE(newblk, nb_hash);
 	FREE(newblk, M_NEWBLK);
 
 	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 	if (lbn >= NXADDR)
 		panic("softdep_setup_allocext: lbn %lld > NXADDR",
 		    (long long)lbn);
 	/*
 	 * The list of allocdirects must be kept in sorted and ascending
 	 * order so that the rollback routines can quickly determine the
 	 * first uncommitted block (the size of the file stored on disk
 	 * ends at the end of the lowest committed fragment, or if there
 	 * are no fragments, at the end of the highest committed block).
 	 * Since files generally grow, the typical case is that the new
 	 * block is to be added at the end of the list. We speed this
 	 * special case by checking against the last allocdirect in the
 	 * list before laboriously traversing the list looking for the
 	 * insertion point.
 	 */
 	adphead = &inodedep->id_newextupdt;
 	oldadp = TAILQ_LAST(adphead, allocdirectlst);
 	if (oldadp == NULL || oldadp->ad_lbn <= lbn) {
 		/* insert at end of list */
 		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
 		if (oldadp != NULL && oldadp->ad_lbn == lbn)
 			allocdirect_merge(adphead, adp, oldadp);
 		FREE_LOCK(&lk);
 		return;
 	}
 	TAILQ_FOREACH(oldadp, adphead, ad_next) {
 		if (oldadp->ad_lbn >= lbn)
 			break;
 	}
 	if (oldadp == NULL)
 		panic("softdep_setup_allocext: lost entry");
 	/* insert in middle of list */
 	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
 	if (oldadp->ad_lbn == lbn)
 		allocdirect_merge(adphead, adp, oldadp);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Indirect block allocation dependencies.
  * 
  * The same dependencies that exist for a direct block also exist when
  * a new block is allocated and pointed to by an entry in a block of
  * indirect pointers. The undo/redo states described above are also
  * used here. Because an indirect block contains many pointers that
  * may have dependencies, a second copy of the entire in-memory indirect
  * block is kept. The buffer cache copy is always completely up-to-date.
  * The second copy, which is used only as a source for disk writes,
  * contains only the safe pointers (i.e., those that have no remaining
  * update dependencies). The second copy is freed when all pointers
  * are safe. The cache is not allowed to replace indirect blocks with
  * pending update dependencies. If a buffer containing an indirect
  * block with dependencies is written, these routines will mark it
  * dirty again. It can only be successfully written once all the
  * dependencies are removed. The ffs_fsync routine in conjunction with
  * softdep_sync_metadata work together to get all the dependencies
  * removed so that a file can be successfully written to disk. Three
  * procedures are used when setting up indirect block pointer
  * dependencies. The division is necessary because of the organization
  * of the "balloc" routine and because of the distinction between file
  * pages and file metadata blocks.
  */
 
 /*
  * Allocate a new allocindir structure.
  */
 static struct allocindir *
 newallocindir(ip, ptrno, newblkno, oldblkno)
 	struct inode *ip;	/* inode for file being extended */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
 {
 	struct allocindir *aip;
 
 	MALLOC(aip, struct allocindir *, sizeof(struct allocindir),
 		M_ALLOCINDIR, M_SOFTDEP_FLAGS|M_ZERO);
-	aip->ai_list.wk_type = D_ALLOCINDIR;
+	workitem_alloc(&aip->ai_list, D_ALLOCINDIR, UFSTOVFS(ip->i_ump));
 	aip->ai_state = ATTACHED;
 	aip->ai_offset = ptrno;
 	aip->ai_newblkno = newblkno;
 	aip->ai_oldblkno = oldblkno;
 	aip->ai_freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize);
 	return (aip);
 }
 
 /*
  * Called just before setting an indirect block pointer
  * to a newly allocated file page.
  */
 void
 softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 	struct inode *ip;	/* inode for file being extended */
 	ufs_lbn_t lbn;		/* allocated block number within file */
 	struct buf *bp;		/* buffer with indirect blk referencing page */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
 	struct buf *nbp;	/* buffer holding allocated page */
 {
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
 	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * If we are allocating a directory page, then we must
 	 * allocate an associated pagedep to track additions and
 	 * deletions.
 	 */
 	if ((ip->i_mode & IFMT) == IFDIR &&
 	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 	setup_allocindir_phase2(bp, ip, aip);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called just before setting an indirect block pointer to a
  * newly allocated indirect block.
  */
 void
 softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
 	struct buf *nbp;	/* newly allocated indirect block */
 	struct inode *ip;	/* inode for file being extended */
 	struct buf *bp;		/* indirect block referencing allocated block */
 	int ptrno;		/* offset of pointer in indirect block */
 	ufs2_daddr_t newblkno;	/* disk block number being added */
 {
 	struct allocindir *aip;
 
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 	aip = newallocindir(ip, ptrno, newblkno, 0);
 	ACQUIRE_LOCK(&lk);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 	setup_allocindir_phase2(bp, ip, aip);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called to finish the allocation of the "aip" allocated
  * by one of the two routines above.
  */
 static void 
 setup_allocindir_phase2(bp, ip, aip)
 	struct buf *bp;		/* in-memory copy of the indirect block */
 	struct inode *ip;	/* inode for file being extended */
 	struct allocindir *aip;	/* allocindir allocated by the above routines */
 {
 	struct worklist *wk;
 	struct indirdep *indirdep, *newindirdep;
 	struct bmsafemap *bmsafemap;
 	struct allocindir *oldaip;
 	struct freefrag *freefrag;
 	struct newblk *newblk;
 	ufs2_daddr_t blkno;
 
 	mtx_assert(&lk, MA_OWNED);
 	if (bp->b_lblkno >= 0)
 		panic("setup_allocindir_phase2: not indir blk");
 	for (indirdep = NULL, newindirdep = NULL; ; ) {
 		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 			if (wk->wk_type != D_INDIRDEP)
 				continue;
 			indirdep = WK_INDIRDEP(wk);
 			break;
 		}
 		if (indirdep == NULL && newindirdep) {
 			indirdep = newindirdep;
 			WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
 			newindirdep = NULL;
 		}
 		if (indirdep) {
 			if (newblk_lookup(ip->i_fs, aip->ai_newblkno, 0,
 			    &newblk) == 0)
 				panic("setup_allocindir: lost block");
 			if (newblk->nb_state == DEPCOMPLETE) {
 				aip->ai_state |= DEPCOMPLETE;
 				aip->ai_buf = NULL;
 			} else {
 				bmsafemap = newblk->nb_bmsafemap;
 				aip->ai_buf = bmsafemap->sm_buf;
 				LIST_REMOVE(newblk, nb_deps);
 				LIST_INSERT_HEAD(&bmsafemap->sm_allocindirhd,
 				    aip, ai_deps);
 			}
 			LIST_REMOVE(newblk, nb_hash);
 			FREE(newblk, M_NEWBLK);
 			aip->ai_indirdep = indirdep;
 			/*
 			 * Check to see if there is an existing dependency
 			 * for this block. If there is, merge the old
 			 * dependency into the new one.
 			 */
 			if (aip->ai_oldblkno == 0)
 				oldaip = NULL;
 			else
 
 				LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next)
 					if (oldaip->ai_offset == aip->ai_offset)
 						break;
 			freefrag = NULL;
 			if (oldaip != NULL) {
 				if (oldaip->ai_newblkno != aip->ai_oldblkno)
 					panic("setup_allocindir_phase2: blkno");
 				aip->ai_oldblkno = oldaip->ai_oldblkno;
 				freefrag = aip->ai_freefrag;
 				aip->ai_freefrag = oldaip->ai_freefrag;
 				oldaip->ai_freefrag = NULL;
 				free_allocindir(oldaip, NULL);
 			}
 			LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
 			if (ip->i_ump->um_fstype == UFS1)
 				((ufs1_daddr_t *)indirdep->ir_savebp->b_data)
 				    [aip->ai_offset] = aip->ai_oldblkno;
 			else
 				((ufs2_daddr_t *)indirdep->ir_savebp->b_data)
 				    [aip->ai_offset] = aip->ai_oldblkno;
 			FREE_LOCK(&lk);
 			if (freefrag != NULL)
 				handle_workitem_freefrag(freefrag);
 		} else
 			FREE_LOCK(&lk);
 		if (newindirdep) {
 			newindirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
 			brelse(newindirdep->ir_savebp);
+			ACQUIRE_LOCK(&lk);
 			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
+			if (indirdep)
+				break;
+			FREE_LOCK(&lk);
 		}
 		if (indirdep) {
 			ACQUIRE_LOCK(&lk);
 			break;
 		}
 		MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep),
 			M_INDIRDEP, M_SOFTDEP_FLAGS);
-		newindirdep->ir_list.wk_type = D_INDIRDEP;
+		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP,
+		    UFSTOVFS(ip->i_ump));
 		newindirdep->ir_state = ATTACHED;
 		if (ip->i_ump->um_fstype == UFS1)
 			newindirdep->ir_state |= UFS1FMT;
 		LIST_INIT(&newindirdep->ir_deplisthd);
 		LIST_INIT(&newindirdep->ir_donehd);
 		if (bp->b_blkno == bp->b_lblkno) {
 			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
 			    NULL, NULL);
 			bp->b_blkno = blkno;
 		}
 		newindirdep->ir_savebp =
 		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
 		BUF_KERNPROC(newindirdep->ir_savebp);
 		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
 		ACQUIRE_LOCK(&lk);
 	}
 }
 
 /*
  * Block de-allocation dependencies.
  * 
  * When blocks are de-allocated, the on-disk pointers must be nullified before
  * the blocks are made available for use by other files.  (The true
  * requirement is that old pointers must be nullified before new on-disk
  * pointers are set.  We chose this slightly more stringent requirement to
  * reduce complexity.) Our implementation handles this dependency by updating
  * the inode (or indirect block) appropriately but delaying the actual block
  * de-allocation (i.e., freemap and free space count manipulation) until
  * after the updated versions reach stable storage.  After the disk is
  * updated, the blocks can be safely de-allocated whenever it is convenient.
  * This implementation handles only the common case of reducing a file's
  * length to zero. Other cases are handled by the conventional synchronous
  * write approach.
  *
  * The ffs implementation with which we worked double-checks
  * the state of the block pointers and file size as it reduces
  * a file's length.  Some of this code is replicated here in our
  * soft updates implementation.  The freeblks->fb_chkcnt field is
  * used to transfer a part of this information to the procedure
  * that eventually de-allocates the blocks.
  *
  * This routine should be called from the routine that shortens
  * a file's length, before the inode's size or block pointers
  * are modified. It will save the block pointer information for
  * later release and zero the inode so that the calling routine
  * can release it.
  */
 void
 softdep_setup_freeblocks(ip, length, flags)
 	struct inode *ip;	/* The inode whose length is to be reduced */
 	off_t length;		/* The new length for the file */
 	int flags;		/* IO_EXT and/or IO_NORMAL */
 {
 	struct freeblks *freeblks;
 	struct inodedep *inodedep;
 	struct allocdirect *adp;
 	struct vnode *vp;
 	struct buf *bp;
 	struct fs *fs;
 	ufs2_daddr_t extblocks, datablocks;
+	struct mount *mp;
 	int i, delay, error;
 
 	fs = ip->i_fs;
+	mp = UFSTOVFS(ip->i_ump);
 	if (length != 0)
 		panic("softdep_setup_freeblocks: non-zero length");
 	MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
 		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
-	freeblks->fb_list.wk_type = D_FREEBLKS;
+	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
 	freeblks->fb_state = ATTACHED;
 	freeblks->fb_uid = ip->i_uid;
 	freeblks->fb_previousinum = ip->i_number;
 	freeblks->fb_devvp = ip->i_devvp;
-	freeblks->fb_mnt = ITOV(ip)->v_mount;
 	extblocks = 0;
 	if (fs->fs_magic == FS_UFS2_MAGIC)
 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 	datablocks = DIP(ip, i_blocks) - extblocks;
 	if ((flags & IO_NORMAL) == 0) {
 		freeblks->fb_oldsize = 0;
 		freeblks->fb_chkcnt = 0;
 	} else {
 		freeblks->fb_oldsize = ip->i_size;
 		ip->i_size = 0;
 		DIP_SET(ip, i_size, 0);
 		freeblks->fb_chkcnt = datablocks;
 		for (i = 0; i < NDADDR; i++) {
 			freeblks->fb_dblks[i] = DIP(ip, i_db[i]);
 			DIP_SET(ip, i_db[i], 0);
 		}
 		for (i = 0; i < NIADDR; i++) {
 			freeblks->fb_iblks[i] = DIP(ip, i_ib[i]);
 			DIP_SET(ip, i_ib[i], 0);
 		}
 		/*
 		 * If the file was removed, then the space being freed was
 		 * accounted for then (see softdep_filereleased()). If the
 		 * file is merely being truncated, then we account for it now.
 		 */
 		if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 			UFS_LOCK(ip->i_ump);
 			fs->fs_pendingblocks += datablocks;
 			UFS_UNLOCK(ip->i_ump);
 		}
 	}
 	if ((flags & IO_EXT) == 0) {
 		freeblks->fb_oldextsize = 0;
 	} else {
 		freeblks->fb_oldextsize = ip->i_din2->di_extsize;
 		ip->i_din2->di_extsize = 0;
 		freeblks->fb_chkcnt += extblocks;
 		for (i = 0; i < NXADDR; i++) {
 			freeblks->fb_eblks[i] = ip->i_din2->di_extb[i];
 			ip->i_din2->di_extb[i] = 0;
 		}
 	}
 	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - freeblks->fb_chkcnt);
 	/*
 	 * Push the zero'ed inode to to its disk buffer so that we are free
 	 * to delete its dependencies below. Once the dependencies are gone
 	 * the buffer can be safely released.
 	 */
 	if ((error = bread(ip->i_devvp,
 	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
 		brelse(bp);
 		softdep_error("softdep_setup_freeblocks", error);
 	}
 	if (ip->i_ump->um_fstype == UFS1)
 		*((struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
 	else
 		*((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 	/*
 	 * Find and eliminate any inode dependencies.
 	 */
 	ACQUIRE_LOCK(&lk);
-	(void) inodedep_lookup(fs, ip->i_number, DEPALLOC, &inodedep);
+	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
 	if ((inodedep->id_state & IOSTARTED) != 0)
 		panic("softdep_setup_freeblocks: inode busy");
 	/*
 	 * Add the freeblks structure to the list of operations that
 	 * must await the zero'ed inode being written to disk. If we
 	 * still have a bitmap dependency (delay == 0), then the inode
 	 * has never been written to disk, so we can process the
 	 * freeblks below once we have deleted the dependencies.
 	 */
 	delay = (inodedep->id_state & DEPCOMPLETE);
 	if (delay)
 		WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
 	/*
 	 * Because the file length has been truncated to zero, any
 	 * pending block allocation dependency structures associated
 	 * with this inode are obsolete and can simply be de-allocated.
 	 * We must first merge the two dependency lists to get rid of
 	 * any duplicate freefrag structures, then purge the merged list.
 	 * If we still have a bitmap dependency, then the inode has never
 	 * been written to disk, so we can free any fragments without delay.
 	 */
 	if (flags & IO_NORMAL) {
 		merge_inode_lists(&inodedep->id_newinoupdt,
 		    &inodedep->id_inoupdt);
 		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
 			free_allocdirect(&inodedep->id_inoupdt, adp, delay);
 	}
 	if (flags & IO_EXT) {
 		merge_inode_lists(&inodedep->id_newextupdt,
 		    &inodedep->id_extupdt);
 		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
 			free_allocdirect(&inodedep->id_extupdt, adp, delay);
 	}
 	FREE_LOCK(&lk);
 	bdwrite(bp);
 	/*
 	 * We must wait for any I/O in progress to finish so that
 	 * all potential buffers on the dirty list will be visible.
 	 * Once they are all there, walk the list and get rid of
 	 * any dependencies.
 	 */
 	vp = ITOV(ip);
 	VI_LOCK(vp);
 	drain_output(vp);
 restart:
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & IO_NORMAL) == 0 &&
 		      (bp->b_xflags & BX_ALTDATA) == 0))
 			continue;
 		if ((bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT)) == NULL)
 			goto restart;
 		VI_UNLOCK(vp);
 		ACQUIRE_LOCK(&lk);
-		(void) inodedep_lookup(fs, ip->i_number, 0, &inodedep);
+		(void) inodedep_lookup(mp, ip->i_number, 0, &inodedep);
 		deallocate_dependencies(bp, inodedep);
 		FREE_LOCK(&lk);
 		bp->b_flags |= B_INVAL | B_NOCACHE;
 		brelse(bp);
 		VI_LOCK(vp);
 		goto restart;
 	}
 	VI_UNLOCK(vp);
 	ACQUIRE_LOCK(&lk);
-	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0)
+	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 
 	if(delay) {
 		freeblks->fb_state |= DEPCOMPLETE;
 		/*
 		 * If the inode with zeroed block pointers is now on disk
 		 * we can start freeing blocks. Add freeblks to the worklist
 		 * instead of calling  handle_workitem_freeblocks directly as
 		 * it is more likely that additional IO is needed to complete
 		 * the request here than in the !delay case.
 		 */  
 		if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
 			add_to_worklist(&freeblks->fb_list);
 	}
 
 	FREE_LOCK(&lk);
 	/*
 	 * If the inode has never been written to disk (delay == 0),
 	 * then we can process the freeblks now that we have deleted
 	 * the dependencies.
 	 */
 	if (!delay)
 		handle_workitem_freeblocks(freeblks, 0);
 }
 
 /*
  * Reclaim any dependency structures from a buffer that is about to
  * be reallocated to a new vnode. The buffer must be locked, thus,
  * no I/O completion operations can occur while we are manipulating
  * its associated dependencies. The mutex is held so that other I/O's
  * associated with related dependencies do not occur.
  */
 static void
 deallocate_dependencies(bp, inodedep)
 	struct buf *bp;
 	struct inodedep *inodedep;
 {
 	struct worklist *wk;
 	struct indirdep *indirdep;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 	struct dirrem *dirrem;
 	struct diradd *dap;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		switch (wk->wk_type) {
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 			/*
 			 * None of the indirect pointers will ever be visible,
 			 * so they can simply be tossed. GOINGAWAY ensures
 			 * that allocated pointers will be saved in the buffer
 			 * cache until they are freed. Note that they will
 			 * only be able to be found by their physical address
 			 * since the inode mapping the logical address will
 			 * be gone. The save buffer used for the safe copy
 			 * was allocated in setup_allocindir_phase2 using
 			 * the physical address so it could be used for this
 			 * purpose. Hence we swap the safe copy with the real
 			 * copy, allowing the safe copy to be freed and holding
 			 * on to the real copy for later use in indir_trunc.
 			 */
 			if (indirdep->ir_state & GOINGAWAY)
 				panic("deallocate_dependencies: already gone");
 			indirdep->ir_state |= GOINGAWAY;
 			VFSTOUFS(bp->b_vp->v_mount)->um_numindirdeps += 1;
 			while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
 				free_allocindir(aip, inodedep);
 			if (bp->b_lblkno >= 0 ||
 			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
 				panic("deallocate_dependencies: not indir");
 			bcopy(bp->b_data, indirdep->ir_savebp->b_data,
 			    bp->b_bcount);
 			WORKLIST_REMOVE(wk);
 			WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, wk);
 			continue;
 
 		case D_PAGEDEP:
 			pagedep = WK_PAGEDEP(wk);
 			/*
 			 * None of the directory additions will ever be
 			 * visible, so they can simply be tossed.
 			 */
 			for (i = 0; i < DAHASHSZ; i++)
 				while ((dap =
 				    LIST_FIRST(&pagedep->pd_diraddhd[i])))
 					free_diradd(dap);
 			while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != 0)
 				free_diradd(dap);
 			/*
 			 * Copy any directory remove dependencies to the list
 			 * to be processed after the zero'ed inode is written.
 			 * If the inode has already been written, then they 
 			 * can be dumped directly onto the work list.
 			 */
 			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
 				LIST_REMOVE(dirrem, dm_next);
 				dirrem->dm_dirinum = pagedep->pd_ino;
 				if (inodedep == NULL ||
 				    (inodedep->id_state & ALLCOMPLETE) ==
 				     ALLCOMPLETE)
 					add_to_worklist(&dirrem->dm_list);
 				else
 					WORKLIST_INSERT(&inodedep->id_bufwait,
 					    &dirrem->dm_list);
 			}
 			if ((pagedep->pd_state & NEWBLOCK) != 0) {
 				LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list)
 					if (wk->wk_type == D_NEWDIRBLK &&
 					    WK_NEWDIRBLK(wk)->db_pagedep ==
 					      pagedep)
 						break;
 				if (wk != NULL) {
 					WORKLIST_REMOVE(wk);
 					free_newdirblk(WK_NEWDIRBLK(wk));
 				} else
 					panic("deallocate_dependencies: "
 					      "lost pagedep");
 			}
 			WORKLIST_REMOVE(&pagedep->pd_list);
 			LIST_REMOVE(pagedep, pd_hash);
 			WORKITEM_FREE(pagedep, D_PAGEDEP);
 			continue;
 
 		case D_ALLOCINDIR:
 			free_allocindir(WK_ALLOCINDIR(wk), inodedep);
 			continue;
 
 		case D_ALLOCDIRECT:
 		case D_INODEDEP:
 			panic("deallocate_dependencies: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 
 		default:
 			panic("deallocate_dependencies: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 }
 
 /*
  * Free an allocdirect. Generate a new freefrag work request if appropriate.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 free_allocdirect(adphead, adp, delay)
 	struct allocdirectlst *adphead;
 	struct allocdirect *adp;
 	int delay;
 {
 	struct newdirblk *newdirblk;
 	struct worklist *wk;
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((adp->ad_state & DEPCOMPLETE) == 0)
 		LIST_REMOVE(adp, ad_deps);
 	TAILQ_REMOVE(adphead, adp, ad_next);
 	if ((adp->ad_state & COMPLETE) == 0)
 		WORKLIST_REMOVE(&adp->ad_list);
 	if (adp->ad_freefrag != NULL) {
 		if (delay)
 			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 			    &adp->ad_freefrag->ff_list);
 		else
 			add_to_worklist(&adp->ad_freefrag->ff_list);
 	}
 	if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) {
 		newdirblk = WK_NEWDIRBLK(wk);
 		WORKLIST_REMOVE(&newdirblk->db_list);
 		if (LIST_FIRST(&adp->ad_newdirblk) != NULL)
 			panic("free_allocdirect: extra newdirblk");
 		if (delay)
 			WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
 			    &newdirblk->db_list);
 		else
 			free_newdirblk(newdirblk);
 	}
 	WORKITEM_FREE(adp, D_ALLOCDIRECT);
 }
 
 /*
  * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 free_newdirblk(newdirblk)
 	struct newdirblk *newdirblk;
 {
 	struct pagedep *pagedep;
 	struct diradd *dap;
 	int i;
 
 	mtx_assert(&lk, MA_OWNED);
 	/*
 	 * If the pagedep is still linked onto the directory buffer
 	 * dependency chain, then some of the entries on the
 	 * pd_pendinghd list may not be committed to disk yet. In
 	 * this case, we will simply clear the NEWBLOCK flag and
 	 * let the pd_pendinghd list be processed when the pagedep
 	 * is next written. If the pagedep is no longer on the buffer
 	 * dependency chain, then all the entries on the pd_pending
 	 * list are committed to disk and we can free them here.
 	 */
 	pagedep = newdirblk->db_pagedep;
 	pagedep->pd_state &= ~NEWBLOCK;
 	if ((pagedep->pd_state & ONWORKLIST) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 			free_diradd(dap);
 	/*
 	 * If no dependencies remain, the pagedep will be freed.
 	 */
 	for (i = 0; i < DAHASHSZ; i++)
 		if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL)
 			break;
 	if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) {
 		LIST_REMOVE(pagedep, pd_hash);
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 	}
 	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
 }
 
 /*
  * Prepare an inode to be freed. The actual free operation is not
  * done until the zero'ed inode has been written to disk.
  */
 void
 softdep_freefile(pvp, ino, mode)
 	struct vnode *pvp;
 	ino_t ino;
 	int mode;
 {
 	struct inode *ip = VTOI(pvp);
 	struct inodedep *inodedep;
 	struct freefile *freefile;
 
 	/*
 	 * This sets up the inode de-allocation dependency.
 	 */
 	MALLOC(freefile, struct freefile *, sizeof(struct freefile),
 		M_FREEFILE, M_SOFTDEP_FLAGS);
-	freefile->fx_list.wk_type = D_FREEFILE;
-	freefile->fx_list.wk_state = 0;
+	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
 	freefile->fx_mode = mode;
 	freefile->fx_oldinum = ino;
 	freefile->fx_devvp = ip->i_devvp;
-	freefile->fx_mnt = ITOV(ip)->v_mount;
 	if ((ip->i_flag & IN_SPACECOUNTED) == 0) {
 		UFS_LOCK(ip->i_ump);
 		ip->i_fs->fs_pendinginodes += 1;
 		UFS_UNLOCK(ip->i_ump);
 	}
 
 	/*
 	 * If the inodedep does not exist, then the zero'ed inode has
 	 * been written to disk. If the allocated inode has never been
 	 * written to disk, then the on-disk inode is zero'ed. In either
 	 * case we can free the file immediately.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 ||
+	if (inodedep_lookup(pvp->v_mount, ino, 0, &inodedep) == 0 ||
 	    check_inode_unwritten(inodedep)) {
 		FREE_LOCK(&lk);
 		handle_workitem_freefile(freefile);
 		return;
 	}
 	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Check to see if an inode has never been written to disk. If
  * so free the inodedep and return success, otherwise return failure.
  * This routine must be called with splbio interrupts blocked.
  *
  * If we still have a bitmap dependency, then the inode has never
  * been written to disk. Drop the dependency as it is no longer
  * necessary since the inode is being deallocated. We set the
  * ALLCOMPLETE flags since the bitmap now properly shows that the
  * inode is not allocated. Even if the inode is actively being
  * written, it has been rolled back to its zero'ed state, so we
  * are ensured that a zero inode is what is on the disk. For short
  * lived files, this change will usually result in removing all the
  * dependencies from the inode so that it can be freed immediately.
  */
 static int
 check_inode_unwritten(inodedep)
 	struct inodedep *inodedep;
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((inodedep->id_state & DEPCOMPLETE) != 0 ||
 	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
 	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
 	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
 	    inodedep->id_nlinkdelta != 0)
 		return (0);
 
 	/*
 	 * Another process might be in initiate_write_inodeblock_ufs[12]
 	 * trying to allocate memory without holding "Softdep Lock".
 	 */
 	if ((inodedep->id_state & IOSTARTED) != 0 &&
 	    inodedep->id_savedino1 == NULL)
 		return (0);
 
 	inodedep->id_state |= ALLCOMPLETE;
 	LIST_REMOVE(inodedep, id_deps);
 	inodedep->id_buf = NULL;
 	if (inodedep->id_state & ONWORKLIST)
 		WORKLIST_REMOVE(&inodedep->id_list);
 	if (inodedep->id_savedino1 != NULL) {
 		FREE(inodedep->id_savedino1, M_SAVEDINO);
 		inodedep->id_savedino1 = NULL;
 	}
 	if (free_inodedep(inodedep) == 0)
 		panic("check_inode_unwritten: busy inode");
 	return (1);
 }
 
 /*
  * Try to free an inodedep structure. Return 1 if it could be freed.
  */
 static int
 free_inodedep(inodedep)
 	struct inodedep *inodedep;
 {
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((inodedep->id_state & ONWORKLIST) != 0 ||
 	    (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
 	    LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
 	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
 	    LIST_FIRST(&inodedep->id_inowait) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
 	    inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL)
 		return (0);
 	LIST_REMOVE(inodedep, id_hash);
 	WORKITEM_FREE(inodedep, D_INODEDEP);
 	num_inodedep -= 1;
 	return (1);
 }
 
 /*
  * This workitem routine performs the block de-allocation.
  * The workitem is added to the pending list after the updated
  * inode block has been written to disk.  As mentioned above,
  * checks regarding the number of blocks de-allocated (compared
  * to the number of blocks allocated for the file) are also
  * performed in this function.
  */
 static void
 handle_workitem_freeblocks(freeblks, flags)
 	struct freeblks *freeblks;
 	int flags;
 {
 	struct inode *ip;
 	struct vnode *vp;
 	struct fs *fs;
 	struct ufsmount *ump;
 	int i, nblocks, level, bsize;
 	ufs2_daddr_t bn, blocksreleased = 0;
 	int error, allerror = 0;
 	ufs_lbn_t baselbns[NIADDR], tmpval;
 	int fs_pendingblocks;
 
-	ump = VFSTOUFS(freeblks->fb_mnt);
+	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
 	fs_pendingblocks = 0;
 	tmpval = 1;
 	baselbns[0] = NDADDR;
 	for (i = 1; i < NIADDR; i++) {
 		tmpval *= NINDIR(fs);
 		baselbns[i] = baselbns[i - 1] + tmpval;
 	}
 	nblocks = btodb(fs->fs_bsize);
 	blocksreleased = 0;
 	/*
 	 * Release all extended attribute blocks or frags.
 	 */
 	if (freeblks->fb_oldextsize > 0) {
 		for (i = (NXADDR - 1); i >= 0; i--) {
 			if ((bn = freeblks->fb_eblks[i]) == 0)
 				continue;
 			bsize = sblksize(fs, freeblks->fb_oldextsize, i);
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
 			    freeblks->fb_previousinum);
 			blocksreleased += btodb(bsize);
 		}
 	}
 	/*
 	 * Release all data blocks or frags.
 	 */
 	if (freeblks->fb_oldsize > 0) {
 		/*
 		 * Indirect blocks first.
 		 */
 		for (level = (NIADDR - 1); level >= 0; level--) {
 			if ((bn = freeblks->fb_iblks[level]) == 0)
 				continue;
 			if ((error = indir_trunc(freeblks, fsbtodb(fs, bn),
 			    level, baselbns[level], &blocksreleased)) == 0)
 				allerror = error;
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn,
 			    fs->fs_bsize, freeblks->fb_previousinum);
 			fs_pendingblocks += nblocks;
 			blocksreleased += nblocks;
 		}
 		/*
 		 * All direct blocks or frags.
 		 */
 		for (i = (NDADDR - 1); i >= 0; i--) {
 			if ((bn = freeblks->fb_dblks[i]) == 0)
 				continue;
 			bsize = sblksize(fs, freeblks->fb_oldsize, i);
 			ffs_blkfree(ump, fs, freeblks->fb_devvp, bn, bsize,
 			    freeblks->fb_previousinum);
 			fs_pendingblocks += btodb(bsize);
 			blocksreleased += btodb(bsize);
 		}
 	}
 	UFS_LOCK(ump);
 	fs->fs_pendingblocks -= fs_pendingblocks;
 	UFS_UNLOCK(ump);
 	/*
 	 * If we still have not finished background cleanup, then check
 	 * to see if the block count needs to be adjusted.
 	 */
 	if (freeblks->fb_chkcnt != blocksreleased &&
 	    (fs->fs_flags & FS_UNCLEAN) != 0 &&
-	    ffs_vget(freeblks->fb_mnt, freeblks->fb_previousinum,
+	    ffs_vget(freeblks->fb_list.wk_mp, freeblks->fb_previousinum,
 	    (flags & LK_NOWAIT) | LK_EXCLUSIVE, &vp) == 0) {
 		ip = VTOI(vp);
 		DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + \
 		    freeblks->fb_chkcnt - blocksreleased);
 		ip->i_flag |= IN_CHANGE;
 		vput(vp);
 	}
 
 #ifdef DIAGNOSTIC
 	if (freeblks->fb_chkcnt != blocksreleased &&
 	    ((fs->fs_flags & FS_UNCLEAN) == 0 || (flags & LK_NOWAIT) != 0))
 		printf("handle_workitem_freeblocks: block count\n");
 	if (allerror)
 		softdep_error("handle_workitem_freeblks", allerror);
 #endif /* DIAGNOSTIC */
 
+	ACQUIRE_LOCK(&lk);
 	WORKITEM_FREE(freeblks, D_FREEBLKS);
+	FREE_LOCK(&lk);
 }
 
 /*
  * Release blocks associated with the inode ip and stored in the indirect
  * block dbn. If level is greater than SINGLE, the block is an indirect block
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  */
 static int
 indir_trunc(freeblks, dbn, level, lbn, countp)
 	struct freeblks *freeblks;
 	ufs2_daddr_t dbn;
 	int level;
 	ufs_lbn_t lbn;
 	ufs2_daddr_t *countp;
 {
 	struct buf *bp;
 	struct fs *fs;
 	struct worklist *wk;
 	struct indirdep *indirdep;
 	struct ufsmount *ump;
 	ufs1_daddr_t *bap1 = 0;
 	ufs2_daddr_t nb, *bap2 = 0;
 	ufs_lbn_t lbnadd;
 	int i, nblocks, ufs1fmt;
 	int error, allerror = 0;
 	int fs_pendingblocks;
 
-	ump = VFSTOUFS(freeblks->fb_mnt);
+	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
 	fs = ump->um_fs;
 	fs_pendingblocks = 0;
 	lbnadd = 1;
 	for (i = level; i > 0; i--)
 		lbnadd *= NINDIR(fs);
 	/*
 	 * Get buffer of block pointers to be freed. This routine is not
 	 * called until the zero'ed inode has been written, so it is safe
 	 * to free blocks as they are encountered. Because the inode has
 	 * been zero'ed, calls to bmap on these blocks will fail. So, we
 	 * have to use the on-disk address and the block device for the
 	 * filesystem to look them up. If the file was deleted before its
 	 * indirect blocks were all written to disk, the routine that set
 	 * us up (deallocate_dependencies) will have arranged to leave
 	 * a complete copy of the indirect block in memory for our use.
 	 * Otherwise we have to read the blocks in from the disk.
 	 */
 #ifdef notyet
 	bp = getblk(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, 0, 0,
 	    GB_NOCREAT);
 #else
 	bp = incore(&freeblks->fb_devvp->v_bufobj, dbn);
 #endif
 	ACQUIRE_LOCK(&lk);
 	if (bp != NULL && (wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		if (wk->wk_type != D_INDIRDEP ||
 		    (indirdep = WK_INDIRDEP(wk))->ir_savebp != bp ||
 		    (indirdep->ir_state & GOINGAWAY) == 0)
 			panic("indir_trunc: lost indirdep");
 		WORKLIST_REMOVE(wk);
 		WORKITEM_FREE(indirdep, D_INDIRDEP);
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			panic("indir_trunc: dangling dep");
-		VFSTOUFS(freeblks->fb_mnt)->um_numindirdeps -= 1;
+		ump->um_numindirdeps -= 1;
 		FREE_LOCK(&lk);
 	} else {
 #ifdef notyet
 		if (bp)
 			brelse(bp);
 #endif
 		FREE_LOCK(&lk);
 		error = bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
 		    NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 	}
 	/*
 	 * Recursively free indirect blocks.
 	 */
-	if (VFSTOUFS(freeblks->fb_mnt)->um_fstype == UFS1) {
+	if (ump->um_fstype == UFS1) {
 		ufs1fmt = 1;
 		bap1 = (ufs1_daddr_t *)bp->b_data;
 	} else {
 		ufs1fmt = 0;
 		bap2 = (ufs2_daddr_t *)bp->b_data;
 	}
 	nblocks = btodb(fs->fs_bsize);
 	for (i = NINDIR(fs) - 1; i >= 0; i--) {
 		if (ufs1fmt)
 			nb = bap1[i];
 		else
 			nb = bap2[i];
 		if (nb == 0)
 			continue;
 		if (level != 0) {
 			if ((error = indir_trunc(freeblks, fsbtodb(fs, nb),
 			     level - 1, lbn + (i * lbnadd), countp)) != 0)
 				allerror = error;
 		}
 		ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize,
 		    freeblks->fb_previousinum);
 		fs_pendingblocks += nblocks;
 		*countp += nblocks;
 	}
 	UFS_LOCK(ump);
 	fs->fs_pendingblocks -= fs_pendingblocks;
 	UFS_UNLOCK(ump);
 	bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
 	return (allerror);
 }
 
 /*
  * Free an allocindir.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 free_allocindir(aip, inodedep)
 	struct allocindir *aip;
 	struct inodedep *inodedep;
 {
 	struct freefrag *freefrag;
 
 	mtx_assert(&lk, MA_OWNED);
 	if ((aip->ai_state & DEPCOMPLETE) == 0)
 		LIST_REMOVE(aip, ai_deps);
 	if (aip->ai_state & ONWORKLIST)
 		WORKLIST_REMOVE(&aip->ai_list);
 	LIST_REMOVE(aip, ai_next);
 	if ((freefrag = aip->ai_freefrag) != NULL) {
 		if (inodedep == NULL)
 			add_to_worklist(&freefrag->ff_list);
 		else
 			WORKLIST_INSERT(&inodedep->id_bufwait,
 			    &freefrag->ff_list);
 	}
 	WORKITEM_FREE(aip, D_ALLOCINDIR);
 }
 
 /*
  * Directory entry addition dependencies.
  * 
  * When adding a new directory entry, the inode (with its incremented link
  * count) must be written to disk before the directory entry's pointer to it.
  * Also, if the inode is newly allocated, the corresponding freemap must be
  * updated (on disk) before the directory entry's pointer. These requirements
  * are met via undo/redo on the directory entry's pointer, which consists
  * simply of the inode number.
  * 
  * As directory entries are added and deleted, the free space within a
  * directory block can become fragmented.  The ufs filesystem will compact
  * a fragmented directory block to make space for a new entry. When this
  * occurs, the offsets of previously added entries change. Any "diradd"
  * dependency structures corresponding to these entries must be updated with
  * the new offsets.
  */
 
 /*
  * This routine is called after the in-memory inode's link
  * count has been incremented, but before the directory entry's
  * pointer to the inode has been set.
  */
 int
 softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for directory */
 	off_t diroffset;	/* offset of new entry in directory */
 	ino_t newinum;		/* inode referenced by new directory entry */
 	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
 	int isnewblk;		/* entry is in a newly allocated block */
 {
 	int offset;		/* offset of new entry within directory block */
 	ufs_lbn_t lbn;		/* block in directory containing new entry */
 	struct fs *fs;
 	struct diradd *dap;
 	struct allocdirect *adp;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct newdirblk *newdirblk = 0;
 	struct mkdir *mkdir1, *mkdir2;
+	struct mount *mp;
 
 	/*
 	 * Whiteouts have no dependencies.
 	 */
 	if (newinum == WINO) {
 		if (newdirbp != NULL)
 			bdwrite(newdirbp);
 		return (0);
 	}
-
+	mp = UFSTOVFS(dp->i_ump);
 	fs = dp->i_fs;
 	lbn = lblkno(fs, diroffset);
 	offset = blkoff(fs, diroffset);
 	MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD,
 		M_SOFTDEP_FLAGS|M_ZERO);
-	dap->da_list.wk_type = D_DIRADD;
+	workitem_alloc(&dap->da_list, D_DIRADD, mp);
 	dap->da_offset = offset;
 	dap->da_newinum = newinum;
 	dap->da_state = ATTACHED;
 	if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) {
 		MALLOC(newdirblk, struct newdirblk *, sizeof(struct newdirblk),
 		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
-		newdirblk->db_list.wk_type = D_NEWDIRBLK;
-		newdirblk->db_state = 0;
+		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
 	}
 	if (newdirbp == NULL) {
 		dap->da_state |= DEPCOMPLETE;
 		ACQUIRE_LOCK(&lk);
 	} else {
 		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
 		MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
 		    M_SOFTDEP_FLAGS);
-		mkdir1->md_list.wk_type = D_MKDIR;
+		workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
 		mkdir1->md_state = MKDIR_BODY;
 		mkdir1->md_diradd = dap;
 		MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR,
 		    M_SOFTDEP_FLAGS);
-		mkdir2->md_list.wk_type = D_MKDIR;
+		workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
 		mkdir2->md_state = MKDIR_PARENT;
 		mkdir2->md_diradd = dap;
 		/*
 		 * Dependency on "." and ".." being written to disk.
 		 */
 		mkdir1->md_buf = newdirbp;
 		ACQUIRE_LOCK(&lk);
 		LIST_INSERT_HEAD(&mkdirlisthd, mkdir1, md_mkdirs);
 		WORKLIST_INSERT(&newdirbp->b_dep, &mkdir1->md_list);
 		FREE_LOCK(&lk);
 		bdwrite(newdirbp);
 		/*
 		 * Dependency on link count increase for parent directory
 		 */
 		ACQUIRE_LOCK(&lk);
-		if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0
+		if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0
 		    || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 			dap->da_state &= ~MKDIR_PARENT;
 			WORKITEM_FREE(mkdir2, D_MKDIR);
 		} else {
 			LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
 			WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
 		}
 	}
 	/*
 	 * Link into parent directory pagedep to await its being written.
 	 */
 	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	dap->da_pagedep = pagedep;
 	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 	    da_pdlist);
 	/*
 	 * Link into its inodedep. Put it on the id_bufwait list if the inode
 	 * is not yet written. If it is written, do the post-inode write
 	 * processing to put it on the id_pendinghd list.
 	 */
-	(void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
+	(void) inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
 	if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
 		diradd_inode_written(dap, inodedep);
 	else
 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 	if (isnewblk) {
 		/*
 		 * Directories growing into indirect blocks are rare
 		 * enough and the frequency of new block allocation
 		 * in those cases even more rare, that we choose not
 		 * to bother tracking them. Rather we simply force the
 		 * new directory entry to disk.
 		 */
 		if (lbn >= NDADDR) {
 			FREE_LOCK(&lk);
 			/*
 			 * We only have a new allocation when at the
 			 * beginning of a new block, not when we are
 			 * expanding into an existing block.
 			 */
 			if (blkoff(fs, diroffset) == 0)
 				return (1);
 			return (0);
 		}
 		/*
 		 * We only have a new allocation when at the beginning
 		 * of a new fragment, not when we are expanding into an
 		 * existing fragment. Also, there is nothing to do if we
 		 * are already tracking this block.
 		 */
 		if (fragoff(fs, diroffset) != 0) {
 			FREE_LOCK(&lk);
 			return (0);
 		}
 		if ((pagedep->pd_state & NEWBLOCK) != 0) {
-			FREE_LOCK(&lk);
 			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
+			FREE_LOCK(&lk);
 			return (0);
 		}
 		/*
 		 * Find our associated allocdirect and have it track us.
 		 */
-		if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0)
+		if (inodedep_lookup(mp, dp->i_number, 0, &inodedep) == 0)
 			panic("softdep_setup_directory_add: lost inodedep");
 		adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst);
 		if (adp == NULL || adp->ad_lbn != lbn)
 			panic("softdep_setup_directory_add: lost entry");
 		pagedep->pd_state |= NEWBLOCK;
 		newdirblk->db_pagedep = pagedep;
 		WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list);
 	}
 	FREE_LOCK(&lk);
 	return (0);
 }
 
 /*
  * This procedure is called to change the offset of a directory
  * entry when compacting a directory block which must be owned
  * exclusively by the caller. Note that the actual entry movement
  * must be done in this procedure to ensure that no I/O completions
  * occur while the move is in progress.
  */
 void 
 softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize)
 	struct inode *dp;	/* inode for directory */
 	caddr_t base;		/* address of dp->i_offset */
 	caddr_t oldloc;		/* address of old directory location */
 	caddr_t newloc;		/* address of new directory location */
 	int entrysize;		/* size of directory entry */
 {
 	int offset, oldoffset, newoffset;
 	struct pagedep *pagedep;
 	struct diradd *dap;
 	ufs_lbn_t lbn;
 
 	ACQUIRE_LOCK(&lk);
 	lbn = lblkno(dp->i_fs, dp->i_offset);
 	offset = blkoff(dp->i_fs, dp->i_offset);
 	if (pagedep_lookup(dp, lbn, 0, &pagedep) == 0)
 		goto done;
 	oldoffset = offset + (oldloc - base);
 	newoffset = offset + (newloc - base);
 
 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) {
 		if (dap->da_offset != oldoffset)
 			continue;
 		dap->da_offset = newoffset;
 		if (DIRADDHASH(newoffset) == DIRADDHASH(oldoffset))
 			break;
 		LIST_REMOVE(dap, da_pdlist);
 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(newoffset)],
 		    dap, da_pdlist);
 		break;
 	}
 	if (dap == NULL) {
 
 		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) {
 			if (dap->da_offset == oldoffset) {
 				dap->da_offset = newoffset;
 				break;
 			}
 		}
 	}
 done:
 	bcopy(oldloc, newloc, entrysize);
 	FREE_LOCK(&lk);
 }
 
 /*
  * Free a diradd dependency structure. This routine must be called
  * with splbio interrupts blocked.
  */
 static void
 free_diradd(dap)
 	struct diradd *dap;
 {
 	struct dirrem *dirrem;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
 	struct mkdir *mkdir, *nextmd;
 
 	mtx_assert(&lk, MA_OWNED);
 	WORKLIST_REMOVE(&dap->da_list);
 	LIST_REMOVE(dap, da_pdlist);
 	if ((dap->da_state & DIRCHG) == 0) {
 		pagedep = dap->da_pagedep;
 	} else {
 		dirrem = dap->da_previous;
 		pagedep = dirrem->dm_pagedep;
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		add_to_worklist(&dirrem->dm_list);
 	}
-	if (inodedep_lookup(VFSTOUFS(pagedep->pd_mnt)->um_fs, dap->da_newinum,
+	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
 	    0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
 		for (mkdir = LIST_FIRST(&mkdirlisthd); mkdir; mkdir = nextmd) {
 			nextmd = LIST_NEXT(mkdir, md_mkdirs);
 			if (mkdir->md_diradd != dap)
 				continue;
 			dap->da_state &= ~mkdir->md_state;
 			WORKLIST_REMOVE(&mkdir->md_list);
 			LIST_REMOVE(mkdir, md_mkdirs);
 			WORKITEM_FREE(mkdir, D_MKDIR);
 		}
 		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
 			panic("free_diradd: unfound ref");
 	}
 	WORKITEM_FREE(dap, D_DIRADD);
 }
 
 /*
  * Directory entry removal dependencies.
  * 
  * When removing a directory entry, the entry's inode pointer must be
  * zero'ed on disk before the corresponding inode's link count is decremented
  * (possibly freeing the inode for re-use). This dependency is handled by
  * updating the directory entry but delaying the inode count reduction until
  * after the directory block has been written to disk. After this point, the
  * inode count can be decremented whenever it is convenient.
  */
 
 /*
  * This routine should be called immediately after removing
  * a directory entry.  The inode's link count should not be
  * decremented by the calling procedure -- the soft updates
  * code will do this task when it is safe.
  */
 void 
 softdep_setup_remove(bp, dp, ip, isrmdir)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	int isrmdir;		/* indicates if doing RMDIR */
 {
 	struct dirrem *dirrem, *prevdirrem;
 
 	/*
 	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.
 	 */
 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 
 	/*
 	 * If the COMPLETE flag is clear, then there were no active
 	 * entries and we want to roll back to a zeroed entry until
 	 * the new inode is committed to disk. If the COMPLETE flag is
 	 * set then we have deleted an entry that never made it to
 	 * disk. If the entry we deleted resulted from a name change,
 	 * then the old name still resides on disk. We cannot delete
 	 * its inode (returned to us in prevdirrem) until the zeroed
 	 * directory entry gets to disk. The new inode has never been
 	 * referenced on the disk, so can be deleted immediately.
 	 */
 	if ((dirrem->dm_state & COMPLETE) == 0) {
 		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
 		    dm_next);
 		FREE_LOCK(&lk);
 	} else {
 		if (prevdirrem != NULL)
 			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
 			    prevdirrem, dm_next);
 		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
 		FREE_LOCK(&lk);
 		handle_workitem_remove(dirrem, NULL);
 	}
 }
 
 /*
  * Allocate a new dirrem if appropriate and return it along with
  * its associated pagedep. Called without a lock, returns with lock.
  */
 static long num_dirrem;		/* number of dirrem allocated */
 static struct dirrem *
 newdirrem(bp, dp, ip, isrmdir, prevdirremp)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	int isrmdir;		/* indicates if doing RMDIR */
 	struct dirrem **prevdirremp; /* previously referenced inode, if any */
 {
 	int offset;
 	ufs_lbn_t lbn;
 	struct diradd *dap;
 	struct dirrem *dirrem;
 	struct pagedep *pagedep;
 
 	/*
 	 * Whiteouts have no deletion dependencies.
 	 */
 	if (ip == NULL)
 		panic("newdirrem: whiteout");
 	/*
 	 * If we are over our limit, try to improve the situation.
 	 * Limiting the number of dirrem structures will also limit
 	 * the number of freefile and freeblks structures.
 	 */
 	ACQUIRE_LOCK(&lk);
 	if (num_dirrem > max_softdeps / 2)
-		(void) request_cleanup(FLUSH_REMOVE);
+		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_REMOVE);
 	num_dirrem += 1;
 	FREE_LOCK(&lk);
 	MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem),
 		M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO);
-	dirrem->dm_list.wk_type = D_DIRREM;
+	workitem_alloc(&dirrem->dm_list, D_DIRREM, ITOV(dp)->v_mount);
 	dirrem->dm_state = isrmdir ? RMDIR : 0;
-	dirrem->dm_mnt = ITOV(ip)->v_mount;
 	dirrem->dm_oldinum = ip->i_number;
 	*prevdirremp = NULL;
 
 	ACQUIRE_LOCK(&lk);
 	lbn = lblkno(dp->i_fs, dp->i_offset);
 	offset = blkoff(dp->i_fs, dp->i_offset);
 	if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
 	dirrem->dm_pagedep = pagedep;
 	/*
 	 * Check for a diradd dependency for the same directory entry.
 	 * If present, then both dependencies become obsolete and can
 	 * be de-allocated. Check for an entry on both the pd_dirraddhd
 	 * list and the pd_pendinghd list.
 	 */
 
 	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
 		if (dap->da_offset == offset)
 			break;
 	if (dap == NULL) {
 
 		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
 			if (dap->da_offset == offset)
 				break;
 		if (dap == NULL)
 			return (dirrem);
 	}
 	/*
 	 * Must be ATTACHED at this point.
 	 */
 	if ((dap->da_state & ATTACHED) == 0)
 		panic("newdirrem: not ATTACHED");
 	if (dap->da_newinum != ip->i_number)
 		panic("newdirrem: inum %d should be %d",
 		    ip->i_number, dap->da_newinum);
 	/*
 	 * If we are deleting a changed name that never made it to disk,
 	 * then return the dirrem describing the previous inode (which
 	 * represents the inode currently referenced from this entry on disk).
 	 */
 	if ((dap->da_state & DIRCHG) != 0) {
 		*prevdirremp = dap->da_previous;
 		dap->da_state &= ~DIRCHG;
 		dap->da_pagedep = pagedep;
 	}
 	/*
 	 * We are deleting an entry that never made it to disk.
 	 * Mark it COMPLETE so we can delete its inode immediately.
 	 */
 	dirrem->dm_state |= COMPLETE;
 	free_diradd(dap);
 	return (dirrem);
 }
 
 /*
  * Directory entry change dependencies.
  * 
  * Changing an existing directory entry requires that an add operation
  * be completed first followed by a deletion. The semantics for the addition
  * are identical to the description of adding a new entry above except
  * that the rollback is to the old inode number rather than zero. Once
  * the addition dependency is completed, the removal is done as described
  * in the removal routine above.
  */
 
 /*
  * This routine should be called immediately after changing
  * a directory entry.  The inode's link count should not be
  * decremented by the calling procedure -- the soft updates
  * code will perform this task when it is safe.
  */
 void 
 softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
 	struct buf *bp;		/* buffer containing directory block */
 	struct inode *dp;	/* inode for the directory being modified */
 	struct inode *ip;	/* inode for directory entry being removed */
 	ino_t newinum;		/* new inode number for changed entry */
 	int isrmdir;		/* indicates if doing RMDIR */
 {
 	int offset;
 	struct diradd *dap = NULL;
 	struct dirrem *dirrem, *prevdirrem;
 	struct pagedep *pagedep;
 	struct inodedep *inodedep;
+	struct mount *mp;
 
 	offset = blkoff(dp->i_fs, dp->i_offset);
+	mp = UFSTOVFS(dp->i_ump);
 
 	/*
 	 * Whiteouts do not need diradd dependencies.
 	 */
 	if (newinum != WINO) {
 		MALLOC(dap, struct diradd *, sizeof(struct diradd),
 		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
-		dap->da_list.wk_type = D_DIRADD;
+		workitem_alloc(&dap->da_list, D_DIRADD, mp);
 		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
 		dap->da_offset = offset;
 		dap->da_newinum = newinum;
 	}
 
 	/*
 	 * Allocate a new dirrem and ACQUIRE_LOCK.
 	 */
 	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
 	pagedep = dirrem->dm_pagedep;
 	/*
 	 * The possible values for isrmdir:
 	 *	0 - non-directory file rename
 	 *	1 - directory rename within same directory
 	 *   inum - directory rename to new directory of given inode number
 	 * When renaming to a new directory, we are both deleting and
 	 * creating a new directory entry, so the link count on the new
 	 * directory should not change. Thus we do not need the followup
 	 * dirrem which is usually done in handle_workitem_remove. We set
 	 * the DIRCHG flag to tell handle_workitem_remove to skip the 
 	 * followup dirrem.
 	 */
 	if (isrmdir > 1)
 		dirrem->dm_state |= DIRCHG;
 
 	/*
 	 * Whiteouts have no additional dependencies,
 	 * so just put the dirrem on the correct list.
 	 */
 	if (newinum == WINO) {
 		if ((dirrem->dm_state & COMPLETE) == 0) {
 			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
 			    dm_next);
 		} else {
 			dirrem->dm_dirinum = pagedep->pd_ino;
 			add_to_worklist(&dirrem->dm_list);
 		}
 		FREE_LOCK(&lk);
 		return;
 	}
 
 	/*
 	 * If the COMPLETE flag is clear, then there were no active
 	 * entries and we want to roll back to the previous inode until
 	 * the new inode is committed to disk. If the COMPLETE flag is
 	 * set, then we have deleted an entry that never made it to disk.
 	 * If the entry we deleted resulted from a name change, then the old
 	 * inode reference still resides on disk. Any rollback that we do
 	 * needs to be to that old inode (returned to us in prevdirrem). If
 	 * the entry we deleted resulted from a create, then there is
 	 * no entry on the disk, so we want to roll back to zero rather
 	 * than the uncommitted inode. In either of the COMPLETE cases we
 	 * want to immediately free the unwritten and unreferenced inode.
 	 */
 	if ((dirrem->dm_state & COMPLETE) == 0) {
 		dap->da_previous = dirrem;
 	} else {
 		if (prevdirrem != NULL) {
 			dap->da_previous = prevdirrem;
 		} else {
 			dap->da_state &= ~DIRCHG;
 			dap->da_pagedep = pagedep;
 		}
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		add_to_worklist(&dirrem->dm_list);
 	}
 	/*
 	 * Link into its inodedep. Put it on the id_bufwait list if the inode
 	 * is not yet written. If it is written, do the post-inode write
 	 * processing to put it on the id_pendinghd list.
 	 */
-	if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 ||
+	if (inodedep_lookup(mp, newinum, DEPALLOC, &inodedep) == 0 ||
 	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		dap->da_state |= COMPLETE;
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 	} else {
 		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
 		    dap, da_pdlist);
 		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
 	}
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called whenever the link count on an inode is changed.
  * It creates an inode dependency so that the new reference(s)
  * to the inode cannot be committed to disk until the updated
  * inode has been written.
  */
 void
 softdep_change_linkcnt(ip)
 	struct inode *ip;	/* the inode with the increased link count */
 {
 	struct inodedep *inodedep;
 
 	ACQUIRE_LOCK(&lk);
-	(void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep);
+	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
+	    DEPALLOC, &inodedep);
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("softdep_change_linkcnt: bad delta");
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called when the effective link count and the reference count
  * on an inode drops to zero. At this point there are no names
  * referencing the file in the filesystem and no active file
  * references. The space associated with the file will be freed
  * as soon as the necessary soft dependencies are cleared.
  */
 void
 softdep_releasefile(ip)
 	struct inode *ip;	/* inode with the zero effective link count */
 {
 	struct inodedep *inodedep;
 	struct fs *fs;
 	int extblocks;
 
 	if (ip->i_effnlink > 0)
 		panic("softdep_filerelease: file still referenced");
 	/*
 	 * We may be called several times as the real reference count
 	 * drops to zero. We only want to account for the space once.
 	 */
 	if (ip->i_flag & IN_SPACECOUNTED)
 		return;
 	/*
 	 * We have to deactivate a snapshot otherwise copyonwrites may
 	 * add blocks and the cleanup may remove blocks after we have
 	 * tried to account for them.
 	 */
 	if ((ip->i_flags & SF_SNAPSHOT) != 0)
 		ffs_snapremove(ITOV(ip));
 	/*
 	 * If we are tracking an nlinkdelta, we have to also remember
 	 * whether we accounted for the freed space yet.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if ((inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep)))
+	if ((inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep)))
 		inodedep->id_state |= SPACECOUNTED;
 	FREE_LOCK(&lk);
 	fs = ip->i_fs;
 	extblocks = 0;
 	if (fs->fs_magic == FS_UFS2_MAGIC)
 		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
 	UFS_LOCK(ip->i_ump);
 	ip->i_fs->fs_pendingblocks += DIP(ip, i_blocks) - extblocks;
 	ip->i_fs->fs_pendinginodes += 1;
 	UFS_UNLOCK(ip->i_ump);
 	ip->i_flag |= IN_SPACECOUNTED;
 }
 
 /*
  * This workitem decrements the inode's link count.
  * If the link count reaches zero, the file is removed.
  */
 static void 
 handle_workitem_remove(dirrem, xp)
 	struct dirrem *dirrem;
 	struct vnode *xp;
 {
 	struct thread *td = curthread;
 	struct inodedep *inodedep;
 	struct vnode *vp;
 	struct inode *ip;
 	ino_t oldinum;
 	int error;
 
 	if ((vp = xp) == NULL &&
-	    (error = ffs_vget(dirrem->dm_mnt, dirrem->dm_oldinum, LK_EXCLUSIVE,
-	     &vp)) != 0) {
+	    (error = ffs_vget(dirrem->dm_list.wk_mp,
+	    dirrem->dm_oldinum, LK_EXCLUSIVE, &vp)) != 0) {
 		softdep_error("handle_workitem_remove: vget", error);
 		return;
 	}
 	ip = VTOI(vp);
 	ACQUIRE_LOCK(&lk);
-	if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0)
+	if ((inodedep_lookup(dirrem->dm_list.wk_mp,
+	    dirrem->dm_oldinum, 0, &inodedep)) == 0)
 		panic("handle_workitem_remove: lost inodedep");
 	/*
 	 * Normal file deletion.
 	 */
 	if ((dirrem->dm_state & RMDIR) == 0) {
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		if (ip->i_nlink < ip->i_effnlink)
 			panic("handle_workitem_remove: bad file delta");
 		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 		num_dirrem -= 1;
+		WORKITEM_FREE(dirrem, D_DIRREM);
 		FREE_LOCK(&lk);
 		vput(vp);
-		WORKITEM_FREE(dirrem, D_DIRREM);
 		return;
 	}
 	/*
 	 * Directory deletion. Decrement reference count for both the
 	 * just deleted parent directory entry and the reference for ".".
 	 * Next truncate the directory to length zero. When the
 	 * truncation completes, arrange to have the reference count on
 	 * the parent decremented to account for the loss of "..".
 	 */
 	ip->i_nlink -= 2;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("handle_workitem_remove: bad dir delta");
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 	FREE_LOCK(&lk);
 	if ((error = ffs_truncate(vp, (off_t)0, 0, td->td_ucred, td)) != 0)
 		softdep_error("handle_workitem_remove: truncate", error);
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * Rename a directory to a new parent. Since, we are both deleting
 	 * and creating a new directory entry, the link count on the new
 	 * directory should not change. Thus we skip the followup dirrem.
 	 */
 	if (dirrem->dm_state & DIRCHG) {
 		num_dirrem -= 1;
+		WORKITEM_FREE(dirrem, D_DIRREM);
 		FREE_LOCK(&lk);
 		vput(vp);
-		WORKITEM_FREE(dirrem, D_DIRREM);
 		return;
 	}
 	/*
 	 * If the inodedep does not exist, then the zero'ed inode has
 	 * been written to disk. If the allocated inode has never been
 	 * written to disk, then the on-disk inode is zero'ed. In either
 	 * case we can remove the file immediately.
 	 */
 	dirrem->dm_state = 0;
 	oldinum = dirrem->dm_oldinum;
 	dirrem->dm_oldinum = dirrem->dm_dirinum;
-	if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 ||
-	    check_inode_unwritten(inodedep)) {
+	if (inodedep_lookup(dirrem->dm_list.wk_mp, oldinum,
+	    0, &inodedep) == 0 || check_inode_unwritten(inodedep)) {
 		FREE_LOCK(&lk);
 		vput(vp);
 		handle_workitem_remove(dirrem, NULL);
 		return;
 	}
 	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
 	FREE_LOCK(&lk);
 	ip->i_flag |= IN_CHANGE;
 	ffs_update(vp, 0);
 	vput(vp);
 }
 
 /*
  * Inode de-allocation dependencies.
  * 
  * When an inode's link count is reduced to zero, it can be de-allocated. We
  * found it convenient to postpone de-allocation until after the inode is
  * written to disk with its new link count (zero).  At this point, all of the
  * on-disk inode's block pointers are nullified and, with careful dependency
  * list ordering, all dependencies related to the inode will be satisfied and
  * the corresponding dependency structures de-allocated.  So, if/when the
  * inode is reused, there will be no mixing of old dependencies with new
  * ones.  This artificial dependency is set up by the block de-allocation
  * procedure above (softdep_setup_freeblocks) and completed by the
  * following procedure.
  */
 static void 
 handle_workitem_freefile(freefile)
 	struct freefile *freefile;
 {
 	struct fs *fs;
 	struct inodedep *idp;
 	struct ufsmount *ump;
 	int error;
 
-	ump = VFSTOUFS(freefile->fx_mnt);
+	ump = VFSTOUFS(freefile->fx_list.wk_mp);
 	fs = ump->um_fs;
 #ifdef DEBUG
 	ACQUIRE_LOCK(&lk);
-	error = inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp);
+	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
 	FREE_LOCK(&lk);
 	if (error)
 		panic("handle_workitem_freefile: inodedep survived");
 #endif
 	UFS_LOCK(ump);
 	fs->fs_pendinginodes -= 1;
 	UFS_UNLOCK(ump);
-	if ((error = ffs_freefile(VFSTOUFS(freefile->fx_mnt), fs,
-	    freefile->fx_devvp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
+	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
+	    freefile->fx_oldinum, freefile->fx_mode)) != 0)
 		softdep_error("handle_workitem_freefile", error);
+	ACQUIRE_LOCK(&lk);
 	WORKITEM_FREE(freefile, D_FREEFILE);
+	FREE_LOCK(&lk);
 }
 
 
 /*
  * Helper function which unlinks marker element from work list and returns
  * the next element on the list.
  */
 static __inline struct worklist *
 markernext(struct worklist *marker)
 {
 	struct worklist *next;
 	
 	next = LIST_NEXT(marker, wk_list);
 	LIST_REMOVE(marker, wk_list);
 	return next;
 }
 
 /*
  * Disk writes.
  * 
  * The dependency structures constructed above are most actively used when file
  * system blocks are written to disk.  No constraints are placed on when a
  * block can be written, but unsatisfied update dependencies are made safe by
  * modifying (or replacing) the source memory for the duration of the disk
  * write.  When the disk write completes, the memory block is again brought
  * up-to-date.
  *
  * In-core inode structure reclamation.
  * 
  * Because there are a finite number of "in-core" inode structures, they are
  * reused regularly.  By transferring all inode-related dependencies to the
  * in-memory inode block and indexing them separately (via "inodedep"s), we
  * can allow "in-core" inode structures to be reused at any time and avoid
  * any increase in contention.
  *
  * Called just before entering the device driver to initiate a new disk I/O.
  * The buffer must be locked, thus, no I/O completion operations can occur
  * while we are manipulating its associated dependencies.
  */
 static void 
 softdep_disk_io_initiation(bp)
 	struct buf *bp;		/* structure describing disk write to occur */
 {
 	struct worklist *wk;
 	struct worklist marker;
 	struct indirdep *indirdep;
 	struct inodedep *inodedep;
 
 	/*
 	 * We only care about write operations. There should never
 	 * be dependencies for reads.
 	 */
 	if (bp->b_iocmd != BIO_WRITE)
 		panic("softdep_disk_io_initiation: not write");
 
 	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
 	PHOLD(curproc);			/* Don't swap out kernel stack */
 
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * Do any necessary pre-I/O processing.
 	 */
 	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
 	     wk = markernext(&marker)) {
 		LIST_INSERT_AFTER(wk, &marker, wk_list);
 		switch (wk->wk_type) {
 
 		case D_PAGEDEP:
 			initiate_write_filepage(WK_PAGEDEP(wk), bp);
 			continue;
 
 		case D_INODEDEP:
 			inodedep = WK_INODEDEP(wk);
 			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
 				initiate_write_inodeblock_ufs1(inodedep, bp);
 			else
 				initiate_write_inodeblock_ufs2(inodedep, bp);
 			continue;
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 			if (indirdep->ir_state & GOINGAWAY)
 				panic("disk_io_initiation: indirdep gone");
 			/*
 			 * If there are no remaining dependencies, this
 			 * will be writing the real pointers, so the
 			 * dependency can be freed.
 			 */
 			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
 				struct buf *bp;
 
 				bp = indirdep->ir_savebp;
 				bp->b_flags |= B_INVAL | B_NOCACHE;
 				/* inline expand WORKLIST_REMOVE(wk); */
 				wk->wk_state &= ~ONWORKLIST;
 				LIST_REMOVE(wk, wk_list);
 				WORKITEM_FREE(indirdep, D_INDIRDEP);
 				FREE_LOCK(&lk);
 				brelse(bp);
 				ACQUIRE_LOCK(&lk);
 				continue;
 			}
 			/*
 			 * Replace up-to-date version with safe version.
 			 */
 			FREE_LOCK(&lk);
 			MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount,
 			    M_INDIRDEP, M_SOFTDEP_FLAGS);
 			ACQUIRE_LOCK(&lk);
 			indirdep->ir_state &= ~ATTACHED;
 			indirdep->ir_state |= UNDONE;
 			bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
 			bcopy(indirdep->ir_savebp->b_data, bp->b_data,
 			    bp->b_bcount);
 			continue;
 
 		case D_MKDIR:
 		case D_BMSAFEMAP:
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 			continue;
 
 		default:
 			panic("handle_disk_io_initiation: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	FREE_LOCK(&lk);
 	PRELE(curproc);			/* Allow swapout of kernel stack */
 }
 
 /*
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in a directory. The buffer must be locked,
  * thus, no I/O completion operations can occur while we are
  * manipulating its associated dependencies.
  */
 static void
 initiate_write_filepage(pagedep, bp)
 	struct pagedep *pagedep;
 	struct buf *bp;
 {
 	struct diradd *dap;
 	struct direct *ep;
 	int i;
 
 	if (pagedep->pd_state & IOSTARTED) {
 		/*
 		 * This can only happen if there is a driver that does not
 		 * understand chaining. Here biodone will reissue the call
 		 * to strategy for the incomplete buffers.
 		 */
 		printf("initiate_write_filepage: already started\n");
 		return;
 	}
 	pagedep->pd_state |= IOSTARTED;
 	for (i = 0; i < DAHASHSZ; i++) {
 		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 			ep = (struct direct *)
 			    ((char *)bp->b_data + dap->da_offset);
 			if (ep->d_ino != dap->da_newinum)
 				panic("%s: dir inum %d != new %d",
 				    "initiate_write_filepage",
 				    ep->d_ino, dap->da_newinum);
 			if (dap->da_state & DIRCHG)
 				ep->d_ino = dap->da_previous->dm_oldinum;
 			else
 				ep->d_ino = 0;
 			dap->da_state &= ~ATTACHED;
 			dap->da_state |= UNDONE;
 		}
 	}
 }
 
 /*
  * Version of initiate_write_inodeblock that handles UFS1 dinodes.
  * Note that any bug fixes made to this routine must be done in the
  * version found below.
  *
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in an inodeblock. The buffer must be
  * locked, thus, no I/O completion operations can occur while we
  * are manipulating its associated dependencies.
  */
 static void 
 initiate_write_inodeblock_ufs1(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;			/* The inode block */
 {
 	struct allocdirect *adp, *lastadp;
 	struct ufs1_dinode *dp;
 	struct ufs1_dinode *sip;
 	struct fs *fs;
 	ufs_lbn_t i, prevlbn = 0;
 	int deplist;
 
 	if (inodedep->id_state & IOSTARTED)
 		panic("initiate_write_inodeblock_ufs1: already started");
 	inodedep->id_state |= IOSTARTED;
 	fs = inodedep->id_fs;
 	dp = (struct ufs1_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, inodedep->id_ino);
 	/*
 	 * If the bitmap is not yet written, then the allocated
 	 * inode cannot be written to disk.
 	 */
 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 		if (inodedep->id_savedino1 != NULL)
 			panic("initiate_write_inodeblock_ufs1: I/O underway");
 		FREE_LOCK(&lk);
 		MALLOC(sip, struct ufs1_dinode *,
 		    sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
 		ACQUIRE_LOCK(&lk);
 		inodedep->id_savedino1 = sip;
 		*inodedep->id_savedino1 = *dp;
 		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
 		dp->di_gen = inodedep->id_savedino1->di_gen;
 		return;
 	}
 	/*
 	 * If no dependencies, then there is nothing to roll back.
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = 0;
 	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL)
 		return;
 	/*
 	 * Set the dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef DIAGNOSTIC
 		if (deplist != 0 && prevlbn >= adp->ad_lbn)
 			panic("softdep_write_inodeblock: lbn order");
 		prevlbn = adp->ad_lbn;
 		if (adp->ad_lbn < NDADDR &&
 		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %d != %jd",
 			    "softdep_write_inodeblock",
 			    (intmax_t)adp->ad_lbn,
 			    dp->di_db[adp->ad_lbn],
 			    (intmax_t)adp->ad_newblkno);
 		if (adp->ad_lbn >= NDADDR &&
 		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
 			panic("%s: indirect pointer #%jd mismatch %d != %jd",
 			    "softdep_write_inodeblock",
 			    (intmax_t)adp->ad_lbn - NDADDR,
 			    dp->di_ib[adp->ad_lbn - NDADDR],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_lbn;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
 #endif /* DIAGNOSTIC */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		if (adp->ad_lbn >= NDADDR)
 			break;
 		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 #ifdef DIAGNOSTIC
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep1");
 #endif /* DIAGNOSTIC */
 			dp->di_db[i] = 0;
 		}
 		for (i = 0; i < NIADDR; i++) {
 #ifdef DIAGNOSTIC
 			if (dp->di_ib[i] != 0 &&
 			    (deplist & ((1 << NDADDR) << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep2");
 #endif /* DIAGNOSTIC */
 			dp->di_ib[i] = 0;
 		}
 		return;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the file,
 	 * roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_lbn; i >= 0; i--)
 			if (dp->di_db[i] != 0)
 				break;
 		dp->di_size = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * The only dependencies are for indirect blocks.
 	 *
 	 * The file size for indirect block additions is not guaranteed.
 	 * Such a guarantee would be non-trivial to achieve. The conventional
 	 * synchronous write implementation also does not make this guarantee.
 	 * Fsck should catch and fix discrepancies. Arguably, the file size
 	 * can be over-estimated without destroying integrity when the file
 	 * moves into the indirect blocks (i.e., is large). If we want to
 	 * postpone fsck, we are stuck with this argument.
 	 */
 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 }
 		
 /*
  * Version of initiate_write_inodeblock that handles UFS2 dinodes.
  * Note that any bug fixes made to this routine must be done in the
  * version found above.
  *
  * Called from within the procedure above to deal with unsatisfied
  * allocation dependencies in an inodeblock. The buffer must be
  * locked, thus, no I/O completion operations can occur while we
  * are manipulating its associated dependencies.
  */
 static void 
 initiate_write_inodeblock_ufs2(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;			/* The inode block */
 {
 	struct allocdirect *adp, *lastadp;
 	struct ufs2_dinode *dp;
 	struct ufs2_dinode *sip;
 	struct fs *fs;
 	ufs_lbn_t i, prevlbn = 0;
 	int deplist;
 
 	if (inodedep->id_state & IOSTARTED)
 		panic("initiate_write_inodeblock_ufs2: already started");
 	inodedep->id_state |= IOSTARTED;
 	fs = inodedep->id_fs;
 	dp = (struct ufs2_dinode *)bp->b_data +
 	    ino_to_fsbo(fs, inodedep->id_ino);
 	/*
 	 * If the bitmap is not yet written, then the allocated
 	 * inode cannot be written to disk.
 	 */
 	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 		if (inodedep->id_savedino2 != NULL)
 			panic("initiate_write_inodeblock_ufs2: I/O underway");
 		FREE_LOCK(&lk);
 		MALLOC(sip, struct ufs2_dinode *,
 		    sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS);
 		ACQUIRE_LOCK(&lk);
 		inodedep->id_savedino2 = sip;
 		*inodedep->id_savedino2 = *dp;
 		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
 		dp->di_gen = inodedep->id_savedino2->di_gen;
 		return;
 	}
 	/*
 	 * If no dependencies, then there is nothing to roll back.
 	 */
 	inodedep->id_savedsize = dp->di_size;
 	inodedep->id_savedextsize = dp->di_extsize;
 	if (TAILQ_FIRST(&inodedep->id_inoupdt) == NULL &&
 	    TAILQ_FIRST(&inodedep->id_extupdt) == NULL)
 		return;
 	/*
 	 * Set the ext data dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef DIAGNOSTIC
 		if (deplist != 0 && prevlbn >= adp->ad_lbn)
 			panic("softdep_write_inodeblock: lbn order");
 		prevlbn = adp->ad_lbn;
 		if (dp->di_extb[adp->ad_lbn] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock",
 			    (intmax_t)adp->ad_lbn,
 			    (intmax_t)dp->di_extb[adp->ad_lbn],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_lbn;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
 #endif /* DIAGNOSTIC */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the ext
 	 * data which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		dp->di_extb[adp->ad_lbn] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_extsize = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 		for (i = adp->ad_lbn + 1; i < NXADDR; i++) {
 #ifdef DIAGNOSTIC
 			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep1");
 #endif /* DIAGNOSTIC */
 			dp->di_extb[i] = 0;
 		}
 		lastadp = NULL;
 		break;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the ext
 	 * data, roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_extsize <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_lbn; i >= 0; i--)
 			if (dp->di_extb[i] != 0)
 				break;
 		dp->di_extsize = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * Set the file data dependencies to busy.
 	 */
 	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     adp = TAILQ_NEXT(adp, ad_next)) {
 #ifdef DIAGNOSTIC
 		if (deplist != 0 && prevlbn >= adp->ad_lbn)
 			panic("softdep_write_inodeblock: lbn order");
 		prevlbn = adp->ad_lbn;
 		if (adp->ad_lbn < NDADDR &&
 		    dp->di_db[adp->ad_lbn] != adp->ad_newblkno)
 			panic("%s: direct pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock",
 			    (intmax_t)adp->ad_lbn,
 			    (intmax_t)dp->di_db[adp->ad_lbn],
 			    (intmax_t)adp->ad_newblkno);
 		if (adp->ad_lbn >= NDADDR &&
 		    dp->di_ib[adp->ad_lbn - NDADDR] != adp->ad_newblkno)
 			panic("%s indirect pointer #%jd mismatch %jd != %jd",
 			    "softdep_write_inodeblock:",
 			    (intmax_t)adp->ad_lbn - NDADDR,
 			    (intmax_t)dp->di_ib[adp->ad_lbn - NDADDR],
 			    (intmax_t)adp->ad_newblkno);
 		deplist |= 1 << adp->ad_lbn;
 		if ((adp->ad_state & ATTACHED) == 0)
 			panic("softdep_write_inodeblock: Unknown state 0x%x",
 			    adp->ad_state);
 #endif /* DIAGNOSTIC */
 		adp->ad_state &= ~ATTACHED;
 		adp->ad_state |= UNDONE;
 	}
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem.
 	 */
 	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
 		if (adp->ad_lbn >= NDADDR)
 			break;
 		dp->di_db[adp->ad_lbn] = adp->ad_oldblkno;
 		/* keep going until hitting a rollback to a frag */
 		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
 			continue;
 		dp->di_size = fs->fs_bsize * adp->ad_lbn + adp->ad_oldsize;
 		for (i = adp->ad_lbn + 1; i < NDADDR; i++) {
 #ifdef DIAGNOSTIC
 			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep2");
 #endif /* DIAGNOSTIC */
 			dp->di_db[i] = 0;
 		}
 		for (i = 0; i < NIADDR; i++) {
 #ifdef DIAGNOSTIC
 			if (dp->di_ib[i] != 0 &&
 			    (deplist & ((1 << NDADDR) << i)) == 0)
 				panic("softdep_write_inodeblock: lost dep3");
 #endif /* DIAGNOSTIC */
 			dp->di_ib[i] = 0;
 		}
 		return;
 	}
 	/*
 	 * If we have zero'ed out the last allocated block of the file,
 	 * roll back the size to the last currently allocated block.
 	 * We know that this last allocated block is a full-sized as
 	 * we already checked for fragments in the loop above.
 	 */
 	if (lastadp != NULL &&
 	    dp->di_size <= (lastadp->ad_lbn + 1) * fs->fs_bsize) {
 		for (i = lastadp->ad_lbn; i >= 0; i--)
 			if (dp->di_db[i] != 0)
 				break;
 		dp->di_size = (i + 1) * fs->fs_bsize;
 	}
 	/*
 	 * The only dependencies are for indirect blocks.
 	 *
 	 * The file size for indirect block additions is not guaranteed.
 	 * Such a guarantee would be non-trivial to achieve. The conventional
 	 * synchronous write implementation also does not make this guarantee.
 	 * Fsck should catch and fix discrepancies. Arguably, the file size
 	 * can be over-estimated without destroying integrity when the file
 	 * moves into the indirect blocks (i.e., is large). If we want to
 	 * postpone fsck, we are stuck with this argument.
 	 */
 	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
 		dp->di_ib[adp->ad_lbn - NDADDR] = 0;
 }
 
 /*
  * This routine is called during the completion interrupt
  * service routine for a disk write (from the procedure called
  * by the device driver to inform the filesystem caches of
  * a request completion).  It should be called early in this
  * procedure, before the block is made available to other
  * processes or other routines are called.
  */
 static void 
 softdep_disk_write_complete(bp)
 	struct buf *bp;		/* describes the completed disk write */
 {
 	struct worklist *wk;
 	struct worklist *owk;
 	struct workhead reattach;
 	struct newblk *newblk;
 	struct allocindir *aip;
 	struct allocdirect *adp;
 	struct indirdep *indirdep;
 	struct inodedep *inodedep;
 	struct bmsafemap *bmsafemap;
 
 	/*
 	 * If an error occurred while doing the write, then the data
 	 * has not hit the disk and the dependencies cannot be unrolled.
 	 */
 	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
 		return;
 	LIST_INIT(&reattach);
 	/*
 	 * This lock must not be released anywhere in this code segment.
 	 */
 	ACQUIRE_LOCK(&lk);
 	owk = NULL;
 	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		if (wk == owk)
 			panic("duplicate worklist: %p\n", wk);
 		owk = wk;
 		switch (wk->wk_type) {
 
 		case D_PAGEDEP:
 			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_INODEDEP:
 			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
 				WORKLIST_INSERT(&reattach, wk);
 			continue;
 
 		case D_BMSAFEMAP:
 			bmsafemap = WK_BMSAFEMAP(wk);
 			while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkhd))) {
 				newblk->nb_state |= DEPCOMPLETE;
 				newblk->nb_bmsafemap = NULL;
 				LIST_REMOVE(newblk, nb_deps);
 			}
 			while ((adp =
 			   LIST_FIRST(&bmsafemap->sm_allocdirecthd))) {
 				adp->ad_state |= DEPCOMPLETE;
 				adp->ad_buf = NULL;
 				LIST_REMOVE(adp, ad_deps);
 				handle_allocdirect_partdone(adp);
 			}
 			while ((aip =
 			    LIST_FIRST(&bmsafemap->sm_allocindirhd))) {
 				aip->ai_state |= DEPCOMPLETE;
 				aip->ai_buf = NULL;
 				LIST_REMOVE(aip, ai_deps);
 				handle_allocindir_partdone(aip);
 			}
 			while ((inodedep =
 			     LIST_FIRST(&bmsafemap->sm_inodedephd)) != NULL) {
 				inodedep->id_state |= DEPCOMPLETE;
 				LIST_REMOVE(inodedep, id_deps);
 				inodedep->id_buf = NULL;
 			}
 			WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
 			continue;
 
 		case D_MKDIR:
 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
 			continue;
 
 		case D_ALLOCDIRECT:
 			adp = WK_ALLOCDIRECT(wk);
 			adp->ad_state |= COMPLETE;
 			handle_allocdirect_partdone(adp);
 			continue;
 
 		case D_ALLOCINDIR:
 			aip = WK_ALLOCINDIR(wk);
 			aip->ai_state |= COMPLETE;
 			handle_allocindir_partdone(aip);
 			continue;
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 			if (indirdep->ir_state & GOINGAWAY)
 				panic("disk_write_complete: indirdep gone");
 			bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
 			FREE(indirdep->ir_saveddata, M_INDIRDEP);
 			indirdep->ir_saveddata = 0;
 			indirdep->ir_state &= ~UNDONE;
 			indirdep->ir_state |= ATTACHED;
 			while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
 				handle_allocindir_partdone(aip);
 				if (aip == LIST_FIRST(&indirdep->ir_donehd))
 					panic("disk_write_complete: not gone");
 			}
 			WORKLIST_INSERT(&reattach, wk);
 			if ((bp->b_flags & B_DELWRI) == 0)
 				stat_indir_blk_ptrs++;
 			bdirty(bp);
 			continue;
 
 		default:
 			panic("handle_disk_write_complete: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	/*
 	 * Reattach any requests that must be redone.
 	 */
 	while ((wk = LIST_FIRST(&reattach)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		WORKLIST_INSERT(&bp->b_dep, wk);
 	}
 	FREE_LOCK(&lk);
 }
 
 /*
  * Called from within softdep_disk_write_complete above. Note that
  * this routine is always called from interrupt level with further
  * splbio interrupts blocked.
  */
 static void 
 handle_allocdirect_partdone(adp)
 	struct allocdirect *adp;	/* the completed allocdirect */
 {
 	struct allocdirectlst *listhead;
 	struct allocdirect *listadp;
 	struct inodedep *inodedep;
 	long bsize, delay;
 
 	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	if (adp->ad_buf != NULL)
 		panic("handle_allocdirect_partdone: dangling dep");
 	/*
 	 * The on-disk inode cannot claim to be any larger than the last
 	 * fragment that has been written. Otherwise, the on-disk inode
 	 * might have fragments that were not the last block in the file
 	 * which would corrupt the filesystem. Thus, we cannot free any
 	 * allocdirects after one whose ad_oldblkno claims a fragment as
 	 * these blocks must be rolled back to zero before writing the inode.
 	 * We check the currently active set of allocdirects in id_inoupdt
 	 * or id_extupdt as appropriate.
 	 */
 	inodedep = adp->ad_inodedep;
 	bsize = inodedep->id_fs->fs_bsize;
 	if (adp->ad_state & EXTDATA)
 		listhead = &inodedep->id_extupdt;
 	else
 		listhead = &inodedep->id_inoupdt;
 	TAILQ_FOREACH(listadp, listhead, ad_next) {
 		/* found our block */
 		if (listadp == adp)
 			break;
 		/* continue if ad_oldlbn is not a fragment */
 		if (listadp->ad_oldsize == 0 ||
 		    listadp->ad_oldsize == bsize)
 			continue;
 		/* hit a fragment */
 		return;
 	}
 	/*
 	 * If we have reached the end of the current list without
 	 * finding the just finished dependency, then it must be
 	 * on the future dependency list. Future dependencies cannot
 	 * be freed until they are moved to the current list.
 	 */
 	if (listadp == NULL) {
 #ifdef DEBUG
 		if (adp->ad_state & EXTDATA)
 			listhead = &inodedep->id_newextupdt;
 		else
 			listhead = &inodedep->id_newinoupdt;
 		TAILQ_FOREACH(listadp, listhead, ad_next)
 			/* found our block */
 			if (listadp == adp)
 				break;
 		if (listadp == NULL)
 			panic("handle_allocdirect_partdone: lost dep");
 #endif /* DEBUG */
 		return;
 	}
 	/*
 	 * If we have found the just finished dependency, then free
 	 * it along with anything that follows it that is complete.
 	 * If the inode still has a bitmap dependency, then it has
 	 * never been written to disk, hence the on-disk inode cannot
 	 * reference the old fragment so we can free it without delay.
 	 */
 	delay = (inodedep->id_state & DEPCOMPLETE);
 	for (; adp; adp = listadp) {
 		listadp = TAILQ_NEXT(adp, ad_next);
 		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
 			return;
 		free_allocdirect(listhead, adp, delay);
 	}
 }
 
 /*
  * Called from within softdep_disk_write_complete above. Note that
  * this routine is always called from interrupt level with further
  * splbio interrupts blocked.
  */
 static void
 handle_allocindir_partdone(aip)
 	struct allocindir *aip;		/* the completed allocindir */
 {
 	struct indirdep *indirdep;
 
 	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
 		return;
 	if (aip->ai_buf != NULL)
 		panic("handle_allocindir_partdone: dangling dependency");
 	indirdep = aip->ai_indirdep;
 	if (indirdep->ir_state & UNDONE) {
 		LIST_REMOVE(aip, ai_next);
 		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
 		return;
 	}
 	if (indirdep->ir_state & UFS1FMT)
 		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 		    aip->ai_newblkno;
 	else
 		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
 		    aip->ai_newblkno;
 	LIST_REMOVE(aip, ai_next);
 	if (aip->ai_freefrag != NULL)
 		add_to_worklist(&aip->ai_freefrag->ff_list);
 	WORKITEM_FREE(aip, D_ALLOCINDIR);
 }
 
 /*
  * Called from within softdep_disk_write_complete above to restore
  * in-memory inode block contents to their most up-to-date state. Note
  * that this routine is always called from interrupt level with further
  * splbio interrupts blocked.
  */
 static int 
 handle_written_inodeblock(inodedep, bp)
 	struct inodedep *inodedep;
 	struct buf *bp;		/* buffer containing the inode block */
 {
 	struct worklist *wk, *filefree;
 	struct allocdirect *adp, *nextadp;
 	struct ufs1_dinode *dp1 = NULL;
 	struct ufs2_dinode *dp2 = NULL;
 	int hadchanges, fstype;
 
 	if ((inodedep->id_state & IOSTARTED) == 0)
 		panic("handle_written_inodeblock: not started");
 	inodedep->id_state &= ~IOSTARTED;
 	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
 		fstype = UFS1;
 		dp1 = (struct ufs1_dinode *)bp->b_data +
 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 	} else {
 		fstype = UFS2;
 		dp2 = (struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
 	}
 	/*
 	 * If we had to rollback the inode allocation because of
 	 * bitmaps being incomplete, then simply restore it.
 	 * Keep the block dirty so that it will not be reclaimed until
 	 * all associated dependencies have been cleared and the
 	 * corresponding updates written to disk.
 	 */
 	if (inodedep->id_savedino1 != NULL) {
 		if (fstype == UFS1)
 			*dp1 = *inodedep->id_savedino1;
 		else
 			*dp2 = *inodedep->id_savedino2;
 		FREE(inodedep->id_savedino1, M_SAVEDINO);
 		inodedep->id_savedino1 = NULL;
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_inode_bitmap++;
 		bdirty(bp);
 		return (1);
 	}
 	inodedep->id_state |= COMPLETE;
 	/*
 	 * Roll forward anything that had to be rolled back before 
 	 * the inode could be updated.
 	 */
 	hadchanges = 0;
 	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
 		nextadp = TAILQ_NEXT(adp, ad_next);
 		if (adp->ad_state & ATTACHED)
 			panic("handle_written_inodeblock: new entry");
 		if (fstype == UFS1) {
 			if (adp->ad_lbn < NDADDR) {
 				if (dp1->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
 					panic("%s %s #%jd mismatch %d != %jd",
 					    "handle_written_inodeblock:",
 					    "direct pointer",
 					    (intmax_t)adp->ad_lbn,
 					    dp1->di_db[adp->ad_lbn],
 					    (intmax_t)adp->ad_oldblkno);
 				dp1->di_db[adp->ad_lbn] = adp->ad_newblkno;
 			} else {
 				if (dp1->di_ib[adp->ad_lbn - NDADDR] != 0)
 					panic("%s: %s #%jd allocated as %d",
 					    "handle_written_inodeblock",
 					    "indirect pointer",
 					    (intmax_t)adp->ad_lbn - NDADDR,
 					    dp1->di_ib[adp->ad_lbn - NDADDR]);
 				dp1->di_ib[adp->ad_lbn - NDADDR] =
 				    adp->ad_newblkno;
 			}
 		} else {
 			if (adp->ad_lbn < NDADDR) {
 				if (dp2->di_db[adp->ad_lbn]!=adp->ad_oldblkno)
 					panic("%s: %s #%jd %s %jd != %jd",
 					    "handle_written_inodeblock",
 					    "direct pointer",
 					    (intmax_t)adp->ad_lbn, "mismatch",
 					    (intmax_t)dp2->di_db[adp->ad_lbn],
 					    (intmax_t)adp->ad_oldblkno);
 				dp2->di_db[adp->ad_lbn] = adp->ad_newblkno;
 			} else {
 				if (dp2->di_ib[adp->ad_lbn - NDADDR] != 0)
 					panic("%s: %s #%jd allocated as %jd",
 					    "handle_written_inodeblock",
 					    "indirect pointer",
 					    (intmax_t)adp->ad_lbn - NDADDR,
 					    (intmax_t)
 					    dp2->di_ib[adp->ad_lbn - NDADDR]);
 				dp2->di_ib[adp->ad_lbn - NDADDR] =
 				    adp->ad_newblkno;
 			}
 		}
 		adp->ad_state &= ~UNDONE;
 		adp->ad_state |= ATTACHED;
 		hadchanges = 1;
 	}
 	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
 		nextadp = TAILQ_NEXT(adp, ad_next);
 		if (adp->ad_state & ATTACHED)
 			panic("handle_written_inodeblock: new entry");
 		if (dp2->di_extb[adp->ad_lbn] != adp->ad_oldblkno)
 			panic("%s: direct pointers #%jd %s %jd != %jd",
 			    "handle_written_inodeblock",
 			    (intmax_t)adp->ad_lbn, "mismatch",
 			    (intmax_t)dp2->di_extb[adp->ad_lbn],
 			    (intmax_t)adp->ad_oldblkno);
 		dp2->di_extb[adp->ad_lbn] = adp->ad_newblkno;
 		adp->ad_state &= ~UNDONE;
 		adp->ad_state |= ATTACHED;
 		hadchanges = 1;
 	}
 	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
 		stat_direct_blk_ptrs++;
 	/*
 	 * Reset the file size to its most up-to-date value.
 	 */
 	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
 		panic("handle_written_inodeblock: bad size");
 	if (fstype == UFS1) {
 		if (dp1->di_size != inodedep->id_savedsize) {
 			dp1->di_size = inodedep->id_savedsize;
 			hadchanges = 1;
 		}
 	} else {
 		if (dp2->di_size != inodedep->id_savedsize) {
 			dp2->di_size = inodedep->id_savedsize;
 			hadchanges = 1;
 		}
 		if (dp2->di_extsize != inodedep->id_savedextsize) {
 			dp2->di_extsize = inodedep->id_savedextsize;
 			hadchanges = 1;
 		}
 	}
 	inodedep->id_savedsize = -1;
 	inodedep->id_savedextsize = -1;
 	/*
 	 * If there were any rollbacks in the inode block, then it must be
 	 * marked dirty so that its will eventually get written back in
 	 * its correct form.
 	 */
 	if (hadchanges)
 		bdirty(bp);
 	/*
 	 * Process any allocdirects that completed during the update.
 	 */
 	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
 		handle_allocdirect_partdone(adp);
 	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
 		handle_allocdirect_partdone(adp);
 	/*
 	 * Process deallocations that were held pending until the
 	 * inode had been written to disk. Freeing of the inode
 	 * is delayed until after all blocks have been freed to
 	 * avoid creation of new <vfsid, inum, lbn> triples
 	 * before the old ones have been deleted.
 	 */
 	filefree = NULL;
 	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		switch (wk->wk_type) {
 
 		case D_FREEFILE:
 			/*
 			 * We defer adding filefree to the worklist until
 			 * all other additions have been made to ensure
 			 * that it will be done after all the old blocks
 			 * have been freed.
 			 */
 			if (filefree != NULL)
 				panic("handle_written_inodeblock: filefree");
 			filefree = wk;
 			continue;
 
 		case D_MKDIR:
 			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
 			continue;
 
 		case D_DIRADD:
 			diradd_inode_written(WK_DIRADD(wk), inodedep);
 			continue;
 
 		case D_FREEBLKS:
 			wk->wk_state |= COMPLETE;
 			if ((wk->wk_state  & ALLCOMPLETE) != ALLCOMPLETE)
 				continue;
 			 /* -- fall through -- */
 		case D_FREEFRAG:
 		case D_DIRREM:
 			add_to_worklist(wk);
 			continue;
 
 		case D_NEWDIRBLK:
 			free_newdirblk(WK_NEWDIRBLK(wk));
 			continue;
 
 		default:
 			panic("handle_written_inodeblock: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 	if (filefree != NULL) {
 		if (free_inodedep(inodedep) == 0)
 			panic("handle_written_inodeblock: live inodedep");
 		add_to_worklist(filefree);
 		return (0);
 	}
 
 	/*
 	 * If no outstanding dependencies, free it.
 	 */
 	if (free_inodedep(inodedep) ||
 	    (TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
 	     TAILQ_FIRST(&inodedep->id_extupdt) == 0))
 		return (0);
 	return (hadchanges);
 }
 
 /*
  * Process a diradd entry after its dependent inode has been written.
  * This routine must be called with splbio interrupts blocked.
  */
 static void
 diradd_inode_written(dap, inodedep)
 	struct diradd *dap;
 	struct inodedep *inodedep;
 {
 	struct pagedep *pagedep;
 
 	dap->da_state |= COMPLETE;
 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		if (dap->da_state & DIRCHG)
 			pagedep = dap->da_previous->dm_pagedep;
 		else
 			pagedep = dap->da_pagedep;
 		LIST_REMOVE(dap, da_pdlist);
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 	}
 	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
 }
 
 /*
  * Handle the completion of a mkdir dependency.
  */
 static void
 handle_written_mkdir(mkdir, type)
 	struct mkdir *mkdir;
 	int type;
 {
 	struct diradd *dap;
 	struct pagedep *pagedep;
 
 	if (mkdir->md_state != type)
 		panic("handle_written_mkdir: bad type");
 	dap = mkdir->md_diradd;
 	dap->da_state &= ~type;
 	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
 		dap->da_state |= DEPCOMPLETE;
 	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 		if (dap->da_state & DIRCHG)
 			pagedep = dap->da_previous->dm_pagedep;
 		else
 			pagedep = dap->da_pagedep;
 		LIST_REMOVE(dap, da_pdlist);
 		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
 	}
 	LIST_REMOVE(mkdir, md_mkdirs);
 	WORKITEM_FREE(mkdir, D_MKDIR);
 }
 
 /*
  * Called from within softdep_disk_write_complete above.
  * A write operation was just completed. Removed inodes can
  * now be freed and associated block pointers may be committed.
  * Note that this routine is always called from interrupt level
  * with further splbio interrupts blocked.
  */
 static int 
 handle_written_filepage(pagedep, bp)
 	struct pagedep *pagedep;
 	struct buf *bp;		/* buffer containing the written page */
 {
 	struct dirrem *dirrem;
 	struct diradd *dap, *nextdap;
 	struct direct *ep;
 	int i, chgs;
 
 	if ((pagedep->pd_state & IOSTARTED) == 0)
 		panic("handle_written_filepage: not started");
 	pagedep->pd_state &= ~IOSTARTED;
 	/*
 	 * Process any directory removals that have been committed.
 	 */
 	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
 		LIST_REMOVE(dirrem, dm_next);
 		dirrem->dm_dirinum = pagedep->pd_ino;
 		add_to_worklist(&dirrem->dm_list);
 	}
 	/*
 	 * Free any directory additions that have been committed.
 	 * If it is a newly allocated block, we have to wait until
 	 * the on-disk directory inode claims the new block.
 	 */
 	if ((pagedep->pd_state & NEWBLOCK) == 0)
 		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
 			free_diradd(dap);
 	/*
 	 * Uncommitted directory entries must be restored.
 	 */
 	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
 		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
 		     dap = nextdap) {
 			nextdap = LIST_NEXT(dap, da_pdlist);
 			if (dap->da_state & ATTACHED)
 				panic("handle_written_filepage: attached");
 			ep = (struct direct *)
 			    ((char *)bp->b_data + dap->da_offset);
 			ep->d_ino = dap->da_newinum;
 			dap->da_state &= ~UNDONE;
 			dap->da_state |= ATTACHED;
 			chgs = 1;
 			/*
 			 * If the inode referenced by the directory has
 			 * been written out, then the dependency can be
 			 * moved to the pending list.
 			 */
 			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
 				LIST_REMOVE(dap, da_pdlist);
 				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
 				    da_pdlist);
 			}
 		}
 	}
 	/*
 	 * If there were any rollbacks in the directory, then it must be
 	 * marked dirty so that its will eventually get written back in
 	 * its correct form.
 	 */
 	if (chgs) {
 		if ((bp->b_flags & B_DELWRI) == 0)
 			stat_dir_entry++;
 		bdirty(bp);
 		return (1);
 	}
 	/*
 	 * If we are not waiting for a new directory block to be
 	 * claimed by its inode, then the pagedep will be freed.
 	 * Otherwise it will remain to track any new entries on
 	 * the page in case they are fsync'ed.
 	 */
 	if ((pagedep->pd_state & NEWBLOCK) == 0) {
 		LIST_REMOVE(pagedep, pd_hash);
 		WORKITEM_FREE(pagedep, D_PAGEDEP);
 	}
 	return (0);
 }
 
 /*
  * Writing back in-core inode structures.
  * 
  * The filesystem only accesses an inode's contents when it occupies an
  * "in-core" inode structure.  These "in-core" structures are separate from
  * the page frames used to cache inode blocks.  Only the latter are
  * transferred to/from the disk.  So, when the updated contents of the
  * "in-core" inode structure are copied to the corresponding in-memory inode
  * block, the dependencies are also transferred.  The following procedure is
  * called when copying a dirty "in-core" inode to a cached inode block.
  */
 
 /*
  * Called when an inode is loaded from disk. If the effective link count
  * differed from the actual link count when it was last flushed, then we
  * need to ensure that the correct effective link count is put back.
  */
 void 
 softdep_load_inodeblock(ip)
 	struct inode *ip;	/* the "in_core" copy of the inode */
 {
 	struct inodedep *inodedep;
 
 	/*
 	 * Check for alternate nlink count.
 	 */
 	ip->i_effnlink = ip->i_nlink;
 	ACQUIRE_LOCK(&lk);
-	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
+	if (inodedep_lookup(UFSTOVFS(ip->i_ump),
+	    ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		return;
 	}
 	ip->i_effnlink -= inodedep->id_nlinkdelta;
 	if (inodedep->id_state & SPACECOUNTED)
 		ip->i_flag |= IN_SPACECOUNTED;
 	FREE_LOCK(&lk);
 }
 
 /*
  * This routine is called just before the "in-core" inode
  * information is to be copied to the in-memory inode block.
  * Recall that an inode block contains several inodes. If
  * the force flag is set, then the dependencies will be
  * cleared so that the update can always be made. Note that
  * the buffer is locked when this routine is called, so we
  * will never be in the middle of writing the inode block 
  * to disk.
  */
 void 
 softdep_update_inodeblock(ip, bp, waitfor)
 	struct inode *ip;	/* the "in_core" copy of the inode */
 	struct buf *bp;		/* the buffer containing the inode block */
 	int waitfor;		/* nonzero => update must be allowed */
 {
 	struct inodedep *inodedep;
 	struct worklist *wk;
+	struct mount *mp;
 	struct buf *ibp;
 	int error;
 
 	/*
 	 * If the effective link count is not equal to the actual link
 	 * count, then we must track the difference in an inodedep while
 	 * the inode is (potentially) tossed out of the cache. Otherwise,
 	 * if there is no existing inodedep, then there are no dependencies
 	 * to track.
 	 */
+	mp = UFSTOVFS(ip->i_ump);
 	ACQUIRE_LOCK(&lk);
-	if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) {
+	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		if (ip->i_effnlink != ip->i_nlink)
 			panic("softdep_update_inodeblock: bad link count");
 		return;
 	}
 	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
 		panic("softdep_update_inodeblock: bad delta");
 	/*
 	 * Changes have been initiated. Anything depending on these
 	 * changes cannot occur until this inode has been written.
 	 */
 	inodedep->id_state &= ~COMPLETE;
 	if ((inodedep->id_state & ONWORKLIST) == 0)
 		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
 	/*
 	 * Any new dependencies associated with the incore inode must 
 	 * now be moved to the list associated with the buffer holding
 	 * the in-memory copy of the inode. Once merged process any
 	 * allocdirects that are completed by the merger.
 	 */
 	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
 	if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
 	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
 	if (TAILQ_FIRST(&inodedep->id_extupdt) != NULL)
 		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt));
 	/*
 	 * Now that the inode has been pushed into the buffer, the
 	 * operations dependent on the inode being written to disk
 	 * can be moved to the id_bufwait so that they will be
 	 * processed when the buffer I/O completes.
 	 */
 	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
 		WORKLIST_REMOVE(wk);
 		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
 	}
 	/*
 	 * Newly allocated inodes cannot be written until the bitmap
 	 * that allocates them have been written (indicated by
 	 * DEPCOMPLETE being set in id_state). If we are doing a
 	 * forced sync (e.g., an fsync on a file), we force the bitmap
 	 * to be written so that the update can be done.
 	 */
 	if (waitfor == 0) {
 		FREE_LOCK(&lk);
 		return;
 	}
 retry:
 	if ((inodedep->id_state & DEPCOMPLETE) != 0) {
 		FREE_LOCK(&lk);
 		return;
 	}
 	ibp = inodedep->id_buf;
 	ibp = getdirtybuf(ibp, &lk, MNT_WAIT);
 	if (ibp == NULL) {
 		/*
 		 * If ibp came back as NULL, the dependency could have been
 		 * freed while we slept.  Look it up again, and check to see
 		 * that it has completed.
 		 */
-		if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) != 0)
+		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
 			goto retry;
 		FREE_LOCK(&lk);
 		return;
 	}
 	FREE_LOCK(&lk);
 	if ((error = bwrite(ibp)) != 0)
 		softdep_error("softdep_update_inodeblock: bwrite", error);
 }
 
 /*
  * Merge the a new inode dependency list (such as id_newinoupdt) into an
  * old inode dependency list (such as id_inoupdt). This routine must be
  * called with splbio interrupts blocked.
  */
 static void
 merge_inode_lists(newlisthead, oldlisthead)
 	struct allocdirectlst *newlisthead;
 	struct allocdirectlst *oldlisthead;
 {
 	struct allocdirect *listadp, *newadp;
 
 	newadp = TAILQ_FIRST(newlisthead);
 	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
 		if (listadp->ad_lbn < newadp->ad_lbn) {
 			listadp = TAILQ_NEXT(listadp, ad_next);
 			continue;
 		}
 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
 		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
 		if (listadp->ad_lbn == newadp->ad_lbn) {
 			allocdirect_merge(oldlisthead, newadp,
 			    listadp);
 			listadp = newadp;
 		}
 		newadp = TAILQ_FIRST(newlisthead);
 	}
 	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
 		TAILQ_REMOVE(newlisthead, newadp, ad_next);
 		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
 	}
 }
 
 /*
  * If we are doing an fsync, then we must ensure that any directory
  * entries for the inode have been written after the inode gets to disk.
  */
 int
 softdep_fsync(vp)
 	struct vnode *vp;	/* the "in_core" copy of the inode */
 {
 	struct inodedep *inodedep;
 	struct pagedep *pagedep;
 	struct worklist *wk;
 	struct diradd *dap;
-	struct mount *mnt;
+	struct mount *mp;
 	struct vnode *pvp;
 	struct inode *ip;
 	struct buf *bp;
 	struct fs *fs;
 	struct thread *td = curthread;
 	int error, flushparent;
 	ino_t parentino;
 	ufs_lbn_t lbn;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
+	mp = vp->v_mount;
 	ACQUIRE_LOCK(&lk);
-	if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) {
+	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
 		FREE_LOCK(&lk);
 		return (0);
 	}
 	if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
 	    LIST_FIRST(&inodedep->id_bufwait) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_extupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_newextupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
 	    TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
 		panic("softdep_fsync: pending ops");
 	for (error = 0, flushparent = 0; ; ) {
 		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
 			break;
 		if (wk->wk_type != D_DIRADD)
 			panic("softdep_fsync: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 		dap = WK_DIRADD(wk);
 		/*
 		 * Flush our parent if this directory entry has a MKDIR_PARENT
 		 * dependency or is contained in a newly allocated block.
 		 */
 		if (dap->da_state & DIRCHG)
 			pagedep = dap->da_previous->dm_pagedep;
 		else
 			pagedep = dap->da_pagedep;
-		mnt = pagedep->pd_mnt;
 		parentino = pagedep->pd_ino;
 		lbn = pagedep->pd_lbn;
 		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
 			panic("softdep_fsync: dirty");
 		if ((dap->da_state & MKDIR_PARENT) ||
 		    (pagedep->pd_state & NEWBLOCK))
 			flushparent = 1;
 		else
 			flushparent = 0;
 		/*
 		 * If we are being fsync'ed as part of vgone'ing this vnode,
 		 * then we will not be able to release and recover the
 		 * vnode below, so we just have to give up on writing its
 		 * directory entry out. It will eventually be written, just
 		 * not now, but then the user was not asking to have it
 		 * written, so we are not breaking any promises.
 		 */
 		if (vp->v_iflag & VI_DOOMED)
 			break;
 		/*
 		 * We prevent deadlock by always fetching inodes from the
 		 * root, moving down the directory tree. Thus, when fetching
 		 * our parent directory, we first try to get the lock. If
 		 * that fails, we must unlock ourselves before requesting
 		 * the lock on our parent. See the comment in ufs_lookup
 		 * for details on possible races.
 		 */
 		FREE_LOCK(&lk);
-		if (ffs_vget(mnt, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
+		if (ffs_vget(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp)) {
 			VOP_UNLOCK(vp, 0, td);
-			error = ffs_vget(mnt, parentino, LK_EXCLUSIVE, &pvp);
+			error = ffs_vget(mp, parentino, LK_EXCLUSIVE, &pvp);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			if (error != 0)
 				return (error);
 		}
 		/*
 		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
 		 * that are contained in direct blocks will be resolved by 
 		 * doing a ffs_update. Pagedeps contained in indirect blocks
 		 * may require a complete sync'ing of the directory. So, we
 		 * try the cheap and fast ffs_update first, and if that fails,
 		 * then we do the slower ffs_syncvnode of the directory.
 		 */
 		if (flushparent) {
 			if ((error = ffs_update(pvp, 1)) != 0) {
 				vput(pvp);
 				return (error);
 			}
 			if ((pagedep->pd_state & NEWBLOCK) &&
 			    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
 				vput(pvp);
 				return (error);
 			}
 		}
 		/*
 		 * Flush directory page containing the inode's name.
 		 */
 		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
 		    &bp);
 		if (error == 0)
 			error = bwrite(bp);
 		else
 			brelse(bp);
 		vput(pvp);
 		if (error != 0)
 			return (error);
 		ACQUIRE_LOCK(&lk);
-		if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
+		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
 			break;
 	}
 	FREE_LOCK(&lk);
 	return (0);
 }
 
 /*
  * Flush all the dirty bitmaps associated with the block device
  * before flushing the rest of the dirty blocks so as to reduce
  * the number of dependencies that will have to be rolled back.
  */
 void
 softdep_fsync_mountdev(vp)
 	struct vnode *vp;
 {
 	struct buf *bp, *nbp;
 	struct worklist *wk;
 
 	if (!vn_isdisk(vp, NULL))
 		panic("softdep_fsync_mountdev: vnode not a disk");
 restart:
 	ACQUIRE_LOCK(&lk);
 	VI_LOCK(vp);
 	TAILQ_FOREACH_SAFE(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs, nbp) {
 		/* 
 		 * If it is already scheduled, skip to the next buffer.
 		 */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
 			continue;
 
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("softdep_fsync_mountdev: not dirty");
 		/*
 		 * We are only interested in bitmaps with outstanding
 		 * dependencies.
 		 */
 		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
 		    wk->wk_type != D_BMSAFEMAP ||
 		    (bp->b_vflags & BV_BKGRDINPROG)) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		VI_UNLOCK(vp);
 		FREE_LOCK(&lk);
 		bremfree(bp);
 		(void) bawrite(bp);
 		goto restart;
 	}
 	FREE_LOCK(&lk);
 	drain_output(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * This routine is called when we are trying to synchronously flush a
  * file. This routine must eliminate any filesystem metadata dependencies
  * so that the syncing routine can succeed by pushing the dirty blocks
  * associated with the file. If any I/O errors occur, they are returned.
  */
 int
 softdep_sync_metadata(struct vnode *vp)
 {
 	struct pagedep *pagedep;
 	struct allocdirect *adp;
 	struct allocindir *aip;
 	struct buf *bp, *nbp;
 	struct worklist *wk;
 	int i, error, waitfor;
 
 	if (!DOINGSOFTDEP(vp))
 		return (0);
 	/*
 	 * Ensure that any direct block dependencies have been cleared.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if ((error = flush_inodedep_deps(VTOI(vp)->i_fs, VTOI(vp)->i_number))) {
+	if ((error = flush_inodedep_deps(vp->v_mount, VTOI(vp)->i_number))) {
 		FREE_LOCK(&lk);
 		return (error);
 	}
 	FREE_LOCK(&lk);
 	/*
 	 * For most files, the only metadata dependencies are the
 	 * cylinder group maps that allocate their inode or blocks.
 	 * The block allocation dependencies can be found by traversing
 	 * the dependency lists for any buffers that remain on their
 	 * dirty buffer list. The inode allocation dependency will
 	 * be resolved when the inode is updated with MNT_WAIT.
 	 * This work is done in two passes. The first pass grabs most
 	 * of the buffers and begins asynchronously writing them. The
 	 * only way to wait for these asynchronous writes is to sleep
 	 * on the filesystem vnode which may stay busy for a long time
 	 * if the filesystem is active. So, instead, we make a second
 	 * pass over the dependencies blocking on each write. In the
 	 * usual case we will be blocking against a write that we
 	 * initiated, so when it is done the dependency will have been
 	 * resolved. Thus the second pass is expected to end quickly.
 	 */
 	waitfor = MNT_NOWAIT;
 
 top:
 	/*
 	 * We must wait for any I/O in progress to finish so that
 	 * all potential buffers on the dirty list will be visible.
 	 */
 	VI_LOCK(vp);
 	drain_output(vp);
 	while ((bp = TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd)) != NULL) {
 		bp = getdirtybuf(bp, VI_MTX(vp), MNT_WAIT);
 		if (bp)
 			break;
 	}
 	VI_UNLOCK(vp);
 	if (bp == NULL)
 		return (0);
 loop:
 	/* While syncing snapshots, we must allow recursive lookups */
 	bp->b_lock.lk_flags |= LK_CANRECURSE;
 	ACQUIRE_LOCK(&lk);
 	/*
 	 * As we hold the buffer locked, none of its dependencies
 	 * will disappear.
 	 */
 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 		switch (wk->wk_type) {
 
 		case D_ALLOCDIRECT:
 			adp = WK_ALLOCDIRECT(wk);
 			if (adp->ad_state & DEPCOMPLETE)
 				continue;
 			nbp = adp->ad_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
 			FREE_LOCK(&lk);
 			if (waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = bwrite(nbp)) != 0) {
 				break;
 			}
 			ACQUIRE_LOCK(&lk);
 			continue;
 
 		case D_ALLOCINDIR:
 			aip = WK_ALLOCINDIR(wk);
 			if (aip->ai_state & DEPCOMPLETE)
 				continue;
 			nbp = aip->ai_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
 			FREE_LOCK(&lk);
 			if (waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = bwrite(nbp)) != 0) {
 				break;
 			}
 			ACQUIRE_LOCK(&lk);
 			continue;
 
 		case D_INDIRDEP:
 		restart:
 
 			LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) {
 				if (aip->ai_state & DEPCOMPLETE)
 					continue;
 				nbp = aip->ai_buf;
 				nbp = getdirtybuf(nbp, &lk, MNT_WAIT);
 				if (nbp == NULL)
 					goto restart;
 				FREE_LOCK(&lk);
 				if ((error = bwrite(nbp)) != 0) {
 					break;
 				}
 				ACQUIRE_LOCK(&lk);
 				goto restart;
 			}
 			continue;
 
 		case D_INODEDEP:
-			if ((error = flush_inodedep_deps(WK_INODEDEP(wk)->id_fs,
+			if ((error = flush_inodedep_deps(wk->wk_mp,
 			    WK_INODEDEP(wk)->id_ino)) != 0) {
 				FREE_LOCK(&lk);
 				break;
 			}
 			continue;
 
 		case D_PAGEDEP:
 			/*
 			 * We are trying to sync a directory that may
 			 * have dependencies on both its own metadata
 			 * and/or dependencies on the inodes of any
 			 * recently allocated files. We walk its diradd
 			 * lists pushing out the associated inode.
 			 */
 			pagedep = WK_PAGEDEP(wk);
 			for (i = 0; i < DAHASHSZ; i++) {
 				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
 					continue;
 				if ((error =
-				    flush_pagedep_deps(vp, pagedep->pd_mnt,
+				    flush_pagedep_deps(vp, wk->wk_mp,
 						&pagedep->pd_diraddhd[i]))) {
 					FREE_LOCK(&lk);
 					break;
 				}
 			}
 			continue;
 
 		case D_MKDIR:
 			/*
 			 * This case should never happen if the vnode has
 			 * been properly sync'ed. However, if this function
 			 * is used at a place where the vnode has not yet
 			 * been sync'ed, this dependency can show up. So,
 			 * rather than panic, just flush it.
 			 */
 			nbp = WK_MKDIR(wk)->md_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
 			FREE_LOCK(&lk);
 			if (waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = bwrite(nbp)) != 0) {
 				break;
 			}
 			ACQUIRE_LOCK(&lk);
 			continue;
 
 		case D_BMSAFEMAP:
 			/*
 			 * This case should never happen if the vnode has
 			 * been properly sync'ed. However, if this function
 			 * is used at a place where the vnode has not yet
 			 * been sync'ed, this dependency can show up. So,
 			 * rather than panic, just flush it.
 			 */
 			nbp = WK_BMSAFEMAP(wk)->sm_buf;
 			nbp = getdirtybuf(nbp, &lk, waitfor);
 			if (nbp == NULL)
 				continue;
 			FREE_LOCK(&lk);
 			if (waitfor == MNT_NOWAIT) {
 				bawrite(nbp);
 			} else if ((error = bwrite(nbp)) != 0) {
 				break;
 			}
 			ACQUIRE_LOCK(&lk);
 			continue;
 
 		default:
 			panic("softdep_sync_metadata: Unknown type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 		/* We reach here only in error and unlocked */
 		if (error == 0)
 			panic("softdep_sync_metadata: zero error");
 		bp->b_lock.lk_flags &= ~LK_CANRECURSE;
 		bawrite(bp);
 		return (error);
 	}
 	FREE_LOCK(&lk);
 	VI_LOCK(vp);
 	while ((nbp = TAILQ_NEXT(bp, b_bobufs)) != NULL) {
 		nbp = getdirtybuf(nbp, VI_MTX(vp), MNT_WAIT);
 		if (nbp)
 			break;
 	}
 	VI_UNLOCK(vp);
 	bp->b_lock.lk_flags &= ~LK_CANRECURSE;
 	bawrite(bp);
 	if (nbp != NULL) {
 		bp = nbp;
 		goto loop;
 	}
 	/*
 	 * The brief unlock is to allow any pent up dependency
 	 * processing to be done. Then proceed with the second pass.
 	 */
 	if (waitfor == MNT_NOWAIT) {
 		waitfor = MNT_WAIT;
 		goto top;
 	}
 
 	/*
 	 * If we have managed to get rid of all the dirty buffers,
 	 * then we are done. For certain directories and block
 	 * devices, we may need to do further work.
 	 *
 	 * We must wait for any I/O in progress to finish so that
 	 * all potential buffers on the dirty list will be visible.
 	 */
 	VI_LOCK(vp);
 	drain_output(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Flush the dependencies associated with an inodedep.
  * Called with splbio blocked.
  */
 static int
-flush_inodedep_deps(fs, ino)
-	struct fs *fs;
+flush_inodedep_deps(mp, ino)
+	struct mount *mp;
 	ino_t ino;
 {
 	struct inodedep *inodedep;
 	int error, waitfor;
 
 	/*
 	 * This work is done in two passes. The first pass grabs most
 	 * of the buffers and begins asynchronously writing them. The
 	 * only way to wait for these asynchronous writes is to sleep
 	 * on the filesystem vnode which may stay busy for a long time
 	 * if the filesystem is active. So, instead, we make a second
 	 * pass over the dependencies blocking on each write. In the
 	 * usual case we will be blocking against a write that we
 	 * initiated, so when it is done the dependency will have been
 	 * resolved. Thus the second pass is expected to end quickly.
 	 * We give a brief window at the top of the loop to allow
 	 * any pending I/O to complete.
 	 */
 	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
 		if (error)
 			return (error);
 		FREE_LOCK(&lk);
 		ACQUIRE_LOCK(&lk);
-		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
+		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 			return (0);
 		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
 		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
 			continue;
 		/*
 		 * If pass2, we are done, otherwise do pass 2.
 		 */
 		if (waitfor == MNT_WAIT)
 			break;
 		waitfor = MNT_WAIT;
 	}
 	/*
 	 * Try freeing inodedep in case all dependencies have been removed.
 	 */
-	if (inodedep_lookup(fs, ino, 0, &inodedep) != 0)
+	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
 		(void) free_inodedep(inodedep);
 	return (0);
 }
 
 /*
  * Flush an inode dependency list.
  * Called with splbio blocked.
  */
 static int
 flush_deplist(listhead, waitfor, errorp)
 	struct allocdirectlst *listhead;
 	int waitfor;
 	int *errorp;
 {
 	struct allocdirect *adp;
 	struct buf *bp;
 
 	mtx_assert(&lk, MA_OWNED);
 	TAILQ_FOREACH(adp, listhead, ad_next) {
 		if (adp->ad_state & DEPCOMPLETE)
 			continue;
 		bp = adp->ad_buf;
 		bp = getdirtybuf(bp, &lk, waitfor);
 		if (bp == NULL) {
 			if (waitfor == MNT_NOWAIT)
 				continue;
 			return (1);
 		}
 		FREE_LOCK(&lk);
 		if (waitfor == MNT_NOWAIT) {
 			bawrite(bp);
 		} else if ((*errorp = bwrite(bp)) != 0) {
 			ACQUIRE_LOCK(&lk);
 			return (1);
 		}
 		ACQUIRE_LOCK(&lk);
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
  * Called with splbio blocked.
  */
 static int
 flush_pagedep_deps(pvp, mp, diraddhdp)
 	struct vnode *pvp;
 	struct mount *mp;
 	struct diraddhd *diraddhdp;
 {
 	struct inodedep *inodedep;
 	struct ufsmount *ump;
 	struct diradd *dap;
 	struct vnode *vp;
 	int error = 0;
 	struct buf *bp;
 	ino_t inum;
 
 	ump = VFSTOUFS(mp);
 	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
 		/*
 		 * Flush ourselves if this directory entry
 		 * has a MKDIR_PARENT dependency.
 		 */
 		if (dap->da_state & MKDIR_PARENT) {
 			FREE_LOCK(&lk);
 			if ((error = ffs_update(pvp, 1)) != 0)
 				break;
 			ACQUIRE_LOCK(&lk);
 			/*
 			 * If that cleared dependencies, go on to next.
 			 */
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 			if (dap->da_state & MKDIR_PARENT)
 				panic("flush_pagedep_deps: MKDIR_PARENT");
 		}
 		/*
 		 * A newly allocated directory must have its "." and
 		 * ".." entries written out before its name can be
 		 * committed in its parent. We do not want or need
 		 * the full semantics of a synchronous ffs_syncvnode as
 		 * that may end up here again, once for each directory
 		 * level in the filesystem. Instead, we push the blocks
 		 * and wait for them to clear. We have to fsync twice
 		 * because the first call may choose to defer blocks
 		 * that still have dependencies, but deferral will
 		 * happen at most once.
 		 */
 		inum = dap->da_newinum;
 		if (dap->da_state & MKDIR_BODY) {
 			FREE_LOCK(&lk);
 			if ((error = ffs_vget(mp, inum, LK_EXCLUSIVE, &vp)))
 				break;
 			if ((error=ffs_syncvnode(vp, MNT_NOWAIT)) ||
 			    (error=ffs_syncvnode(vp, MNT_NOWAIT))) {
 				vput(vp);
 				break;
 			}
 			VI_LOCK(vp);
 			drain_output(vp);
 			VI_UNLOCK(vp);
 			vput(vp);
 			ACQUIRE_LOCK(&lk);
 			/*
 			 * If that cleared dependencies, go on to next.
 			 */
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 			if (dap->da_state & MKDIR_BODY)
 				panic("flush_pagedep_deps: MKDIR_BODY");
 		}
 		/*
 		 * Flush the inode on which the directory entry depends.
 		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
 		 * the only remaining dependency is that the updated inode
 		 * count must get pushed to disk. The inode has already
 		 * been pushed into its inode buffer (via VOP_UPDATE) at
 		 * the time of the reference count change. So we need only
 		 * locate that buffer, ensure that there will be no rollback
 		 * caused by a bitmap dependency, then write the inode buffer.
 		 */
 retry:
-		if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0)
+		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
 			panic("flush_pagedep_deps: lost inode");
 		/*
 		 * If the inode still has bitmap dependencies,
 		 * push them to disk.
 		 */
 		if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 			bp = inodedep->id_buf;
 			bp = getdirtybuf(bp, &lk, MNT_WAIT);
 			if (bp == NULL)
 				goto retry;
 			FREE_LOCK(&lk);
 			if ((error = bwrite(bp)) != 0)
 				break;
 			ACQUIRE_LOCK(&lk);
 			if (dap != LIST_FIRST(diraddhdp))
 				continue;
 		}
 		/*
 		 * If the inode is still sitting in a buffer waiting
 		 * to be written, push it to disk.
 		 */
 		FREE_LOCK(&lk);
 		if ((error = bread(ump->um_devvp,
 		    fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)),
 		    (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) {
 			brelse(bp);
 			break;
 		}
 		if ((error = bwrite(bp)) != 0)
 			break;
 		ACQUIRE_LOCK(&lk);
 		/*
 		 * If we have failed to get rid of all the dependencies
 		 * then something is seriously wrong.
 		 */
 		if (dap == LIST_FIRST(diraddhdp))
 			panic("flush_pagedep_deps: flush failed");
 	}
 	if (error)
 		ACQUIRE_LOCK(&lk);
 	return (error);
 }
 
 /*
  * A large burst of file addition or deletion activity can drive the
  * memory load excessively high. First attempt to slow things down
  * using the techniques below. If that fails, this routine requests
  * the offending operations to fall back to running synchronously
  * until the memory load returns to a reasonable level.
  */
 int
 softdep_slowdown(vp)
 	struct vnode *vp;
 {
 	int max_softdeps_hard;
 
 	max_softdeps_hard = max_softdeps * 11 / 10;
 	if (num_dirrem < max_softdeps_hard / 2 &&
 	    num_inodedep < max_softdeps_hard &&
 	    VFSTOUFS(vp->v_mount)->um_numindirdeps < maxindirdeps)
   		return (0);
 	if (VFSTOUFS(vp->v_mount)->um_numindirdeps >= maxindirdeps)
-		speedup_syncer();
+		softdep_speedup();
 	stat_sync_limit_hit += 1;
 	return (1);
 }
 
 /*
  * Called by the allocation routines when they are about to fail
  * in the hope that we can free up some disk space.
  * 
  * First check to see if the work list has anything on it. If it has,
  * clean up entries until we successfully free some space. Because this
  * process holds inodes locked, we cannot handle any remove requests
  * that might block on a locked inode as that could lead to deadlock.
  * If the worklist yields no free space, encourage the syncer daemon
  * to help us. In no event will we try for longer than tickdelay seconds.
  */
 int
 softdep_request_cleanup(fs, vp)
 	struct fs *fs;
 	struct vnode *vp;
 {
 	struct ufsmount *ump;
 	long starttime;
 	ufs2_daddr_t needed;
 	int error;
 
 	ump = VTOI(vp)->i_ump;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	needed = fs->fs_cstotal.cs_nbfree + fs->fs_contigsumsize;
 	starttime = time_second + tickdelay;
 	/*
 	 * If we are being called because of a process doing a
 	 * copy-on-write, then it is not safe to update the vnode
 	 * as we may recurse into the copy-on-write routine.
 	 */
 	if (!(curthread->td_pflags & TDP_COWINPROGRESS)) {
 		UFS_UNLOCK(ump);
 		error = ffs_update(vp, 1);
 		UFS_LOCK(ump);
 		if (error != 0)
 			return (0);
 	}
 	while (fs->fs_pendingblocks > 0 && fs->fs_cstotal.cs_nbfree <= needed) {
 		if (time_second > starttime)
 			return (0);
 		UFS_UNLOCK(ump);
 		ACQUIRE_LOCK(&lk);
-		if (softdep_worklist_busy < 0) {
-			request_cleanup(FLUSH_REMOVE_WAIT);
-			FREE_LOCK(&lk);
-			UFS_LOCK(ump);
-			return (0);
-		}
-		softdep_worklist_busy += 1;
-		if (num_on_worklist > 0 &&
-		    process_worklist_item(NULL, LK_NOWAIT) != -1) {
+		if (ump->softdep_on_worklist > 0 &&
+		    process_worklist_item(UFSTOVFS(ump), LK_NOWAIT) != -1) {
 			stat_worklist_push += 1;
-			softdep_worklist_busy -= 1;
-			if (softdep_worklist_req && softdep_worklist_busy == 0)
-				wakeup(&softdep_worklist_req);
 			FREE_LOCK(&lk);
 			UFS_LOCK(ump);
 			continue;
 		}
-		softdep_worklist_busy -= 1;
-		if (softdep_worklist_req && softdep_worklist_busy == 0)
-			wakeup(&softdep_worklist_req);
-		request_cleanup(FLUSH_REMOVE_WAIT);
+		request_cleanup(UFSTOVFS(ump), FLUSH_REMOVE_WAIT);
 		FREE_LOCK(&lk);
 		UFS_LOCK(ump);
 	}
 	return (1);
 }
 
 /*
  * If memory utilization has gotten too high, deliberately slow things
  * down and speed up the I/O processing.
  */
+extern struct thread *syncertd;
 static int
-request_cleanup(resource)
+request_cleanup(mp, resource)
+	struct mount *mp;
 	int resource;
 {
 	struct thread *td = curthread;
+	struct ufsmount *ump;
 
 	mtx_assert(&lk, MA_OWNED);
 	/*
-	 * We never hold up the filesystem syncer process.
+	 * We never hold up the filesystem syncer or buf daemon.
 	 */
-	if (td == filesys_syncer || (td->td_pflags & TDP_SOFTDEP))
+	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
 		return (0);
+	ump = VFSTOUFS(mp);
 	/*
 	 * First check to see if the work list has gotten backlogged.
 	 * If it has, co-opt this process to help clean up two entries.
 	 * Because this process may hold inodes locked, we cannot
 	 * handle any remove requests that might block on a locked
 	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
 	 * to avoid recursively processing the worklist.
 	 */
-	if (num_on_worklist > max_softdeps / 10 &&
-	    softdep_worklist_busy >= 0) {
-		softdep_worklist_busy += 1;
+	if (ump->softdep_on_worklist > max_softdeps / 10) {
 		td->td_pflags |= TDP_SOFTDEP;
-		process_worklist_item(NULL, LK_NOWAIT);
-		process_worklist_item(NULL, LK_NOWAIT);
+		process_worklist_item(mp, LK_NOWAIT);
+		process_worklist_item(mp, LK_NOWAIT);
 		td->td_pflags &= ~TDP_SOFTDEP;
 		stat_worklist_push += 2;
-		softdep_worklist_busy -= 1;
-		if (softdep_worklist_req && softdep_worklist_busy == 0)
-			wakeup(&softdep_worklist_req);
 		return(1);
 	}
 	/*
 	 * Next, we attempt to speed up the syncer process. If that
 	 * is successful, then we allow the process to continue.
 	 */
-	if (speedup_syncer() && resource != FLUSH_REMOVE_WAIT)
+	if (softdep_speedup() && resource != FLUSH_REMOVE_WAIT)
 		return(0);
 	/*
 	 * If we are resource constrained on inode dependencies, try
 	 * flushing some dirty inodes. Otherwise, we are constrained
 	 * by file deletions, so try accelerating flushes of directories
 	 * with removal dependencies. We would like to do the cleanup
 	 * here, but we probably hold an inode locked at this point and 
 	 * that might deadlock against one that we try to clean. So,
 	 * the best that we can do is request the syncer daemon to do
 	 * the cleanup for us.
 	 */
 	switch (resource) {
 
 	case FLUSH_INODES:
 		stat_ino_limit_push += 1;
 		req_clear_inodedeps += 1;
 		stat_countp = &stat_ino_limit_hit;
 		break;
 
 	case FLUSH_REMOVE:
 	case FLUSH_REMOVE_WAIT:
 		stat_blk_limit_push += 1;
 		req_clear_remove += 1;
 		stat_countp = &stat_blk_limit_hit;
 		break;
 
 	default:
 		panic("request_cleanup: unknown type");
 	}
 	/*
 	 * Hopefully the syncer daemon will catch up and awaken us.
 	 * We wait at most tickdelay before proceeding in any case.
 	 */
 	proc_waiting += 1;
 	if (handle.callout == NULL)
 		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
 	msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
 	proc_waiting -= 1;
 	return (1);
 }
 
 /*
  * Awaken processes pausing in request_cleanup and clear proc_waiting
  * to indicate that there is no longer a timer running.
  */
 static void
 pause_timer(arg)
 	void *arg;
 {
 
 	ACQUIRE_LOCK(&lk);
 	*stat_countp += 1;
 	wakeup_one(&proc_waiting);
 	if (proc_waiting > 0)
 		handle = timeout(pause_timer, 0, tickdelay > 2 ? tickdelay : 2);
 	else
 		handle.callout = NULL;
 	FREE_LOCK(&lk);
 }
 
 /*
  * Flush out a directory with at least one removal dependency in an effort to
  * reduce the number of dirrem, freefile, and freeblks dependency structures.
  */
 static void
 clear_remove(td)
 	struct thread *td;
 {
 	struct pagedep_hashhead *pagedephd;
 	struct pagedep *pagedep;
 	static int next = 0;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, cnt;
 	ino_t ino;
 
 	mtx_assert(&lk, MA_OWNED);
 
 	for (cnt = 0; cnt < pagedep_hash; cnt++) {
 		pagedephd = &pagedep_hashtbl[next++];
 		if (next >= pagedep_hash)
 			next = 0;
 		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
 			if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL)
 				continue;
-			mp = pagedep->pd_mnt;
+			mp = pagedep->pd_list.wk_mp;
 			ino = pagedep->pd_ino;
 			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 				continue;
 			FREE_LOCK(&lk);
 			if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp))) {
 				softdep_error("clear_remove: vget", error);
 				vn_finished_write(mp);
 				ACQUIRE_LOCK(&lk);
 				return;
 			}
 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
 				softdep_error("clear_remove: fsync", error);
 			VI_LOCK(vp);
 			drain_output(vp);
 			VI_UNLOCK(vp);
 			vput(vp);
 			vn_finished_write(mp);
 			ACQUIRE_LOCK(&lk);
 			return;
 		}
 	}
 }
 
 /*
  * Clear out a block of dirty inodes in an effort to reduce
  * the number of inodedep dependency structures.
  */
 static void
 clear_inodedeps(td)
 	struct thread *td;
 {
 	struct inodedep_hashhead *inodedephd;
 	struct inodedep *inodedep;
 	static int next = 0;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fs *fs;
 	int error, cnt;
 	ino_t firstino, lastino, ino;
 
 	mtx_assert(&lk, MA_OWNED);
 	/*
 	 * Pick a random inode dependency to be cleared.
 	 * We will then gather up all the inodes in its block 
 	 * that have dependencies and flush them out.
 	 */
 	for (cnt = 0; cnt < inodedep_hash; cnt++) {
 		inodedephd = &inodedep_hashtbl[next++];
 		if (next >= inodedep_hash)
 			next = 0;
 		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
 			break;
 	}
 	if (inodedep == NULL)
 		return;
-	/*
-	 * Ugly code to find mount point given pointer to superblock.
-	 */
 	fs = inodedep->id_fs;
-	TAILQ_FOREACH(mp, &mountlist, mnt_list)
-		if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs)
-			break;
+	mp = inodedep->id_list.wk_mp;
 	/*
 	 * Find the last inode in the block with dependencies.
 	 */
 	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
 	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
-		if (inodedep_lookup(fs, lastino, 0, &inodedep) != 0)
+		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
 			break;
 	/*
 	 * Asynchronously push all but the last inode with dependencies.
 	 * Synchronously push the last inode with dependencies to ensure
 	 * that the inode block gets written to free up the inodedeps.
 	 */
 	for (ino = firstino; ino <= lastino; ino++) {
-		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
+		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
 			continue;
 		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
 			continue;
 		FREE_LOCK(&lk);
 		if ((error = ffs_vget(mp, ino, LK_EXCLUSIVE, &vp)) != 0) {
 			softdep_error("clear_inodedeps: vget", error);
 			vn_finished_write(mp);
 			ACQUIRE_LOCK(&lk);
 			return;
 		}
 		if (ino == lastino) {
 			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
 				softdep_error("clear_inodedeps: fsync1", error);
 		} else {
 			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
 				softdep_error("clear_inodedeps: fsync2", error);
 			VI_LOCK(vp);
 			drain_output(vp);
 			VI_UNLOCK(vp);
 		}
 		vput(vp);
 		vn_finished_write(mp);
 		ACQUIRE_LOCK(&lk);
 	}
 }
 
 /*
  * Function to determine if the buffer has outstanding dependencies
  * that will cause a roll-back if the buffer is written. If wantcount
  * is set, return number of dependencies, otherwise just yes or no.
  */
 static int
 softdep_count_dependencies(bp, wantcount)
 	struct buf *bp;
 	int wantcount;
 {
 	struct worklist *wk;
 	struct inodedep *inodedep;
 	struct indirdep *indirdep;
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 	struct diradd *dap;
 	int i, retval;
 
 	retval = 0;
 	ACQUIRE_LOCK(&lk);
 	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
 		switch (wk->wk_type) {
 
 		case D_INODEDEP:
 			inodedep = WK_INODEDEP(wk);
 			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
 				/* bitmap allocation dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
 				/* direct block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
 				/* direct block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_INDIRDEP:
 			indirdep = WK_INDIRDEP(wk);
 
 			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
 				/* indirect block pointer dependency */
 				retval += 1;
 				if (!wantcount)
 					goto out;
 			}
 			continue;
 
 		case D_PAGEDEP:
 			pagedep = WK_PAGEDEP(wk);
 			for (i = 0; i < DAHASHSZ; i++) {
 
 				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
 					/* directory entry dependency */
 					retval += 1;
 					if (!wantcount)
 						goto out;
 				}
 			}
 			continue;
 
 		case D_BMSAFEMAP:
 		case D_ALLOCDIRECT:
 		case D_ALLOCINDIR:
 		case D_MKDIR:
 			/* never a dependency on these blocks */
 			continue;
 
 		default:
 			panic("softdep_check_for_rollback: Unexpected type %s",
 			    TYPENAME(wk->wk_type));
 			/* NOTREACHED */
 		}
 	}
 out:
 	FREE_LOCK(&lk);
 	return retval;
 }
 
 /*
  * Acquire exclusive access to a buffer.
  * Must be called with a locked mtx parameter.
  * Return acquired buffer or NULL on failure.
  */
 static struct buf *
 getdirtybuf(bp, mtx, waitfor)
 	struct buf *bp;
 	struct mtx *mtx;
 	int waitfor;
 {
 	int error;
 
 	mtx_assert(mtx, MA_OWNED);
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
 		if (waitfor != MNT_WAIT)
 			return (NULL);
 		error = BUF_LOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, mtx);
 		/*
 		 * Even if we sucessfully acquire bp here, we have dropped
 		 * mtx, which may violates our guarantee.
 		 */
 		if (error == 0)
 			BUF_UNLOCK(bp);
 		else if (error != ENOLCK)
 			panic("getdirtybuf: inconsistent lock: %d", error);
 		mtx_lock(mtx);
 		return (NULL);
 	}
 	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 		if (mtx == &lk && waitfor == MNT_WAIT) {
 			mtx_unlock(mtx);
 			BO_LOCK(bp->b_bufobj);
 			BUF_UNLOCK(bp);
 			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 				bp->b_vflags |= BV_BKGRDWAIT;
 				msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj),
 				       PRIBIO | PDROP, "getbuf", 0);
 			} else
 				BO_UNLOCK(bp->b_bufobj);
 			mtx_lock(mtx);
 			return (NULL);
 		}
 		BUF_UNLOCK(bp);
 		if (waitfor != MNT_WAIT)
 			return (NULL);
 		/*
 		 * The mtx argument must be bp->b_vp's mutex in
 		 * this case.
 		 */
 #ifdef	DEBUG_VFS_LOCKS
 		if (bp->b_vp->v_type != VCHR)
 			ASSERT_VI_LOCKED(bp->b_vp, "getdirtybuf");
 #endif
 		bp->b_vflags |= BV_BKGRDWAIT;
 		msleep(&bp->b_xflags, mtx, PRIBIO, "getbuf", 0);
 		return (NULL);
 	}
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		BUF_UNLOCK(bp);
 		return (NULL);
 	}
 	bremfree(bp);
 	return (bp);
 }
 
 /*
  * Wait for pending output on a vnode to complete.
  * Must be called with vnode lock and interlock locked.
  *
  * XXX: Should just be a call to bufobj_wwait().
  */
 static void
 drain_output(vp)
 	struct vnode *vp;
 {
 	ASSERT_VOP_LOCKED(vp, "drain_output");
 	ASSERT_VI_LOCKED(vp, "drain_output");
 
 	while (vp->v_bufobj.bo_numoutput) {
 		vp->v_bufobj.bo_flag |= BO_WWAIT;
 		msleep((caddr_t)&vp->v_bufobj.bo_numoutput,
 		    VI_MTX(vp), PRIBIO + 1, "drainvp", 0);
 	}
 }
 
 /*
  * Called whenever a buffer that is being invalidated or reallocated
  * contains dependencies. This should only happen if an I/O error has
  * occurred. The routine is called with the buffer locked.
  */ 
 static void
 softdep_deallocate_dependencies(bp)
 	struct buf *bp;
 {
 
 	if ((bp->b_ioflags & BIO_ERROR) == 0)
 		panic("softdep_deallocate_dependencies: dangling deps");
 	softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
 	panic("softdep_deallocate_dependencies: unrecovered I/O error");
 }
 
 /*
  * Function to handle asynchronous write errors in the filesystem.
  */
 static void
 softdep_error(func, error)
 	char *func;
 	int error;
 {
 
 	/* XXX should do something better! */
 	printf("%s: got error %d while accessing filesystem\n", func, error);
 }
 
 #endif /* SOFTUPDATES */
Index: head/sys/ufs/ffs/softdep.h
===================================================================
--- head/sys/ufs/ffs/softdep.h	(revision 156202)
+++ head/sys/ufs/ffs/softdep.h	(revision 156203)
@@ -1,590 +1,585 @@
 /*-
  * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * The soft updates code is derived from the appendix of a University
  * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
  * "Soft Updates: A Solution to the Metadata Update Problem in File
  * Systems", CSE-TR-254-95, August 1995).
  *
  * Further information about soft updates can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)softdep.h	9.7 (McKusick) 6/21/00
  * $FreeBSD$
  */
 
 #include <sys/queue.h>
 
 /*
  * Allocation dependencies are handled with undo/redo on the in-memory
  * copy of the data. A particular data dependency is eliminated when
  * it is ALLCOMPLETE: that is ATTACHED, DEPCOMPLETE, and COMPLETE.
  * 
  * ATTACHED means that the data is not currently being written to
  * disk. UNDONE means that the data has been rolled back to a safe
  * state for writing to the disk. When the I/O completes, the data is
  * restored to its current form and the state reverts to ATTACHED.
  * The data must be locked throughout the rollback, I/O, and roll
  * forward so that the rolled back information is never visible to
  * user processes. The COMPLETE flag indicates that the item has been
  * written. For example, a dependency that requires that an inode be
  * written will be marked COMPLETE after the inode has been written
  * to disk. The DEPCOMPLETE flag indicates the completion of any other
  * dependencies such as the writing of a cylinder group map has been
  * completed. A dependency structure may be freed only when both it
  * and its dependencies have completed and any rollbacks that are in
  * progress have finished as indicated by the set of ALLCOMPLETE flags
  * all being set. The two MKDIR flags indicate additional dependencies
  * that must be done when creating a new directory. MKDIR_BODY is
  * cleared when the directory data block containing the "." and ".."
  * entries has been written. MKDIR_PARENT is cleared when the parent
  * inode with the increased link count for ".." has been written. When
  * both MKDIR flags have been cleared, the DEPCOMPLETE flag is set to
  * indicate that the directory dependencies have been completed. The
  * writing of the directory inode itself sets the COMPLETE flag which
  * then allows the directory entry for the new directory to be written
  * to disk. The RMDIR flag marks a dirrem structure as representing
  * the removal of a directory rather than a file. When the removal
  * dependencies are completed, additional work needs to be done
  * (truncation of the "." and ".." entries, an additional decrement
  * of the associated inode, and a decrement of the parent inode). The
  * DIRCHG flag marks a diradd structure as representing the changing
  * of an existing entry rather than the addition of a new one. When
  * the update is complete the dirrem associated with the inode for
  * the old name must be added to the worklist to do the necessary
  * reference count decrement. The GOINGAWAY flag indicates that the
  * data structure is frozen from further change until its dependencies
  * have been completed and its resources freed after which it will be
  * discarded. The IOSTARTED flag prevents multiple calls to the I/O
  * start routine from doing multiple rollbacks. The SPACECOUNTED flag
  * says that the files space has been accounted to the pending free
  * space count. The NEWBLOCK flag marks pagedep structures that have
  * just been allocated, so must be claimed by the inode before all
  * dependencies are complete. The INPROGRESS flag marks worklist
  * structures that are still on the worklist, but are being considered
  * for action by some process. The UFS1FMT flag indicates that the
  * inode being processed is a ufs1 format. The EXTDATA flag indicates
  * that the allocdirect describes an extended-attributes dependency.
  * The ONWORKLIST flag shows whether the structure is currently linked
  * onto a worklist.
  */
 #define	ATTACHED	0x0001
 #define	UNDONE		0x0002
 #define	COMPLETE	0x0004
 #define	DEPCOMPLETE	0x0008
 #define	MKDIR_PARENT	0x0010	/* diradd & mkdir only */
 #define	MKDIR_BODY	0x0020	/* diradd & mkdir only */
 #define	RMDIR		0x0040	/* dirrem only */
 #define	DIRCHG		0x0080	/* diradd & dirrem only */
 #define	GOINGAWAY	0x0100	/* indirdep only */
 #define	IOSTARTED	0x0200	/* inodedep & pagedep only */
 #define	SPACECOUNTED	0x0400	/* inodedep only */
 #define	NEWBLOCK	0x0800	/* pagedep only */
 #define	INPROGRESS	0x1000	/* dirrem, freeblks, freefrag, freefile only */
 #define	UFS1FMT		0x2000	/* indirdep only */
 #define	EXTDATA		0x4000	/* allocdirect only */
 #define ONWORKLIST	0x8000
 
 #define	ALLCOMPLETE	(ATTACHED | COMPLETE | DEPCOMPLETE)
 
 /*
  * The workitem queue.
  * 
  * It is sometimes useful and/or necessary to clean up certain dependencies
  * in the background rather than during execution of an application process
  * or interrupt service routine. To realize this, we append dependency
  * structures corresponding to such tasks to a "workitem" queue. In a soft
  * updates implementation, most pending workitems should not wait for more
  * than a couple of seconds, so the filesystem syncer process awakens once
  * per second to process the items on the queue.
  */
 
 /* LIST_HEAD(workhead, worklist);	-- declared in buf.h */
 
 /*
  * Each request can be linked onto a work queue through its worklist structure.
  * To avoid the need for a pointer to the structure itself, this structure
  * MUST be declared FIRST in each type in which it appears! If more than one
  * worklist is needed in the structure, then a wk_data field must be added
  * and the macros below changed to use it.
  */
 struct worklist {
+	struct mount		*wk_mp;		/* Mount we live in */
 	LIST_ENTRY(worklist)	wk_list;	/* list of work requests */
 	unsigned short		wk_type;	/* type of request */
 	unsigned short		wk_state;	/* state flags */
 };
 #define WK_DATA(wk) ((void *)(wk))
 #define WK_PAGEDEP(wk) ((struct pagedep *)(wk))
 #define WK_INODEDEP(wk) ((struct inodedep *)(wk))
-#define WK_NEWBLK(wk) ((struct newblk *)(wk))
 #define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk))
 #define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk))
 #define WK_INDIRDEP(wk) ((struct indirdep *)(wk))
 #define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk))
 #define WK_FREEFRAG(wk) ((struct freefrag *)(wk))
 #define WK_FREEBLKS(wk) ((struct freeblks *)(wk))
 #define WK_FREEFILE(wk) ((struct freefile *)(wk))
 #define WK_DIRADD(wk) ((struct diradd *)(wk))
 #define WK_MKDIR(wk) ((struct mkdir *)(wk))
 #define WK_DIRREM(wk) ((struct dirrem *)(wk))
 #define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk))
 
 /*
  * Various types of lists
  */
 LIST_HEAD(dirremhd, dirrem);
 LIST_HEAD(diraddhd, diradd);
 LIST_HEAD(newblkhd, newblk);
 LIST_HEAD(inodedephd, inodedep);
 LIST_HEAD(allocindirhd, allocindir);
 LIST_HEAD(allocdirecthd, allocdirect);
 TAILQ_HEAD(allocdirectlst, allocdirect);
 
 /*
  * The "pagedep" structure tracks the various dependencies related to
  * a particular directory page. If a directory page has any dependencies,
  * it will have a pagedep linked to its associated buffer. The
  * pd_dirremhd list holds the list of dirrem requests which decrement
  * inode reference counts. These requests are processed after the
  * directory page with the corresponding zero'ed entries has been
  * written. The pd_diraddhd list maintains the list of diradd requests
  * which cannot be committed until their corresponding inode has been
  * written to disk. Because a directory may have many new entries
  * being created, several lists are maintained hashed on bits of the
  * offset of the entry into the directory page to keep the lists from
  * getting too long. Once a new directory entry has been cleared to
  * be written, it is moved to the pd_pendinghd list. After the new
  * entry has been written to disk it is removed from the pd_pendinghd
  * list, any removed operations are done, and the dependency structure
  * is freed.
  */
 #define DAHASHSZ 5
 #define DIRADDHASH(offset) (((offset) >> 2) % DAHASHSZ)
 struct pagedep {
 	struct	worklist pd_list;	/* page buffer */
 #	define	pd_state pd_list.wk_state /* check for multiple I/O starts */
 	LIST_ENTRY(pagedep) pd_hash;	/* hashed lookup */
-	struct	mount *pd_mnt;		/* associated mount point */
 	ino_t	pd_ino;			/* associated file */
 	ufs_lbn_t pd_lbn;		/* block within file */
 	struct	dirremhd pd_dirremhd;	/* dirrem's waiting for page */
 	struct	diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */
 	struct	diraddhd pd_pendinghd;	/* directory entries awaiting write */
 };
 
 /*
  * The "inodedep" structure tracks the set of dependencies associated
  * with an inode. One task that it must manage is delayed operations
  * (i.e., work requests that must be held until the inodedep's associated
  * inode has been written to disk). Getting an inode from its incore 
  * state to the disk requires two steps to be taken by the filesystem
  * in this order: first the inode must be copied to its disk buffer by
  * the VOP_UPDATE operation; second the inode's buffer must be written
  * to disk. To ensure that both operations have happened in the required
  * order, the inodedep maintains two lists. Delayed operations are
  * placed on the id_inowait list. When the VOP_UPDATE is done, all
  * operations on the id_inowait list are moved to the id_bufwait list.
  * When the buffer is written, the items on the id_bufwait list can be
  * safely moved to the work queue to be processed. A second task of the
  * inodedep structure is to track the status of block allocation within
  * the inode.  Each block that is allocated is represented by an
  * "allocdirect" structure (see below). It is linked onto the id_newinoupdt
  * list until both its contents and its allocation in the cylinder
  * group map have been written to disk. Once these dependencies have been
  * satisfied, it is removed from the id_newinoupdt list and any followup
  * actions such as releasing the previous block or fragment are placed
  * on the id_inowait list. When an inode is updated (a VOP_UPDATE is
  * done), the "inodedep" structure is linked onto the buffer through
  * its worklist. Thus, it will be notified when the buffer is about
  * to be written and when it is done. At the update time, all the
  * elements on the id_newinoupdt list are moved to the id_inoupdt list
  * since those changes are now relevant to the copy of the inode in the
  * buffer. Also at update time, the tasks on the id_inowait list are
  * moved to the id_bufwait list so that they will be executed when
  * the updated inode has been written to disk. When the buffer containing
  * the inode is written to disk, any updates listed on the id_inoupdt
  * list are rolled back as they are not yet safe. Following the write,
  * the changes are once again rolled forward and any actions on the
  * id_bufwait list are processed (since those actions are now safe).
  * The entries on the id_inoupdt and id_newinoupdt lists must be kept
  * sorted by logical block number to speed the calculation of the size
  * of the rolled back inode (see explanation in initiate_write_inodeblock).
  * When a directory entry is created, it is represented by a diradd.
  * The diradd is added to the id_inowait list as it cannot be safely
  * written to disk until the inode that it represents is on disk. After
  * the inode is written, the id_bufwait list is processed and the diradd
  * entries are moved to the id_pendinghd list where they remain until
  * the directory block containing the name has been written to disk.
  * The purpose of keeping the entries on the id_pendinghd list is so that
  * the softdep_fsync function can find and push the inode's directory
  * name(s) as part of the fsync operation for that file.
  */
 struct inodedep {
 	struct	worklist id_list;	/* buffer holding inode block */
 #	define	id_state id_list.wk_state /* inode dependency state */
 	LIST_ENTRY(inodedep) id_hash;	/* hashed lookup */
 	struct	fs *id_fs;		/* associated filesystem */
 	ino_t	id_ino;			/* dependent inode */
 	nlink_t	id_nlinkdelta;		/* saved effective link count */
 	LIST_ENTRY(inodedep) id_deps;	/* bmsafemap's list of inodedep's */
 	struct	buf *id_buf;		/* related bmsafemap (if pending) */
 	long	id_savedextsize;	/* ext size saved during rollback */
 	off_t	id_savedsize;		/* file size saved during rollback */
 	struct	workhead id_pendinghd;	/* entries awaiting directory write */
 	struct	workhead id_bufwait;	/* operations after inode written */
 	struct	workhead id_inowait;	/* operations waiting inode update */
 	struct	allocdirectlst id_inoupdt; /* updates before inode written */
 	struct	allocdirectlst id_newinoupdt; /* updates when inode written */
 	struct	allocdirectlst id_extupdt; /* extdata updates pre-inode write */
 	struct	allocdirectlst id_newextupdt; /* extdata updates at ino write */
 	union {
 	struct	ufs1_dinode *idu_savedino1; /* saved ufs1_dinode contents */
 	struct	ufs2_dinode *idu_savedino2; /* saved ufs2_dinode contents */
 	} id_un;
 };
 #define id_savedino1 id_un.idu_savedino1
 #define id_savedino2 id_un.idu_savedino2
 
 /*
  * A "newblk" structure is attached to a bmsafemap structure when a block
  * or fragment is allocated from a cylinder group. Its state is set to
  * DEPCOMPLETE when its cylinder group map is written. It is consumed by
  * an associated allocdirect or allocindir allocation which will attach
  * themselves to the bmsafemap structure if the newblk's DEPCOMPLETE flag
  * is not set (i.e., its cylinder group map has not been written).
  */ 
 struct newblk {
 	LIST_ENTRY(newblk) nb_hash;	/* hashed lookup */
 	struct	fs *nb_fs;		/* associated filesystem */
 	int	nb_state;		/* state of bitmap dependency */
 	ufs2_daddr_t nb_newblkno;	/* allocated block number */
 	LIST_ENTRY(newblk) nb_deps;	/* bmsafemap's list of newblk's */
 	struct	bmsafemap *nb_bmsafemap; /* associated bmsafemap */
 };
 
 /*
  * A "bmsafemap" structure maintains a list of dependency structures
  * that depend on the update of a particular cylinder group map.
  * It has lists for newblks, allocdirects, allocindirs, and inodedeps.
  * It is attached to the buffer of a cylinder group block when any of
  * these things are allocated from the cylinder group. It is freed
  * after the cylinder group map is written and the state of its
  * dependencies are updated with DEPCOMPLETE to indicate that it has
  * been processed.
  */
 struct bmsafemap {
 	struct	worklist sm_list;	/* cylgrp buffer */
 	struct	buf *sm_buf;		/* associated buffer */
 	struct	allocdirecthd sm_allocdirecthd; /* allocdirect deps */
 	struct	allocindirhd sm_allocindirhd; /* allocindir deps */
 	struct	inodedephd sm_inodedephd; /* inodedep deps */
 	struct	newblkhd sm_newblkhd;	/* newblk deps */
 };
 
 /*
  * An "allocdirect" structure is attached to an "inodedep" when a new block
  * or fragment is allocated and pointed to by the inode described by
  * "inodedep". The worklist is linked to the buffer that holds the block.
  * When the block is first allocated, it is linked to the bmsafemap
  * structure associated with the buffer holding the cylinder group map
  * from which it was allocated. When the cylinder group map is written
  * to disk, ad_state has the DEPCOMPLETE flag set. When the block itself
  * is written, the COMPLETE flag is set. Once both the cylinder group map
  * and the data itself have been written, it is safe to write the inode
  * that claims the block. If there was a previous fragment that had been
  * allocated before the file was increased in size, the old fragment may
  * be freed once the inode claiming the new block is written to disk.
  * This ad_fragfree request is attached to the id_inowait list of the
  * associated inodedep (pointed to by ad_inodedep) for processing after
  * the inode is written. When a block is allocated to a directory, an
  * fsync of a file whose name is within that block must ensure not only
  * that the block containing the file name has been written, but also
  * that the on-disk inode references that block. When a new directory
  * block is created, we allocate a newdirblk structure which is linked
  * to the associated allocdirect (on its ad_newdirblk list). When the
  * allocdirect has been satisfied, the newdirblk structure is moved to
  * the inodedep id_bufwait list of its directory to await the inode
  * being written. When the inode is written, the directory entries are
  * fully committed and can be deleted from their pagedep->id_pendinghd
  * and inodedep->id_pendinghd lists.
  */
 struct allocdirect {
 	struct	worklist ad_list;	/* buffer holding block */
 #	define	ad_state ad_list.wk_state /* block pointer state */
 	TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */
 	ufs_lbn_t ad_lbn;		/* block within file */
 	ufs2_daddr_t ad_newblkno;	/* new value of block pointer */
 	ufs2_daddr_t ad_oldblkno;	/* old value of block pointer */
 	long	ad_newsize;		/* size of new block */
 	long	ad_oldsize;		/* size of old block */
 	LIST_ENTRY(allocdirect) ad_deps; /* bmsafemap's list of allocdirect's */
 	struct	buf *ad_buf;		/* cylgrp buffer (if pending) */
 	struct	inodedep *ad_inodedep;	/* associated inodedep */
 	struct	freefrag *ad_freefrag;	/* fragment to be freed (if any) */
 	struct	workhead ad_newdirblk;	/* dir block to notify when written */
 };
 
 /*
  * A single "indirdep" structure manages all allocation dependencies for
  * pointers in an indirect block. The up-to-date state of the indirect
  * block is stored in ir_savedata. The set of pointers that may be safely
  * written to the disk is stored in ir_safecopy. The state field is used
  * only to track whether the buffer is currently being written (in which
  * case it is not safe to update ir_safecopy). Ir_deplisthd contains the
  * list of allocindir structures, one for each block that needs to be
  * written to disk. Once the block and its bitmap allocation have been
  * written the safecopy can be updated to reflect the allocation and the
  * allocindir structure freed. If ir_state indicates that an I/O on the
  * indirect block is in progress when ir_safecopy is to be updated, the
  * update is deferred by placing the allocindir on the ir_donehd list.
  * When the I/O on the indirect block completes, the entries on the
  * ir_donehd list are processed by updating their corresponding ir_safecopy
  * pointers and then freeing the allocindir structure.
  */
 struct indirdep {
 	struct	worklist ir_list;	/* buffer holding indirect block */
 #	define	ir_state ir_list.wk_state /* indirect block pointer state */
 	caddr_t ir_saveddata;		/* buffer cache contents */
 	struct	buf *ir_savebp;		/* buffer holding safe copy */
 	struct	allocindirhd ir_donehd;	/* done waiting to update safecopy */
 	struct	allocindirhd ir_deplisthd; /* allocindir deps for this block */
 };
 
 /*
  * An "allocindir" structure is attached to an "indirdep" when a new block
  * is allocated and pointed to by the indirect block described by the
  * "indirdep". The worklist is linked to the buffer that holds the new block.
  * When the block is first allocated, it is linked to the bmsafemap
  * structure associated with the buffer holding the cylinder group map
  * from which it was allocated. When the cylinder group map is written
  * to disk, ai_state has the DEPCOMPLETE flag set. When the block itself
  * is written, the COMPLETE flag is set. Once both the cylinder group map
  * and the data itself have been written, it is safe to write the entry in
  * the indirect block that claims the block; the "allocindir" dependency 
  * can then be freed as it is no longer applicable.
  */
 struct allocindir {
 	struct	worklist ai_list;	/* buffer holding indirect block */
 #	define	ai_state ai_list.wk_state /* indirect block pointer state */
 	LIST_ENTRY(allocindir) ai_next;	/* indirdep's list of allocindir's */
 	int	ai_offset;		/* pointer offset in indirect block */
 	ufs2_daddr_t ai_newblkno;	/* new block pointer value */
 	ufs2_daddr_t ai_oldblkno;	/* old block pointer value */
 	struct	freefrag *ai_freefrag;	/* block to be freed when complete */
 	struct	indirdep *ai_indirdep;	/* address of associated indirdep */
 	LIST_ENTRY(allocindir) ai_deps;	/* bmsafemap's list of allocindir's */
 	struct	buf *ai_buf;		/* cylgrp buffer (if pending) */
 };
 
 /*
  * A "freefrag" structure is attached to an "inodedep" when a previously
  * allocated fragment is replaced with a larger fragment, rather than extended.
  * The "freefrag" structure is constructed and attached when the replacement
  * block is first allocated. It is processed after the inode claiming the
  * bigger block that replaces it has been written to disk. Note that the
  * ff_state field is is used to store the uid, so may lose data. However,
  * the uid is used only in printing an error message, so is not critical.
  * Keeping it in a short keeps the data structure down to 32 bytes.
  */
 struct freefrag {
 	struct	worklist ff_list;	/* id_inowait or delayed worklist */
 #	define	ff_state ff_list.wk_state /* owning user; should be uid_t */
-	struct	mount *ff_mnt;		/* associated mount point */
 	ufs2_daddr_t ff_blkno;		/* fragment physical block number */
 	long	ff_fragsize;		/* size of fragment being deleted */
 	ino_t	ff_inum;		/* owning inode number */
 };
 
 /*
  * A "freeblks" structure is attached to an "inodedep" when the
  * corresponding file's length is reduced to zero. It records all
  * the information needed to free the blocks of a file after its
  * zero'ed inode has been written to disk.
  */
 struct freeblks {
 	struct	worklist fb_list;	/* id_inowait or delayed worklist */
 #	define	fb_state fb_list.wk_state /* inode and dirty block state */
 	ino_t	fb_previousinum;	/* inode of previous owner of blocks */
 	uid_t	fb_uid;			/* uid of previous owner of blocks */
 	struct	vnode *fb_devvp;	/* filesystem device vnode */
-	struct	mount *fb_mnt;		/* associated mount point */
 	long	fb_oldextsize;		/* previous ext data size */
 	off_t	fb_oldsize;		/* previous file size */
 	ufs2_daddr_t fb_chkcnt;		/* used to check cnt of blks released */
 	ufs2_daddr_t fb_dblks[NDADDR];	/* direct blk ptrs to deallocate */
 	ufs2_daddr_t fb_iblks[NIADDR];	/* indirect blk ptrs to deallocate */
 	ufs2_daddr_t fb_eblks[NXADDR];	/* indirect blk ptrs to deallocate */
 };
 
 /*
  * A "freefile" structure is attached to an inode when its
  * link count is reduced to zero. It marks the inode as free in
  * the cylinder group map after the zero'ed inode has been written
  * to disk and any associated blocks and fragments have been freed.
  */
 struct freefile {
 	struct	worklist fx_list;	/* id_inowait or delayed worklist */
 	mode_t	fx_mode;		/* mode of inode */
 	ino_t	fx_oldinum;		/* inum of the unlinked file */
 	struct	vnode *fx_devvp;	/* filesystem device vnode */
-	struct	mount *fx_mnt;		/* associated mount point */
 };
 
 /*
  * A "diradd" structure is linked to an "inodedep" id_inowait list when a
  * new directory entry is allocated that references the inode described
  * by "inodedep". When the inode itself is written (either the initial
  * allocation for new inodes or with the increased link count for
  * existing inodes), the COMPLETE flag is set in da_state. If the entry
  * is for a newly allocated inode, the "inodedep" structure is associated
  * with a bmsafemap which prevents the inode from being written to disk
  * until the cylinder group has been updated. Thus the da_state COMPLETE
  * flag cannot be set until the inode bitmap dependency has been removed.
  * When creating a new file, it is safe to write the directory entry that
  * claims the inode once the referenced inode has been written. Since
  * writing the inode clears the bitmap dependencies, the DEPCOMPLETE flag
  * in the diradd can be set unconditionally when creating a file. When
  * creating a directory, there are two additional dependencies described by
  * mkdir structures (see their description below). When these dependencies
  * are resolved the DEPCOMPLETE flag is set in the diradd structure.
  * If there are multiple links created to the same inode, there will be
  * a separate diradd structure created for each link. The diradd is
  * linked onto the pg_diraddhd list of the pagedep for the directory
  * page that contains the entry. When a directory page is written,
  * the pg_diraddhd list is traversed to rollback any entries that are
  * not yet ready to be written to disk. If a directory entry is being
  * changed (by rename) rather than added, the DIRCHG flag is set and
  * the da_previous entry points to the entry that will be "removed"
  * once the new entry has been committed. During rollback, entries
  * with da_previous are replaced with the previous inode number rather
  * than zero.
  *
  * The overlaying of da_pagedep and da_previous is done to keep the
  * structure down to 32 bytes in size on a 32-bit machine. If a
  * da_previous entry is present, the pointer to its pagedep is available
  * in the associated dirrem entry. If the DIRCHG flag is set, the
  * da_previous entry is valid; if not set the da_pagedep entry is valid.
  * The DIRCHG flag never changes; it is set when the structure is created
  * if appropriate and is never cleared.
  */
 struct diradd {
 	struct	worklist da_list;	/* id_inowait or id_pendinghd list */
 #	define	da_state da_list.wk_state /* state of the new directory entry */
 	LIST_ENTRY(diradd) da_pdlist;	/* pagedep holding directory block */
 	doff_t	da_offset;		/* offset of new dir entry in dir blk */
 	ino_t	da_newinum;		/* inode number for the new dir entry */
 	union {
 	struct	dirrem *dau_previous;	/* entry being replaced in dir change */
 	struct	pagedep *dau_pagedep;	/* pagedep dependency for addition */
 	} da_un;
 };
 #define da_previous da_un.dau_previous
 #define da_pagedep da_un.dau_pagedep
 
 /*
  * Two "mkdir" structures are needed to track the additional dependencies
  * associated with creating a new directory entry. Normally a directory
  * addition can be committed as soon as the newly referenced inode has been
  * written to disk with its increased link count. When a directory is
  * created there are two additional dependencies: writing the directory
  * data block containing the "." and ".." entries (MKDIR_BODY) and writing
  * the parent inode with the increased link count for ".." (MKDIR_PARENT).
  * These additional dependencies are tracked by two mkdir structures that
  * reference the associated "diradd" structure. When they have completed,
  * they set the DEPCOMPLETE flag on the diradd so that it knows that its
  * extra dependencies have been completed. The md_state field is used only
  * to identify which type of dependency the mkdir structure is tracking.
  * It is not used in the mainline code for any purpose other than consistency
  * checking. All the mkdir structures in the system are linked together on
  * a list. This list is needed so that a diradd can find its associated
  * mkdir structures and deallocate them if it is prematurely freed (as for
  * example if a mkdir is immediately followed by a rmdir of the same directory).
  * Here, the free of the diradd must traverse the list to find the associated
  * mkdir structures that reference it. The deletion would be faster if the
  * diradd structure were simply augmented to have two pointers that referenced
  * the associated mkdir's. However, this would increase the size of the diradd
  * structure from 32 to 64-bits to speed a very infrequent operation.
  */
 struct mkdir {
 	struct	worklist md_list;	/* id_inowait or buffer holding dir */
 #	define	md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */
 	struct	diradd *md_diradd;	/* associated diradd */
 	struct	buf *md_buf;		/* MKDIR_BODY: buffer holding dir */
 	LIST_ENTRY(mkdir) md_mkdirs;	/* list of all mkdirs */
 };
 LIST_HEAD(mkdirlist, mkdir) mkdirlisthd;
 
 /*
  * A "dirrem" structure describes an operation to decrement the link
  * count on an inode. The dirrem structure is attached to the pg_dirremhd
  * list of the pagedep for the directory page that contains the entry.
  * It is processed after the directory page with the deleted entry has
  * been written to disk.
  *
  * The overlaying of dm_pagedep and dm_dirinum is done to keep the
  * structure down to 32 bytes in size on a 32-bit machine. It works
  * because they are never used concurrently.
  */
 struct dirrem {
 	struct	worklist dm_list;	/* delayed worklist */
 #	define	dm_state dm_list.wk_state /* state of the old directory entry */
 	LIST_ENTRY(dirrem) dm_next;	/* pagedep's list of dirrem's */
-	struct	mount *dm_mnt;		/* associated mount point */
 	ino_t	dm_oldinum;		/* inum of the removed dir entry */
 	union {
 	struct	pagedep *dmu_pagedep;	/* pagedep dependency for remove */
 	ino_t	dmu_dirinum;		/* parent inode number (for rmdir) */
 	} dm_un;
 };
 #define dm_pagedep dm_un.dmu_pagedep
 #define dm_dirinum dm_un.dmu_dirinum
 
 /*
  * A "newdirblk" structure tracks the progress of a newly allocated
  * directory block from its creation until it is claimed by its on-disk
  * inode. When a block is allocated to a directory, an fsync of a file
  * whose name is within that block must ensure not only that the block
  * containing the file name has been written, but also that the on-disk
  * inode references that block. When a new directory block is created,
  * we allocate a newdirblk structure which is linked to the associated
  * allocdirect (on its ad_newdirblk list). When the allocdirect has been
  * satisfied, the newdirblk structure is moved to the inodedep id_bufwait
  * list of its directory to await the inode being written. When the inode
  * is written, the directory entries are fully committed and can be
  * deleted from their pagedep->id_pendinghd and inodedep->id_pendinghd
  * lists. Note that we could track directory blocks allocated to indirect
  * blocks using a similar scheme with the allocindir structures. Rather
  * than adding this level of complexity, we simply write those newly 
  * allocated indirect blocks synchronously as such allocations are rare.
  */
 struct newdirblk {
 	struct	worklist db_list;	/* id_inowait or pg_newdirblk */
 #	define	db_state db_list.wk_state /* unused */
 	struct	pagedep *db_pagedep;	/* associated pagedep */
 };
Index: head/sys/ufs/ufs/ufsmount.h
===================================================================
--- head/sys/ufs/ufs/ufsmount.h	(revision 156202)
+++ head/sys/ufs/ufs/ufsmount.h	(revision 156203)
@@ -1,125 +1,133 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufsmount.h	8.6 (Berkeley) 3/30/95
  * $FreeBSD$
  */
 
 #ifndef _UFS_UFS_UFSMOUNT_H_
 #define _UFS_UFS_UFSMOUNT_H_
 
+#include <sys/buf.h>	/* XXX For struct workhead. */
+
 /*
  * Arguments to mount UFS-based filesystems
  */
 struct ufs_args {
 	char	*fspec;			/* block special device to mount */
 	struct	export_args export;	/* network export information */
 };
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_UFSMNT);
 #endif
 
 struct buf;
 struct inode;
 struct nameidata;
 struct timeval;
 struct ucred;
 struct uio;
 struct vnode;
 struct ufs_extattr_per_mount;
 
 /* This structure describes the UFS specific mount structure data. */
 struct ufsmount {
 	struct	mount *um_mountp;		/* filesystem vfs structure */
 	struct cdev *um_dev;			/* device mounted */
 	struct g_consumer *um_cp;
 	struct bufobj *um_bo;			/* Buffer cache object */
 	struct	vnode *um_devvp;		/* block device mounted vnode */
 	u_long	um_fstype;			/* type of filesystem */
 	struct	fs *um_fs;			/* pointer to superblock */
 	struct	ufs_extattr_per_mount um_extattr;	/* extended attrs */
 	u_long	um_nindir;			/* indirect ptrs per block */
 	u_long	um_bptrtodb;			/* indir ptr to disk block */
 	u_long	um_seqinc;			/* inc between seq blocks */
 	struct	mtx um_lock;			/* Protects ufsmount & fs */
 	long	um_numindirdeps;		/* outstanding indirdeps */
+	struct workhead softdep_workitem_pending; /* softdep work queue */
+	struct worklist *softdep_worklist_tail;	/* Tail pointer for above */
+	int	softdep_on_worklist;		/* Items on the worklist */
+	int	softdep_deps;			/* Total dependency count */
+	int	softdep_req;			/* Wakeup when deps hits 0. */
 	struct	vnode *um_quotas[MAXQUOTAS];	/* pointer to quota files */
 	struct	ucred *um_cred[MAXQUOTAS];	/* quota file access cred */
 	time_t	um_btime[MAXQUOTAS];		/* block quota time limit */
 	time_t	um_itime[MAXQUOTAS];		/* inode quota time limit */
 	char	um_qflags[MAXQUOTAS];		/* quota specific flags */
 	int64_t	um_savedmaxfilesize;		/* XXX - limit maxfilesize */
 	int	(*um_balloc)(struct vnode *, off_t, int, struct ucred *, int, struct buf **);
 	int	(*um_blkatoff)(struct vnode *, off_t, char **, struct buf **);
 	int	(*um_truncate)(struct vnode *, off_t, int, struct ucred *, struct thread *);
 	int	(*um_update)(struct vnode *, int);
 	int	(*um_valloc)(struct vnode *, int, struct ucred *, struct vnode **);
 	int	(*um_vfree)(struct vnode *, ino_t, int);
 	void	(*um_ifree)(struct ufsmount *, struct inode *);
 };
 
 #define UFS_BALLOC(aa, bb, cc, dd, ee, ff) VFSTOUFS((aa)->v_mount)->um_balloc(aa, bb, cc, dd, ee, ff)
 #define UFS_BLKATOFF(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_blkatoff(aa, bb, cc, dd)
 #define UFS_TRUNCATE(aa, bb, cc, dd, ee) VFSTOUFS((aa)->v_mount)->um_truncate(aa, bb, cc, dd, ee)
 #define UFS_UPDATE(aa, bb) VFSTOUFS((aa)->v_mount)->um_update(aa, bb)
 #define UFS_VALLOC(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, cc, dd)
 #define UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc)
 #define UFS_IFREE(aa, bb) ((aa)->um_ifree(aa, bb))
 
 #define	UFS_LOCK(aa)	mtx_lock(&(aa)->um_lock)
 #define	UFS_UNLOCK(aa)	mtx_unlock(&(aa)->um_lock)
 #define	UFS_MTX(aa)	(&(aa)->um_lock)
 
 /*
  * Filesystem types
  */
 #define UFS1	1
 #define UFS2	2
 
 /*
  * Flags describing the state of quotas.
  */
 #define	QTF_OPENING	0x01			/* Q_QUOTAON in progress */
 #define	QTF_CLOSING	0x02			/* Q_QUOTAOFF in progress */
 
 /* Convert mount ptr to ufsmount ptr. */
 #define VFSTOUFS(mp)	((struct ufsmount *)((mp)->mnt_data))
+#define	UFSTOVFS(ump)	(ump)->um_mountp
 
 /*
  * Macros to access filesystem parameters in the ufsmount structure.
  * Used by ufs_bmap.
  */
 #define MNINDIR(ump)			((ump)->um_nindir)
 #define	blkptrtodb(ump, b)		((b) << (ump)->um_bptrtodb)
 #define	is_sequential(ump, a, b)	((b) == (a) + ump->um_seqinc)
 #endif /* _KERNEL */
 
 #endif